hyparquet/src/utils.js

217 lines
6.6 KiB
JavaScript
Raw Normal View History

2024-01-05 09:39:59 +00:00
/**
2024-05-28 21:24:12 +00:00
* Replace bigint, date, etc with legal JSON types.
* When parsing parquet files, bigints are used to represent 64-bit integers.
* However, JSON does not support bigints, so it's helpful to convert to numbers.
2024-01-05 09:39:59 +00:00
*
* @param {any} obj object to convert
* @returns {unknown} converted object
*/
export function toJson(obj) {
2024-02-14 05:11:34 +00:00
if (obj === undefined) return null
if (typeof obj === 'bigint') return Number(obj)
if (Array.isArray(obj)) return obj.map(toJson)
2024-05-04 07:38:19 +00:00
if (obj instanceof Uint8Array) return Array.from(obj)
2024-05-13 01:12:30 +00:00
if (obj instanceof Date) return obj.toISOString()
2024-02-14 05:11:34 +00:00
if (obj instanceof Object) {
2024-01-05 09:39:59 +00:00
/** @type {Record<string, unknown>} */
const newObj = {}
for (const key of Object.keys(obj)) {
2024-02-14 05:25:40 +00:00
if (obj[key] === undefined) continue
2024-01-05 09:39:59 +00:00
newObj[key] = toJson(obj[key])
}
return newObj
}
2024-02-14 05:11:34 +00:00
return obj
2024-01-05 09:39:59 +00:00
}
2024-04-07 16:33:57 +00:00
/**
* Concatenate two arrays fast.
2024-05-02 06:23:50 +00:00
*
2024-04-07 16:33:57 +00:00
* @param {any[]} aaa first array
2024-05-02 06:23:50 +00:00
* @param {DecodedArray} bbb second array
2024-04-07 16:33:57 +00:00
*/
export function concat(aaa, bbb) {
const chunk = 10000
for (let i = 0; i < bbb.length; i += chunk) {
aaa.push(...bbb.slice(i, i + chunk))
}
}
2024-07-26 21:08:57 +00:00
/**
* Deep equality comparison
*
* @param {any} a First object to compare
* @param {any} b Second object to compare
* @returns {boolean} true if objects are equal
*/
export function equals(a, b) {
if (a === b) return true
if (a instanceof Uint8Array && b instanceof Uint8Array) return equals(Array.from(a), Array.from(b))
if (!a || !b || typeof a !== typeof b) return false
return Array.isArray(a) && Array.isArray(b)
? a.length === b.length && a.every((v, i) => equals(v, b[i]))
: typeof a === 'object' && Object.keys(a).length === Object.keys(b).length && Object.keys(a).every(k => equals(a[k], b[k]))
}
2024-07-26 21:08:57 +00:00
/**
* Get the byte length of a URL using a HEAD request.
* If requestInit is provided, it will be passed to fetch.
2024-07-26 21:08:57 +00:00
*
* @param {string} url
* @param {RequestInit} [requestInit] fetch options
* @returns {Promise<number>}
2024-07-26 21:08:57 +00:00
*/
export async function byteLengthFromUrl(url, requestInit) {
return await fetch(url, { ...requestInit, method: 'HEAD' })
2024-07-26 21:08:57 +00:00
.then(res => {
if (!res.ok) throw new Error(`fetch head failed ${res.status}`)
const length = res.headers.get('Content-Length')
if (!length) throw new Error('missing content length')
return parseInt(length)
})
}
/**
* Construct an AsyncBuffer for a URL.
* If byteLength is not provided, will make a HEAD request to get the file size.
* If requestInit is provided, it will be passed to fetch.
*
* @param {object} options
* @param {string} options.url
* @param {number} [options.byteLength]
* @param {RequestInit} [options.requestInit]
* @returns {Promise<AsyncBuffer>}
*/
export async function asyncBufferFromUrl({ url, byteLength, requestInit }) {
2024-12-17 17:25:54 +00:00
if (!url) throw new Error('missing url')
// byte length from HEAD request
byteLength ||= await byteLengthFromUrl(url, requestInit)
/**
* A promise for the whole buffer, if range requests are not supported.
* @type {Promise<ArrayBuffer>|undefined}
*/
let buffer = undefined
const init = requestInit || {}
2024-07-26 21:08:57 +00:00
return {
byteLength,
async slice(start, end) {
if (buffer) {
return buffer.then(buffer => buffer.slice(start, end))
}
const headers = new Headers(init.headers)
2024-07-26 21:08:57 +00:00
const endStr = end === undefined ? '' : end - 1
headers.set('Range', `bytes=${start}-${endStr}`)
const res = await fetch(url, { ...init, headers })
2024-07-26 21:08:57 +00:00
if (!res.ok || !res.body) throw new Error(`fetch failed ${res.status}`)
if (res.status === 200) {
// Endpoint does not support range requests and returned the whole object
buffer = res.arrayBuffer()
return buffer.then(buffer => buffer.slice(start, end))
} else if (res.status === 206) {
// The endpoint supports range requests and sent us the requested range
return res.arrayBuffer()
} else {
throw new Error(`fetch received unexpected status code ${res.status}`)
}
2024-07-26 21:08:57 +00:00
},
}
}
2024-07-26 22:01:01 +00:00
/**
* Construct an AsyncBuffer for a local file using node fs package.
*
* @param {string} filename
* @returns {Promise<AsyncBuffer>}
*/
export async function asyncBufferFromFile(filename) {
const fsPackage = 'fs' // webpack no include
const fs = await import(fsPackage)
2024-07-26 22:01:01 +00:00
const stat = await fs.promises.stat(filename)
return {
byteLength: stat.size,
async slice(start, end) {
// read file slice
const readStream = fs.createReadStream(filename, { start, end })
return await readStreamToArrayBuffer(readStream)
},
}
}
/**
* Convert a node ReadStream to ArrayBuffer.
*
* @param {import('stream').Readable} input
* @returns {Promise<ArrayBuffer>}
*/
function readStreamToArrayBuffer(input) {
return new Promise((resolve, reject) => {
/** @type {Buffer[]} */
const chunks = []
input.on('data', chunk => chunks.push(chunk))
input.on('end', () => {
const buffer = Buffer.concat(chunks)
resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))
})
input.on('error', reject)
})
}
/**
* Returns a cached layer on top of an AsyncBuffer. For caching slices of a file
* that are read multiple times, possibly over a network.
*
* @param {AsyncBuffer} file file-like object to cache
* @returns {AsyncBuffer} cached file-like object
*/
export function cachedAsyncBuffer({ byteLength, slice }) {
const cache = new Map()
return {
byteLength,
/**
* @param {number} start
* @param {number} [end]
* @returns {Awaitable<ArrayBuffer>}
*/
slice(start, end) {
const key = cacheKey(start, end, byteLength)
const cached = cache.get(key)
if (cached) return cached
// cache miss, read from file
const promise = slice(start, end)
cache.set(key, promise)
return promise
},
}
}
/**
* Returns canonical cache key for a byte range 'start,end'.
* Normalize int-range and suffix-range requests to the same key.
*
* @import {AsyncBuffer, Awaitable, DecodedArray} from '../src/types.d.ts'
* @param {number} start start byte of range
* @param {number} [end] end byte of range, or undefined for suffix range
* @param {number} [size] size of file, or undefined for suffix range
* @returns {string}
*/
function cacheKey(start, end, size) {
if (start < 0) {
if (end !== undefined) throw new Error(`invalid suffix range [${start}, ${end}]`)
if (size === undefined) return `${start},`
return `${size + start},${size}`
} else if (end !== undefined) {
if (start > end) throw new Error(`invalid empty range [${start}, ${end}]`)
return `${start},${end}`
} else if (size === undefined) {
return `${start},`
} else {
return `${start},${size}`
}
}