diff --git a/README.md b/README.md index 851fea4..2b8467b 100644 --- a/README.md +++ b/README.md @@ -52,18 +52,9 @@ npm install hyparquet To read the entire contents of a parquet file in a node.js environment: ```js -const { parquetRead } = await import('hyparquet') -const { createReadStream } = await import('fs') -const file = { // AsyncBuffer - byteLength: stat.size, - async slice(start, end) { - // read file slice - const readStream = createReadStream(filename, { start, end }) - return await readStreamToArrayBuffer(readStream) - } -} +const { asyncBufferFromFile, parquetRead } = await import('hyparquet') await parquetRead({ - file, + file: await asyncBufferFromFile(filename), onComplete: data => console.log(data) }) ``` @@ -71,30 +62,16 @@ await parquetRead({ ### Browser Hyparquet supports asynchronous fetching of parquet files over a network. -You can provide an `AsyncBuffer` which is like a js `ArrayBuffer` but the `slice` method returns `Promise`. ```js -const { parquetRead } = await import('https://cdn.jsdelivr.net/npm/hyparquet/src/hyparquet.min.js') -const file = { // AsyncBuffer - byteLength, - async slice(start, end) { - // fetch byte range from url - const headers = new Headers() - headers.set('Range', `bytes=${start}-${end - 1}`) - const res = await fetch(url, { headers }) - if (!res.ok || !res.body) throw new Error('fetch failed') - return res.arrayBuffer() - }, -} +const { asyncBufferFromUrl, parquetRead } = await import('https://cdn.jsdelivr.net/npm/hyparquet/src/hyparquet.min.js') +const url = 'https://hyperparam-public.s3.amazonaws.com/bunnies.parquet' await parquetRead({ - file, + file: await asyncBufferFromUrl(url), onComplete: data => console.log(data) }) ``` -In a node.js environment: - - ## Metadata You can read just the metadata, including schema and data statistics using the `parquetMetadata` function: @@ -122,7 +99,7 @@ const metadata = parquetMetadata(arrayBuffer) To parse parquet files from a user drag-and-drop action, see example in [index.html](index.html). -## Filtering +## Filtering by Row and Column To read large parquet files, it is recommended that you filter by row and column. Hyparquet is designed to load only the minimal amount of data needed to fulfill a query. diff --git a/src/hyparquet.d.ts b/src/hyparquet.d.ts index bf57406..7957e01 100644 --- a/src/hyparquet.d.ts +++ b/src/hyparquet.d.ts @@ -88,6 +88,22 @@ export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean */ export function toJson(obj: any): any +/** + * Construct an AsyncBuffer for a URL. + * + * @param {string} url + * @returns {Promise} + */ +export function asyncBufferFromUrl(url: string): Promise + +/** + * Construct an AsyncBuffer for a local file using node fs package. + * + * @param {string} filename + * @returns {Promise} + */ +export function asyncBufferFromFile(filename: string): Promise + /** * Parquet query options for reading data */ diff --git a/src/hyparquet.js b/src/hyparquet.js index 51a441a..407e7f5 100644 --- a/src/hyparquet.js +++ b/src/hyparquet.js @@ -7,5 +7,5 @@ export { parquetRead } import { snappyUncompress } from './snappy.js' export { snappyUncompress } -import { toJson } from './utils.js' -export { toJson } +import { asyncBufferFromFile, asyncBufferFromUrl, toJson } from './utils.js' +export { asyncBufferFromFile, asyncBufferFromUrl, toJson }