diff --git a/README.md b/README.md index 85f81ec..fbd75d7 100644 --- a/README.md +++ b/README.md @@ -47,19 +47,6 @@ Check out a minimal parquet viewer demo that shows how to integrate hyparquet in ## Quick Start -### Node.js Example - -To read the contents of a local parquet file in a node.js environment use `asyncBufferFromFile`: - -```javascript -const { asyncBufferFromFile, parquetReadObjects } = await import('hyparquet') - -const file = await asyncBufferFromFile(filename) -const data = await parquetReadObjects({ file }) -``` - -Note: hyparquet is published as an ES module, so dynamic `import()` may be required on the command line. - ### Browser Example In the browser use `asyncBufferFromUrl` to wrap a url for reading asynchronously over the network. @@ -78,6 +65,19 @@ const data = await parquetReadObjects({ }) ``` +### Node.js Example + +To read the contents of a local parquet file in a node.js environment use `asyncBufferFromFile`: + +```javascript +const { asyncBufferFromFile, parquetReadObjects } = await import('hyparquet') + +const file = await asyncBufferFromFile('example.parquet') +const data = await parquetReadObjects({ file }) +``` + +Note: hyparquet is published as an ES module, so dynamic `import()` may be required for old versions of node. + ## Parquet Writing To create parquet files from javascript, check out the [hyparquet-writer](https://github.com/hyparam/hyparquet-writer) package. @@ -124,15 +124,6 @@ interface AsyncBuffer { In most cases, you should probably use `asyncBufferFromUrl` or `asyncBufferFromFile` to create an `AsyncBuffer` for hyparquet. -#### asyncBufferFromFile - -If you are in a local node.js environment, use `asyncBufferFromFile` to wrap a local file as an `AsyncBuffer`: - -```typescript -const file: AsyncBuffer = asyncBufferFromFile('local.parquet') -const data = await parquetReadObjects({ file }) -``` - #### asyncBufferFromUrl If you want to read a parquet file remotely over http, use `asyncBufferFromUrl` to wrap an http url as an `AsyncBuffer` using http range requests. @@ -148,6 +139,17 @@ const file: AsyncBuffer = await asyncBufferFromUrl({ url, requestInit, byteLengt const data = await parquetReadObjects({ file }) ``` +#### asyncBufferFromFile + +If you are in a node.js environment, use `asyncBufferFromFile` to wrap a local file as an `AsyncBuffer`: + +```typescript +import { asyncBufferFromFile, parquetReadObjects } from 'hyparquet' + +const file: AsyncBuffer = await asyncBufferFromFile('example.parquet') +const data = await parquetReadObjects({ file }) +``` + #### ArrayBuffer You can provide an `ArrayBuffer` anywhere that an `AsyncBuffer` is expected. This is useful if you already have the entire parquet file in memory. @@ -252,7 +254,6 @@ You can include support for ALL parquet `compressors` plus hysnappy using the [h import { parquetReadObjects } from 'hyparquet' import { compressors } from 'hyparquet-compressors' -const file = await asyncBufferFromFile(filename) const data = await parquetReadObjects({ file, compressors }) ``` diff --git a/package.json b/package.json index 0680089..9812e5f 100644 --- a/package.json +++ b/package.json @@ -21,17 +21,23 @@ "type": "git", "url": "git+https://github.com/hyparam/hyparquet.git" }, - "main": "src/hyparquet.js", "files": [ "src", "types" ], "type": "module", "types": "types/hyparquet.d.ts", + "main": "src/hyparquet.js", "exports": { ".": { - "types": "./types/hyparquet.d.ts", - "import": "./src/hyparquet.js" + "browser": { + "types": "./types/hyparquet.d.ts", + "import": "./src/hyparquet.js" + }, + "default": { + "types": "./types/node.d.ts", + "import": "./src/node.js" + } }, "./src/*.js": { "types": "./types/*.d.ts", diff --git a/src/hyparquet.js b/src/hyparquet.js index 893445a..3ca6400 100644 --- a/src/hyparquet.js +++ b/src/hyparquet.js @@ -4,7 +4,7 @@ export { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata export { parquetRead } export { parquetQuery } from './query.js' export { snappyUncompress } from './snappy.js' -export { asyncBufferFromFile, asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js' +export { asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js' /** * This is a helper function to read parquet row data as a promise. diff --git a/src/node.js b/src/node.js new file mode 100644 index 0000000..ac587c3 --- /dev/null +++ b/src/node.js @@ -0,0 +1,33 @@ +import { createReadStream, promises as fs } from 'fs' + +export * from './hyparquet.js' + +/** + * @import {AsyncBuffer} from '../src/types.js' + */ +/** + * Construct an AsyncBuffer for a local file using node fs package. + * + * @param {string} filename + * @returns {Promise} + */ +export async function asyncBufferFromFile(filename) { + const { size } = await fs.stat(filename) + return { + byteLength: size, + slice(start, end) { + // read file slice + const reader = createReadStream(filename, { start, end }) + return new Promise((resolve, reject) => { + /** @type {any[]} */ + const chunks = [] + reader.on('data', chunk => chunks.push(chunk)) + reader.on('error', reject) + reader.on('end', () => { + const buffer = Buffer.concat(chunks) + resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)) + }) + }) + }, + } +} diff --git a/src/utils.js b/src/utils.js index ae773f5..7148209 100644 --- a/src/utils.js +++ b/src/utils.js @@ -127,45 +127,6 @@ export async function asyncBufferFromUrl({ url, byteLength, requestInit, fetch: } } -/** - * Construct an AsyncBuffer for a local file using node fs package. - * - * @param {string} filename - * @returns {Promise} - */ -export async function asyncBufferFromFile(filename) { - const fsPackage = 'fs' // webpack no include - const fs = await import(fsPackage) - const stat = await fs.promises.stat(filename) - return { - byteLength: stat.size, - async slice(start, end) { - // read file slice - const readStream = fs.createReadStream(filename, { start, end }) - return await readStreamToArrayBuffer(readStream) - }, - } -} - -/** - * Convert a node ReadStream to ArrayBuffer. - * - * @param {import('stream').Readable} input - * @returns {Promise} - */ -function readStreamToArrayBuffer(input) { - return new Promise((resolve, reject) => { - /** @type {Buffer[]} */ - const chunks = [] - input.on('data', chunk => chunks.push(chunk)) - input.on('end', () => { - const buffer = Buffer.concat(chunks) - resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)) - }) - input.on('error', reject) - }) -} - /** * Returns a cached layer on top of an AsyncBuffer. For caching slices of a file * that are read multiple times, possibly over a network. diff --git a/test/column.test.js b/test/column.test.js index 47800c4..02be166 100644 --- a/test/column.test.js +++ b/test/column.test.js @@ -1,9 +1,9 @@ import { describe, expect, it } from 'vitest' import { readColumn } from '../src/column.js' import { parquetMetadata } from '../src/hyparquet.js' +import { asyncBufferFromFile } from '../src/node.js' import { getColumnRange } from '../src/plan.js' import { getSchemaPath } from '../src/schema.js' -import { asyncBufferFromFile } from '../src/utils.js' const values = [null, 1, -2, NaN, 0, -1, -0, 2] diff --git a/test/indexes.test.js b/test/indexes.test.js index bf203cd..ca366bd 100644 --- a/test/indexes.test.js +++ b/test/indexes.test.js @@ -1,9 +1,9 @@ import fs from 'fs' import { describe, expect, it } from 'vitest' -import { parquetMetadata } from '../src/hyparquet.js' +import { parquetMetadata, toJson } from '../src/hyparquet.js' import { readColumnIndex, readOffsetIndex } from '../src/indexes.js' +import { asyncBufferFromFile } from '../src/node.js' import { getSchemaPath } from '../src/schema.js' -import { asyncBufferFromFile, toJson } from '../src/utils.js' import { fileToJson } from './helpers.js' describe('readColumnIndex', () => { diff --git a/test/metadata.test.js b/test/metadata.test.js index 8dc6dfa..0f5e322 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -1,7 +1,7 @@ import fs from 'fs' import { describe, expect, it } from 'vitest' -import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js' -import { asyncBufferFromFile, toJson } from '../src/utils.js' +import { parquetMetadata, parquetMetadataAsync, toJson } from '../src/hyparquet.js' +import { asyncBufferFromFile } from '../src/node.js' import { fileToJson } from './helpers.js' const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet')) diff --git a/test/package.test.js b/test/package.test.js index 9731021..874caf3 100644 --- a/test/package.test.js +++ b/test/package.test.js @@ -23,13 +23,12 @@ describe('package.json', () => { }) it('should have exports with types first', () => { const { exports } = packageJson - expect(exports).toBeDefined() - for (const [, exportObj] of Object.entries(exports)) { - if (typeof exportObj === 'object') { - expect(Object.keys(exportObj)).toEqual(['types', 'import']) - } else { - expect(typeof exportObj).toBe('string') - } - } + expect(Object.keys(exports)).toEqual(['.', './src/*.js']) + // node vs default (browser) + expect(Object.keys(exports['.'])).toEqual(['browser', 'default']) + expect(Object.keys(exports['.'].browser)).toEqual(['types', 'import']) + expect(Object.keys(exports['.'].default)).toEqual(['types', 'import']) + // deep imports + expect(Object.keys(exports['./src/*.js'])).toEqual(['types', 'import']) }) }) diff --git a/test/plan.test.js b/test/plan.test.js index d7be474..056c4f3 100644 --- a/test/plan.test.js +++ b/test/plan.test.js @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest' import { parquetMetadataAsync } from '../src/hyparquet.js' -import { asyncBufferFromFile } from '../src/utils.js' +import { asyncBufferFromFile } from '../src/node.js' import { parquetPlan } from '../src/plan.js' describe('parquetPlan', () => { diff --git a/test/query.test.js b/test/query.test.js index 2cde705..d2a4be8 100644 --- a/test/query.test.js +++ b/test/query.test.js @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest' import { parquetQuery } from '../src/query.js' -import { asyncBufferFromFile } from '../src/utils.js' +import { asyncBufferFromFile } from '../src/node.js' import { countingBuffer } from './helpers.js' describe('parquetQuery', () => { diff --git a/test/read.test.js b/test/read.test.js index aae158e..e7f38d4 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -1,7 +1,7 @@ import { describe, expect, it, vi } from 'vitest' import { convertWithDictionary } from '../src/convert.js' import { parquetMetadataAsync, parquetRead, parquetReadObjects } from '../src/hyparquet.js' -import { asyncBufferFromFile } from '../src/utils.js' +import { asyncBufferFromFile } from '../src/node.js' import { countingBuffer } from './helpers.js' vi.mock('../src/convert.js', { spy: true }) diff --git a/test/read.utf8.test.js b/test/read.utf8.test.js index 1a7645d..578b371 100644 --- a/test/read.utf8.test.js +++ b/test/read.utf8.test.js @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest' import { parquetReadObjects } from '../src/hyparquet.js' -import { asyncBufferFromFile } from '../src/utils.js' +import { asyncBufferFromFile } from '../src/node.js' describe('parquetRead utf8', () => { it('default utf8 behavior', async () => { diff --git a/test/readFiles.test.js b/test/readFiles.test.js index 260ea6b..a044e47 100644 --- a/test/readFiles.test.js +++ b/test/readFiles.test.js @@ -1,8 +1,8 @@ import fs from 'fs' import { compressors } from 'hyparquet-compressors' import { describe, expect, it } from 'vitest' -import { parquetMetadataAsync, parquetRead } from '../src/hyparquet.js' -import { asyncBufferFromFile, toJson } from '../src/utils.js' +import { parquetMetadataAsync, parquetRead, toJson } from '../src/hyparquet.js' +import { asyncBufferFromFile } from '../src/node.js' import { fileToJson } from './helpers.js' describe('parquetRead test files', () => { diff --git a/test/schemaTree.test.js b/test/schemaTree.test.js index 0f71951..cca0fa5 100644 --- a/test/schemaTree.test.js +++ b/test/schemaTree.test.js @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest' import { parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js' -import { asyncBufferFromFile } from '../src/utils.js' +import { asyncBufferFromFile } from '../src/node.js' describe('parquetSchema', () => { it('parse schema tree from rowgroups.parquet', async () => {