Node-specific exports for asyncBufferFromFile (#80)

* Update README for asyncBufferFromFile
* Simplify asyncBufferFromFile
This commit is contained in:
Kenny Daniel 2025-05-30 13:01:20 -07:00 committed by GitHub
parent ec233fbf74
commit f23b2757ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 86 additions and 86 deletions

@ -47,19 +47,6 @@ Check out a minimal parquet viewer demo that shows how to integrate hyparquet in
## Quick Start
### Node.js Example
To read the contents of a local parquet file in a node.js environment use `asyncBufferFromFile`:
```javascript
const { asyncBufferFromFile, parquetReadObjects } = await import('hyparquet')
const file = await asyncBufferFromFile(filename)
const data = await parquetReadObjects({ file })
```
Note: hyparquet is published as an ES module, so dynamic `import()` may be required on the command line.
### Browser Example
In the browser use `asyncBufferFromUrl` to wrap a url for reading asynchronously over the network.
@ -78,6 +65,19 @@ const data = await parquetReadObjects({
})
```
### Node.js Example
To read the contents of a local parquet file in a node.js environment use `asyncBufferFromFile`:
```javascript
const { asyncBufferFromFile, parquetReadObjects } = await import('hyparquet')
const file = await asyncBufferFromFile('example.parquet')
const data = await parquetReadObjects({ file })
```
Note: hyparquet is published as an ES module, so dynamic `import()` may be required for old versions of node.
## Parquet Writing
To create parquet files from javascript, check out the [hyparquet-writer](https://github.com/hyparam/hyparquet-writer) package.
@ -124,15 +124,6 @@ interface AsyncBuffer {
In most cases, you should probably use `asyncBufferFromUrl` or `asyncBufferFromFile` to create an `AsyncBuffer` for hyparquet.
#### asyncBufferFromFile
If you are in a local node.js environment, use `asyncBufferFromFile` to wrap a local file as an `AsyncBuffer`:
```typescript
const file: AsyncBuffer = asyncBufferFromFile('local.parquet')
const data = await parquetReadObjects({ file })
```
#### asyncBufferFromUrl
If you want to read a parquet file remotely over http, use `asyncBufferFromUrl` to wrap an http url as an `AsyncBuffer` using http range requests.
@ -148,6 +139,17 @@ const file: AsyncBuffer = await asyncBufferFromUrl({ url, requestInit, byteLengt
const data = await parquetReadObjects({ file })
```
#### asyncBufferFromFile
If you are in a node.js environment, use `asyncBufferFromFile` to wrap a local file as an `AsyncBuffer`:
```typescript
import { asyncBufferFromFile, parquetReadObjects } from 'hyparquet'
const file: AsyncBuffer = await asyncBufferFromFile('example.parquet')
const data = await parquetReadObjects({ file })
```
#### ArrayBuffer
You can provide an `ArrayBuffer` anywhere that an `AsyncBuffer` is expected. This is useful if you already have the entire parquet file in memory.
@ -252,7 +254,6 @@ You can include support for ALL parquet `compressors` plus hysnappy using the [h
import { parquetReadObjects } from 'hyparquet'
import { compressors } from 'hyparquet-compressors'
const file = await asyncBufferFromFile(filename)
const data = await parquetReadObjects({ file, compressors })
```

@ -21,17 +21,23 @@
"type": "git",
"url": "git+https://github.com/hyparam/hyparquet.git"
},
"main": "src/hyparquet.js",
"files": [
"src",
"types"
],
"type": "module",
"types": "types/hyparquet.d.ts",
"main": "src/hyparquet.js",
"exports": {
".": {
"types": "./types/hyparquet.d.ts",
"import": "./src/hyparquet.js"
"browser": {
"types": "./types/hyparquet.d.ts",
"import": "./src/hyparquet.js"
},
"default": {
"types": "./types/node.d.ts",
"import": "./src/node.js"
}
},
"./src/*.js": {
"types": "./types/*.d.ts",

@ -4,7 +4,7 @@ export { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata
export { parquetRead }
export { parquetQuery } from './query.js'
export { snappyUncompress } from './snappy.js'
export { asyncBufferFromFile, asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js'
export { asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js'
/**
* This is a helper function to read parquet row data as a promise.

33
src/node.js Normal file

@ -0,0 +1,33 @@
import { createReadStream, promises as fs } from 'fs'
export * from './hyparquet.js'
/**
* @import {AsyncBuffer} from '../src/types.js'
*/
/**
* Construct an AsyncBuffer for a local file using node fs package.
*
* @param {string} filename
* @returns {Promise<AsyncBuffer>}
*/
export async function asyncBufferFromFile(filename) {
const { size } = await fs.stat(filename)
return {
byteLength: size,
slice(start, end) {
// read file slice
const reader = createReadStream(filename, { start, end })
return new Promise((resolve, reject) => {
/** @type {any[]} */
const chunks = []
reader.on('data', chunk => chunks.push(chunk))
reader.on('error', reject)
reader.on('end', () => {
const buffer = Buffer.concat(chunks)
resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))
})
})
},
}
}

@ -127,45 +127,6 @@ export async function asyncBufferFromUrl({ url, byteLength, requestInit, fetch:
}
}
/**
* Construct an AsyncBuffer for a local file using node fs package.
*
* @param {string} filename
* @returns {Promise<AsyncBuffer>}
*/
export async function asyncBufferFromFile(filename) {
const fsPackage = 'fs' // webpack no include
const fs = await import(fsPackage)
const stat = await fs.promises.stat(filename)
return {
byteLength: stat.size,
async slice(start, end) {
// read file slice
const readStream = fs.createReadStream(filename, { start, end })
return await readStreamToArrayBuffer(readStream)
},
}
}
/**
* Convert a node ReadStream to ArrayBuffer.
*
* @param {import('stream').Readable} input
* @returns {Promise<ArrayBuffer>}
*/
function readStreamToArrayBuffer(input) {
return new Promise((resolve, reject) => {
/** @type {Buffer[]} */
const chunks = []
input.on('data', chunk => chunks.push(chunk))
input.on('end', () => {
const buffer = Buffer.concat(chunks)
resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))
})
input.on('error', reject)
})
}
/**
* Returns a cached layer on top of an AsyncBuffer. For caching slices of a file
* that are read multiple times, possibly over a network.

@ -1,9 +1,9 @@
import { describe, expect, it } from 'vitest'
import { readColumn } from '../src/column.js'
import { parquetMetadata } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/node.js'
import { getColumnRange } from '../src/plan.js'
import { getSchemaPath } from '../src/schema.js'
import { asyncBufferFromFile } from '../src/utils.js'
const values = [null, 1, -2, NaN, 0, -1, -0, 2]

@ -1,9 +1,9 @@
import fs from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetMetadata } from '../src/hyparquet.js'
import { parquetMetadata, toJson } from '../src/hyparquet.js'
import { readColumnIndex, readOffsetIndex } from '../src/indexes.js'
import { asyncBufferFromFile } from '../src/node.js'
import { getSchemaPath } from '../src/schema.js'
import { asyncBufferFromFile, toJson } from '../src/utils.js'
import { fileToJson } from './helpers.js'
describe('readColumnIndex', () => {

@ -1,7 +1,7 @@
import fs from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
import { asyncBufferFromFile, toJson } from '../src/utils.js'
import { parquetMetadata, parquetMetadataAsync, toJson } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/node.js'
import { fileToJson } from './helpers.js'
const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet'))

@ -23,13 +23,12 @@ describe('package.json', () => {
})
it('should have exports with types first', () => {
const { exports } = packageJson
expect(exports).toBeDefined()
for (const [, exportObj] of Object.entries(exports)) {
if (typeof exportObj === 'object') {
expect(Object.keys(exportObj)).toEqual(['types', 'import'])
} else {
expect(typeof exportObj).toBe('string')
}
}
expect(Object.keys(exports)).toEqual(['.', './src/*.js'])
// node vs default (browser)
expect(Object.keys(exports['.'])).toEqual(['browser', 'default'])
expect(Object.keys(exports['.'].browser)).toEqual(['types', 'import'])
expect(Object.keys(exports['.'].default)).toEqual(['types', 'import'])
// deep imports
expect(Object.keys(exports['./src/*.js'])).toEqual(['types', 'import'])
})
})

@ -1,6 +1,6 @@
import { describe, expect, it } from 'vitest'
import { parquetMetadataAsync } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/utils.js'
import { asyncBufferFromFile } from '../src/node.js'
import { parquetPlan } from '../src/plan.js'
describe('parquetPlan', () => {

@ -1,6 +1,6 @@
import { describe, expect, it } from 'vitest'
import { parquetQuery } from '../src/query.js'
import { asyncBufferFromFile } from '../src/utils.js'
import { asyncBufferFromFile } from '../src/node.js'
import { countingBuffer } from './helpers.js'
describe('parquetQuery', () => {

@ -1,7 +1,7 @@
import { describe, expect, it, vi } from 'vitest'
import { convertWithDictionary } from '../src/convert.js'
import { parquetMetadataAsync, parquetRead, parquetReadObjects } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/utils.js'
import { asyncBufferFromFile } from '../src/node.js'
import { countingBuffer } from './helpers.js'
vi.mock('../src/convert.js', { spy: true })

@ -1,6 +1,6 @@
import { describe, expect, it } from 'vitest'
import { parquetReadObjects } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/utils.js'
import { asyncBufferFromFile } from '../src/node.js'
describe('parquetRead utf8', () => {
it('default utf8 behavior', async () => {

@ -1,8 +1,8 @@
import fs from 'fs'
import { compressors } from 'hyparquet-compressors'
import { describe, expect, it } from 'vitest'
import { parquetMetadataAsync, parquetRead } from '../src/hyparquet.js'
import { asyncBufferFromFile, toJson } from '../src/utils.js'
import { parquetMetadataAsync, parquetRead, toJson } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/node.js'
import { fileToJson } from './helpers.js'
describe('parquetRead test files', () => {

@ -1,6 +1,6 @@
import { describe, expect, it } from 'vitest'
import { parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/utils.js'
import { asyncBufferFromFile } from '../src/node.js'
describe('parquetSchema', () => {
it('parse schema tree from rowgroups.parquet', async () => {