mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-06 06:51:54 +00:00
Node-specific exports for asyncBufferFromFile (#80)
* Update README for asyncBufferFromFile * Simplify asyncBufferFromFile
This commit is contained in:
parent
ec233fbf74
commit
f23b2757ca
47
README.md
47
README.md
@ -47,19 +47,6 @@ Check out a minimal parquet viewer demo that shows how to integrate hyparquet in
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Node.js Example
|
||||
|
||||
To read the contents of a local parquet file in a node.js environment use `asyncBufferFromFile`:
|
||||
|
||||
```javascript
|
||||
const { asyncBufferFromFile, parquetReadObjects } = await import('hyparquet')
|
||||
|
||||
const file = await asyncBufferFromFile(filename)
|
||||
const data = await parquetReadObjects({ file })
|
||||
```
|
||||
|
||||
Note: hyparquet is published as an ES module, so dynamic `import()` may be required on the command line.
|
||||
|
||||
### Browser Example
|
||||
|
||||
In the browser use `asyncBufferFromUrl` to wrap a url for reading asynchronously over the network.
|
||||
@ -78,6 +65,19 @@ const data = await parquetReadObjects({
|
||||
})
|
||||
```
|
||||
|
||||
### Node.js Example
|
||||
|
||||
To read the contents of a local parquet file in a node.js environment use `asyncBufferFromFile`:
|
||||
|
||||
```javascript
|
||||
const { asyncBufferFromFile, parquetReadObjects } = await import('hyparquet')
|
||||
|
||||
const file = await asyncBufferFromFile('example.parquet')
|
||||
const data = await parquetReadObjects({ file })
|
||||
```
|
||||
|
||||
Note: hyparquet is published as an ES module, so dynamic `import()` may be required for old versions of node.
|
||||
|
||||
## Parquet Writing
|
||||
|
||||
To create parquet files from javascript, check out the [hyparquet-writer](https://github.com/hyparam/hyparquet-writer) package.
|
||||
@ -124,15 +124,6 @@ interface AsyncBuffer {
|
||||
|
||||
In most cases, you should probably use `asyncBufferFromUrl` or `asyncBufferFromFile` to create an `AsyncBuffer` for hyparquet.
|
||||
|
||||
#### asyncBufferFromFile
|
||||
|
||||
If you are in a local node.js environment, use `asyncBufferFromFile` to wrap a local file as an `AsyncBuffer`:
|
||||
|
||||
```typescript
|
||||
const file: AsyncBuffer = asyncBufferFromFile('local.parquet')
|
||||
const data = await parquetReadObjects({ file })
|
||||
```
|
||||
|
||||
#### asyncBufferFromUrl
|
||||
|
||||
If you want to read a parquet file remotely over http, use `asyncBufferFromUrl` to wrap an http url as an `AsyncBuffer` using http range requests.
|
||||
@ -148,6 +139,17 @@ const file: AsyncBuffer = await asyncBufferFromUrl({ url, requestInit, byteLengt
|
||||
const data = await parquetReadObjects({ file })
|
||||
```
|
||||
|
||||
#### asyncBufferFromFile
|
||||
|
||||
If you are in a node.js environment, use `asyncBufferFromFile` to wrap a local file as an `AsyncBuffer`:
|
||||
|
||||
```typescript
|
||||
import { asyncBufferFromFile, parquetReadObjects } from 'hyparquet'
|
||||
|
||||
const file: AsyncBuffer = await asyncBufferFromFile('example.parquet')
|
||||
const data = await parquetReadObjects({ file })
|
||||
```
|
||||
|
||||
#### ArrayBuffer
|
||||
|
||||
You can provide an `ArrayBuffer` anywhere that an `AsyncBuffer` is expected. This is useful if you already have the entire parquet file in memory.
|
||||
@ -252,7 +254,6 @@ You can include support for ALL parquet `compressors` plus hysnappy using the [h
|
||||
import { parquetReadObjects } from 'hyparquet'
|
||||
import { compressors } from 'hyparquet-compressors'
|
||||
|
||||
const file = await asyncBufferFromFile(filename)
|
||||
const data = await parquetReadObjects({ file, compressors })
|
||||
```
|
||||
|
||||
|
||||
12
package.json
12
package.json
@ -21,17 +21,23 @@
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/hyparam/hyparquet.git"
|
||||
},
|
||||
"main": "src/hyparquet.js",
|
||||
"files": [
|
||||
"src",
|
||||
"types"
|
||||
],
|
||||
"type": "module",
|
||||
"types": "types/hyparquet.d.ts",
|
||||
"main": "src/hyparquet.js",
|
||||
"exports": {
|
||||
".": {
|
||||
"types": "./types/hyparquet.d.ts",
|
||||
"import": "./src/hyparquet.js"
|
||||
"browser": {
|
||||
"types": "./types/hyparquet.d.ts",
|
||||
"import": "./src/hyparquet.js"
|
||||
},
|
||||
"default": {
|
||||
"types": "./types/node.d.ts",
|
||||
"import": "./src/node.js"
|
||||
}
|
||||
},
|
||||
"./src/*.js": {
|
||||
"types": "./types/*.d.ts",
|
||||
|
||||
@ -4,7 +4,7 @@ export { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata
|
||||
export { parquetRead }
|
||||
export { parquetQuery } from './query.js'
|
||||
export { snappyUncompress } from './snappy.js'
|
||||
export { asyncBufferFromFile, asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js'
|
||||
export { asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, flatten, toJson } from './utils.js'
|
||||
|
||||
/**
|
||||
* This is a helper function to read parquet row data as a promise.
|
||||
|
||||
33
src/node.js
Normal file
33
src/node.js
Normal file
@ -0,0 +1,33 @@
|
||||
import { createReadStream, promises as fs } from 'fs'
|
||||
|
||||
export * from './hyparquet.js'
|
||||
|
||||
/**
|
||||
* @import {AsyncBuffer} from '../src/types.js'
|
||||
*/
|
||||
/**
|
||||
* Construct an AsyncBuffer for a local file using node fs package.
|
||||
*
|
||||
* @param {string} filename
|
||||
* @returns {Promise<AsyncBuffer>}
|
||||
*/
|
||||
export async function asyncBufferFromFile(filename) {
|
||||
const { size } = await fs.stat(filename)
|
||||
return {
|
||||
byteLength: size,
|
||||
slice(start, end) {
|
||||
// read file slice
|
||||
const reader = createReadStream(filename, { start, end })
|
||||
return new Promise((resolve, reject) => {
|
||||
/** @type {any[]} */
|
||||
const chunks = []
|
||||
reader.on('data', chunk => chunks.push(chunk))
|
||||
reader.on('error', reject)
|
||||
reader.on('end', () => {
|
||||
const buffer = Buffer.concat(chunks)
|
||||
resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))
|
||||
})
|
||||
})
|
||||
},
|
||||
}
|
||||
}
|
||||
39
src/utils.js
39
src/utils.js
@ -127,45 +127,6 @@ export async function asyncBufferFromUrl({ url, byteLength, requestInit, fetch:
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct an AsyncBuffer for a local file using node fs package.
|
||||
*
|
||||
* @param {string} filename
|
||||
* @returns {Promise<AsyncBuffer>}
|
||||
*/
|
||||
export async function asyncBufferFromFile(filename) {
|
||||
const fsPackage = 'fs' // webpack no include
|
||||
const fs = await import(fsPackage)
|
||||
const stat = await fs.promises.stat(filename)
|
||||
return {
|
||||
byteLength: stat.size,
|
||||
async slice(start, end) {
|
||||
// read file slice
|
||||
const readStream = fs.createReadStream(filename, { start, end })
|
||||
return await readStreamToArrayBuffer(readStream)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a node ReadStream to ArrayBuffer.
|
||||
*
|
||||
* @param {import('stream').Readable} input
|
||||
* @returns {Promise<ArrayBuffer>}
|
||||
*/
|
||||
function readStreamToArrayBuffer(input) {
|
||||
return new Promise((resolve, reject) => {
|
||||
/** @type {Buffer[]} */
|
||||
const chunks = []
|
||||
input.on('data', chunk => chunks.push(chunk))
|
||||
input.on('end', () => {
|
||||
const buffer = Buffer.concat(chunks)
|
||||
resolve(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength))
|
||||
})
|
||||
input.on('error', reject)
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a cached layer on top of an AsyncBuffer. For caching slices of a file
|
||||
* that are read multiple times, possibly over a network.
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { readColumn } from '../src/column.js'
|
||||
import { parquetMetadata } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
import { getColumnRange } from '../src/plan.js'
|
||||
import { getSchemaPath } from '../src/schema.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
|
||||
const values = [null, 1, -2, NaN, 0, -1, -0, 2]
|
||||
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import fs from 'fs'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadata } from '../src/hyparquet.js'
|
||||
import { parquetMetadata, toJson } from '../src/hyparquet.js'
|
||||
import { readColumnIndex, readOffsetIndex } from '../src/indexes.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
import { getSchemaPath } from '../src/schema.js'
|
||||
import { asyncBufferFromFile, toJson } from '../src/utils.js'
|
||||
import { fileToJson } from './helpers.js'
|
||||
|
||||
describe('readColumnIndex', () => {
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import fs from 'fs'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile, toJson } from '../src/utils.js'
|
||||
import { parquetMetadata, parquetMetadataAsync, toJson } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
import { fileToJson } from './helpers.js'
|
||||
|
||||
const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet'))
|
||||
|
||||
@ -23,13 +23,12 @@ describe('package.json', () => {
|
||||
})
|
||||
it('should have exports with types first', () => {
|
||||
const { exports } = packageJson
|
||||
expect(exports).toBeDefined()
|
||||
for (const [, exportObj] of Object.entries(exports)) {
|
||||
if (typeof exportObj === 'object') {
|
||||
expect(Object.keys(exportObj)).toEqual(['types', 'import'])
|
||||
} else {
|
||||
expect(typeof exportObj).toBe('string')
|
||||
}
|
||||
}
|
||||
expect(Object.keys(exports)).toEqual(['.', './src/*.js'])
|
||||
// node vs default (browser)
|
||||
expect(Object.keys(exports['.'])).toEqual(['browser', 'default'])
|
||||
expect(Object.keys(exports['.'].browser)).toEqual(['types', 'import'])
|
||||
expect(Object.keys(exports['.'].default)).toEqual(['types', 'import'])
|
||||
// deep imports
|
||||
expect(Object.keys(exports['./src/*.js'])).toEqual(['types', 'import'])
|
||||
})
|
||||
})
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadataAsync } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
import { parquetPlan } from '../src/plan.js'
|
||||
|
||||
describe('parquetPlan', () => {
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetQuery } from '../src/query.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
import { countingBuffer } from './helpers.js'
|
||||
|
||||
describe('parquetQuery', () => {
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import { describe, expect, it, vi } from 'vitest'
|
||||
import { convertWithDictionary } from '../src/convert.js'
|
||||
import { parquetMetadataAsync, parquetRead, parquetReadObjects } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
import { countingBuffer } from './helpers.js'
|
||||
|
||||
vi.mock('../src/convert.js', { spy: true })
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetReadObjects } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
|
||||
describe('parquetRead utf8', () => {
|
||||
it('default utf8 behavior', async () => {
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
import fs from 'fs'
|
||||
import { compressors } from 'hyparquet-compressors'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadataAsync, parquetRead } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile, toJson } from '../src/utils.js'
|
||||
import { parquetMetadataAsync, parquetRead, toJson } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
import { fileToJson } from './helpers.js'
|
||||
|
||||
describe('parquetRead test files', () => {
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
import { asyncBufferFromFile } from '../src/node.js'
|
||||
|
||||
describe('parquetSchema', () => {
|
||||
it('parse schema tree from rowgroups.parquet', async () => {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user