diff --git a/benchmark.js b/benchmark.js index 6a496e9..4fa2de5 100644 --- a/benchmark.js +++ b/benchmark.js @@ -3,21 +3,22 @@ import { snappyUncompressor } from 'hysnappy' import { parquetRead } from './src/hyparquet.js' const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet' +const filename = 'example.parquet' // download test parquet file if needed -let stat = await fs.stat('example.parquet').catch(() => undefined) +let stat = await fs.stat(filename).catch(() => undefined) if (!stat) { console.log('downloading ' + url) const res = await fetch(url) if (!res.ok) throw new Error(res.statusText) // write to file async - const writeStream = createWriteStream('example.parquet') + const writeStream = createWriteStream(filename) for await (const chunk of res.body) { writeStream.write(chunk) } writeStream.end() console.log('downloaded example.parquet') - stat = await fs.stat('example.parquet').catch(() => undefined) + stat = await fs.stat(filename).catch(() => undefined) } // asyncBuffer @@ -25,7 +26,7 @@ const file = { byteLength: stat.size, async slice(start, end) { // read file slice - const readStream = createReadStream('example.parquet', { start, end }) + const readStream = createReadStream(filename, { start, end }) const buffer = await readStreamToArrayBuffer(readStream) return new Uint8Array(buffer).buffer }, diff --git a/package.json b/package.json index 63b9a6f..04f1563 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,7 @@ "type": "module", "types": "src/hyparquet.d.ts", "scripts": { - "coverage": "vitest run --coverage", + "coverage": "vitest run --coverage --coverage.include=src", "demo": "http-server -o", "lint": "eslint .", "test": "vitest run", @@ -28,14 +28,14 @@ }, "devDependencies": { "@types/node": "20.12.7", - "@typescript-eslint/eslint-plugin": "7.6.0", - "@vitest/coverage-v8": "1.4.0", + "@typescript-eslint/eslint-plugin": "7.7.0", + "@vitest/coverage-v8": "1.5.0", "eslint": "8.57.0", "eslint-plugin-import": "2.29.1", "eslint-plugin-jsdoc": "48.2.3", "http-server": "14.1.1", "hysnappy": "0.3.0", "typescript": "5.4.5", - "vitest": "1.4.0" + "vitest": "1.5.0" } } diff --git a/src/metadata.js b/src/metadata.js index e5ba68f..249eb2c 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -29,11 +29,12 @@ import { deserializeTCompactProtocol } from './thrift.js' * @returns {Promise} parquet metadata object */ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) { - if (!asyncBuffer) throw new Error('parquet asyncBuffer is required') + if (!asyncBuffer) throw new Error('parquet file is required') + if (!(asyncBuffer.byteLength >= 0)) throw new Error('parquet file byteLength is required') // fetch last bytes (footer) of the file const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize) - const footerBuffer = await asyncBuffer.slice(footerOffset) + const footerBuffer = await asyncBuffer.slice(footerOffset, asyncBuffer.byteLength) // Check for parquet magic number "PAR1" const footerView = new DataView(footerBuffer) @@ -72,7 +73,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << * @returns {FileMetaData} parquet metadata object */ export function parquetMetadata(arrayBuffer) { - if (!arrayBuffer) throw new Error('parquet arrayBuffer is required') + if (!arrayBuffer) throw new Error('parquet file is required') const view = new DataView(arrayBuffer) // Validate footer magic number "PAR1" diff --git a/test/metadata.test.js b/test/metadata.test.js index 539970f..6dd062a 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -19,7 +19,7 @@ describe('parquetMetadata', () => { it('throws for arrayBuffer undefined', () => { // @ts-expect-error testing invalid input - expect(() => parquetMetadata(undefined)).toThrow('parquet arrayBuffer is required') + expect(() => parquetMetadata(undefined)).toThrow('parquet file is required') }) it('throws for a too short file', () => { @@ -64,7 +64,7 @@ describe('parquetMetadataAsync', () => { it('throws for asyncBuffer undefined', async () => { const arrayBuffer = undefined await expect(parquetMetadataAsync(arrayBuffer)).rejects - .toThrow('parquet asyncBuffer is required') + .toThrow('parquet file is required') }) it('throws for invalid magic number', async () => { diff --git a/test/read.test.js b/test/read.test.js index 7f2bf32..9644d4a 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -10,6 +10,12 @@ describe('parquetRead', () => { .rejects.toThrow('parquet file is required') }) + it('throws error for undefined byteLength', async () => { + const file = { byteLength: undefined, slice: () => new ArrayBuffer(0) } + await expect(parquetRead({ file })) + .rejects.toThrow('parquet file byteLength is required') + }) + it('filter by row', async () => { const file = fileToAsyncBuffer('test/files/rowgroups.parquet') await parquetRead({