diff --git a/benchmark.js b/benchmark.js index a812844..6a496e9 100644 --- a/benchmark.js +++ b/benchmark.js @@ -1,4 +1,5 @@ import { createReadStream, createWriteStream, promises as fs } from 'fs' +import { snappyUncompressor } from 'hysnappy' import { parquetRead } from './src/hyparquet.js' const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet' @@ -18,6 +19,7 @@ if (!stat) { console.log('downloaded example.parquet') stat = await fs.stat('example.parquet').catch(() => undefined) } + // asyncBuffer const file = { byteLength: stat.size, @@ -30,8 +32,12 @@ const file = { } const startTime = performance.now() console.log('parsing wikipedia data...') + // read parquet file -await parquetRead({ file }) +await parquetRead({ + file, + compressors: { SNAPPY: snappyUncompressor() }, // hysnappy wasm +}) const ms = performance.now() - startTime console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`) diff --git a/package.json b/package.json index b18036b..fc106dc 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "eslint-plugin-import": "2.29.1", "eslint-plugin-jsdoc": "48.2.0", "http-server": "14.1.1", + "hysnappy": "0.2.0", "typescript": "5.3.3", "vitest": "1.3.1" } diff --git a/src/datapage.js b/src/datapage.js index 9254983..84433d8 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -1,4 +1,3 @@ -import { ParquetType } from './constants.js' import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' import { getMaxDefinitionLevel,