hyparquet/benchmark.js

59 lines
1.8 KiB
JavaScript
Raw Normal View History

2024-02-04 20:31:57 +00:00
import { createReadStream, createWriteStream, promises as fs } from 'fs'
2024-02-21 22:16:51 +00:00
import { snappyUncompressor } from 'hysnappy'
2024-02-04 20:31:57 +00:00
import { parquetRead } from './src/hyparquet.js'
const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
2024-04-18 00:45:15 +00:00
const filename = 'example.parquet'
2024-02-04 20:31:57 +00:00
// download test parquet file if needed
2024-04-18 00:45:15 +00:00
let stat = await fs.stat(filename).catch(() => undefined)
2024-02-04 20:31:57 +00:00
if (!stat) {
console.log('downloading ' + url)
const res = await fetch(url)
if (!res.ok) throw new Error(res.statusText)
// write to file async
2024-04-18 00:45:15 +00:00
const writeStream = createWriteStream(filename)
2024-02-04 20:31:57 +00:00
for await (const chunk of res.body) {
writeStream.write(chunk)
}
2024-02-19 00:42:58 +00:00
writeStream.end()
console.log('downloaded example.parquet')
2024-04-18 00:45:15 +00:00
stat = await fs.stat(filename).catch(() => undefined)
2024-02-04 20:31:57 +00:00
}
2024-02-21 22:16:51 +00:00
2024-02-04 20:31:57 +00:00
// asyncBuffer
const file = {
byteLength: stat.size,
async slice(start, end) {
// read file slice
2024-04-18 00:45:15 +00:00
const readStream = createReadStream(filename, { start, end })
2024-02-04 20:31:57 +00:00
const buffer = await readStreamToArrayBuffer(readStream)
return new Uint8Array(buffer).buffer
},
}
const startTime = performance.now()
2024-04-30 07:09:41 +00:00
console.log('parsing example.parquet data...')
2024-02-21 22:16:51 +00:00
2024-02-04 20:31:57 +00:00
// read parquet file
2024-02-21 22:16:51 +00:00
await parquetRead({
file,
compressors: { SNAPPY: snappyUncompressor() }, // hysnappy wasm
})
2024-02-04 20:31:57 +00:00
const ms = performance.now() - startTime
console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`)
/**
* Convert a web ReadableStream to ArrayBuffer.
*
* @param {ReadStream} input
* @returns {Promise<ArrayBuffer>}
*/
function readStreamToArrayBuffer(input) {
return new Promise((resolve, reject) => {
const chunks = []
input.on('data', chunk => chunks.push(chunk))
input.on('end', () => resolve(Buffer.concat(chunks).buffer))
input.on('error', reject)
})
}