Perf benchmark

This commit is contained in:
Kenny Daniel 2024-02-04 12:31:57 -08:00
parent 2626c0160e
commit 2e11ab275d
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 63 additions and 1 deletions

1
.gitignore vendored

@ -4,3 +4,4 @@ coverage
dist
*.tgz
example.parquet
benchmark.parquet

50
benchmark.js Normal file

@ -0,0 +1,50 @@
import { createReadStream, createWriteStream, promises as fs } from 'fs'
import { parquetRead } from './src/hyparquet.js'
const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
// download test parquet file if needed
const stat = await fs.stat('benchmark.parquet').catch(() => undefined)
if (!stat) {
console.log('downloading ' + url)
const res = await fetch(url)
if (!res.ok) throw new Error(res.statusText)
// write to file async
const writeStream = createWriteStream('benchmark.parquet')
for await (const chunk of res.body) {
writeStream.write(chunk)
}
// await res.body.pipeTo(writeStream)
console.log('download benchmark.parquet')
}
// asyncBuffer
const file = {
byteLength: stat.size,
async slice(start, end) {
// read file slice
const readStream = createReadStream('benchmark.parquet', { start, end })
const buffer = await readStreamToArrayBuffer(readStream)
return new Uint8Array(buffer).buffer
},
}
const startTime = performance.now()
console.log('parsing wikipedia data...')
// read parquet file
await parquetRead({ file })
const ms = performance.now() - startTime
console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`)
/**
* Convert a web ReadableStream to ArrayBuffer.
*
* @param {ReadStream} input
* @returns {Promise<ArrayBuffer>}
*/
function readStreamToArrayBuffer(input) {
return new Promise((resolve, reject) => {
const chunks = []
input.on('data', chunk => chunks.push(chunk))
input.on('end', () => resolve(Buffer.concat(chunks).buffer))
input.on('error', reject)
})
}

@ -1,4 +1,4 @@
import { parquetMetadata, parquetMetadataAsync, toJson } from './src/hyparquet.js'
import { parquetMetadata, parquetMetadataAsync, parquetRead, toJson } from './src/hyparquet.js'
const dropzone = document.getElementById('dropzone')
const layout = document.getElementById('layout')
@ -79,6 +79,11 @@ function processFile(file) {
const arrayBuffer = e.target.result
const metadata = parquetMetadata(arrayBuffer)
renderSidebar(arrayBuffer, metadata, file.name)
const startTime = performance.now()
parquetRead({ file: arrayBuffer, onComplete(data) {
const ms = performance.now() - startTime
console.log(`parsed ${file.name} in ${ms.toFixed(0)} ms`)
} }) // TODO
} catch (e) {
console.error('Error parsing file', e)
dropzone.innerHTML = `<strong>${file.name}</strong>`

@ -1,3 +1,9 @@
/**
* The MIT License (MIT)
* Copyright (c) 2016 Zhipeng Jia
* https://github.com/zhipeng-jia/snappyjs
*/
const WORD_MASK = [0, 0xff, 0xffff, 0xffffff, 0xffffffff]
/**