mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-06 06:51:54 +00:00
Perf benchmark
This commit is contained in:
parent
2626c0160e
commit
2e11ab275d
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@ coverage
|
||||
dist
|
||||
*.tgz
|
||||
example.parquet
|
||||
benchmark.parquet
|
||||
|
||||
50
benchmark.js
Normal file
50
benchmark.js
Normal file
@ -0,0 +1,50 @@
|
||||
import { createReadStream, createWriteStream, promises as fs } from 'fs'
|
||||
import { parquetRead } from './src/hyparquet.js'
|
||||
|
||||
const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
|
||||
|
||||
// download test parquet file if needed
|
||||
const stat = await fs.stat('benchmark.parquet').catch(() => undefined)
|
||||
if (!stat) {
|
||||
console.log('downloading ' + url)
|
||||
const res = await fetch(url)
|
||||
if (!res.ok) throw new Error(res.statusText)
|
||||
// write to file async
|
||||
const writeStream = createWriteStream('benchmark.parquet')
|
||||
for await (const chunk of res.body) {
|
||||
writeStream.write(chunk)
|
||||
}
|
||||
// await res.body.pipeTo(writeStream)
|
||||
console.log('download benchmark.parquet')
|
||||
}
|
||||
// asyncBuffer
|
||||
const file = {
|
||||
byteLength: stat.size,
|
||||
async slice(start, end) {
|
||||
// read file slice
|
||||
const readStream = createReadStream('benchmark.parquet', { start, end })
|
||||
const buffer = await readStreamToArrayBuffer(readStream)
|
||||
return new Uint8Array(buffer).buffer
|
||||
},
|
||||
}
|
||||
const startTime = performance.now()
|
||||
console.log('parsing wikipedia data...')
|
||||
// read parquet file
|
||||
await parquetRead({ file })
|
||||
const ms = performance.now() - startTime
|
||||
console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`)
|
||||
|
||||
/**
|
||||
* Convert a web ReadableStream to ArrayBuffer.
|
||||
*
|
||||
* @param {ReadStream} input
|
||||
* @returns {Promise<ArrayBuffer>}
|
||||
*/
|
||||
function readStreamToArrayBuffer(input) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const chunks = []
|
||||
input.on('data', chunk => chunks.push(chunk))
|
||||
input.on('end', () => resolve(Buffer.concat(chunks).buffer))
|
||||
input.on('error', reject)
|
||||
})
|
||||
}
|
||||
7
demo.js
7
demo.js
@ -1,4 +1,4 @@
|
||||
import { parquetMetadata, parquetMetadataAsync, toJson } from './src/hyparquet.js'
|
||||
import { parquetMetadata, parquetMetadataAsync, parquetRead, toJson } from './src/hyparquet.js'
|
||||
|
||||
const dropzone = document.getElementById('dropzone')
|
||||
const layout = document.getElementById('layout')
|
||||
@ -79,6 +79,11 @@ function processFile(file) {
|
||||
const arrayBuffer = e.target.result
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
renderSidebar(arrayBuffer, metadata, file.name)
|
||||
const startTime = performance.now()
|
||||
parquetRead({ file: arrayBuffer, onComplete(data) {
|
||||
const ms = performance.now() - startTime
|
||||
console.log(`parsed ${file.name} in ${ms.toFixed(0)} ms`)
|
||||
} }) // TODO
|
||||
} catch (e) {
|
||||
console.error('Error parsing file', e)
|
||||
dropzone.innerHTML = `<strong>${file.name}</strong>`
|
||||
|
||||
@ -1,3 +1,9 @@
|
||||
/**
|
||||
* The MIT License (MIT)
|
||||
* Copyright (c) 2016 Zhipeng Jia
|
||||
* https://github.com/zhipeng-jia/snappyjs
|
||||
*/
|
||||
|
||||
const WORD_MASK = [0, 0xff, 0xffff, 0xffffff, 0xffffffff]
|
||||
|
||||
/**
|
||||
|
||||
Loading…
Reference in New Issue
Block a user