diff --git a/.gitignore b/.gitignore index a9dd7ea..cc462ca 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ coverage dist *.tgz example.parquet +benchmark.parquet diff --git a/benchmark.js b/benchmark.js new file mode 100644 index 0000000..e0da8af --- /dev/null +++ b/benchmark.js @@ -0,0 +1,50 @@ +import { createReadStream, createWriteStream, promises as fs } from 'fs' +import { parquetRead } from './src/hyparquet.js' + +const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet' + +// download test parquet file if needed +const stat = await fs.stat('benchmark.parquet').catch(() => undefined) +if (!stat) { + console.log('downloading ' + url) + const res = await fetch(url) + if (!res.ok) throw new Error(res.statusText) + // write to file async + const writeStream = createWriteStream('benchmark.parquet') + for await (const chunk of res.body) { + writeStream.write(chunk) + } + // await res.body.pipeTo(writeStream) + console.log('download benchmark.parquet') +} +// asyncBuffer +const file = { + byteLength: stat.size, + async slice(start, end) { + // read file slice + const readStream = createReadStream('benchmark.parquet', { start, end }) + const buffer = await readStreamToArrayBuffer(readStream) + return new Uint8Array(buffer).buffer + }, +} +const startTime = performance.now() +console.log('parsing wikipedia data...') +// read parquet file +await parquetRead({ file }) +const ms = performance.now() - startTime +console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`) + +/** + * Convert a web ReadableStream to ArrayBuffer. + * + * @param {ReadStream} input + * @returns {Promise} + */ +function readStreamToArrayBuffer(input) { + return new Promise((resolve, reject) => { + const chunks = [] + input.on('data', chunk => chunks.push(chunk)) + input.on('end', () => resolve(Buffer.concat(chunks).buffer)) + input.on('error', reject) + }) +} diff --git a/demo.js b/demo.js index b7c5f7c..dbc2cfe 100644 --- a/demo.js +++ b/demo.js @@ -1,4 +1,4 @@ -import { parquetMetadata, parquetMetadataAsync, toJson } from './src/hyparquet.js' +import { parquetMetadata, parquetMetadataAsync, parquetRead, toJson } from './src/hyparquet.js' const dropzone = document.getElementById('dropzone') const layout = document.getElementById('layout') @@ -79,6 +79,11 @@ function processFile(file) { const arrayBuffer = e.target.result const metadata = parquetMetadata(arrayBuffer) renderSidebar(arrayBuffer, metadata, file.name) + const startTime = performance.now() + parquetRead({ file: arrayBuffer, onComplete(data) { + const ms = performance.now() - startTime + console.log(`parsed ${file.name} in ${ms.toFixed(0)} ms`) + } }) // TODO } catch (e) { console.error('Error parsing file', e) dropzone.innerHTML = `${file.name}` diff --git a/src/snappy.js b/src/snappy.js index aedccce..5e7324c 100644 --- a/src/snappy.js +++ b/src/snappy.js @@ -1,3 +1,9 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2016 Zhipeng Jia + * https://github.com/zhipeng-jia/snappyjs + */ + const WORD_MASK = [0, 0xff, 0xffff, 0xffffff, 0xffffffff] /**