2024-07-26 22:01:01 +00:00
|
|
|
import { createWriteStream, promises as fs } from 'fs'
|
2024-07-02 23:33:42 +00:00
|
|
|
import { compressors } from 'hyparquet-compressors'
|
2024-07-23 14:08:14 +00:00
|
|
|
import { pipeline } from 'stream/promises'
|
2025-05-30 22:47:02 +00:00
|
|
|
import { parquetReadObjects } from './src/index.js'
|
|
|
|
|
import { asyncBufferFromFile } from './src/node.js'
|
2024-02-04 20:31:57 +00:00
|
|
|
|
|
|
|
|
const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
|
2024-04-18 00:45:15 +00:00
|
|
|
const filename = 'example.parquet'
|
2024-02-04 20:31:57 +00:00
|
|
|
|
|
|
|
|
// download test parquet file if needed
|
2024-04-18 00:45:15 +00:00
|
|
|
let stat = await fs.stat(filename).catch(() => undefined)
|
2024-02-04 20:31:57 +00:00
|
|
|
if (!stat) {
|
|
|
|
|
console.log('downloading ' + url)
|
|
|
|
|
const res = await fetch(url)
|
|
|
|
|
if (!res.ok) throw new Error(res.statusText)
|
|
|
|
|
// write to file async
|
2024-07-23 14:08:14 +00:00
|
|
|
await pipeline(res.body, createWriteStream(filename))
|
2025-04-11 11:43:11 +00:00
|
|
|
stat = await fs.stat(filename)
|
2024-07-23 14:08:14 +00:00
|
|
|
console.log('downloaded example.parquet', stat.size)
|
2024-02-04 20:31:57 +00:00
|
|
|
}
|
2024-02-21 22:16:51 +00:00
|
|
|
|
2024-02-04 20:31:57 +00:00
|
|
|
// asyncBuffer
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile(filename)
|
2024-02-04 20:31:57 +00:00
|
|
|
const startTime = performance.now()
|
2024-04-30 07:09:41 +00:00
|
|
|
console.log('parsing example.parquet data...')
|
2024-02-21 22:16:51 +00:00
|
|
|
|
2024-02-04 20:31:57 +00:00
|
|
|
// read parquet file
|
2025-03-17 17:07:08 +00:00
|
|
|
await parquetReadObjects({
|
2024-02-21 22:16:51 +00:00
|
|
|
file,
|
2024-07-02 23:33:42 +00:00
|
|
|
compressors,
|
2024-02-21 22:16:51 +00:00
|
|
|
})
|
2024-02-04 20:31:57 +00:00
|
|
|
const ms = performance.now() - startTime
|
|
|
|
|
console.log(`parsed ${stat.size.toLocaleString()} bytes in ${ms.toFixed(0)} ms`)
|