mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-28 07:56:38 +00:00
for is faster than forEach
This commit is contained in:
parent
97aef2adef
commit
f37b2aea9f
@ -165,8 +165,8 @@ const file = await asyncBufferFromUrl({ url, requestInit })
|
||||
|
||||
### Returned row format
|
||||
|
||||
By default, data returned in the `onComplete` function will be one array of columns per row.
|
||||
If you would like each row to be an object with each key the name of the column, set the option `rowFormat` to `object`.
|
||||
By default, data returned by `parquetRead` in the `onComplete` function will be one **array** of columns per row.
|
||||
If you would like each row to be an **object** with each key the name of the column, set the option `rowFormat` to `object`.
|
||||
|
||||
```javascript
|
||||
import { parquetRead } from 'hyparquet'
|
||||
@ -178,6 +178,8 @@ await parquetRead({
|
||||
})
|
||||
```
|
||||
|
||||
The `parquetReadObjects` function defaults to returning an array of objects.
|
||||
|
||||
## Supported Parquet Files
|
||||
|
||||
The parquet format is known to be a sprawling format which includes options for a wide array of compression schemes, encoding types, and data structures.
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import { createWriteStream, promises as fs } from 'fs'
|
||||
import { compressors } from 'hyparquet-compressors'
|
||||
import { pipeline } from 'stream/promises'
|
||||
import { parquetRead } from './src/hyparquet.js'
|
||||
import { parquetReadObjects } from './src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from './src/utils.js'
|
||||
|
||||
const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
|
||||
@ -25,7 +25,7 @@ const startTime = performance.now()
|
||||
console.log('parsing example.parquet data...')
|
||||
|
||||
// read parquet file
|
||||
await parquetRead({
|
||||
await parquetReadObjects({
|
||||
file,
|
||||
compressors,
|
||||
})
|
||||
|
||||
10
package.json
10
package.json
@ -19,8 +19,8 @@
|
||||
},
|
||||
"main": "src/hyparquet.js",
|
||||
"files": [
|
||||
"types",
|
||||
"src"
|
||||
"src",
|
||||
"types"
|
||||
],
|
||||
"type": "module",
|
||||
"types": "types/hyparquet.d.ts",
|
||||
@ -34,12 +34,12 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "22.13.10",
|
||||
"@vitest/coverage-v8": "3.0.8",
|
||||
"@vitest/coverage-v8": "3.0.9",
|
||||
"eslint": "9.22.0",
|
||||
"eslint-plugin-jsdoc": "50.6.6",
|
||||
"eslint-plugin-jsdoc": "50.6.8",
|
||||
"hyparquet-compressors": "1.0.0",
|
||||
"typescript": "5.8.2",
|
||||
"typescript-eslint": "8.26.1",
|
||||
"vitest": "3.0.8"
|
||||
"vitest": "3.0.9"
|
||||
}
|
||||
}
|
||||
|
||||
33
src/read.js
33
src/read.js
@ -71,17 +71,16 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
|
||||
|
||||
// loop through metadata to find min/max bytes to read
|
||||
let [groupStartByte, groupEndByte] = [file.byteLength, 0]
|
||||
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => {
|
||||
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
|
||||
for (const { meta_data } of rowGroup.columns) {
|
||||
if (!meta_data) throw new Error('parquet column metadata is undefined')
|
||||
// skip columns that are not requested
|
||||
if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return
|
||||
if (columns && !columns.includes(meta_data.path_in_schema[0])) continue
|
||||
|
||||
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
|
||||
const [columnStartByte, columnEndByte] = getColumnRange(meta_data).map(Number)
|
||||
groupStartByte = Math.min(groupStartByte, columnStartByte)
|
||||
groupEndByte = Math.max(groupEndByte, columnEndByte)
|
||||
})
|
||||
}
|
||||
if (groupStartByte >= groupEndByte && columns?.length) {
|
||||
// TODO: should throw if any column is missing
|
||||
throw new Error(`parquet columns not found: ${columns.join(', ')}`)
|
||||
}
|
||||
// if row group size is less than 32mb, pre-load in one read
|
||||
@ -148,16 +147,12 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
|
||||
if (subcolumns?.every(name => subcolumnData.has(name))) {
|
||||
// For every subcolumn, flatten and assemble the column
|
||||
const flatData = new Map(subcolumns.map(name => [name, flatten(subcolumnData.get(name))]))
|
||||
// We have all data needed to assemble a top level column
|
||||
assembleNested(flatData, schemaPath[1])
|
||||
const flatColumn = flatData.get(columnName)
|
||||
if (flatColumn) {
|
||||
chunks = [flatColumn]
|
||||
subcolumns.forEach(name => subcolumnData.delete(name))
|
||||
subcolumnData.set(columnName, chunks)
|
||||
} else {
|
||||
throw new Error(`parquet column data not assembled: ${columnName}`)
|
||||
}
|
||||
if (!flatColumn) throw new Error(`parquet column data not assembled: ${columnName}`)
|
||||
chunks = [flatColumn]
|
||||
subcolumns.forEach(name => subcolumnData.delete(name))
|
||||
subcolumnData.set(columnName, chunks)
|
||||
}
|
||||
|
||||
// do not emit column data until structs are fully parsed
|
||||
@ -175,8 +170,6 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
|
||||
}
|
||||
await Promise.all(promises)
|
||||
if (options.onComplete) {
|
||||
// transpose columns into rows
|
||||
const groupData = new Array(rowLimit)
|
||||
const includedColumnNames = children
|
||||
.map(child => child.element.name)
|
||||
.filter(name => !columns || columns.includes(name))
|
||||
@ -184,14 +177,16 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
|
||||
const includedColumns = columnOrder
|
||||
.map(name => includedColumnNames.includes(name) ? flatten(subcolumnData.get(name)) : undefined)
|
||||
|
||||
// transpose columns into rows
|
||||
const groupData = new Array(rowLimit)
|
||||
for (let row = 0; row < rowLimit; row++) {
|
||||
if (options.rowFormat === 'object') {
|
||||
// return each row as an object
|
||||
/** @type {Record<string, any>} */
|
||||
const rowData = {}
|
||||
columnOrder.forEach((name, index) => {
|
||||
rowData[name] = includedColumns[index]?.[row]
|
||||
})
|
||||
for (let i = 0; i < columnOrder.length; i++) {
|
||||
rowData[columnOrder[i]] = includedColumns[i]?.[row]
|
||||
}
|
||||
groupData[row] = rowData
|
||||
} else {
|
||||
// return each row as an array
|
||||
|
||||
@ -160,7 +160,7 @@ function readVarBigInt(reader) {
|
||||
* @param {DataReader} reader
|
||||
* @returns {number} value
|
||||
*/
|
||||
function readZigZag(reader) {
|
||||
export function readZigZag(reader) {
|
||||
const zigzag = readVarInt(reader)
|
||||
// convert zigzag to int
|
||||
return zigzag >>> 1 ^ -(zigzag & 1)
|
||||
@ -176,7 +176,7 @@ function readZigZag(reader) {
|
||||
export function readZigZagBigInt(reader) {
|
||||
const zigzag = readVarBigInt(reader)
|
||||
// convert zigzag to int
|
||||
return zigzag >> BigInt(1) ^ -(zigzag & BigInt(1))
|
||||
return zigzag >> 1n ^ -(zigzag & 1n)
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Loading…
Reference in New Issue
Block a user