mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Pass bitWidth to writeRleBitPackedHybrid to avoid re-scanning data
This commit is contained in:
parent
12a12edfe8
commit
5a3d6e8d3f
@ -1,7 +1,7 @@
|
||||
import { createWriteStream, promises as fs } from 'fs'
|
||||
import { pipeline } from 'stream/promises'
|
||||
import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects, parquetSchema } from 'hyparquet'
|
||||
import { parquetWriteFile } from './src/write.js'
|
||||
import { parquetWriteFile } from './src/node.js'
|
||||
|
||||
const url = 'https://s3.hyperparam.app/tpch-lineitem-v2.parquet'
|
||||
const filename = 'data/tpch-lineitem-v2.parquet'
|
||||
@ -28,7 +28,7 @@ const metadata = await parquetMetadataAsync(file)
|
||||
const rows = await parquetReadObjects({
|
||||
file,
|
||||
metadata,
|
||||
columns: ['l_comment'],
|
||||
// columns: ['l_comment'],
|
||||
rowStart: 0,
|
||||
rowEnd: 100_000,
|
||||
})
|
||||
@ -42,7 +42,7 @@ const columnData = schema.children.map(({ element }) => ({
|
||||
// type: element.type,
|
||||
...element,
|
||||
data: [],
|
||||
})).filter(({ name }) => name === 'l_comment')
|
||||
})) // .filter(({ name }) => name === 'l_comment')
|
||||
for (const row of rows) {
|
||||
for (const { name, data } of columnData) {
|
||||
data.push(row[name])
|
||||
|
||||
@ -28,10 +28,12 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
|
||||
// write page data to temp buffer
|
||||
const page = new ByteWriter()
|
||||
if (encoding === 'RLE_DICTIONARY') {
|
||||
const maxValue = Math.max(...nonnull)
|
||||
// find max bitwidth
|
||||
let maxValue = 0
|
||||
for (const v of values) if (v > maxValue) maxValue = v
|
||||
const bitWidth = Math.ceil(Math.log2(maxValue + 1))
|
||||
page.appendUint8(bitWidth)
|
||||
writeRleBitPackedHybrid(page, nonnull)
|
||||
page.appendUint8(bitWidth) // prepend bitWidth
|
||||
writeRleBitPackedHybrid(page, nonnull, bitWidth)
|
||||
} else {
|
||||
writePlain(page, nonnull, type)
|
||||
}
|
||||
@ -115,7 +117,7 @@ function writeLevels(writer, schemaPath, values) {
|
||||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
|
||||
let repetition_levels_byte_length = 0
|
||||
if (maxRepetitionLevel) {
|
||||
repetition_levels_byte_length = writeRleBitPackedHybrid(writer, [])
|
||||
repetition_levels_byte_length = writeRleBitPackedHybrid(writer, [], 0)
|
||||
}
|
||||
|
||||
// definition levels
|
||||
@ -131,7 +133,8 @@ function writeLevels(writer, schemaPath, values) {
|
||||
definitionLevels.push(maxDefinitionLevel)
|
||||
}
|
||||
}
|
||||
definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels)
|
||||
const bitWidth = Math.ceil(Math.log2(maxDefinitionLevel + 1))
|
||||
definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels, bitWidth)
|
||||
}
|
||||
return { definition_levels_byte_length, repetition_levels_byte_length, num_nulls }
|
||||
}
|
||||
|
||||
@ -5,16 +5,11 @@ import { ByteWriter } from './bytewriter.js'
|
||||
* @import {Writer} from '../src/types.js'
|
||||
* @param {Writer} writer
|
||||
* @param {DecodedArray} values
|
||||
* @param {number} bitWidth
|
||||
* @returns {number} bytes written
|
||||
*/
|
||||
export function writeRleBitPackedHybrid(writer, values) {
|
||||
export function writeRleBitPackedHybrid(writer, values, bitWidth) {
|
||||
const offsetStart = writer.offset
|
||||
// find max bitwidth
|
||||
let max = 0
|
||||
for (const v of values) {
|
||||
if (v > max) max = v
|
||||
}
|
||||
const bitWidth = Math.ceil(Math.log2(max + 1))
|
||||
|
||||
// try both RLE and bit-packed and choose the best
|
||||
const rle = new ByteWriter()
|
||||
|
||||
@ -14,7 +14,7 @@ function roundTripDeserialize(values) {
|
||||
|
||||
// Serialize the values using writeRleBitPackedHybrid
|
||||
const writer = new ByteWriter()
|
||||
writeRleBitPackedHybrid(writer, values)
|
||||
writeRleBitPackedHybrid(writer, values, bitWidth)
|
||||
const buffer = writer.getBuffer()
|
||||
const reader = { view: new DataView(buffer), offset: 0 }
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user