diff --git a/benchmark.js b/benchmark.js index 817b5e2..1f0f4be 100644 --- a/benchmark.js +++ b/benchmark.js @@ -1,7 +1,7 @@ import { createWriteStream, promises as fs } from 'fs' import { pipeline } from 'stream/promises' import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects, parquetSchema } from 'hyparquet' -import { parquetWriteFile } from './src/write.js' +import { parquetWriteFile } from './src/node.js' const url = 'https://s3.hyperparam.app/tpch-lineitem-v2.parquet' const filename = 'data/tpch-lineitem-v2.parquet' @@ -28,7 +28,7 @@ const metadata = await parquetMetadataAsync(file) const rows = await parquetReadObjects({ file, metadata, - columns: ['l_comment'], + // columns: ['l_comment'], rowStart: 0, rowEnd: 100_000, }) @@ -42,7 +42,7 @@ const columnData = schema.children.map(({ element }) => ({ // type: element.type, ...element, data: [], -})).filter(({ name }) => name === 'l_comment') +})) // .filter(({ name }) => name === 'l_comment') for (const row of rows) { for (const { name, data } of columnData) { data.push(row[name]) diff --git a/src/datapage.js b/src/datapage.js index 0d3b996..fa47406 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -28,10 +28,12 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp // write page data to temp buffer const page = new ByteWriter() if (encoding === 'RLE_DICTIONARY') { - const maxValue = Math.max(...nonnull) + // find max bitwidth + let maxValue = 0 + for (const v of values) if (v > maxValue) maxValue = v const bitWidth = Math.ceil(Math.log2(maxValue + 1)) - page.appendUint8(bitWidth) - writeRleBitPackedHybrid(page, nonnull) + page.appendUint8(bitWidth) // prepend bitWidth + writeRleBitPackedHybrid(page, nonnull, bitWidth) } else { writePlain(page, nonnull, type) } @@ -115,7 +117,7 @@ function writeLevels(writer, schemaPath, values) { const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) let repetition_levels_byte_length = 0 if (maxRepetitionLevel) { - repetition_levels_byte_length = writeRleBitPackedHybrid(writer, []) + repetition_levels_byte_length = writeRleBitPackedHybrid(writer, [], 0) } // definition levels @@ -131,7 +133,8 @@ function writeLevels(writer, schemaPath, values) { definitionLevels.push(maxDefinitionLevel) } } - definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels) + const bitWidth = Math.ceil(Math.log2(maxDefinitionLevel + 1)) + definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels, bitWidth) } return { definition_levels_byte_length, repetition_levels_byte_length, num_nulls } } diff --git a/src/encoding.js b/src/encoding.js index cab7b71..d633ca9 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -5,16 +5,11 @@ import { ByteWriter } from './bytewriter.js' * @import {Writer} from '../src/types.js' * @param {Writer} writer * @param {DecodedArray} values + * @param {number} bitWidth * @returns {number} bytes written */ -export function writeRleBitPackedHybrid(writer, values) { +export function writeRleBitPackedHybrid(writer, values, bitWidth) { const offsetStart = writer.offset - // find max bitwidth - let max = 0 - for (const v of values) { - if (v > max) max = v - } - const bitWidth = Math.ceil(Math.log2(max + 1)) // try both RLE and bit-packed and choose the best const rle = new ByteWriter() diff --git a/test/encoding.test.js b/test/encoding.test.js index 7aa134e..7f59734 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -14,7 +14,7 @@ function roundTripDeserialize(values) { // Serialize the values using writeRleBitPackedHybrid const writer = new ByteWriter() - writeRleBitPackedHybrid(writer, values) + writeRleBitPackedHybrid(writer, values, bitWidth) const buffer = writer.getBuffer() const reader = { view: new DataView(buffer), offset: 0 }