Return ColumnChunk from writeColumn

This commit is contained in:
Kenny Daniel 2025-11-20 23:03:40 -08:00
parent b2ea21b366
commit 5dadd5f7ef
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
5 changed files with 38 additions and 39 deletions

@ -52,15 +52,15 @@
"test": "vitest run"
},
"dependencies": {
"hyparquet": "1.20.2"
"hyparquet": "1.22.0"
},
"devDependencies": {
"@babel/eslint-parser": "7.28.5",
"@types/node": "24.10.1",
"@vitest/coverage-v8": "4.0.8",
"@vitest/coverage-v8": "4.0.14",
"eslint": "9.39.1",
"eslint-plugin-jsdoc": "61.2.0",
"eslint-plugin-jsdoc": "61.4.1",
"typescript": "5.9.3",
"vitest": "4.0.8"
"vitest": "4.0.14"
}
}

@ -11,7 +11,7 @@ import { unconvert } from './unconvert.js'
* @param {ColumnEncoder} column
* @param {DecodedArray} values
* @param {boolean} stats
* @returns {ColumnMetaData}
* @returns {ColumnChunk}
*/
export function writeColumn(writer, column, values, stats) {
const { columnName, element, schemaPath, compressed } = column
@ -74,17 +74,20 @@ export function writeColumn(writer, column, values, stats) {
}
return {
type,
encodings,
path_in_schema: schemaPath.slice(1).map(s => s.name),
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
num_values: BigInt(num_values),
total_compressed_size: BigInt(writer.offset - offsetStart),
total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO
data_page_offset,
dictionary_page_offset,
statistics,
geospatial_statistics,
meta_data: {
type,
encodings,
path_in_schema: schemaPath.slice(1).map(s => s.name),
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
num_values: BigInt(num_values),
total_compressed_size: BigInt(writer.offset - offsetStart),
total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO
data_page_offset,
dictionary_page_offset,
statistics,
geospatial_statistics,
},
file_offset: BigInt(offsetStart),
}
}
@ -137,7 +140,7 @@ function writeDictionaryPage(writer, column, dictionary) {
}
/**
* @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
* @import {ColumnChunk, ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
* @import {ColumnEncoder, PageData, Writer} from '../src/types.js'
* @param {DecodedArray} values
* @returns {Statistics}

@ -1,4 +1,4 @@
import { Encoding, PageType } from 'hyparquet/src/constants.js'
import { Encodings, PageTypes } from 'hyparquet/src/constants.js'
import { ByteWriter } from './bytewriter.js'
import { writeRleBitPackedHybrid } from './encoding.js'
import { writePlain } from './plain.js'
@ -85,26 +85,26 @@ export function writeDataPageV2(writer, values, column, encoding, listValues) {
export function writePageHeader(writer, header) {
/** @type {import('../src/types.js').ThriftObject} */
const compact = {
field_1: PageType.indexOf(header.type),
field_1: PageTypes.indexOf(header.type),
field_2: header.uncompressed_page_size,
field_3: header.compressed_page_size,
field_4: header.crc,
field_5: header.data_page_header && {
field_1: header.data_page_header.num_values,
field_2: Encoding.indexOf(header.data_page_header.encoding),
field_3: Encoding.indexOf(header.data_page_header.definition_level_encoding),
field_4: Encoding.indexOf(header.data_page_header.repetition_level_encoding),
field_2: Encodings.indexOf(header.data_page_header.encoding),
field_3: Encodings.indexOf(header.data_page_header.definition_level_encoding),
field_4: Encodings.indexOf(header.data_page_header.repetition_level_encoding),
// field_5: header.data_page_header.statistics,
},
field_7: header.dictionary_page_header && {
field_1: header.dictionary_page_header.num_values,
field_2: Encoding.indexOf(header.dictionary_page_header.encoding),
field_2: Encodings.indexOf(header.dictionary_page_header.encoding),
},
field_8: header.data_page_header_v2 && {
field_1: header.data_page_header_v2.num_values,
field_2: header.data_page_header_v2.num_nulls,
field_3: header.data_page_header_v2.num_rows,
field_4: Encoding.indexOf(header.data_page_header_v2.encoding),
field_4: Encodings.indexOf(header.data_page_header_v2.encoding),
field_5: header.data_page_header_v2.definition_levels_byte_length,
field_6: header.data_page_header_v2.repetition_levels_byte_length,
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true

@ -1,5 +1,5 @@
import { getSchemaPath } from 'hyparquet/src/schema.js'
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from 'hyparquet/src/constants.js'
import { CompressionCodecs, ConvertedTypes, Encodings, FieldRepetitionTypes, PageTypes, ParquetTypes } from 'hyparquet/src/constants.js'
import { serializeTCompactProtocol } from './thrift.js'
import { unconvertStatistics } from './unconvert.js'
@ -14,12 +14,12 @@ export function writeMetadata(writer, metadata) {
const compact = {
field_1: metadata.version,
field_2: metadata.schema && metadata.schema.map(element => ({
field_1: element.type && ParquetType.indexOf(element.type),
field_1: element.type && ParquetTypes.indexOf(element.type),
field_2: element.type_length,
field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type),
field_3: element.repetition_type && FieldRepetitionTypes.indexOf(element.repetition_type),
field_4: element.name,
field_5: element.num_children,
field_6: element.converted_type && ConvertedType.indexOf(element.converted_type),
field_6: element.converted_type && ConvertedTypes.indexOf(element.converted_type),
field_7: element.scale,
field_8: element.precision,
field_9: element.field_id,
@ -31,10 +31,10 @@ export function writeMetadata(writer, metadata) {
field_1: c.file_path,
field_2: c.file_offset,
field_3: c.meta_data && {
field_1: ParquetType.indexOf(c.meta_data.type),
field_2: c.meta_data.encodings.map(e => Encoding.indexOf(e)),
field_1: ParquetTypes.indexOf(c.meta_data.type),
field_2: c.meta_data.encodings.map(e => Encodings.indexOf(e)),
field_3: c.meta_data.path_in_schema,
field_4: CompressionCodec.indexOf(c.meta_data.codec),
field_4: CompressionCodecs.indexOf(c.meta_data.codec),
field_5: c.meta_data.num_values,
field_6: c.meta_data.total_uncompressed_size,
field_7: c.meta_data.total_compressed_size,
@ -50,8 +50,8 @@ export function writeMetadata(writer, metadata) {
schemaElement(metadata.schema, c.meta_data.path_in_schema, columnIndex + 1)
),
field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({
field_1: PageType.indexOf(es.page_type),
field_2: Encoding.indexOf(es.encoding),
field_1: PageTypes.indexOf(es.page_type),
field_2: Encodings.indexOf(es.encoding),
field_3: es.count,
})),
field_14: c.meta_data.bloom_filter_offset,

@ -74,8 +74,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 })
compressed: this.compressed,
}
const file_offset = BigInt(this.writer.offset)
const meta_data = writeColumn(
const columnChunk = writeColumn(
this.writer,
column,
groupData,
@ -83,10 +82,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 })
)
// save column chunk metadata
columns.push({
file_offset,
meta_data,
})
columns.push(columnChunk)
}
this.num_rows += BigInt(groupSize)