diff --git a/package.json b/package.json index f9516b8..36c88c6 100644 --- a/package.json +++ b/package.json @@ -52,15 +52,15 @@ "test": "vitest run" }, "dependencies": { - "hyparquet": "1.20.2" + "hyparquet": "1.22.0" }, "devDependencies": { "@babel/eslint-parser": "7.28.5", "@types/node": "24.10.1", - "@vitest/coverage-v8": "4.0.8", + "@vitest/coverage-v8": "4.0.14", "eslint": "9.39.1", - "eslint-plugin-jsdoc": "61.2.0", + "eslint-plugin-jsdoc": "61.4.1", "typescript": "5.9.3", - "vitest": "4.0.8" + "vitest": "4.0.14" } } diff --git a/src/column.js b/src/column.js index 5390a3a..5cecea3 100644 --- a/src/column.js +++ b/src/column.js @@ -11,7 +11,7 @@ import { unconvert } from './unconvert.js' * @param {ColumnEncoder} column * @param {DecodedArray} values * @param {boolean} stats - * @returns {ColumnMetaData} + * @returns {ColumnChunk} */ export function writeColumn(writer, column, values, stats) { const { columnName, element, schemaPath, compressed } = column @@ -74,17 +74,20 @@ export function writeColumn(writer, column, values, stats) { } return { - type, - encodings, - path_in_schema: schemaPath.slice(1).map(s => s.name), - codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED', - num_values: BigInt(num_values), - total_compressed_size: BigInt(writer.offset - offsetStart), - total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO - data_page_offset, - dictionary_page_offset, - statistics, - geospatial_statistics, + meta_data: { + type, + encodings, + path_in_schema: schemaPath.slice(1).map(s => s.name), + codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED', + num_values: BigInt(num_values), + total_compressed_size: BigInt(writer.offset - offsetStart), + total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO + data_page_offset, + dictionary_page_offset, + statistics, + geospatial_statistics, + }, + file_offset: BigInt(offsetStart), } } @@ -137,7 +140,7 @@ function writeDictionaryPage(writer, column, dictionary) { } /** - * @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet' + * @import {ColumnChunk, ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet' * @import {ColumnEncoder, PageData, Writer} from '../src/types.js' * @param {DecodedArray} values * @returns {Statistics} diff --git a/src/datapage.js b/src/datapage.js index 73d1887..8717547 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -1,4 +1,4 @@ -import { Encoding, PageType } from 'hyparquet/src/constants.js' +import { Encodings, PageTypes } from 'hyparquet/src/constants.js' import { ByteWriter } from './bytewriter.js' import { writeRleBitPackedHybrid } from './encoding.js' import { writePlain } from './plain.js' @@ -85,26 +85,26 @@ export function writeDataPageV2(writer, values, column, encoding, listValues) { export function writePageHeader(writer, header) { /** @type {import('../src/types.js').ThriftObject} */ const compact = { - field_1: PageType.indexOf(header.type), + field_1: PageTypes.indexOf(header.type), field_2: header.uncompressed_page_size, field_3: header.compressed_page_size, field_4: header.crc, field_5: header.data_page_header && { field_1: header.data_page_header.num_values, - field_2: Encoding.indexOf(header.data_page_header.encoding), - field_3: Encoding.indexOf(header.data_page_header.definition_level_encoding), - field_4: Encoding.indexOf(header.data_page_header.repetition_level_encoding), + field_2: Encodings.indexOf(header.data_page_header.encoding), + field_3: Encodings.indexOf(header.data_page_header.definition_level_encoding), + field_4: Encodings.indexOf(header.data_page_header.repetition_level_encoding), // field_5: header.data_page_header.statistics, }, field_7: header.dictionary_page_header && { field_1: header.dictionary_page_header.num_values, - field_2: Encoding.indexOf(header.dictionary_page_header.encoding), + field_2: Encodings.indexOf(header.dictionary_page_header.encoding), }, field_8: header.data_page_header_v2 && { field_1: header.data_page_header_v2.num_values, field_2: header.data_page_header_v2.num_nulls, field_3: header.data_page_header_v2.num_rows, - field_4: Encoding.indexOf(header.data_page_header_v2.encoding), + field_4: Encodings.indexOf(header.data_page_header_v2.encoding), field_5: header.data_page_header_v2.definition_levels_byte_length, field_6: header.data_page_header_v2.repetition_levels_byte_length, field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true diff --git a/src/metadata.js b/src/metadata.js index 11cc8df..dcf692e 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,5 +1,5 @@ import { getSchemaPath } from 'hyparquet/src/schema.js' -import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from 'hyparquet/src/constants.js' +import { CompressionCodecs, ConvertedTypes, Encodings, FieldRepetitionTypes, PageTypes, ParquetTypes } from 'hyparquet/src/constants.js' import { serializeTCompactProtocol } from './thrift.js' import { unconvertStatistics } from './unconvert.js' @@ -14,12 +14,12 @@ export function writeMetadata(writer, metadata) { const compact = { field_1: metadata.version, field_2: metadata.schema && metadata.schema.map(element => ({ - field_1: element.type && ParquetType.indexOf(element.type), + field_1: element.type && ParquetTypes.indexOf(element.type), field_2: element.type_length, - field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type), + field_3: element.repetition_type && FieldRepetitionTypes.indexOf(element.repetition_type), field_4: element.name, field_5: element.num_children, - field_6: element.converted_type && ConvertedType.indexOf(element.converted_type), + field_6: element.converted_type && ConvertedTypes.indexOf(element.converted_type), field_7: element.scale, field_8: element.precision, field_9: element.field_id, @@ -31,10 +31,10 @@ export function writeMetadata(writer, metadata) { field_1: c.file_path, field_2: c.file_offset, field_3: c.meta_data && { - field_1: ParquetType.indexOf(c.meta_data.type), - field_2: c.meta_data.encodings.map(e => Encoding.indexOf(e)), + field_1: ParquetTypes.indexOf(c.meta_data.type), + field_2: c.meta_data.encodings.map(e => Encodings.indexOf(e)), field_3: c.meta_data.path_in_schema, - field_4: CompressionCodec.indexOf(c.meta_data.codec), + field_4: CompressionCodecs.indexOf(c.meta_data.codec), field_5: c.meta_data.num_values, field_6: c.meta_data.total_uncompressed_size, field_7: c.meta_data.total_compressed_size, @@ -50,8 +50,8 @@ export function writeMetadata(writer, metadata) { schemaElement(metadata.schema, c.meta_data.path_in_schema, columnIndex + 1) ), field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({ - field_1: PageType.indexOf(es.page_type), - field_2: Encoding.indexOf(es.encoding), + field_1: PageTypes.indexOf(es.page_type), + field_2: Encodings.indexOf(es.encoding), field_3: es.count, })), field_14: c.meta_data.bloom_filter_offset, diff --git a/src/parquet-writer.js b/src/parquet-writer.js index 64da80d..c940522 100644 --- a/src/parquet-writer.js +++ b/src/parquet-writer.js @@ -74,8 +74,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 }) compressed: this.compressed, } - const file_offset = BigInt(this.writer.offset) - const meta_data = writeColumn( + const columnChunk = writeColumn( this.writer, column, groupData, @@ -83,10 +82,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 }) ) // save column chunk metadata - columns.push({ - file_offset, - meta_data, - }) + columns.push(columnChunk) } this.num_rows += BigInt(groupSize)