mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Return ColumnChunk from writeColumn
This commit is contained in:
parent
b2ea21b366
commit
5dadd5f7ef
@ -52,15 +52,15 @@
|
||||
"test": "vitest run"
|
||||
},
|
||||
"dependencies": {
|
||||
"hyparquet": "1.20.2"
|
||||
"hyparquet": "1.22.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/eslint-parser": "7.28.5",
|
||||
"@types/node": "24.10.1",
|
||||
"@vitest/coverage-v8": "4.0.8",
|
||||
"@vitest/coverage-v8": "4.0.14",
|
||||
"eslint": "9.39.1",
|
||||
"eslint-plugin-jsdoc": "61.2.0",
|
||||
"eslint-plugin-jsdoc": "61.4.1",
|
||||
"typescript": "5.9.3",
|
||||
"vitest": "4.0.8"
|
||||
"vitest": "4.0.14"
|
||||
}
|
||||
}
|
||||
|
||||
@ -11,7 +11,7 @@ import { unconvert } from './unconvert.js'
|
||||
* @param {ColumnEncoder} column
|
||||
* @param {DecodedArray} values
|
||||
* @param {boolean} stats
|
||||
* @returns {ColumnMetaData}
|
||||
* @returns {ColumnChunk}
|
||||
*/
|
||||
export function writeColumn(writer, column, values, stats) {
|
||||
const { columnName, element, schemaPath, compressed } = column
|
||||
@ -74,17 +74,20 @@ export function writeColumn(writer, column, values, stats) {
|
||||
}
|
||||
|
||||
return {
|
||||
type,
|
||||
encodings,
|
||||
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
||||
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
|
||||
num_values: BigInt(num_values),
|
||||
total_compressed_size: BigInt(writer.offset - offsetStart),
|
||||
total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO
|
||||
data_page_offset,
|
||||
dictionary_page_offset,
|
||||
statistics,
|
||||
geospatial_statistics,
|
||||
meta_data: {
|
||||
type,
|
||||
encodings,
|
||||
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
||||
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
|
||||
num_values: BigInt(num_values),
|
||||
total_compressed_size: BigInt(writer.offset - offsetStart),
|
||||
total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO
|
||||
data_page_offset,
|
||||
dictionary_page_offset,
|
||||
statistics,
|
||||
geospatial_statistics,
|
||||
},
|
||||
file_offset: BigInt(offsetStart),
|
||||
}
|
||||
}
|
||||
|
||||
@ -137,7 +140,7 @@ function writeDictionaryPage(writer, column, dictionary) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
|
||||
* @import {ColumnChunk, ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
|
||||
* @import {ColumnEncoder, PageData, Writer} from '../src/types.js'
|
||||
* @param {DecodedArray} values
|
||||
* @returns {Statistics}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { Encoding, PageType } from 'hyparquet/src/constants.js'
|
||||
import { Encodings, PageTypes } from 'hyparquet/src/constants.js'
|
||||
import { ByteWriter } from './bytewriter.js'
|
||||
import { writeRleBitPackedHybrid } from './encoding.js'
|
||||
import { writePlain } from './plain.js'
|
||||
@ -85,26 +85,26 @@ export function writeDataPageV2(writer, values, column, encoding, listValues) {
|
||||
export function writePageHeader(writer, header) {
|
||||
/** @type {import('../src/types.js').ThriftObject} */
|
||||
const compact = {
|
||||
field_1: PageType.indexOf(header.type),
|
||||
field_1: PageTypes.indexOf(header.type),
|
||||
field_2: header.uncompressed_page_size,
|
||||
field_3: header.compressed_page_size,
|
||||
field_4: header.crc,
|
||||
field_5: header.data_page_header && {
|
||||
field_1: header.data_page_header.num_values,
|
||||
field_2: Encoding.indexOf(header.data_page_header.encoding),
|
||||
field_3: Encoding.indexOf(header.data_page_header.definition_level_encoding),
|
||||
field_4: Encoding.indexOf(header.data_page_header.repetition_level_encoding),
|
||||
field_2: Encodings.indexOf(header.data_page_header.encoding),
|
||||
field_3: Encodings.indexOf(header.data_page_header.definition_level_encoding),
|
||||
field_4: Encodings.indexOf(header.data_page_header.repetition_level_encoding),
|
||||
// field_5: header.data_page_header.statistics,
|
||||
},
|
||||
field_7: header.dictionary_page_header && {
|
||||
field_1: header.dictionary_page_header.num_values,
|
||||
field_2: Encoding.indexOf(header.dictionary_page_header.encoding),
|
||||
field_2: Encodings.indexOf(header.dictionary_page_header.encoding),
|
||||
},
|
||||
field_8: header.data_page_header_v2 && {
|
||||
field_1: header.data_page_header_v2.num_values,
|
||||
field_2: header.data_page_header_v2.num_nulls,
|
||||
field_3: header.data_page_header_v2.num_rows,
|
||||
field_4: Encoding.indexOf(header.data_page_header_v2.encoding),
|
||||
field_4: Encodings.indexOf(header.data_page_header_v2.encoding),
|
||||
field_5: header.data_page_header_v2.definition_levels_byte_length,
|
||||
field_6: header.data_page_header_v2.repetition_levels_byte_length,
|
||||
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
import { getSchemaPath } from 'hyparquet/src/schema.js'
|
||||
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from 'hyparquet/src/constants.js'
|
||||
import { CompressionCodecs, ConvertedTypes, Encodings, FieldRepetitionTypes, PageTypes, ParquetTypes } from 'hyparquet/src/constants.js'
|
||||
import { serializeTCompactProtocol } from './thrift.js'
|
||||
import { unconvertStatistics } from './unconvert.js'
|
||||
|
||||
@ -14,12 +14,12 @@ export function writeMetadata(writer, metadata) {
|
||||
const compact = {
|
||||
field_1: metadata.version,
|
||||
field_2: metadata.schema && metadata.schema.map(element => ({
|
||||
field_1: element.type && ParquetType.indexOf(element.type),
|
||||
field_1: element.type && ParquetTypes.indexOf(element.type),
|
||||
field_2: element.type_length,
|
||||
field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type),
|
||||
field_3: element.repetition_type && FieldRepetitionTypes.indexOf(element.repetition_type),
|
||||
field_4: element.name,
|
||||
field_5: element.num_children,
|
||||
field_6: element.converted_type && ConvertedType.indexOf(element.converted_type),
|
||||
field_6: element.converted_type && ConvertedTypes.indexOf(element.converted_type),
|
||||
field_7: element.scale,
|
||||
field_8: element.precision,
|
||||
field_9: element.field_id,
|
||||
@ -31,10 +31,10 @@ export function writeMetadata(writer, metadata) {
|
||||
field_1: c.file_path,
|
||||
field_2: c.file_offset,
|
||||
field_3: c.meta_data && {
|
||||
field_1: ParquetType.indexOf(c.meta_data.type),
|
||||
field_2: c.meta_data.encodings.map(e => Encoding.indexOf(e)),
|
||||
field_1: ParquetTypes.indexOf(c.meta_data.type),
|
||||
field_2: c.meta_data.encodings.map(e => Encodings.indexOf(e)),
|
||||
field_3: c.meta_data.path_in_schema,
|
||||
field_4: CompressionCodec.indexOf(c.meta_data.codec),
|
||||
field_4: CompressionCodecs.indexOf(c.meta_data.codec),
|
||||
field_5: c.meta_data.num_values,
|
||||
field_6: c.meta_data.total_uncompressed_size,
|
||||
field_7: c.meta_data.total_compressed_size,
|
||||
@ -50,8 +50,8 @@ export function writeMetadata(writer, metadata) {
|
||||
schemaElement(metadata.schema, c.meta_data.path_in_schema, columnIndex + 1)
|
||||
),
|
||||
field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({
|
||||
field_1: PageType.indexOf(es.page_type),
|
||||
field_2: Encoding.indexOf(es.encoding),
|
||||
field_1: PageTypes.indexOf(es.page_type),
|
||||
field_2: Encodings.indexOf(es.encoding),
|
||||
field_3: es.count,
|
||||
})),
|
||||
field_14: c.meta_data.bloom_filter_offset,
|
||||
|
||||
@ -74,8 +74,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 })
|
||||
compressed: this.compressed,
|
||||
}
|
||||
|
||||
const file_offset = BigInt(this.writer.offset)
|
||||
const meta_data = writeColumn(
|
||||
const columnChunk = writeColumn(
|
||||
this.writer,
|
||||
column,
|
||||
groupData,
|
||||
@ -83,10 +82,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 })
|
||||
)
|
||||
|
||||
// save column chunk metadata
|
||||
columns.push({
|
||||
file_offset,
|
||||
meta_data,
|
||||
})
|
||||
columns.push(columnChunk)
|
||||
}
|
||||
this.num_rows += BigInt(groupSize)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user