diff --git a/package.json b/package.json index 9c96ee0..f9cc551 100644 --- a/package.json +++ b/package.json @@ -45,7 +45,7 @@ "test": "vitest run" }, "dependencies": { - "hyparquet": "1.11.1" + "hyparquet": "1.12.0" }, "devDependencies": { "@babel/eslint-parser": "7.27.0", diff --git a/src/column.js b/src/column.js index cef283e..87d5165 100644 --- a/src/column.js +++ b/src/column.js @@ -9,7 +9,7 @@ import { ByteWriter } from './bytewriter.js' /** * @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType, SchemaElement, Statistics} from 'hyparquet' - * @import {Writer} from '../src/types.js' + * @import {ThriftObject, Writer} from '../src/types.js' * @param {Writer} writer * @param {SchemaElement[]} schemaPath * @param {DecodedArray} values @@ -141,6 +141,7 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) { * @param {PageHeader} header */ function writePageHeader(writer, header) { + /** @type {ThriftObject} */ const compact = { field_1: PageType.indexOf(header.type), field_2: header.uncompressed_page_size, diff --git a/src/metadata.js b/src/metadata.js index 198fd29..a5df77f 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -3,12 +3,13 @@ import { serializeTCompactProtocol } from './thrift.js' import { unconvertMetadata } from './unconvert.js' /** - * @import {FileMetaData} from 'hyparquet' - * @import {Writer} from '../src/types.js' + * @import {FileMetaData, LogicalType, TimeUnit} from 'hyparquet' + * @import {ThriftObject, Writer} from '../src/types.js' * @param {Writer} writer * @param {FileMetaData} metadata */ export function writeMetadata(writer, metadata) { + /** @type {ThriftObject} */ const compact = { field_1: metadata.version, field_2: metadata.schema && metadata.schema.map(element => ({ @@ -21,7 +22,7 @@ export function writeMetadata(writer, metadata) { field_7: element.scale, field_8: element.precision, field_9: element.field_id, - field_10: element.logical_type, + field_10: logicalType(element.logical_type), })), field_3: metadata.num_rows, field_4: metadata.row_groups.map(rg => ({ @@ -36,7 +37,10 @@ export function writeMetadata(writer, metadata) { field_5: c.meta_data.num_values, field_6: c.meta_data.total_uncompressed_size, field_7: c.meta_data.total_compressed_size, - field_8: c.meta_data.key_value_metadata, + field_8: c.meta_data.key_value_metadata && c.meta_data.key_value_metadata.map(kv => ({ + field_1: kv.key, + field_2: kv.value, + })), field_9: c.meta_data.data_page_offset, field_10: c.meta_data.index_page_offset, field_11: c.meta_data.dictionary_page_offset, @@ -67,7 +71,7 @@ export function writeMetadata(writer, metadata) { field_5: c.offset_index_length, field_6: c.column_index_offset, field_7: c.column_index_length, - field_8: c.crypto_metadata, + // field_8: c.crypto_metadata, field_9: c.encrypted_column_metadata, })), field_2: rg.total_byte_size, @@ -93,3 +97,52 @@ export function writeMetadata(writer, metadata) { const metadataLength = writer.offset - metadataStart writer.appendUint32(metadataLength) } + +/** + * @param {LogicalType | undefined} type + * @returns {ThriftObject | undefined} + */ +function logicalType(type) { + if (type === undefined) return undefined + if (type.type === 'STRING') return { field_1: {} } + if (type.type === 'MAP') return { field_2: {} } + if (type.type === 'LIST') return { field_3: {} } + if (type.type === 'ENUM') return { field_4: {} } + if (type.type === 'DECIMAL') return { field_5: { + field_1: type.scale, + field_2: type.precision, + } } + if (type.type === 'DATE') return { field_6: {} } + if (type.type === 'TIME') return { field_7: { + field_1: type.isAdjustedToUTC, + field_2: timeUnit(type.unit), + } } + if (type.type === 'TIMESTAMP') return { field_8: { + field_1: type.isAdjustedToUTC, + field_2: timeUnit(type.unit), + } } + if (type.type === 'INTEGER') return { field_10: { + field_1: type.bitWidth, + field_2: type.isSigned, + } } + if (type.type === 'NULL') return { field_11: {} } + if (type.type === 'JSON') return { field_12: {} } + if (type.type === 'BSON') return { field_13: {} } + if (type.type === 'UUID') return { field_14: {} } + if (type.type === 'FLOAT16') return { field_15: {} } + if (type.type === 'VARIANT') return { field_16: {} } + if (type.type === 'GEOMETRY') return { field_17: {} } + if (type.type === 'GEOGRAPHY') return { field_18: {} } + throw new Error(`unknown logical type: ${type.type}`) +} + +/** + * @param {TimeUnit} unit + * @returns {ThriftObject} + */ +function timeUnit(unit) { + if (unit === 'MILLIS') return { field_1: {} } + if (unit === 'MICROS') return { field_2: {} } + if (unit === 'NANOS') return { field_3: {} } + throw new Error(`unknown time unit: ${unit}`) +} diff --git a/src/types.d.ts b/src/types.d.ts index 576ea1d..810432a 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -41,3 +41,6 @@ export interface Writer { appendVarInt(value: number): void appendVarBigInt(value: bigint): void } + +export type ThriftObject = { [ key: `field_${number}` ]: ThriftType } +export type ThriftType = boolean | number | bigint | string | Uint8Array | ThriftType[] | ThriftObject | undefined