diff --git a/src/constants.js b/src/constants.js index 5a3f55c..e577e0a 100644 --- a/src/constants.js +++ b/src/constants.js @@ -1,3 +1,7 @@ +/** + * @typedef {import('./types.js').ParquetType} ParquetTypeType + * @type {ParquetTypeType[]} + */ export const ParquetType = [ 'BOOLEAN', 'INT32', @@ -28,6 +32,10 @@ export const FieldRepetitionType = [ 'REPEATED', ] +/** + * @typedef {import('./types.js').ConvertedType} ConvertedTypeType + * @type {ConvertedTypeType[]} + */ export const ConvertedType = [ 'UTF8', 'MAP', @@ -53,6 +61,28 @@ export const ConvertedType = [ 'INTERVAL', ] +/** + * @typedef {import('./types.js').LogicalTypeType} LogicalTypeType + * @type {LogicalTypeType[]} + */ +export const logicalTypeType = [ + 'NULL', + 'STRING', + 'MAP', + 'LIST', + 'ENUM', + 'DECIMAL', + 'DATE', + 'TIME', + 'TIMESTAMP', + 'INTERVAL', + 'INTEGER', + 'NULL', + 'JSON', + 'BSON', + 'UUID', +] + export const CompressionCodec = [ 'UNCOMPRESSED', 'SNAPPY', diff --git a/src/metadata.js b/src/metadata.js index fe51cdd..317ef41 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -105,6 +105,7 @@ export function parquetMetadata(arrayBuffer) { scale: field.field_7, precision: field.field_8, field_id: field.field_9, + logical_type: logicalType(field.field_10), })) const num_rows = metadata.field_3 const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({ @@ -171,3 +172,32 @@ export function parquetMetadata(arrayBuffer) { export function parquetSchema(metadata) { return schemaTree(metadata.schema, 0) } + +/** + * Parse logical type by type. + * + * @typedef {import("./types.d.ts").LogicalType} LogicalType + * @param {any} logicalType + * @returns {LogicalType | undefined} + */ +function logicalType(logicalType) { + if (logicalType?.field_5) { + return { + logicalType: 'DECIMAL', + scale: logicalType.field_5.field_1, + precision: logicalType.field_5.field_2, + } + } + // TODO: TimestampType + // TOFO: TimeType + if (logicalType?.field_10) { + return { + logicalType: 'INTEGER', + bitWidth: logicalType.field_10.field_1, + isSigned: logicalType.field_10.field_2, + } + } + if (logicalType) { + return logicalType + } +} diff --git a/src/types.d.ts b/src/types.d.ts index 8a97668..cb1c233 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -42,6 +42,7 @@ export interface SchemaElement { scale?: number precision?: number field_id?: number + logicalType?: LogicalType } export type ParquetType = @@ -83,6 +84,40 @@ export type ConvertedType = 'BSON' | 'INTERVAL' +type LogicalDecimalType = { + logicalType: 'DECIMAL' + precision: number + scale: number +} + +type LogicalIntType = { + logicalType: 'INTEGER' + bitWidth: number + isSigned: boolean +} + +export type LogicalType = + { logicalType: LogicalTypeType } | + LogicalDecimalType | + LogicalIntType + +export type LogicalTypeType = + 'STRING' | // convertedType UTF8 + 'MAP' | // convertedType MAP + 'LIST' | // convertedType LIST + 'ENUM' | // convertedType ENUM + 'DECIMAL' | // convertedType DECIMAL + precision/scale + 'DATE' | // convertedType DATE + 'TIME' | // convertedType TIME_MILLIS or TIME_MICROS + 'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS + 'INTEGER' | // convertedType INT or UINT + 'INTERVAL' | // convertedType INT or UINT + 'NULL' | // no convertedType + 'JSON' | // convertedType JSON + 'BSON' | // convertedType BSON + 'UUID' | // no convertedType + 'FLOAT16' // no convertedType + export interface RowGroup { columns: ColumnChunk[] total_byte_size: number diff --git a/test/files/concatenated_gzip_members.metadata.json b/test/files/concatenated_gzip_members.metadata.json index 29f4ef7..cee5ddf 100644 --- a/test/files/concatenated_gzip_members.metadata.json +++ b/test/files/concatenated_gzip_members.metadata.json @@ -36,6 +36,11 @@ }, { "converted_type": "UINT_64", + "logical_type": { + "logicalType": "INTEGER", + "bitWidth": 64, + "isSigned": false + }, "name": "long_col", "repetition_type": "OPTIONAL", "type": "INT64"