Parse logical types from metadata

This commit is contained in:
Kenny Daniel 2024-03-12 00:00:20 -07:00
parent 595c87a82b
commit 8a98407734
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 100 additions and 0 deletions

@ -1,3 +1,7 @@
/**
* @typedef {import('./types.js').ParquetType} ParquetTypeType
* @type {ParquetTypeType[]}
*/
export const ParquetType = [
'BOOLEAN',
'INT32',
@ -28,6 +32,10 @@ export const FieldRepetitionType = [
'REPEATED',
]
/**
* @typedef {import('./types.js').ConvertedType} ConvertedTypeType
* @type {ConvertedTypeType[]}
*/
export const ConvertedType = [
'UTF8',
'MAP',
@ -53,6 +61,28 @@ export const ConvertedType = [
'INTERVAL',
]
/**
* @typedef {import('./types.js').LogicalTypeType} LogicalTypeType
* @type {LogicalTypeType[]}
*/
export const logicalTypeType = [
'NULL',
'STRING',
'MAP',
'LIST',
'ENUM',
'DECIMAL',
'DATE',
'TIME',
'TIMESTAMP',
'INTERVAL',
'INTEGER',
'NULL',
'JSON',
'BSON',
'UUID',
]
export const CompressionCodec = [
'UNCOMPRESSED',
'SNAPPY',

@ -105,6 +105,7 @@ export function parquetMetadata(arrayBuffer) {
scale: field.field_7,
precision: field.field_8,
field_id: field.field_9,
logical_type: logicalType(field.field_10),
}))
const num_rows = metadata.field_3
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({
@ -171,3 +172,32 @@ export function parquetMetadata(arrayBuffer) {
export function parquetSchema(metadata) {
return schemaTree(metadata.schema, 0)
}
/**
* Parse logical type by type.
*
* @typedef {import("./types.d.ts").LogicalType} LogicalType
* @param {any} logicalType
* @returns {LogicalType | undefined}
*/
function logicalType(logicalType) {
if (logicalType?.field_5) {
return {
logicalType: 'DECIMAL',
scale: logicalType.field_5.field_1,
precision: logicalType.field_5.field_2,
}
}
// TODO: TimestampType
// TOFO: TimeType
if (logicalType?.field_10) {
return {
logicalType: 'INTEGER',
bitWidth: logicalType.field_10.field_1,
isSigned: logicalType.field_10.field_2,
}
}
if (logicalType) {
return logicalType
}
}

35
src/types.d.ts vendored

@ -42,6 +42,7 @@ export interface SchemaElement {
scale?: number
precision?: number
field_id?: number
logicalType?: LogicalType
}
export type ParquetType =
@ -83,6 +84,40 @@ export type ConvertedType =
'BSON' |
'INTERVAL'
type LogicalDecimalType = {
logicalType: 'DECIMAL'
precision: number
scale: number
}
type LogicalIntType = {
logicalType: 'INTEGER'
bitWidth: number
isSigned: boolean
}
export type LogicalType =
{ logicalType: LogicalTypeType } |
LogicalDecimalType |
LogicalIntType
export type LogicalTypeType =
'STRING' | // convertedType UTF8
'MAP' | // convertedType MAP
'LIST' | // convertedType LIST
'ENUM' | // convertedType ENUM
'DECIMAL' | // convertedType DECIMAL + precision/scale
'DATE' | // convertedType DATE
'TIME' | // convertedType TIME_MILLIS or TIME_MICROS
'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS
'INTEGER' | // convertedType INT or UINT
'INTERVAL' | // convertedType INT or UINT
'NULL' | // no convertedType
'JSON' | // convertedType JSON
'BSON' | // convertedType BSON
'UUID' | // no convertedType
'FLOAT16' // no convertedType
export interface RowGroup {
columns: ColumnChunk[]
total_byte_size: number

@ -36,6 +36,11 @@
},
{
"converted_type": "UINT_64",
"logical_type": {
"logicalType": "INTEGER",
"bitWidth": 64,
"isSigned": false
},
"name": "long_col",
"repetition_type": "OPTIONAL",
"type": "INT64"