From 619f79079549b4c94d5b90393b00595f82bfbb88 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sat, 20 Jan 2024 18:28:56 -0800 Subject: [PATCH] Convert rich types --- src/column.js | 56 ++++++++++++++++++++++++++++++++++++++++++++---- src/constants.js | 25 +++++++++++++++++++++ src/types.d.ts | 11 ++++++++++ 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/column.js b/src/column.js index b909a88..2f0524a 100644 --- a/src/column.js +++ b/src/column.js @@ -1,7 +1,7 @@ -import { CompressionCodec, Encoding, PageType } from './constants.js' +import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js' import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js' import { parquetHeader } from './header.js' -import { getMaxDefinitionLevel, isRequired } from './schema.js' +import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js' import { snappyUncompress } from './snappy.js' /** @@ -11,6 +11,8 @@ import { snappyUncompress } from './snappy.js' * @typedef {import('./types.js').RowGroup} RowGroup */ +const dayMillis = 86400000000000 // 1 day in milliseconds + /** * Read a column from the file. * @@ -113,11 +115,16 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) { } else { if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) { // dereference dictionary values + values = [] for (let i = 0; i < dataPage.length; i++) { - dataPage[i] = dictionary[dataPage[i]] + values[i] = dictionary[dataPage[i]] } + } else if (Array.isArray(dataPage)) { + // convert primitive types to rich types + values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema)) + } else { + values = dataPage // TODO: data page shouldn't be a fixed byte array? } - values = dataPage } // TODO: check that we are at the end of the page @@ -155,3 +162,44 @@ export function getColumnOffset(columnMetadata) { } return Number(columnOffset) } + +/** + * Convert known types from primitive to rich. + * + * @param {any[]} data series of primitive types + * @param {SchemaElement} schemaElement schema element for the data + * @returns {any[]} series of rich types + */ +function convert(data, schemaElement) { + const ctype = schemaElement.converted_type + if (!ctype) return data + if (ctype === ConvertedType.UTF8) { + const decoder = new TextDecoder() + return data.map(v => decoder.decode(v)) + } + if (ctype === ConvertedType.DECIMAL) { + const scaleFactor = Math.pow(10, schemaElement.scale || 0) + if (typeof data[0] === 'number') { + return data.map(v => v * scaleFactor) + } else { + // TODO: parse byte string + throw new Error('parquet decimal byte string not supported') + } + } + if (ctype === ConvertedType.DATE) { + return data.map(v => new Date(v * dayMillis)) + } + if (ctype === ConvertedType.TIME_MILLIS) { + return data.map(v => new Date(v)) + } + if (ctype === ConvertedType.JSON) { + return data.map(v => JSON.parse(v)) + } + if (ctype === ConvertedType.BSON) { + throw new Error('parquet bson not supported') + } + if (ctype === ConvertedType.INTERVAL) { + throw new Error('parquet interval not supported') + } + return data +} diff --git a/src/constants.js b/src/constants.js index c742c7b..de4c1c1 100644 --- a/src/constants.js +++ b/src/constants.js @@ -27,6 +27,31 @@ export const FieldRepetitionType = { REPEATED: 2, } +export const ConvertedType = { + UTF8: 0, + MAP: 1, + MAP_KEY_VALUE: 2, + LIST: 3, + ENUM: 4, + DECIMAL: 5, + DATE: 6, + TIME_MILLIS: 7, + TIME_MICROS: 8, + TIMESTAMP_MILLIS: 9, + TIMESTAMP_MICROS: 10, + UINT_8: 11, + UINT_16: 12, + UINT_32: 13, + UINT_64: 14, + INT_8: 15, + INT_16: 16, + INT_32: 17, + INT_64: 18, + JSON: 19, + BSON: 20, + INTERVAL: 21, +} + export const CompressionCodec = { UNCOMPRESSED: 0, SNAPPY: 1, diff --git a/src/types.d.ts b/src/types.d.ts index 394b14f..b5bb73e 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -81,6 +81,17 @@ export enum ConvertedType { TIME_MICROS = 8, TIMESTAMP_MILLIS = 9, TIMESTAMP_MICROS = 10, + UINT_8 = 11, + UINT_16 = 12, + UINT_32 = 13, + UINT_64 = 14, + INT_8 = 15, + INT_16 = 16, + INT_32 = 17, + INT_64 = 18, + JSON = 19, + BSON = 20, + INTERVAL = 21, } export interface RowGroup {