Convert rich types

This commit is contained in:
Kenny Daniel 2024-01-20 18:28:56 -08:00
parent 6d03bd6d86
commit 619f790795
No known key found for this signature in database
GPG Key ID: 6A3C5E318BE71391
3 changed files with 88 additions and 4 deletions

@ -1,7 +1,7 @@
import { CompressionCodec, Encoding, PageType } from './constants.js'
import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js'
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
import { parquetHeader } from './header.js'
import { getMaxDefinitionLevel, isRequired } from './schema.js'
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
import { snappyUncompress } from './snappy.js'
/**
@ -11,6 +11,8 @@ import { snappyUncompress } from './snappy.js'
* @typedef {import('./types.js').RowGroup} RowGroup
*/
const dayMillis = 86400000000000 // 1 day in milliseconds
/**
* Read a column from the file.
*
@ -113,11 +115,16 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) {
} else {
if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) {
// dereference dictionary values
values = []
for (let i = 0; i < dataPage.length; i++) {
dataPage[i] = dictionary[dataPage[i]]
values[i] = dictionary[dataPage[i]]
}
} else if (Array.isArray(dataPage)) {
// convert primitive types to rich types
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
} else {
values = dataPage // TODO: data page shouldn't be a fixed byte array?
}
values = dataPage
}
// TODO: check that we are at the end of the page
@ -155,3 +162,44 @@ export function getColumnOffset(columnMetadata) {
}
return Number(columnOffset)
}
/**
* Convert known types from primitive to rich.
*
* @param {any[]} data series of primitive types
* @param {SchemaElement} schemaElement schema element for the data
* @returns {any[]} series of rich types
*/
function convert(data, schemaElement) {
const ctype = schemaElement.converted_type
if (!ctype) return data
if (ctype === ConvertedType.UTF8) {
const decoder = new TextDecoder()
return data.map(v => decoder.decode(v))
}
if (ctype === ConvertedType.DECIMAL) {
const scaleFactor = Math.pow(10, schemaElement.scale || 0)
if (typeof data[0] === 'number') {
return data.map(v => v * scaleFactor)
} else {
// TODO: parse byte string
throw new Error('parquet decimal byte string not supported')
}
}
if (ctype === ConvertedType.DATE) {
return data.map(v => new Date(v * dayMillis))
}
if (ctype === ConvertedType.TIME_MILLIS) {
return data.map(v => new Date(v))
}
if (ctype === ConvertedType.JSON) {
return data.map(v => JSON.parse(v))
}
if (ctype === ConvertedType.BSON) {
throw new Error('parquet bson not supported')
}
if (ctype === ConvertedType.INTERVAL) {
throw new Error('parquet interval not supported')
}
return data
}

@ -27,6 +27,31 @@ export const FieldRepetitionType = {
REPEATED: 2,
}
export const ConvertedType = {
UTF8: 0,
MAP: 1,
MAP_KEY_VALUE: 2,
LIST: 3,
ENUM: 4,
DECIMAL: 5,
DATE: 6,
TIME_MILLIS: 7,
TIME_MICROS: 8,
TIMESTAMP_MILLIS: 9,
TIMESTAMP_MICROS: 10,
UINT_8: 11,
UINT_16: 12,
UINT_32: 13,
UINT_64: 14,
INT_8: 15,
INT_16: 16,
INT_32: 17,
INT_64: 18,
JSON: 19,
BSON: 20,
INTERVAL: 21,
}
export const CompressionCodec = {
UNCOMPRESSED: 0,
SNAPPY: 1,

11
src/types.d.ts vendored

@ -81,6 +81,17 @@ export enum ConvertedType {
TIME_MICROS = 8,
TIMESTAMP_MILLIS = 9,
TIMESTAMP_MICROS = 10,
UINT_8 = 11,
UINT_16 = 12,
UINT_32 = 13,
UINT_64 = 14,
INT_8 = 15,
INT_16 = 16,
INT_32 = 17,
INT_64 = 18,
JSON = 19,
BSON = 20,
INTERVAL = 21,
}
export interface RowGroup {