mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-03 10:26:36 +00:00
Convert rich types
This commit is contained in:
parent
6d03bd6d86
commit
619f790795
@ -1,7 +1,7 @@
|
||||
import { CompressionCodec, Encoding, PageType } from './constants.js'
|
||||
import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js'
|
||||
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
|
||||
import { parquetHeader } from './header.js'
|
||||
import { getMaxDefinitionLevel, isRequired } from './schema.js'
|
||||
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
|
||||
import { snappyUncompress } from './snappy.js'
|
||||
|
||||
/**
|
||||
@ -11,6 +11,8 @@ import { snappyUncompress } from './snappy.js'
|
||||
* @typedef {import('./types.js').RowGroup} RowGroup
|
||||
*/
|
||||
|
||||
const dayMillis = 86400000000000 // 1 day in milliseconds
|
||||
|
||||
/**
|
||||
* Read a column from the file.
|
||||
*
|
||||
@ -113,11 +115,16 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) {
|
||||
} else {
|
||||
if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) {
|
||||
// dereference dictionary values
|
||||
values = []
|
||||
for (let i = 0; i < dataPage.length; i++) {
|
||||
dataPage[i] = dictionary[dataPage[i]]
|
||||
values[i] = dictionary[dataPage[i]]
|
||||
}
|
||||
} else if (Array.isArray(dataPage)) {
|
||||
// convert primitive types to rich types
|
||||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
|
||||
} else {
|
||||
values = dataPage // TODO: data page shouldn't be a fixed byte array?
|
||||
}
|
||||
values = dataPage
|
||||
}
|
||||
|
||||
// TODO: check that we are at the end of the page
|
||||
@ -155,3 +162,44 @@ export function getColumnOffset(columnMetadata) {
|
||||
}
|
||||
return Number(columnOffset)
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert known types from primitive to rich.
|
||||
*
|
||||
* @param {any[]} data series of primitive types
|
||||
* @param {SchemaElement} schemaElement schema element for the data
|
||||
* @returns {any[]} series of rich types
|
||||
*/
|
||||
function convert(data, schemaElement) {
|
||||
const ctype = schemaElement.converted_type
|
||||
if (!ctype) return data
|
||||
if (ctype === ConvertedType.UTF8) {
|
||||
const decoder = new TextDecoder()
|
||||
return data.map(v => decoder.decode(v))
|
||||
}
|
||||
if (ctype === ConvertedType.DECIMAL) {
|
||||
const scaleFactor = Math.pow(10, schemaElement.scale || 0)
|
||||
if (typeof data[0] === 'number') {
|
||||
return data.map(v => v * scaleFactor)
|
||||
} else {
|
||||
// TODO: parse byte string
|
||||
throw new Error('parquet decimal byte string not supported')
|
||||
}
|
||||
}
|
||||
if (ctype === ConvertedType.DATE) {
|
||||
return data.map(v => new Date(v * dayMillis))
|
||||
}
|
||||
if (ctype === ConvertedType.TIME_MILLIS) {
|
||||
return data.map(v => new Date(v))
|
||||
}
|
||||
if (ctype === ConvertedType.JSON) {
|
||||
return data.map(v => JSON.parse(v))
|
||||
}
|
||||
if (ctype === ConvertedType.BSON) {
|
||||
throw new Error('parquet bson not supported')
|
||||
}
|
||||
if (ctype === ConvertedType.INTERVAL) {
|
||||
throw new Error('parquet interval not supported')
|
||||
}
|
||||
return data
|
||||
}
|
||||
|
||||
@ -27,6 +27,31 @@ export const FieldRepetitionType = {
|
||||
REPEATED: 2,
|
||||
}
|
||||
|
||||
export const ConvertedType = {
|
||||
UTF8: 0,
|
||||
MAP: 1,
|
||||
MAP_KEY_VALUE: 2,
|
||||
LIST: 3,
|
||||
ENUM: 4,
|
||||
DECIMAL: 5,
|
||||
DATE: 6,
|
||||
TIME_MILLIS: 7,
|
||||
TIME_MICROS: 8,
|
||||
TIMESTAMP_MILLIS: 9,
|
||||
TIMESTAMP_MICROS: 10,
|
||||
UINT_8: 11,
|
||||
UINT_16: 12,
|
||||
UINT_32: 13,
|
||||
UINT_64: 14,
|
||||
INT_8: 15,
|
||||
INT_16: 16,
|
||||
INT_32: 17,
|
||||
INT_64: 18,
|
||||
JSON: 19,
|
||||
BSON: 20,
|
||||
INTERVAL: 21,
|
||||
}
|
||||
|
||||
export const CompressionCodec = {
|
||||
UNCOMPRESSED: 0,
|
||||
SNAPPY: 1,
|
||||
|
||||
11
src/types.d.ts
vendored
11
src/types.d.ts
vendored
@ -81,6 +81,17 @@ export enum ConvertedType {
|
||||
TIME_MICROS = 8,
|
||||
TIMESTAMP_MILLIS = 9,
|
||||
TIMESTAMP_MICROS = 10,
|
||||
UINT_8 = 11,
|
||||
UINT_16 = 12,
|
||||
UINT_32 = 13,
|
||||
UINT_64 = 14,
|
||||
INT_8 = 15,
|
||||
INT_16 = 16,
|
||||
INT_32 = 17,
|
||||
INT_64 = 18,
|
||||
JSON = 19,
|
||||
BSON = 20,
|
||||
INTERVAL = 21,
|
||||
}
|
||||
|
||||
export interface RowGroup {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user