diff --git a/src/column.js b/src/column.js index 74ccdd4..f263e93 100644 --- a/src/column.js +++ b/src/column.js @@ -1,4 +1,4 @@ -import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js' +import { Encoding, PageType } from './constants.js' import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js' import { parquetHeader } from './header.js' import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js' @@ -49,14 +49,13 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, let page const uncompressed_page_size = Number(header.uncompressed_page_size) const { codec } = columnMetadata - if (codec === CompressionCodec.UNCOMPRESSED) { + if (codec === 'UNCOMPRESSED') { page = compressedBytes - } else if (codec === CompressionCodec.SNAPPY) { + } else if (codec === 'SNAPPY') { page = new Uint8Array(uncompressed_page_size) snappyUncompress(compressedBytes, page) } else { - const compressor = Object.entries(CompressionCodec).find(([, value]) => value === codec) - throw new Error(`parquet unsupported compression codec: ${codec} ${compressor?.[0]}`) + throw new Error(`parquet unsupported compression codec: ${codec}`) } if (page?.length !== uncompressed_page_size) { throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`) @@ -178,11 +177,11 @@ export function getColumnOffset(columnMetadata) { function convert(data, schemaElement) { const ctype = schemaElement.converted_type if (ctype === undefined) return data - if (ctype === ConvertedType.UTF8) { + if (ctype === 'UTF8') { const decoder = new TextDecoder() return data.map(v => decoder.decode(v)) } - if (ctype === ConvertedType.DECIMAL) { + if (ctype === 'DECIMAL') { const scaleFactor = Math.pow(10, schemaElement.scale || 0) if (typeof data[0] === 'number') { return data.map(v => v * scaleFactor) @@ -191,19 +190,19 @@ function convert(data, schemaElement) { throw new Error('parquet decimal byte string not supported') } } - if (ctype === ConvertedType.DATE) { + if (ctype === 'DATE') { return data.map(v => new Date(v * dayMillis)) } - if (ctype === ConvertedType.TIME_MILLIS) { + if (ctype === 'TIME_MILLIS') { return data.map(v => new Date(v)) } - if (ctype === ConvertedType.JSON) { + if (ctype === 'JSON') { return data.map(v => JSON.parse(v)) } - if (ctype === ConvertedType.BSON) { + if (ctype === 'BSON') { throw new Error('parquet bson not supported') } - if (ctype === ConvertedType.INTERVAL) { + if (ctype === 'INTERVAL') { throw new Error('parquet interval not supported') } return data diff --git a/src/constants.js b/src/constants.js index de4c1c1..356cf8a 100644 --- a/src/constants.js +++ b/src/constants.js @@ -9,67 +9,6 @@ export const ParquetType = { FIXED_LEN_BYTE_ARRAY: 7, } -export const ParquetEncoding = { - PLAIN: 0, - PLAIN_DICTIONARY: 2, - RLE: 3, - BIT_PACKED: 4, // deprecated - DELTA_BINARY_PACKED: 5, - DELTA_LENGTH_BYTE_ARRAY: 6, - DELTA_BYTE_ARRAY: 7, - RLE_DICTIONARY: 8, - BYTE_STREAM_SPLIT: 9, -} - -export const FieldRepetitionType = { - REQUIRED: 0, - OPTIONAL: 1, - REPEATED: 2, -} - -export const ConvertedType = { - UTF8: 0, - MAP: 1, - MAP_KEY_VALUE: 2, - LIST: 3, - ENUM: 4, - DECIMAL: 5, - DATE: 6, - TIME_MILLIS: 7, - TIME_MICROS: 8, - TIMESTAMP_MILLIS: 9, - TIMESTAMP_MICROS: 10, - UINT_8: 11, - UINT_16: 12, - UINT_32: 13, - UINT_64: 14, - INT_8: 15, - INT_16: 16, - INT_32: 17, - INT_64: 18, - JSON: 19, - BSON: 20, - INTERVAL: 21, -} - -export const CompressionCodec = { - UNCOMPRESSED: 0, - SNAPPY: 1, - GZIP: 2, - LZO: 3, - BROTLI: 4, - LZ4: 5, - ZSTD: 6, - LZ4_RAW: 7, -} - -export const PageType = { - DATA_PAGE: 0, - INDEX_PAGE: 1, - DICTIONARY_PAGE: 2, - DATA_PAGE_V2: 3, -} - export const Encoding = { PLAIN: 0, PLAIN_DICTIONARY: 2, @@ -81,3 +20,52 @@ export const Encoding = { RLE_DICTIONARY: 8, BYTE_STREAM_SPLIT: 9, } + +export const FieldRepetitionType = [ + 'REQUIRED', + 'OPTIONAL', + 'REPEATED', +] + +export const ConvertedType = [ + 'UTF8', + 'MAP', + 'MAP_KEY_VALUE', + 'LIST', + 'ENUM', + 'DECIMAL', + 'DATE', + 'TIME_MILLIS', + 'TIME_MICROS', + 'TIMESTAMP_MILLIS', + 'TIMESTAMP_MICROS', + 'UINT_8', + 'UINT_16', + 'UINT_32', + 'UINT_64', + 'INT_8', + 'INT_16', + 'INT_32', + 'INT_64', + 'JSON', + 'BSON', + 'INTERVAL', +] + +export const CompressionCodec = [ + 'UNCOMPRESSED', + 'SNAPPY', + 'GZIP', + 'LZO', + 'BROTLI', + 'LZ4', + 'ZSTD', + 'LZ4_RAW', +] + +export const PageType = { + DATA_PAGE: 0, + INDEX_PAGE: 1, + DICTIONARY_PAGE: 2, + DATA_PAGE_V2: 3, +} diff --git a/src/encoding.js b/src/encoding.js index 2265997..f8397ec 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -1,4 +1,4 @@ -import { ParquetEncoding, ParquetType } from './constants.js' +import { Encoding, ParquetType } from './constants.js' import { readVarInt } from './thrift.js' /** @@ -203,7 +203,7 @@ export function widthFromMaxInt(value) { export function readData(dataView, encoding, offset, count, bitWidth) { const value = [] let byteLength = 0 - if (encoding === ParquetEncoding.RLE) { + if (encoding === Encoding.RLE) { let seen = 0 while (seen < count) { const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count) diff --git a/src/metadata.js b/src/metadata.js index 1b02da7..b920d08 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,3 +1,4 @@ +import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js' import { schemaTree } from './schema.js' import { deserializeTCompactProtocol } from './thrift.js' @@ -97,10 +98,10 @@ export function parquetMetadata(arrayBuffer) { const schema = metadata.field_2.map((/** @type {any} */ field) => ({ type: field.field_1, type_length: field.field_2, - repetition_type: field.field_3, + repetition_type: FieldRepetitionType[field.field_3], name: field.field_4, num_children: field.field_5, - converted_type: field.field_6, + converted_type: ConvertedType[field.field_6], scale: field.field_7, precision: field.field_8, field_id: field.field_9, @@ -114,7 +115,7 @@ export function parquetMetadata(arrayBuffer) { type: column.field_3.field_1, encodings: column.field_3.field_2, path_in_schema: column.field_3.field_3, - codec: column.field_3.field_4, + codec: CompressionCodec[column.field_3.field_4], num_values: column.field_3.field_5, total_uncompressed_size: column.field_3.field_6, total_compressed_size: column.field_3.field_7, diff --git a/src/schema.js b/src/schema.js index 8be3941..5f54659 100644 --- a/src/schema.js +++ b/src/schema.js @@ -1,5 +1,3 @@ -import { FieldRepetitionType } from './constants.js' - /** * @typedef {import('./types.js').SchemaElement} SchemaElement * @typedef {import('./types.js').SchemaTree} SchemaTree @@ -57,7 +55,7 @@ export function schemaElement(schema, name) { * @returns {boolean} true if the element is required */ export function isRequired(schema, name) { - return schemaElement(schema, name).repetition_type === FieldRepetitionType.REQUIRED + return schemaElement(schema, name).repetition_type === 'REQUIRED' } /** @@ -71,7 +69,7 @@ export function getMaxRepetitionLevel(schema, parts) { let maxLevel = 0 parts.forEach((part, i) => { const element = schemaElement(schema, parts.slice(0, i + 1)) - if (element.repetition_type === FieldRepetitionType.REPEATED) { + if (element.repetition_type === 'REPEATED') { maxLevel += 1 } }) @@ -89,7 +87,7 @@ export function getMaxDefinitionLevel(schema, parts) { let maxLevel = 0 parts.forEach((part, i) => { const element = schemaElement(schema, parts.slice(0, i + 1)) - if (element.repetition_type !== FieldRepetitionType.REQUIRED) { + if (element.repetition_type !== 'REQUIRED') { maxLevel += 1 } }) diff --git a/src/types.d.ts b/src/types.d.ts index b5bb73e..21e8738 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -63,36 +63,34 @@ export enum ParquetType { FIXED_LEN_BYTE_ARRAY = 7, } -export enum FieldRepetitionType { - REQUIRED = 0, - OPTIONAL = 1, - REPEATED = 2, -} +export type FieldRepetitionType = + 'REQUIRED' | + 'OPTIONAL' | + 'REPEATED' -export enum ConvertedType { - UTF8 = 0, - MAP = 1, - MAP_KEY_VALUE = 2, - LIST = 3, - ENUM = 4, - DECIMAL = 5, - DATE = 6, - TIME_MILLIS = 7, - TIME_MICROS = 8, - TIMESTAMP_MILLIS = 9, - TIMESTAMP_MICROS = 10, - UINT_8 = 11, - UINT_16 = 12, - UINT_32 = 13, - UINT_64 = 14, - INT_8 = 15, - INT_16 = 16, - INT_32 = 17, - INT_64 = 18, - JSON = 19, - BSON = 20, - INTERVAL = 21, -} +export type ConvertedType = + 'UTF8' | + 'MAP' | + 'MAP_KEY_VALUE' | + 'LIST' | + 'ENUM' | + 'DECIMAL' | + 'DATE' | + 'TIME_MILLIS' | + 'TIME_MICROS' | + 'TIMESTAMP_MILLIS' | + 'TIMESTAMP_MICROS' | + 'UINT_8' | + 'UINT_16' | + 'UINT_32' | + 'UINT_64' | + 'INT_8' | + 'INT_16' | + 'INT_32' | + 'INT_64' | + 'JSON' | + 'BSON' | + 'INTERVAL' export interface RowGroup { columns: ColumnChunk[] @@ -135,16 +133,15 @@ export enum Encoding { BYTE_STREAM_SPLIT = 9, } -export enum CompressionCodec { - UNCOMPRESSED = 0, - SNAPPY = 1, - GZIP = 2, - LZO = 3, - BROTLI = 4, - LZ4 = 5, - ZSTD = 6, - LZ4_RAW = 7, -} +export type CompressionCodec = + 'UNCOMPRESSED' | + 'SNAPPY' | + 'GZIP' | + 'LZO' | + 'BROTLI' | + 'LZ4' | + 'ZSTD' | + 'LZ4_RAW' interface KeyValue { key: string diff --git a/test/metadata.test.js b/test/metadata.test.js index 2cb60f8..6d92f50 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -77,8 +77,17 @@ const addrtypeMetadata = { created_by: 'DuckDB', metadata_length: 149, schema: [ - { repetition_type: 0, name: 'duckdb_schema', num_children: 1 }, - { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 }, + { + repetition_type: 'REQUIRED', + name: 'duckdb_schema', + num_children: 1, + }, + { + type: 6, + repetition_type: 'OPTIONAL', + name: 'ADDRTYPE', + converted_type: 'UTF8', + }, ], num_rows: 10, row_groups: [ @@ -90,7 +99,7 @@ const addrtypeMetadata = { type: 6, encodings: [0, 8], path_in_schema: ['ADDRTYPE'], - codec: 1, + codec: 'SNAPPY', num_values: 10, total_uncompressed_size: 78, total_compressed_size: 82, @@ -118,13 +127,13 @@ const rowgroupsMetadata = { metadata_length: 1602, schema: [ { - repetition_type: 0, + repetition_type: 'REQUIRED', name: 'schema', num_children: 1, }, { type: 2, - repetition_type: 1, + repetition_type: 'OPTIONAL', name: 'numbers', }, ], @@ -136,7 +145,7 @@ const rowgroupsMetadata = { file_offset: 150, file_path: undefined, meta_data: { - codec: 1, + codec: 'SNAPPY', data_page_offset: 71, dictionary_page_offset: 4, encoding_stats: [ @@ -165,7 +174,7 @@ const rowgroupsMetadata = { { file_offset: 368, meta_data: { - codec: 1, + codec: 'SNAPPY', data_page_offset: 294, dictionary_page_offset: 248, encoding_stats: [ diff --git a/test/schema.test.js b/test/schema.test.js index c53dc93..b2effc9 100644 --- a/test/schema.test.js +++ b/test/schema.test.js @@ -1,5 +1,4 @@ import { describe, expect, it } from 'vitest' -import { FieldRepetitionType } from '../src/constants.js' import { getMaxDefinitionLevel, getMaxRepetitionLevel, @@ -9,10 +8,14 @@ import { } from '../src/schema.js' describe('Parquet schema utils', () => { + /** + * @typedef {import('../src/types.js').SchemaElement} SchemaElement + * @type {SchemaElement[]} + */ const schema = [ - { name: 'root', num_children: 2, repetition_type: FieldRepetitionType.REQUIRED }, - { name: 'child1', repetition_type: FieldRepetitionType.OPTIONAL }, - { name: 'child2', repetition_type: FieldRepetitionType.REPEATED }, + { name: 'root', num_children: 2, repetition_type: 'REQUIRED' }, + { name: 'child1', repetition_type: 'OPTIONAL' }, + { name: 'child2', repetition_type: 'REPEATED' }, ] describe('schemaElement', () => { diff --git a/test/schemaTree.test.js b/test/schemaTree.test.js index ce8a338..704dd53 100644 --- a/test/schemaTree.test.js +++ b/test/schemaTree.test.js @@ -25,9 +25,9 @@ const addrtypeSchema = { children: [], count: 1, element: { - converted_type: 0, + converted_type: 'UTF8', name: 'ADDRTYPE', - repetition_type: 1, + repetition_type: 'OPTIONAL', type: 6, }, }, @@ -36,7 +36,7 @@ const addrtypeSchema = { element: { name: 'duckdb_schema', num_children: 1, - repetition_type: 0, + repetition_type: 'REQUIRED', }, } @@ -48,7 +48,7 @@ const rowgroupsSchema = { count: 1, element: { name: 'numbers', - repetition_type: 1, + repetition_type: 'OPTIONAL', type: 2, }, }, @@ -57,6 +57,6 @@ const rowgroupsSchema = { element: { name: 'schema', num_children: 1, - repetition_type: 0, + repetition_type: 'REQUIRED', }, }