From 11f35c9e4309083a1102add5726bc3a4ebb63c67 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 27 Feb 2024 10:33:17 -0800 Subject: [PATCH] Encoding as string --- src/column.js | 4 +-- src/constants.js | 23 ++++++++-------- src/datapage.js | 10 +++---- src/datapageV2.js | 11 ++++---- src/encoding.js | 4 +-- src/header.js | 11 ++++---- src/metadata.js | 6 ++--- src/types.d.ts | 21 +++++++-------- .../addrtype-missing-value.metadata.json | 2 +- .../concatenated_gzip_members.metadata.json | 4 +-- test/files/datapage_v2.snappy.metadata.json | 19 ++++---------- test/files/nonnullable.impala.metadata.json | 26 +++++++++---------- test/files/rowgroups.metadata.json | 12 ++++----- 13 files changed, 72 insertions(+), 81 deletions(-) diff --git a/src/column.js b/src/column.js index a428f1a..d5e2456 100644 --- a/src/column.js +++ b/src/column.js @@ -1,4 +1,4 @@ -import { Encoding, PageType } from './constants.js' +import { PageType } from './constants.js' import { convert } from './convert.js' import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js' import { readDataPageV2 } from './datapageV2.js' @@ -57,7 +57,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schema, columnMetadata) valuesSeen += daph.num_values - const dictionaryEncoding = daph.encoding === Encoding.PLAIN_DICTIONARY || daph.encoding === Encoding.RLE_DICTIONARY + const dictionaryEncoding = daph.encoding === 'PLAIN_DICTIONARY' || daph.encoding === 'RLE_DICTIONARY' // construct output values: skip nulls and construct lists /** @type {any[]} */ diff --git a/src/constants.js b/src/constants.js index 356cf8a..fc23fa8 100644 --- a/src/constants.js +++ b/src/constants.js @@ -9,17 +9,18 @@ export const ParquetType = { FIXED_LEN_BYTE_ARRAY: 7, } -export const Encoding = { - PLAIN: 0, - PLAIN_DICTIONARY: 2, - RLE: 3, - BIT_PACKED: 4, // deprecated - DELTA_BINARY_PACKED: 5, - DELTA_LENGTH_BYTE_ARRAY: 6, - DELTA_BYTE_ARRAY: 7, - RLE_DICTIONARY: 8, - BYTE_STREAM_SPLIT: 9, -} +export const Encoding = [ + 'PLAIN', + undefined, + 'PLAIN_DICTIONARY', + 'RLE', + 'BIT_PACKED', // deprecated + 'DELTA_BINARY_PACKED', + 'DELTA_LENGTH_BYTE_ARRAY', + 'DELTA_BYTE_ARRAY', + 'RLE_DICTIONARY', + 'BYTE_STREAM_SPLIT', +] export const FieldRepetitionType = [ 'REQUIRED', diff --git a/src/datapage.js b/src/datapage.js index 410f14d..44e1281 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -1,4 +1,4 @@ -import { Encoding, ParquetType } from './constants.js' +import { ParquetType } from './constants.js' import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' import { getMaxDefinitionLevel, @@ -61,16 +61,16 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { // read values based on encoding const nValues = daph.num_values - numNulls - if (daph.encoding === Encoding.PLAIN) { + if (daph.encoding === 'PLAIN') { const se = schemaElement(schema, columnMetadata.path_in_schema) const utf8 = se.converted_type === 'UTF8' const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8) values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value) offset += plainObj.byteLength } else if ( - daph.encoding === Encoding.PLAIN_DICTIONARY || - daph.encoding === Encoding.RLE_DICTIONARY || - daph.encoding === Encoding.RLE + daph.encoding === 'PLAIN_DICTIONARY' || + daph.encoding === 'RLE_DICTIONARY' || + daph.encoding === 'RLE' ) { // bit width is stored as single byte let bitWidth diff --git a/src/datapageV2.js b/src/datapageV2.js index d715b92..4f1d51e 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -1,5 +1,4 @@ import { decompressPage } from './column.js' -import { Encoding } from './constants.js' import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' import { getMaxDefinitionLevel, getMaxRepetitionLevel, schemaElement } from './schema.js' import { readVarInt, readZigZag } from './thrift.js' @@ -47,7 +46,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp // read values based on encoding const nValues = daph2.num_values - daph2.num_nulls - if (daph2.encoding === Encoding.PLAIN) { + if (daph2.encoding === 'PLAIN') { const se = schemaElement(schema, columnMetadata.path_in_schema) const utf8 = se.converted_type === 'UTF8' let page = compressedBytes.slice(offset) @@ -57,7 +56,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) const plainObj = readPlain(pageView, columnMetadata.type, nValues, 0, utf8) values = plainObj.value - } else if (daph2.encoding === Encoding.RLE) { + } else if (daph2.encoding === 'RLE') { const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) const bitWidth = 1 @@ -69,8 +68,8 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp ).value } } else if ( - daph2.encoding === Encoding.PLAIN_DICTIONARY || - daph2.encoding === Encoding.RLE_DICTIONARY + daph2.encoding === 'PLAIN_DICTIONARY' || + daph2.encoding === 'RLE_DICTIONARY' ) { compressedBytes = compressedBytes.subarray(offset) const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) @@ -81,7 +80,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp pageView, 1, bitWidth, uncompressedPageSize, nValues ) values = value - } else if (daph2.encoding === Encoding.DELTA_BINARY_PACKED) { + } else if (daph2.encoding === 'DELTA_BINARY_PACKED') { if (daph2.num_nulls) throw new Error('parquet delta-int not supported') const codec = daph2.is_compressed ? columnMetadata.codec : 'UNCOMPRESSED' const page = decompressPage(compressedBytes, uncompressedPageSize, codec, compressors) diff --git a/src/encoding.js b/src/encoding.js index ad56349..8638535 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -1,4 +1,4 @@ -import { Encoding, ParquetType } from './constants.js' +import { ParquetType } from './constants.js' import { readVarInt } from './thrift.js' /** @@ -213,7 +213,7 @@ export function widthFromMaxInt(value) { export function readData(dataView, encoding, offset, count, bitWidth) { const value = [] let byteLength = 0 - if (encoding === Encoding.RLE) { + if (encoding === 'RLE') { let seen = 0 while (seen < count) { const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count) diff --git a/src/header.js b/src/header.js index 171017f..81255ef 100644 --- a/src/header.js +++ b/src/header.js @@ -1,3 +1,4 @@ +import { Encoding } from './constants.js' import { deserializeTCompactProtocol } from './thrift.js' /** @@ -26,9 +27,9 @@ export function parquetHeader(arrayBuffer, offset) { const crc = header.field_4 const data_page_header = header.field_5 && { num_values: header.field_5.field_1, - encoding: header.field_5.field_2, - definition_level_encoding: header.field_5.field_3, - repetition_level_encoding: header.field_5.field_4, + encoding: Encoding[header.field_5.field_2], + definition_level_encoding: Encoding[header.field_5.field_3], + repetition_level_encoding: Encoding[header.field_5.field_4], statistics: header.field_5.field_5 && { max: header.field_5.field_5.field_1, min: header.field_5.field_5.field_2, @@ -41,14 +42,14 @@ export function parquetHeader(arrayBuffer, offset) { const index_page_header = header.field_6 const dictionary_page_header = header.field_7 && { num_values: header.field_7.field_1, - encoding: header.field_7.field_2, + encoding: Encoding[header.field_7.field_2], is_sorted: header.field_7.field_3, } const data_page_header_v2 = header.field_8 && { num_values: header.field_8.field_1, num_nulls: header.field_8.field_2, num_rows: header.field_8.field_3, - encoding: header.field_8.field_4, + encoding: Encoding[header.field_8.field_4], definition_levels_byte_length: header.field_8.field_5, repetition_levels_byte_length: header.field_8.field_6, is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default to true diff --git a/src/metadata.js b/src/metadata.js index b920d08..22e982d 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,4 +1,4 @@ -import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js' +import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType } from './constants.js' import { schemaTree } from './schema.js' import { deserializeTCompactProtocol } from './thrift.js' @@ -113,7 +113,7 @@ export function parquetMetadata(arrayBuffer) { file_offset: column.field_2, meta_data: column.field_3 && { type: column.field_3.field_1, - encodings: column.field_3.field_2, + encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encoding[e]), path_in_schema: column.field_3.field_3, codec: CompressionCodec[column.field_3.field_4], num_values: column.field_3.field_5, @@ -131,7 +131,7 @@ export function parquetMetadata(arrayBuffer) { }, encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({ page_type: encodingStat.field_1, - encoding: encodingStat.field_2, + encoding: Encoding[encodingStat.field_2], count: encodingStat.field_3, })), }, diff --git a/src/types.d.ts b/src/types.d.ts index 4f1396f..7c6655e 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -113,17 +113,16 @@ export interface ColumnMetaData { encoding_stats?: PageEncodingStats[] } -export enum Encoding { - PLAIN = 0, - PLAIN_DICTIONARY = 2, - RLE = 3, - BIT_PACKED = 4, // deprecated - DELTA_BINARY_PACKED = 5, - DELTA_LENGTH_BYTE_ARRAY = 6, - DELTA_BYTE_ARRAY = 7, - RLE_DICTIONARY = 8, - BYTE_STREAM_SPLIT = 9, -} +export type Encoding = + 'PLAIN' | + 'PLAIN_DICTIONARY' | + 'RLE' | + 'BIT_PACKED' | // deprecated + 'DELTA_BINARY_PACKED' | + 'DELTA_LENGTH_BYTE_ARRAY' | + 'DELTA_BYTE_ARRAY' | + 'RLE_DICTIONARY' | + 'BYTE_STREAM_SPLIT' export type CompressionCodec = 'UNCOMPRESSED' | diff --git a/test/files/addrtype-missing-value.metadata.json b/test/files/addrtype-missing-value.metadata.json index 14e7beb..fa0b14b 100644 --- a/test/files/addrtype-missing-value.metadata.json +++ b/test/files/addrtype-missing-value.metadata.json @@ -23,7 +23,7 @@ "file_offset": 0, "meta_data": { "type": 6, - "encodings": [0, 8], + "encodings": ["PLAIN", "RLE_DICTIONARY"], "path_in_schema": ["ADDRTYPE"], "codec": "SNAPPY", "num_values": 10, diff --git a/test/files/concatenated_gzip_members.metadata.json b/test/files/concatenated_gzip_members.metadata.json index 29da81a..0e21c6c 100644 --- a/test/files/concatenated_gzip_members.metadata.json +++ b/test/files/concatenated_gzip_members.metadata.json @@ -11,8 +11,8 @@ "codec": "GZIP", "data_page_offset": 4, "encodings": [ - 0, - 3 + "PLAIN", + "RLE" ], "num_values": 513, "path_in_schema": [ diff --git a/test/files/datapage_v2.snappy.metadata.json b/test/files/datapage_v2.snappy.metadata.json index 1527643..d38e215 100644 --- a/test/files/datapage_v2.snappy.metadata.json +++ b/test/files/datapage_v2.snappy.metadata.json @@ -17,10 +17,7 @@ "meta_data": { "codec": "SNAPPY", "data_page_offset": 4, - "encodings": [ - 0, - 8 - ], + "encodings": ["PLAIN", "RLE_DICTIONARY"], "num_values": 5, "path_in_schema": ["a"], "statistics": { @@ -38,7 +35,7 @@ "meta_data": { "codec": "SNAPPY", "data_page_offset": 67, - "encodings": [5], + "encodings": ["DELTA_BINARY_PACKED"], "num_values": 5, "path_in_schema": ["b"], "statistics": { @@ -56,10 +53,7 @@ "meta_data": { "codec": "SNAPPY", "data_page_offset": 116, - "encodings": [ - 0, - 8 - ], + "encodings": ["PLAIN", "RLE_DICTIONARY"], "num_values": 5, "path_in_schema": ["c"], "statistics": { @@ -77,7 +71,7 @@ "meta_data": { "codec": "SNAPPY", "data_page_offset": 204, - "encodings": [3], + "encodings": ["RLE"], "num_values": 5, "path_in_schema": ["d"], "statistics": { @@ -95,10 +89,7 @@ "meta_data": { "codec": "SNAPPY", "data_page_offset": 243, - "encodings": [ - 0, - 8 - ], + "encodings": ["PLAIN", "RLE_DICTIONARY"], "num_values": 10, "path_in_schema": [ "e", diff --git a/test/files/nonnullable.impala.metadata.json b/test/files/nonnullable.impala.metadata.json index 826cfa2..654b49e 100644 --- a/test/files/nonnullable.impala.metadata.json +++ b/test/files/nonnullable.impala.metadata.json @@ -17,7 +17,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 4, - "encodings": [ 0, 4 ], + "encodings": [ "PLAIN", "BIT_PACKED" ], "num_values": 1, "path_in_schema": [ "ID" ], "statistics": { @@ -35,7 +35,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 53, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "Int_Array", "list", "element" ], "statistics": { @@ -53,7 +53,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 102, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 3, "path_in_schema": [ "int_array_array", @@ -77,7 +77,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 157, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "Int_Map", "map", "key" ], "statistics": { @@ -95,7 +95,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 204, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "Int_Map", "map", "value" ], "statistics": { @@ -113,7 +113,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 253, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 4, "path_in_schema": [ "int_map_array", @@ -137,7 +137,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 302, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 4, "path_in_schema": [ "int_map_array", @@ -161,7 +161,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 353, - "encodings": [ 0, 4 ], + "encodings": [ "PLAIN", "BIT_PACKED" ], "num_values": 1, "path_in_schema": [ "nested_Struct", "a" ], "statistics": { @@ -179,7 +179,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 390, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "nested_Struct", "B", "list", "element" ], "statistics": { @@ -197,7 +197,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 439, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "nested_Struct", @@ -224,7 +224,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 490, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "nested_Struct", @@ -251,7 +251,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 566, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "nested_Struct", "G", "map", "key" ], "statistics": { @@ -267,7 +267,7 @@ "meta_data": { "codec": "UNCOMPRESSED", "data_page_offset": 599, - "encodings": [ 0, 3 ], + "encodings": [ "PLAIN", "RLE" ], "num_values": 1, "path_in_schema": [ "nested_Struct", diff --git a/test/files/rowgroups.metadata.json b/test/files/rowgroups.metadata.json index 1560d55..1c931a1 100644 --- a/test/files/rowgroups.metadata.json +++ b/test/files/rowgroups.metadata.json @@ -25,10 +25,10 @@ "data_page_offset": 71, "dictionary_page_offset": 4, "encoding_stats": [ - { "count": 1, "encoding": 0, "page_type": 2 }, - { "count": 1, "encoding": 8, "page_type": 0 } + { "count": 1, "encoding": "PLAIN", "page_type": 2 }, + { "count": 1, "encoding": "RLE_DICTIONARY", "page_type": 0 } ], - "encodings": [0, 3, 8], + "encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"], "num_values": 10, "path_in_schema": ["numbers"], "statistics": { @@ -54,10 +54,10 @@ "data_page_offset": 294, "dictionary_page_offset": 248, "encoding_stats": [ - { "count": 1, "encoding": 0, "page_type": 2 }, - { "count": 1, "encoding": 8, "page_type": 0 } + { "count": 1, "encoding": "PLAIN", "page_type": 2 }, + { "count": 1, "encoding": "RLE_DICTIONARY", "page_type": 0 } ], - "encodings": [0, 3, 8], + "encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"], "num_values": 5, "path_in_schema": ["numbers"], "statistics": {