Encoding as string

This commit is contained in:
Kenny Daniel 2024-02-27 10:33:17 -08:00
parent e3b5fca883
commit 11f35c9e43
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
13 changed files with 72 additions and 81 deletions

@ -1,4 +1,4 @@
import { Encoding, PageType } from './constants.js'
import { PageType } from './constants.js'
import { convert } from './convert.js'
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPageV2 } from './datapageV2.js'
@ -57,7 +57,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schema, columnMetadata)
valuesSeen += daph.num_values
const dictionaryEncoding = daph.encoding === Encoding.PLAIN_DICTIONARY || daph.encoding === Encoding.RLE_DICTIONARY
const dictionaryEncoding = daph.encoding === 'PLAIN_DICTIONARY' || daph.encoding === 'RLE_DICTIONARY'
// construct output values: skip nulls and construct lists
/** @type {any[]} */

@ -9,17 +9,18 @@ export const ParquetType = {
FIXED_LEN_BYTE_ARRAY: 7,
}
export const Encoding = {
PLAIN: 0,
PLAIN_DICTIONARY: 2,
RLE: 3,
BIT_PACKED: 4, // deprecated
DELTA_BINARY_PACKED: 5,
DELTA_LENGTH_BYTE_ARRAY: 6,
DELTA_BYTE_ARRAY: 7,
RLE_DICTIONARY: 8,
BYTE_STREAM_SPLIT: 9,
}
export const Encoding = [
'PLAIN',
undefined,
'PLAIN_DICTIONARY',
'RLE',
'BIT_PACKED', // deprecated
'DELTA_BINARY_PACKED',
'DELTA_LENGTH_BYTE_ARRAY',
'DELTA_BYTE_ARRAY',
'RLE_DICTIONARY',
'BYTE_STREAM_SPLIT',
]
export const FieldRepetitionType = [
'REQUIRED',

@ -1,4 +1,4 @@
import { Encoding, ParquetType } from './constants.js'
import { ParquetType } from './constants.js'
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import {
getMaxDefinitionLevel,
@ -61,16 +61,16 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
// read values based on encoding
const nValues = daph.num_values - numNulls
if (daph.encoding === Encoding.PLAIN) {
if (daph.encoding === 'PLAIN') {
const se = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = se.converted_type === 'UTF8'
const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8)
values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value)
offset += plainObj.byteLength
} else if (
daph.encoding === Encoding.PLAIN_DICTIONARY ||
daph.encoding === Encoding.RLE_DICTIONARY ||
daph.encoding === Encoding.RLE
daph.encoding === 'PLAIN_DICTIONARY' ||
daph.encoding === 'RLE_DICTIONARY' ||
daph.encoding === 'RLE'
) {
// bit width is stored as single byte
let bitWidth

@ -1,5 +1,4 @@
import { decompressPage } from './column.js'
import { Encoding } from './constants.js'
import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel, schemaElement } from './schema.js'
import { readVarInt, readZigZag } from './thrift.js'
@ -47,7 +46,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
// read values based on encoding
const nValues = daph2.num_values - daph2.num_nulls
if (daph2.encoding === Encoding.PLAIN) {
if (daph2.encoding === 'PLAIN') {
const se = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = se.converted_type === 'UTF8'
let page = compressedBytes.slice(offset)
@ -57,7 +56,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const plainObj = readPlain(pageView, columnMetadata.type, nValues, 0, utf8)
values = plainObj.value
} else if (daph2.encoding === Encoding.RLE) {
} else if (daph2.encoding === 'RLE') {
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const bitWidth = 1
@ -69,8 +68,8 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
).value
}
} else if (
daph2.encoding === Encoding.PLAIN_DICTIONARY ||
daph2.encoding === Encoding.RLE_DICTIONARY
daph2.encoding === 'PLAIN_DICTIONARY' ||
daph2.encoding === 'RLE_DICTIONARY'
) {
compressedBytes = compressedBytes.subarray(offset)
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
@ -81,7 +80,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
pageView, 1, bitWidth, uncompressedPageSize, nValues
)
values = value
} else if (daph2.encoding === Encoding.DELTA_BINARY_PACKED) {
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') {
if (daph2.num_nulls) throw new Error('parquet delta-int not supported')
const codec = daph2.is_compressed ? columnMetadata.codec : 'UNCOMPRESSED'
const page = decompressPage(compressedBytes, uncompressedPageSize, codec, compressors)

@ -1,4 +1,4 @@
import { Encoding, ParquetType } from './constants.js'
import { ParquetType } from './constants.js'
import { readVarInt } from './thrift.js'
/**
@ -213,7 +213,7 @@ export function widthFromMaxInt(value) {
export function readData(dataView, encoding, offset, count, bitWidth) {
const value = []
let byteLength = 0
if (encoding === Encoding.RLE) {
if (encoding === 'RLE') {
let seen = 0
while (seen < count) {
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)

@ -1,3 +1,4 @@
import { Encoding } from './constants.js'
import { deserializeTCompactProtocol } from './thrift.js'
/**
@ -26,9 +27,9 @@ export function parquetHeader(arrayBuffer, offset) {
const crc = header.field_4
const data_page_header = header.field_5 && {
num_values: header.field_5.field_1,
encoding: header.field_5.field_2,
definition_level_encoding: header.field_5.field_3,
repetition_level_encoding: header.field_5.field_4,
encoding: Encoding[header.field_5.field_2],
definition_level_encoding: Encoding[header.field_5.field_3],
repetition_level_encoding: Encoding[header.field_5.field_4],
statistics: header.field_5.field_5 && {
max: header.field_5.field_5.field_1,
min: header.field_5.field_5.field_2,
@ -41,14 +42,14 @@ export function parquetHeader(arrayBuffer, offset) {
const index_page_header = header.field_6
const dictionary_page_header = header.field_7 && {
num_values: header.field_7.field_1,
encoding: header.field_7.field_2,
encoding: Encoding[header.field_7.field_2],
is_sorted: header.field_7.field_3,
}
const data_page_header_v2 = header.field_8 && {
num_values: header.field_8.field_1,
num_nulls: header.field_8.field_2,
num_rows: header.field_8.field_3,
encoding: header.field_8.field_4,
encoding: Encoding[header.field_8.field_4],
definition_levels_byte_length: header.field_8.field_5,
repetition_levels_byte_length: header.field_8.field_6,
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default to true

@ -1,4 +1,4 @@
import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js'
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType } from './constants.js'
import { schemaTree } from './schema.js'
import { deserializeTCompactProtocol } from './thrift.js'
@ -113,7 +113,7 @@ export function parquetMetadata(arrayBuffer) {
file_offset: column.field_2,
meta_data: column.field_3 && {
type: column.field_3.field_1,
encodings: column.field_3.field_2,
encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encoding[e]),
path_in_schema: column.field_3.field_3,
codec: CompressionCodec[column.field_3.field_4],
num_values: column.field_3.field_5,
@ -131,7 +131,7 @@ export function parquetMetadata(arrayBuffer) {
},
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
page_type: encodingStat.field_1,
encoding: encodingStat.field_2,
encoding: Encoding[encodingStat.field_2],
count: encodingStat.field_3,
})),
},

21
src/types.d.ts vendored

@ -113,17 +113,16 @@ export interface ColumnMetaData {
encoding_stats?: PageEncodingStats[]
}
export enum Encoding {
PLAIN = 0,
PLAIN_DICTIONARY = 2,
RLE = 3,
BIT_PACKED = 4, // deprecated
DELTA_BINARY_PACKED = 5,
DELTA_LENGTH_BYTE_ARRAY = 6,
DELTA_BYTE_ARRAY = 7,
RLE_DICTIONARY = 8,
BYTE_STREAM_SPLIT = 9,
}
export type Encoding =
'PLAIN' |
'PLAIN_DICTIONARY' |
'RLE' |
'BIT_PACKED' | // deprecated
'DELTA_BINARY_PACKED' |
'DELTA_LENGTH_BYTE_ARRAY' |
'DELTA_BYTE_ARRAY' |
'RLE_DICTIONARY' |
'BYTE_STREAM_SPLIT'
export type CompressionCodec =
'UNCOMPRESSED' |

@ -23,7 +23,7 @@
"file_offset": 0,
"meta_data": {
"type": 6,
"encodings": [0, 8],
"encodings": ["PLAIN", "RLE_DICTIONARY"],
"path_in_schema": ["ADDRTYPE"],
"codec": "SNAPPY",
"num_values": 10,

@ -11,8 +11,8 @@
"codec": "GZIP",
"data_page_offset": 4,
"encodings": [
0,
3
"PLAIN",
"RLE"
],
"num_values": 513,
"path_in_schema": [

@ -17,10 +17,7 @@
"meta_data": {
"codec": "SNAPPY",
"data_page_offset": 4,
"encodings": [
0,
8
],
"encodings": ["PLAIN", "RLE_DICTIONARY"],
"num_values": 5,
"path_in_schema": ["a"],
"statistics": {
@ -38,7 +35,7 @@
"meta_data": {
"codec": "SNAPPY",
"data_page_offset": 67,
"encodings": [5],
"encodings": ["DELTA_BINARY_PACKED"],
"num_values": 5,
"path_in_schema": ["b"],
"statistics": {
@ -56,10 +53,7 @@
"meta_data": {
"codec": "SNAPPY",
"data_page_offset": 116,
"encodings": [
0,
8
],
"encodings": ["PLAIN", "RLE_DICTIONARY"],
"num_values": 5,
"path_in_schema": ["c"],
"statistics": {
@ -77,7 +71,7 @@
"meta_data": {
"codec": "SNAPPY",
"data_page_offset": 204,
"encodings": [3],
"encodings": ["RLE"],
"num_values": 5,
"path_in_schema": ["d"],
"statistics": {
@ -95,10 +89,7 @@
"meta_data": {
"codec": "SNAPPY",
"data_page_offset": 243,
"encodings": [
0,
8
],
"encodings": ["PLAIN", "RLE_DICTIONARY"],
"num_values": 10,
"path_in_schema": [
"e",

@ -17,7 +17,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 4,
"encodings": [ 0, 4 ],
"encodings": [ "PLAIN", "BIT_PACKED" ],
"num_values": 1,
"path_in_schema": [ "ID" ],
"statistics": {
@ -35,7 +35,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 53,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [ "Int_Array", "list", "element" ],
"statistics": {
@ -53,7 +53,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 102,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 3,
"path_in_schema": [
"int_array_array",
@ -77,7 +77,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 157,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [ "Int_Map", "map", "key" ],
"statistics": {
@ -95,7 +95,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 204,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [ "Int_Map", "map", "value" ],
"statistics": {
@ -113,7 +113,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 253,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 4,
"path_in_schema": [
"int_map_array",
@ -137,7 +137,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 302,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 4,
"path_in_schema": [
"int_map_array",
@ -161,7 +161,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 353,
"encodings": [ 0, 4 ],
"encodings": [ "PLAIN", "BIT_PACKED" ],
"num_values": 1,
"path_in_schema": [ "nested_Struct", "a" ],
"statistics": {
@ -179,7 +179,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 390,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [ "nested_Struct", "B", "list", "element" ],
"statistics": {
@ -197,7 +197,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 439,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [
"nested_Struct",
@ -224,7 +224,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 490,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [
"nested_Struct",
@ -251,7 +251,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 566,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [ "nested_Struct", "G", "map", "key" ],
"statistics": {
@ -267,7 +267,7 @@
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 599,
"encodings": [ 0, 3 ],
"encodings": [ "PLAIN", "RLE" ],
"num_values": 1,
"path_in_schema": [
"nested_Struct",

@ -25,10 +25,10 @@
"data_page_offset": 71,
"dictionary_page_offset": 4,
"encoding_stats": [
{ "count": 1, "encoding": 0, "page_type": 2 },
{ "count": 1, "encoding": 8, "page_type": 0 }
{ "count": 1, "encoding": "PLAIN", "page_type": 2 },
{ "count": 1, "encoding": "RLE_DICTIONARY", "page_type": 0 }
],
"encodings": [0, 3, 8],
"encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"],
"num_values": 10,
"path_in_schema": ["numbers"],
"statistics": {
@ -54,10 +54,10 @@
"data_page_offset": 294,
"dictionary_page_offset": 248,
"encoding_stats": [
{ "count": 1, "encoding": 0, "page_type": 2 },
{ "count": 1, "encoding": 8, "page_type": 0 }
{ "count": 1, "encoding": "PLAIN", "page_type": 2 },
{ "count": 1, "encoding": "RLE_DICTIONARY", "page_type": 0 }
],
"encodings": [0, 3, 8],
"encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"],
"num_values": 5,
"path_in_schema": ["numbers"],
"statistics": {