mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-02 10:06:38 +00:00
Encoding as string
This commit is contained in:
parent
e3b5fca883
commit
11f35c9e43
@ -1,4 +1,4 @@
|
||||
import { Encoding, PageType } from './constants.js'
|
||||
import { PageType } from './constants.js'
|
||||
import { convert } from './convert.js'
|
||||
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
|
||||
import { readDataPageV2 } from './datapageV2.js'
|
||||
@ -57,7 +57,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
|
||||
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schema, columnMetadata)
|
||||
valuesSeen += daph.num_values
|
||||
|
||||
const dictionaryEncoding = daph.encoding === Encoding.PLAIN_DICTIONARY || daph.encoding === Encoding.RLE_DICTIONARY
|
||||
const dictionaryEncoding = daph.encoding === 'PLAIN_DICTIONARY' || daph.encoding === 'RLE_DICTIONARY'
|
||||
|
||||
// construct output values: skip nulls and construct lists
|
||||
/** @type {any[]} */
|
||||
|
||||
@ -9,17 +9,18 @@ export const ParquetType = {
|
||||
FIXED_LEN_BYTE_ARRAY: 7,
|
||||
}
|
||||
|
||||
export const Encoding = {
|
||||
PLAIN: 0,
|
||||
PLAIN_DICTIONARY: 2,
|
||||
RLE: 3,
|
||||
BIT_PACKED: 4, // deprecated
|
||||
DELTA_BINARY_PACKED: 5,
|
||||
DELTA_LENGTH_BYTE_ARRAY: 6,
|
||||
DELTA_BYTE_ARRAY: 7,
|
||||
RLE_DICTIONARY: 8,
|
||||
BYTE_STREAM_SPLIT: 9,
|
||||
}
|
||||
export const Encoding = [
|
||||
'PLAIN',
|
||||
undefined,
|
||||
'PLAIN_DICTIONARY',
|
||||
'RLE',
|
||||
'BIT_PACKED', // deprecated
|
||||
'DELTA_BINARY_PACKED',
|
||||
'DELTA_LENGTH_BYTE_ARRAY',
|
||||
'DELTA_BYTE_ARRAY',
|
||||
'RLE_DICTIONARY',
|
||||
'BYTE_STREAM_SPLIT',
|
||||
]
|
||||
|
||||
export const FieldRepetitionType = [
|
||||
'REQUIRED',
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { Encoding, ParquetType } from './constants.js'
|
||||
import { ParquetType } from './constants.js'
|
||||
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
|
||||
import {
|
||||
getMaxDefinitionLevel,
|
||||
@ -61,16 +61,16 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
|
||||
|
||||
// read values based on encoding
|
||||
const nValues = daph.num_values - numNulls
|
||||
if (daph.encoding === Encoding.PLAIN) {
|
||||
if (daph.encoding === 'PLAIN') {
|
||||
const se = schemaElement(schema, columnMetadata.path_in_schema)
|
||||
const utf8 = se.converted_type === 'UTF8'
|
||||
const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8)
|
||||
values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value)
|
||||
offset += plainObj.byteLength
|
||||
} else if (
|
||||
daph.encoding === Encoding.PLAIN_DICTIONARY ||
|
||||
daph.encoding === Encoding.RLE_DICTIONARY ||
|
||||
daph.encoding === Encoding.RLE
|
||||
daph.encoding === 'PLAIN_DICTIONARY' ||
|
||||
daph.encoding === 'RLE_DICTIONARY' ||
|
||||
daph.encoding === 'RLE'
|
||||
) {
|
||||
// bit width is stored as single byte
|
||||
let bitWidth
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
import { decompressPage } from './column.js'
|
||||
import { Encoding } from './constants.js'
|
||||
import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
|
||||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, schemaElement } from './schema.js'
|
||||
import { readVarInt, readZigZag } from './thrift.js'
|
||||
@ -47,7 +46,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
|
||||
|
||||
// read values based on encoding
|
||||
const nValues = daph2.num_values - daph2.num_nulls
|
||||
if (daph2.encoding === Encoding.PLAIN) {
|
||||
if (daph2.encoding === 'PLAIN') {
|
||||
const se = schemaElement(schema, columnMetadata.path_in_schema)
|
||||
const utf8 = se.converted_type === 'UTF8'
|
||||
let page = compressedBytes.slice(offset)
|
||||
@ -57,7 +56,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
|
||||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
|
||||
const plainObj = readPlain(pageView, columnMetadata.type, nValues, 0, utf8)
|
||||
values = plainObj.value
|
||||
} else if (daph2.encoding === Encoding.RLE) {
|
||||
} else if (daph2.encoding === 'RLE') {
|
||||
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
|
||||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
|
||||
const bitWidth = 1
|
||||
@ -69,8 +68,8 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
|
||||
).value
|
||||
}
|
||||
} else if (
|
||||
daph2.encoding === Encoding.PLAIN_DICTIONARY ||
|
||||
daph2.encoding === Encoding.RLE_DICTIONARY
|
||||
daph2.encoding === 'PLAIN_DICTIONARY' ||
|
||||
daph2.encoding === 'RLE_DICTIONARY'
|
||||
) {
|
||||
compressedBytes = compressedBytes.subarray(offset)
|
||||
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
|
||||
@ -81,7 +80,7 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
|
||||
pageView, 1, bitWidth, uncompressedPageSize, nValues
|
||||
)
|
||||
values = value
|
||||
} else if (daph2.encoding === Encoding.DELTA_BINARY_PACKED) {
|
||||
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') {
|
||||
if (daph2.num_nulls) throw new Error('parquet delta-int not supported')
|
||||
const codec = daph2.is_compressed ? columnMetadata.codec : 'UNCOMPRESSED'
|
||||
const page = decompressPage(compressedBytes, uncompressedPageSize, codec, compressors)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { Encoding, ParquetType } from './constants.js'
|
||||
import { ParquetType } from './constants.js'
|
||||
import { readVarInt } from './thrift.js'
|
||||
|
||||
/**
|
||||
@ -213,7 +213,7 @@ export function widthFromMaxInt(value) {
|
||||
export function readData(dataView, encoding, offset, count, bitWidth) {
|
||||
const value = []
|
||||
let byteLength = 0
|
||||
if (encoding === Encoding.RLE) {
|
||||
if (encoding === 'RLE') {
|
||||
let seen = 0
|
||||
while (seen < count) {
|
||||
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import { Encoding } from './constants.js'
|
||||
import { deserializeTCompactProtocol } from './thrift.js'
|
||||
|
||||
/**
|
||||
@ -26,9 +27,9 @@ export function parquetHeader(arrayBuffer, offset) {
|
||||
const crc = header.field_4
|
||||
const data_page_header = header.field_5 && {
|
||||
num_values: header.field_5.field_1,
|
||||
encoding: header.field_5.field_2,
|
||||
definition_level_encoding: header.field_5.field_3,
|
||||
repetition_level_encoding: header.field_5.field_4,
|
||||
encoding: Encoding[header.field_5.field_2],
|
||||
definition_level_encoding: Encoding[header.field_5.field_3],
|
||||
repetition_level_encoding: Encoding[header.field_5.field_4],
|
||||
statistics: header.field_5.field_5 && {
|
||||
max: header.field_5.field_5.field_1,
|
||||
min: header.field_5.field_5.field_2,
|
||||
@ -41,14 +42,14 @@ export function parquetHeader(arrayBuffer, offset) {
|
||||
const index_page_header = header.field_6
|
||||
const dictionary_page_header = header.field_7 && {
|
||||
num_values: header.field_7.field_1,
|
||||
encoding: header.field_7.field_2,
|
||||
encoding: Encoding[header.field_7.field_2],
|
||||
is_sorted: header.field_7.field_3,
|
||||
}
|
||||
const data_page_header_v2 = header.field_8 && {
|
||||
num_values: header.field_8.field_1,
|
||||
num_nulls: header.field_8.field_2,
|
||||
num_rows: header.field_8.field_3,
|
||||
encoding: header.field_8.field_4,
|
||||
encoding: Encoding[header.field_8.field_4],
|
||||
definition_levels_byte_length: header.field_8.field_5,
|
||||
repetition_levels_byte_length: header.field_8.field_6,
|
||||
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default to true
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js'
|
||||
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType } from './constants.js'
|
||||
import { schemaTree } from './schema.js'
|
||||
import { deserializeTCompactProtocol } from './thrift.js'
|
||||
|
||||
@ -113,7 +113,7 @@ export function parquetMetadata(arrayBuffer) {
|
||||
file_offset: column.field_2,
|
||||
meta_data: column.field_3 && {
|
||||
type: column.field_3.field_1,
|
||||
encodings: column.field_3.field_2,
|
||||
encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encoding[e]),
|
||||
path_in_schema: column.field_3.field_3,
|
||||
codec: CompressionCodec[column.field_3.field_4],
|
||||
num_values: column.field_3.field_5,
|
||||
@ -131,7 +131,7 @@ export function parquetMetadata(arrayBuffer) {
|
||||
},
|
||||
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
|
||||
page_type: encodingStat.field_1,
|
||||
encoding: encodingStat.field_2,
|
||||
encoding: Encoding[encodingStat.field_2],
|
||||
count: encodingStat.field_3,
|
||||
})),
|
||||
},
|
||||
|
||||
21
src/types.d.ts
vendored
21
src/types.d.ts
vendored
@ -113,17 +113,16 @@ export interface ColumnMetaData {
|
||||
encoding_stats?: PageEncodingStats[]
|
||||
}
|
||||
|
||||
export enum Encoding {
|
||||
PLAIN = 0,
|
||||
PLAIN_DICTIONARY = 2,
|
||||
RLE = 3,
|
||||
BIT_PACKED = 4, // deprecated
|
||||
DELTA_BINARY_PACKED = 5,
|
||||
DELTA_LENGTH_BYTE_ARRAY = 6,
|
||||
DELTA_BYTE_ARRAY = 7,
|
||||
RLE_DICTIONARY = 8,
|
||||
BYTE_STREAM_SPLIT = 9,
|
||||
}
|
||||
export type Encoding =
|
||||
'PLAIN' |
|
||||
'PLAIN_DICTIONARY' |
|
||||
'RLE' |
|
||||
'BIT_PACKED' | // deprecated
|
||||
'DELTA_BINARY_PACKED' |
|
||||
'DELTA_LENGTH_BYTE_ARRAY' |
|
||||
'DELTA_BYTE_ARRAY' |
|
||||
'RLE_DICTIONARY' |
|
||||
'BYTE_STREAM_SPLIT'
|
||||
|
||||
export type CompressionCodec =
|
||||
'UNCOMPRESSED' |
|
||||
|
||||
@ -23,7 +23,7 @@
|
||||
"file_offset": 0,
|
||||
"meta_data": {
|
||||
"type": 6,
|
||||
"encodings": [0, 8],
|
||||
"encodings": ["PLAIN", "RLE_DICTIONARY"],
|
||||
"path_in_schema": ["ADDRTYPE"],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 10,
|
||||
|
||||
@ -11,8 +11,8 @@
|
||||
"codec": "GZIP",
|
||||
"data_page_offset": 4,
|
||||
"encodings": [
|
||||
0,
|
||||
3
|
||||
"PLAIN",
|
||||
"RLE"
|
||||
],
|
||||
"num_values": 513,
|
||||
"path_in_schema": [
|
||||
|
||||
@ -17,10 +17,7 @@
|
||||
"meta_data": {
|
||||
"codec": "SNAPPY",
|
||||
"data_page_offset": 4,
|
||||
"encodings": [
|
||||
0,
|
||||
8
|
||||
],
|
||||
"encodings": ["PLAIN", "RLE_DICTIONARY"],
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["a"],
|
||||
"statistics": {
|
||||
@ -38,7 +35,7 @@
|
||||
"meta_data": {
|
||||
"codec": "SNAPPY",
|
||||
"data_page_offset": 67,
|
||||
"encodings": [5],
|
||||
"encodings": ["DELTA_BINARY_PACKED"],
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["b"],
|
||||
"statistics": {
|
||||
@ -56,10 +53,7 @@
|
||||
"meta_data": {
|
||||
"codec": "SNAPPY",
|
||||
"data_page_offset": 116,
|
||||
"encodings": [
|
||||
0,
|
||||
8
|
||||
],
|
||||
"encodings": ["PLAIN", "RLE_DICTIONARY"],
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["c"],
|
||||
"statistics": {
|
||||
@ -77,7 +71,7 @@
|
||||
"meta_data": {
|
||||
"codec": "SNAPPY",
|
||||
"data_page_offset": 204,
|
||||
"encodings": [3],
|
||||
"encodings": ["RLE"],
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["d"],
|
||||
"statistics": {
|
||||
@ -95,10 +89,7 @@
|
||||
"meta_data": {
|
||||
"codec": "SNAPPY",
|
||||
"data_page_offset": 243,
|
||||
"encodings": [
|
||||
0,
|
||||
8
|
||||
],
|
||||
"encodings": ["PLAIN", "RLE_DICTIONARY"],
|
||||
"num_values": 10,
|
||||
"path_in_schema": [
|
||||
"e",
|
||||
|
||||
@ -17,7 +17,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 4,
|
||||
"encodings": [ 0, 4 ],
|
||||
"encodings": [ "PLAIN", "BIT_PACKED" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "ID" ],
|
||||
"statistics": {
|
||||
@ -35,7 +35,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 53,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Array", "list", "element" ],
|
||||
"statistics": {
|
||||
@ -53,7 +53,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 102,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 3,
|
||||
"path_in_schema": [
|
||||
"int_array_array",
|
||||
@ -77,7 +77,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 157,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Map", "map", "key" ],
|
||||
"statistics": {
|
||||
@ -95,7 +95,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 204,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Map", "map", "value" ],
|
||||
"statistics": {
|
||||
@ -113,7 +113,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 253,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 4,
|
||||
"path_in_schema": [
|
||||
"int_map_array",
|
||||
@ -137,7 +137,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 302,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 4,
|
||||
"path_in_schema": [
|
||||
"int_map_array",
|
||||
@ -161,7 +161,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 353,
|
||||
"encodings": [ 0, 4 ],
|
||||
"encodings": [ "PLAIN", "BIT_PACKED" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "a" ],
|
||||
"statistics": {
|
||||
@ -179,7 +179,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 390,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "B", "list", "element" ],
|
||||
"statistics": {
|
||||
@ -197,7 +197,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 439,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [
|
||||
"nested_Struct",
|
||||
@ -224,7 +224,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 490,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [
|
||||
"nested_Struct",
|
||||
@ -251,7 +251,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 566,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "G", "map", "key" ],
|
||||
"statistics": {
|
||||
@ -267,7 +267,7 @@
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 599,
|
||||
"encodings": [ 0, 3 ],
|
||||
"encodings": [ "PLAIN", "RLE" ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [
|
||||
"nested_Struct",
|
||||
|
||||
@ -25,10 +25,10 @@
|
||||
"data_page_offset": 71,
|
||||
"dictionary_page_offset": 4,
|
||||
"encoding_stats": [
|
||||
{ "count": 1, "encoding": 0, "page_type": 2 },
|
||||
{ "count": 1, "encoding": 8, "page_type": 0 }
|
||||
{ "count": 1, "encoding": "PLAIN", "page_type": 2 },
|
||||
{ "count": 1, "encoding": "RLE_DICTIONARY", "page_type": 0 }
|
||||
],
|
||||
"encodings": [0, 3, 8],
|
||||
"encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"],
|
||||
"num_values": 10,
|
||||
"path_in_schema": ["numbers"],
|
||||
"statistics": {
|
||||
@ -54,10 +54,10 @@
|
||||
"data_page_offset": 294,
|
||||
"dictionary_page_offset": 248,
|
||||
"encoding_stats": [
|
||||
{ "count": 1, "encoding": 0, "page_type": 2 },
|
||||
{ "count": 1, "encoding": 8, "page_type": 0 }
|
||||
{ "count": 1, "encoding": "PLAIN", "page_type": 2 },
|
||||
{ "count": 1, "encoding": "RLE_DICTIONARY", "page_type": 0 }
|
||||
],
|
||||
"encodings": [0, 3, 8],
|
||||
"encodings": ["PLAIN", "RLE", "RLE_DICTIONARY"],
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["numbers"],
|
||||
"statistics": {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user