mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-05 22:41:55 +00:00
Human readable metadata types
This commit is contained in:
parent
17f7ace840
commit
8f7cd07734
@ -1,4 +1,4 @@
|
||||
import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js'
|
||||
import { Encoding, PageType } from './constants.js'
|
||||
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
|
||||
import { parquetHeader } from './header.js'
|
||||
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
|
||||
@ -49,14 +49,13 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
|
||||
let page
|
||||
const uncompressed_page_size = Number(header.uncompressed_page_size)
|
||||
const { codec } = columnMetadata
|
||||
if (codec === CompressionCodec.UNCOMPRESSED) {
|
||||
if (codec === 'UNCOMPRESSED') {
|
||||
page = compressedBytes
|
||||
} else if (codec === CompressionCodec.SNAPPY) {
|
||||
} else if (codec === 'SNAPPY') {
|
||||
page = new Uint8Array(uncompressed_page_size)
|
||||
snappyUncompress(compressedBytes, page)
|
||||
} else {
|
||||
const compressor = Object.entries(CompressionCodec).find(([, value]) => value === codec)
|
||||
throw new Error(`parquet unsupported compression codec: ${codec} ${compressor?.[0]}`)
|
||||
throw new Error(`parquet unsupported compression codec: ${codec}`)
|
||||
}
|
||||
if (page?.length !== uncompressed_page_size) {
|
||||
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
|
||||
@ -178,11 +177,11 @@ export function getColumnOffset(columnMetadata) {
|
||||
function convert(data, schemaElement) {
|
||||
const ctype = schemaElement.converted_type
|
||||
if (ctype === undefined) return data
|
||||
if (ctype === ConvertedType.UTF8) {
|
||||
if (ctype === 'UTF8') {
|
||||
const decoder = new TextDecoder()
|
||||
return data.map(v => decoder.decode(v))
|
||||
}
|
||||
if (ctype === ConvertedType.DECIMAL) {
|
||||
if (ctype === 'DECIMAL') {
|
||||
const scaleFactor = Math.pow(10, schemaElement.scale || 0)
|
||||
if (typeof data[0] === 'number') {
|
||||
return data.map(v => v * scaleFactor)
|
||||
@ -191,19 +190,19 @@ function convert(data, schemaElement) {
|
||||
throw new Error('parquet decimal byte string not supported')
|
||||
}
|
||||
}
|
||||
if (ctype === ConvertedType.DATE) {
|
||||
if (ctype === 'DATE') {
|
||||
return data.map(v => new Date(v * dayMillis))
|
||||
}
|
||||
if (ctype === ConvertedType.TIME_MILLIS) {
|
||||
if (ctype === 'TIME_MILLIS') {
|
||||
return data.map(v => new Date(v))
|
||||
}
|
||||
if (ctype === ConvertedType.JSON) {
|
||||
if (ctype === 'JSON') {
|
||||
return data.map(v => JSON.parse(v))
|
||||
}
|
||||
if (ctype === ConvertedType.BSON) {
|
||||
if (ctype === 'BSON') {
|
||||
throw new Error('parquet bson not supported')
|
||||
}
|
||||
if (ctype === ConvertedType.INTERVAL) {
|
||||
if (ctype === 'INTERVAL') {
|
||||
throw new Error('parquet interval not supported')
|
||||
}
|
||||
return data
|
||||
|
||||
110
src/constants.js
110
src/constants.js
@ -9,67 +9,6 @@ export const ParquetType = {
|
||||
FIXED_LEN_BYTE_ARRAY: 7,
|
||||
}
|
||||
|
||||
export const ParquetEncoding = {
|
||||
PLAIN: 0,
|
||||
PLAIN_DICTIONARY: 2,
|
||||
RLE: 3,
|
||||
BIT_PACKED: 4, // deprecated
|
||||
DELTA_BINARY_PACKED: 5,
|
||||
DELTA_LENGTH_BYTE_ARRAY: 6,
|
||||
DELTA_BYTE_ARRAY: 7,
|
||||
RLE_DICTIONARY: 8,
|
||||
BYTE_STREAM_SPLIT: 9,
|
||||
}
|
||||
|
||||
export const FieldRepetitionType = {
|
||||
REQUIRED: 0,
|
||||
OPTIONAL: 1,
|
||||
REPEATED: 2,
|
||||
}
|
||||
|
||||
export const ConvertedType = {
|
||||
UTF8: 0,
|
||||
MAP: 1,
|
||||
MAP_KEY_VALUE: 2,
|
||||
LIST: 3,
|
||||
ENUM: 4,
|
||||
DECIMAL: 5,
|
||||
DATE: 6,
|
||||
TIME_MILLIS: 7,
|
||||
TIME_MICROS: 8,
|
||||
TIMESTAMP_MILLIS: 9,
|
||||
TIMESTAMP_MICROS: 10,
|
||||
UINT_8: 11,
|
||||
UINT_16: 12,
|
||||
UINT_32: 13,
|
||||
UINT_64: 14,
|
||||
INT_8: 15,
|
||||
INT_16: 16,
|
||||
INT_32: 17,
|
||||
INT_64: 18,
|
||||
JSON: 19,
|
||||
BSON: 20,
|
||||
INTERVAL: 21,
|
||||
}
|
||||
|
||||
export const CompressionCodec = {
|
||||
UNCOMPRESSED: 0,
|
||||
SNAPPY: 1,
|
||||
GZIP: 2,
|
||||
LZO: 3,
|
||||
BROTLI: 4,
|
||||
LZ4: 5,
|
||||
ZSTD: 6,
|
||||
LZ4_RAW: 7,
|
||||
}
|
||||
|
||||
export const PageType = {
|
||||
DATA_PAGE: 0,
|
||||
INDEX_PAGE: 1,
|
||||
DICTIONARY_PAGE: 2,
|
||||
DATA_PAGE_V2: 3,
|
||||
}
|
||||
|
||||
export const Encoding = {
|
||||
PLAIN: 0,
|
||||
PLAIN_DICTIONARY: 2,
|
||||
@ -81,3 +20,52 @@ export const Encoding = {
|
||||
RLE_DICTIONARY: 8,
|
||||
BYTE_STREAM_SPLIT: 9,
|
||||
}
|
||||
|
||||
export const FieldRepetitionType = [
|
||||
'REQUIRED',
|
||||
'OPTIONAL',
|
||||
'REPEATED',
|
||||
]
|
||||
|
||||
export const ConvertedType = [
|
||||
'UTF8',
|
||||
'MAP',
|
||||
'MAP_KEY_VALUE',
|
||||
'LIST',
|
||||
'ENUM',
|
||||
'DECIMAL',
|
||||
'DATE',
|
||||
'TIME_MILLIS',
|
||||
'TIME_MICROS',
|
||||
'TIMESTAMP_MILLIS',
|
||||
'TIMESTAMP_MICROS',
|
||||
'UINT_8',
|
||||
'UINT_16',
|
||||
'UINT_32',
|
||||
'UINT_64',
|
||||
'INT_8',
|
||||
'INT_16',
|
||||
'INT_32',
|
||||
'INT_64',
|
||||
'JSON',
|
||||
'BSON',
|
||||
'INTERVAL',
|
||||
]
|
||||
|
||||
export const CompressionCodec = [
|
||||
'UNCOMPRESSED',
|
||||
'SNAPPY',
|
||||
'GZIP',
|
||||
'LZO',
|
||||
'BROTLI',
|
||||
'LZ4',
|
||||
'ZSTD',
|
||||
'LZ4_RAW',
|
||||
]
|
||||
|
||||
export const PageType = {
|
||||
DATA_PAGE: 0,
|
||||
INDEX_PAGE: 1,
|
||||
DICTIONARY_PAGE: 2,
|
||||
DATA_PAGE_V2: 3,
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { ParquetEncoding, ParquetType } from './constants.js'
|
||||
import { Encoding, ParquetType } from './constants.js'
|
||||
import { readVarInt } from './thrift.js'
|
||||
|
||||
/**
|
||||
@ -203,7 +203,7 @@ export function widthFromMaxInt(value) {
|
||||
export function readData(dataView, encoding, offset, count, bitWidth) {
|
||||
const value = []
|
||||
let byteLength = 0
|
||||
if (encoding === ParquetEncoding.RLE) {
|
||||
if (encoding === Encoding.RLE) {
|
||||
let seen = 0
|
||||
while (seen < count) {
|
||||
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js'
|
||||
import { schemaTree } from './schema.js'
|
||||
import { deserializeTCompactProtocol } from './thrift.js'
|
||||
|
||||
@ -97,10 +98,10 @@ export function parquetMetadata(arrayBuffer) {
|
||||
const schema = metadata.field_2.map((/** @type {any} */ field) => ({
|
||||
type: field.field_1,
|
||||
type_length: field.field_2,
|
||||
repetition_type: field.field_3,
|
||||
repetition_type: FieldRepetitionType[field.field_3],
|
||||
name: field.field_4,
|
||||
num_children: field.field_5,
|
||||
converted_type: field.field_6,
|
||||
converted_type: ConvertedType[field.field_6],
|
||||
scale: field.field_7,
|
||||
precision: field.field_8,
|
||||
field_id: field.field_9,
|
||||
@ -114,7 +115,7 @@ export function parquetMetadata(arrayBuffer) {
|
||||
type: column.field_3.field_1,
|
||||
encodings: column.field_3.field_2,
|
||||
path_in_schema: column.field_3.field_3,
|
||||
codec: column.field_3.field_4,
|
||||
codec: CompressionCodec[column.field_3.field_4],
|
||||
num_values: column.field_3.field_5,
|
||||
total_uncompressed_size: column.field_3.field_6,
|
||||
total_compressed_size: column.field_3.field_7,
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
import { FieldRepetitionType } from './constants.js'
|
||||
|
||||
/**
|
||||
* @typedef {import('./types.js').SchemaElement} SchemaElement
|
||||
* @typedef {import('./types.js').SchemaTree} SchemaTree
|
||||
@ -57,7 +55,7 @@ export function schemaElement(schema, name) {
|
||||
* @returns {boolean} true if the element is required
|
||||
*/
|
||||
export function isRequired(schema, name) {
|
||||
return schemaElement(schema, name).repetition_type === FieldRepetitionType.REQUIRED
|
||||
return schemaElement(schema, name).repetition_type === 'REQUIRED'
|
||||
}
|
||||
|
||||
/**
|
||||
@ -71,7 +69,7 @@ export function getMaxRepetitionLevel(schema, parts) {
|
||||
let maxLevel = 0
|
||||
parts.forEach((part, i) => {
|
||||
const element = schemaElement(schema, parts.slice(0, i + 1))
|
||||
if (element.repetition_type === FieldRepetitionType.REPEATED) {
|
||||
if (element.repetition_type === 'REPEATED') {
|
||||
maxLevel += 1
|
||||
}
|
||||
})
|
||||
@ -89,7 +87,7 @@ export function getMaxDefinitionLevel(schema, parts) {
|
||||
let maxLevel = 0
|
||||
parts.forEach((part, i) => {
|
||||
const element = schemaElement(schema, parts.slice(0, i + 1))
|
||||
if (element.repetition_type !== FieldRepetitionType.REQUIRED) {
|
||||
if (element.repetition_type !== 'REQUIRED') {
|
||||
maxLevel += 1
|
||||
}
|
||||
})
|
||||
|
||||
75
src/types.d.ts
vendored
75
src/types.d.ts
vendored
@ -63,36 +63,34 @@ export enum ParquetType {
|
||||
FIXED_LEN_BYTE_ARRAY = 7,
|
||||
}
|
||||
|
||||
export enum FieldRepetitionType {
|
||||
REQUIRED = 0,
|
||||
OPTIONAL = 1,
|
||||
REPEATED = 2,
|
||||
}
|
||||
export type FieldRepetitionType =
|
||||
'REQUIRED' |
|
||||
'OPTIONAL' |
|
||||
'REPEATED'
|
||||
|
||||
export enum ConvertedType {
|
||||
UTF8 = 0,
|
||||
MAP = 1,
|
||||
MAP_KEY_VALUE = 2,
|
||||
LIST = 3,
|
||||
ENUM = 4,
|
||||
DECIMAL = 5,
|
||||
DATE = 6,
|
||||
TIME_MILLIS = 7,
|
||||
TIME_MICROS = 8,
|
||||
TIMESTAMP_MILLIS = 9,
|
||||
TIMESTAMP_MICROS = 10,
|
||||
UINT_8 = 11,
|
||||
UINT_16 = 12,
|
||||
UINT_32 = 13,
|
||||
UINT_64 = 14,
|
||||
INT_8 = 15,
|
||||
INT_16 = 16,
|
||||
INT_32 = 17,
|
||||
INT_64 = 18,
|
||||
JSON = 19,
|
||||
BSON = 20,
|
||||
INTERVAL = 21,
|
||||
}
|
||||
export type ConvertedType =
|
||||
'UTF8' |
|
||||
'MAP' |
|
||||
'MAP_KEY_VALUE' |
|
||||
'LIST' |
|
||||
'ENUM' |
|
||||
'DECIMAL' |
|
||||
'DATE' |
|
||||
'TIME_MILLIS' |
|
||||
'TIME_MICROS' |
|
||||
'TIMESTAMP_MILLIS' |
|
||||
'TIMESTAMP_MICROS' |
|
||||
'UINT_8' |
|
||||
'UINT_16' |
|
||||
'UINT_32' |
|
||||
'UINT_64' |
|
||||
'INT_8' |
|
||||
'INT_16' |
|
||||
'INT_32' |
|
||||
'INT_64' |
|
||||
'JSON' |
|
||||
'BSON' |
|
||||
'INTERVAL'
|
||||
|
||||
export interface RowGroup {
|
||||
columns: ColumnChunk[]
|
||||
@ -135,16 +133,15 @@ export enum Encoding {
|
||||
BYTE_STREAM_SPLIT = 9,
|
||||
}
|
||||
|
||||
export enum CompressionCodec {
|
||||
UNCOMPRESSED = 0,
|
||||
SNAPPY = 1,
|
||||
GZIP = 2,
|
||||
LZO = 3,
|
||||
BROTLI = 4,
|
||||
LZ4 = 5,
|
||||
ZSTD = 6,
|
||||
LZ4_RAW = 7,
|
||||
}
|
||||
export type CompressionCodec =
|
||||
'UNCOMPRESSED' |
|
||||
'SNAPPY' |
|
||||
'GZIP' |
|
||||
'LZO' |
|
||||
'BROTLI' |
|
||||
'LZ4' |
|
||||
'ZSTD' |
|
||||
'LZ4_RAW'
|
||||
|
||||
interface KeyValue {
|
||||
key: string
|
||||
|
||||
@ -77,8 +77,17 @@ const addrtypeMetadata = {
|
||||
created_by: 'DuckDB',
|
||||
metadata_length: 149,
|
||||
schema: [
|
||||
{ repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
|
||||
{ type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
|
||||
{
|
||||
repetition_type: 'REQUIRED',
|
||||
name: 'duckdb_schema',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
type: 6,
|
||||
repetition_type: 'OPTIONAL',
|
||||
name: 'ADDRTYPE',
|
||||
converted_type: 'UTF8',
|
||||
},
|
||||
],
|
||||
num_rows: 10,
|
||||
row_groups: [
|
||||
@ -90,7 +99,7 @@ const addrtypeMetadata = {
|
||||
type: 6,
|
||||
encodings: [0, 8],
|
||||
path_in_schema: ['ADDRTYPE'],
|
||||
codec: 1,
|
||||
codec: 'SNAPPY',
|
||||
num_values: 10,
|
||||
total_uncompressed_size: 78,
|
||||
total_compressed_size: 82,
|
||||
@ -118,13 +127,13 @@ const rowgroupsMetadata = {
|
||||
metadata_length: 1602,
|
||||
schema: [
|
||||
{
|
||||
repetition_type: 0,
|
||||
repetition_type: 'REQUIRED',
|
||||
name: 'schema',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
type: 2,
|
||||
repetition_type: 1,
|
||||
repetition_type: 'OPTIONAL',
|
||||
name: 'numbers',
|
||||
},
|
||||
],
|
||||
@ -136,7 +145,7 @@ const rowgroupsMetadata = {
|
||||
file_offset: 150,
|
||||
file_path: undefined,
|
||||
meta_data: {
|
||||
codec: 1,
|
||||
codec: 'SNAPPY',
|
||||
data_page_offset: 71,
|
||||
dictionary_page_offset: 4,
|
||||
encoding_stats: [
|
||||
@ -165,7 +174,7 @@ const rowgroupsMetadata = {
|
||||
{
|
||||
file_offset: 368,
|
||||
meta_data: {
|
||||
codec: 1,
|
||||
codec: 'SNAPPY',
|
||||
data_page_offset: 294,
|
||||
dictionary_page_offset: 248,
|
||||
encoding_stats: [
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { FieldRepetitionType } from '../src/constants.js'
|
||||
import {
|
||||
getMaxDefinitionLevel,
|
||||
getMaxRepetitionLevel,
|
||||
@ -9,10 +8,14 @@ import {
|
||||
} from '../src/schema.js'
|
||||
|
||||
describe('Parquet schema utils', () => {
|
||||
/**
|
||||
* @typedef {import('../src/types.js').SchemaElement} SchemaElement
|
||||
* @type {SchemaElement[]}
|
||||
*/
|
||||
const schema = [
|
||||
{ name: 'root', num_children: 2, repetition_type: FieldRepetitionType.REQUIRED },
|
||||
{ name: 'child1', repetition_type: FieldRepetitionType.OPTIONAL },
|
||||
{ name: 'child2', repetition_type: FieldRepetitionType.REPEATED },
|
||||
{ name: 'root', num_children: 2, repetition_type: 'REQUIRED' },
|
||||
{ name: 'child1', repetition_type: 'OPTIONAL' },
|
||||
{ name: 'child2', repetition_type: 'REPEATED' },
|
||||
]
|
||||
|
||||
describe('schemaElement', () => {
|
||||
|
||||
@ -25,9 +25,9 @@ const addrtypeSchema = {
|
||||
children: [],
|
||||
count: 1,
|
||||
element: {
|
||||
converted_type: 0,
|
||||
converted_type: 'UTF8',
|
||||
name: 'ADDRTYPE',
|
||||
repetition_type: 1,
|
||||
repetition_type: 'OPTIONAL',
|
||||
type: 6,
|
||||
},
|
||||
},
|
||||
@ -36,7 +36,7 @@ const addrtypeSchema = {
|
||||
element: {
|
||||
name: 'duckdb_schema',
|
||||
num_children: 1,
|
||||
repetition_type: 0,
|
||||
repetition_type: 'REQUIRED',
|
||||
},
|
||||
}
|
||||
|
||||
@ -48,7 +48,7 @@ const rowgroupsSchema = {
|
||||
count: 1,
|
||||
element: {
|
||||
name: 'numbers',
|
||||
repetition_type: 1,
|
||||
repetition_type: 'OPTIONAL',
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
@ -57,6 +57,6 @@ const rowgroupsSchema = {
|
||||
element: {
|
||||
name: 'schema',
|
||||
num_children: 1,
|
||||
repetition_type: 0,
|
||||
repetition_type: 'REQUIRED',
|
||||
},
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user