Human readable metadata types

This commit is contained in:
Kenny Daniel 2024-02-11 14:33:56 -08:00
parent 17f7ace840
commit 8f7cd07734
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
9 changed files with 133 additions and 138 deletions

@ -1,4 +1,4 @@
import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js'
import { Encoding, PageType } from './constants.js'
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
import { parquetHeader } from './header.js'
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
@ -49,14 +49,13 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
let page
const uncompressed_page_size = Number(header.uncompressed_page_size)
const { codec } = columnMetadata
if (codec === CompressionCodec.UNCOMPRESSED) {
if (codec === 'UNCOMPRESSED') {
page = compressedBytes
} else if (codec === CompressionCodec.SNAPPY) {
} else if (codec === 'SNAPPY') {
page = new Uint8Array(uncompressed_page_size)
snappyUncompress(compressedBytes, page)
} else {
const compressor = Object.entries(CompressionCodec).find(([, value]) => value === codec)
throw new Error(`parquet unsupported compression codec: ${codec} ${compressor?.[0]}`)
throw new Error(`parquet unsupported compression codec: ${codec}`)
}
if (page?.length !== uncompressed_page_size) {
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
@ -178,11 +177,11 @@ export function getColumnOffset(columnMetadata) {
function convert(data, schemaElement) {
const ctype = schemaElement.converted_type
if (ctype === undefined) return data
if (ctype === ConvertedType.UTF8) {
if (ctype === 'UTF8') {
const decoder = new TextDecoder()
return data.map(v => decoder.decode(v))
}
if (ctype === ConvertedType.DECIMAL) {
if (ctype === 'DECIMAL') {
const scaleFactor = Math.pow(10, schemaElement.scale || 0)
if (typeof data[0] === 'number') {
return data.map(v => v * scaleFactor)
@ -191,19 +190,19 @@ function convert(data, schemaElement) {
throw new Error('parquet decimal byte string not supported')
}
}
if (ctype === ConvertedType.DATE) {
if (ctype === 'DATE') {
return data.map(v => new Date(v * dayMillis))
}
if (ctype === ConvertedType.TIME_MILLIS) {
if (ctype === 'TIME_MILLIS') {
return data.map(v => new Date(v))
}
if (ctype === ConvertedType.JSON) {
if (ctype === 'JSON') {
return data.map(v => JSON.parse(v))
}
if (ctype === ConvertedType.BSON) {
if (ctype === 'BSON') {
throw new Error('parquet bson not supported')
}
if (ctype === ConvertedType.INTERVAL) {
if (ctype === 'INTERVAL') {
throw new Error('parquet interval not supported')
}
return data

@ -9,67 +9,6 @@ export const ParquetType = {
FIXED_LEN_BYTE_ARRAY: 7,
}
export const ParquetEncoding = {
PLAIN: 0,
PLAIN_DICTIONARY: 2,
RLE: 3,
BIT_PACKED: 4, // deprecated
DELTA_BINARY_PACKED: 5,
DELTA_LENGTH_BYTE_ARRAY: 6,
DELTA_BYTE_ARRAY: 7,
RLE_DICTIONARY: 8,
BYTE_STREAM_SPLIT: 9,
}
export const FieldRepetitionType = {
REQUIRED: 0,
OPTIONAL: 1,
REPEATED: 2,
}
export const ConvertedType = {
UTF8: 0,
MAP: 1,
MAP_KEY_VALUE: 2,
LIST: 3,
ENUM: 4,
DECIMAL: 5,
DATE: 6,
TIME_MILLIS: 7,
TIME_MICROS: 8,
TIMESTAMP_MILLIS: 9,
TIMESTAMP_MICROS: 10,
UINT_8: 11,
UINT_16: 12,
UINT_32: 13,
UINT_64: 14,
INT_8: 15,
INT_16: 16,
INT_32: 17,
INT_64: 18,
JSON: 19,
BSON: 20,
INTERVAL: 21,
}
export const CompressionCodec = {
UNCOMPRESSED: 0,
SNAPPY: 1,
GZIP: 2,
LZO: 3,
BROTLI: 4,
LZ4: 5,
ZSTD: 6,
LZ4_RAW: 7,
}
export const PageType = {
DATA_PAGE: 0,
INDEX_PAGE: 1,
DICTIONARY_PAGE: 2,
DATA_PAGE_V2: 3,
}
export const Encoding = {
PLAIN: 0,
PLAIN_DICTIONARY: 2,
@ -81,3 +20,52 @@ export const Encoding = {
RLE_DICTIONARY: 8,
BYTE_STREAM_SPLIT: 9,
}
export const FieldRepetitionType = [
'REQUIRED',
'OPTIONAL',
'REPEATED',
]
export const ConvertedType = [
'UTF8',
'MAP',
'MAP_KEY_VALUE',
'LIST',
'ENUM',
'DECIMAL',
'DATE',
'TIME_MILLIS',
'TIME_MICROS',
'TIMESTAMP_MILLIS',
'TIMESTAMP_MICROS',
'UINT_8',
'UINT_16',
'UINT_32',
'UINT_64',
'INT_8',
'INT_16',
'INT_32',
'INT_64',
'JSON',
'BSON',
'INTERVAL',
]
export const CompressionCodec = [
'UNCOMPRESSED',
'SNAPPY',
'GZIP',
'LZO',
'BROTLI',
'LZ4',
'ZSTD',
'LZ4_RAW',
]
export const PageType = {
DATA_PAGE: 0,
INDEX_PAGE: 1,
DICTIONARY_PAGE: 2,
DATA_PAGE_V2: 3,
}

@ -1,4 +1,4 @@
import { ParquetEncoding, ParquetType } from './constants.js'
import { Encoding, ParquetType } from './constants.js'
import { readVarInt } from './thrift.js'
/**
@ -203,7 +203,7 @@ export function widthFromMaxInt(value) {
export function readData(dataView, encoding, offset, count, bitWidth) {
const value = []
let byteLength = 0
if (encoding === ParquetEncoding.RLE) {
if (encoding === Encoding.RLE) {
let seen = 0
while (seen < count) {
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)

@ -1,3 +1,4 @@
import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js'
import { schemaTree } from './schema.js'
import { deserializeTCompactProtocol } from './thrift.js'
@ -97,10 +98,10 @@ export function parquetMetadata(arrayBuffer) {
const schema = metadata.field_2.map((/** @type {any} */ field) => ({
type: field.field_1,
type_length: field.field_2,
repetition_type: field.field_3,
repetition_type: FieldRepetitionType[field.field_3],
name: field.field_4,
num_children: field.field_5,
converted_type: field.field_6,
converted_type: ConvertedType[field.field_6],
scale: field.field_7,
precision: field.field_8,
field_id: field.field_9,
@ -114,7 +115,7 @@ export function parquetMetadata(arrayBuffer) {
type: column.field_3.field_1,
encodings: column.field_3.field_2,
path_in_schema: column.field_3.field_3,
codec: column.field_3.field_4,
codec: CompressionCodec[column.field_3.field_4],
num_values: column.field_3.field_5,
total_uncompressed_size: column.field_3.field_6,
total_compressed_size: column.field_3.field_7,

@ -1,5 +1,3 @@
import { FieldRepetitionType } from './constants.js'
/**
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @typedef {import('./types.js').SchemaTree} SchemaTree
@ -57,7 +55,7 @@ export function schemaElement(schema, name) {
* @returns {boolean} true if the element is required
*/
export function isRequired(schema, name) {
return schemaElement(schema, name).repetition_type === FieldRepetitionType.REQUIRED
return schemaElement(schema, name).repetition_type === 'REQUIRED'
}
/**
@ -71,7 +69,7 @@ export function getMaxRepetitionLevel(schema, parts) {
let maxLevel = 0
parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type === FieldRepetitionType.REPEATED) {
if (element.repetition_type === 'REPEATED') {
maxLevel += 1
}
})
@ -89,7 +87,7 @@ export function getMaxDefinitionLevel(schema, parts) {
let maxLevel = 0
parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type !== FieldRepetitionType.REQUIRED) {
if (element.repetition_type !== 'REQUIRED') {
maxLevel += 1
}
})

75
src/types.d.ts vendored

@ -63,36 +63,34 @@ export enum ParquetType {
FIXED_LEN_BYTE_ARRAY = 7,
}
export enum FieldRepetitionType {
REQUIRED = 0,
OPTIONAL = 1,
REPEATED = 2,
}
export type FieldRepetitionType =
'REQUIRED' |
'OPTIONAL' |
'REPEATED'
export enum ConvertedType {
UTF8 = 0,
MAP = 1,
MAP_KEY_VALUE = 2,
LIST = 3,
ENUM = 4,
DECIMAL = 5,
DATE = 6,
TIME_MILLIS = 7,
TIME_MICROS = 8,
TIMESTAMP_MILLIS = 9,
TIMESTAMP_MICROS = 10,
UINT_8 = 11,
UINT_16 = 12,
UINT_32 = 13,
UINT_64 = 14,
INT_8 = 15,
INT_16 = 16,
INT_32 = 17,
INT_64 = 18,
JSON = 19,
BSON = 20,
INTERVAL = 21,
}
export type ConvertedType =
'UTF8' |
'MAP' |
'MAP_KEY_VALUE' |
'LIST' |
'ENUM' |
'DECIMAL' |
'DATE' |
'TIME_MILLIS' |
'TIME_MICROS' |
'TIMESTAMP_MILLIS' |
'TIMESTAMP_MICROS' |
'UINT_8' |
'UINT_16' |
'UINT_32' |
'UINT_64' |
'INT_8' |
'INT_16' |
'INT_32' |
'INT_64' |
'JSON' |
'BSON' |
'INTERVAL'
export interface RowGroup {
columns: ColumnChunk[]
@ -135,16 +133,15 @@ export enum Encoding {
BYTE_STREAM_SPLIT = 9,
}
export enum CompressionCodec {
UNCOMPRESSED = 0,
SNAPPY = 1,
GZIP = 2,
LZO = 3,
BROTLI = 4,
LZ4 = 5,
ZSTD = 6,
LZ4_RAW = 7,
}
export type CompressionCodec =
'UNCOMPRESSED' |
'SNAPPY' |
'GZIP' |
'LZO' |
'BROTLI' |
'LZ4' |
'ZSTD' |
'LZ4_RAW'
interface KeyValue {
key: string

@ -77,8 +77,17 @@ const addrtypeMetadata = {
created_by: 'DuckDB',
metadata_length: 149,
schema: [
{ repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
{ type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
{
repetition_type: 'REQUIRED',
name: 'duckdb_schema',
num_children: 1,
},
{
type: 6,
repetition_type: 'OPTIONAL',
name: 'ADDRTYPE',
converted_type: 'UTF8',
},
],
num_rows: 10,
row_groups: [
@ -90,7 +99,7 @@ const addrtypeMetadata = {
type: 6,
encodings: [0, 8],
path_in_schema: ['ADDRTYPE'],
codec: 1,
codec: 'SNAPPY',
num_values: 10,
total_uncompressed_size: 78,
total_compressed_size: 82,
@ -118,13 +127,13 @@ const rowgroupsMetadata = {
metadata_length: 1602,
schema: [
{
repetition_type: 0,
repetition_type: 'REQUIRED',
name: 'schema',
num_children: 1,
},
{
type: 2,
repetition_type: 1,
repetition_type: 'OPTIONAL',
name: 'numbers',
},
],
@ -136,7 +145,7 @@ const rowgroupsMetadata = {
file_offset: 150,
file_path: undefined,
meta_data: {
codec: 1,
codec: 'SNAPPY',
data_page_offset: 71,
dictionary_page_offset: 4,
encoding_stats: [
@ -165,7 +174,7 @@ const rowgroupsMetadata = {
{
file_offset: 368,
meta_data: {
codec: 1,
codec: 'SNAPPY',
data_page_offset: 294,
dictionary_page_offset: 248,
encoding_stats: [

@ -1,5 +1,4 @@
import { describe, expect, it } from 'vitest'
import { FieldRepetitionType } from '../src/constants.js'
import {
getMaxDefinitionLevel,
getMaxRepetitionLevel,
@ -9,10 +8,14 @@ import {
} from '../src/schema.js'
describe('Parquet schema utils', () => {
/**
* @typedef {import('../src/types.js').SchemaElement} SchemaElement
* @type {SchemaElement[]}
*/
const schema = [
{ name: 'root', num_children: 2, repetition_type: FieldRepetitionType.REQUIRED },
{ name: 'child1', repetition_type: FieldRepetitionType.OPTIONAL },
{ name: 'child2', repetition_type: FieldRepetitionType.REPEATED },
{ name: 'root', num_children: 2, repetition_type: 'REQUIRED' },
{ name: 'child1', repetition_type: 'OPTIONAL' },
{ name: 'child2', repetition_type: 'REPEATED' },
]
describe('schemaElement', () => {

@ -25,9 +25,9 @@ const addrtypeSchema = {
children: [],
count: 1,
element: {
converted_type: 0,
converted_type: 'UTF8',
name: 'ADDRTYPE',
repetition_type: 1,
repetition_type: 'OPTIONAL',
type: 6,
},
},
@ -36,7 +36,7 @@ const addrtypeSchema = {
element: {
name: 'duckdb_schema',
num_children: 1,
repetition_type: 0,
repetition_type: 'REQUIRED',
},
}
@ -48,7 +48,7 @@ const rowgroupsSchema = {
count: 1,
element: {
name: 'numbers',
repetition_type: 1,
repetition_type: 'OPTIONAL',
type: 2,
},
},
@ -57,6 +57,6 @@ const rowgroupsSchema = {
element: {
name: 'schema',
num_children: 1,
repetition_type: 0,
repetition_type: 'REQUIRED',
},
}