ParquetType as string

This commit is contained in:
Kenny Daniel 2024-02-27 11:06:31 -08:00
parent 11f35c9e43
commit 8b575ad2d8
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
13 changed files with 90 additions and 91 deletions

@ -1,13 +1,13 @@
export const ParquetType = {
BOOLEAN: 0,
INT32: 1,
INT64: 2,
INT96: 3, // deprecated
FLOAT: 4,
DOUBLE: 5,
BYTE_ARRAY: 6,
FIXED_LEN_BYTE_ARRAY: 7,
}
export const ParquetType = [
'BOOLEAN',
'INT32',
'INT64',
'INT96', // deprecated
'FLOAT',
'DOUBLE',
'BYTE_ARRAY',
'FIXED_LEN_BYTE_ARRAY',
]
export const Encoding = [
'PLAIN',

@ -75,7 +75,7 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
// bit width is stored as single byte
let bitWidth
// TODO: RLE encoding uses bitWidth = schemaElement.type_length
if (columnMetadata.type === ParquetType.BOOLEAN) {
if (columnMetadata.type === 'BOOLEAN') {
bitWidth = 1
} else {
bitWidth = dataView.getUint8(offset)

@ -1,4 +1,3 @@
import { ParquetType } from './constants.js'
import { readVarInt } from './thrift.js'
/**
@ -150,8 +149,9 @@ function readPlainByteArrayFixed(dataView, offset, fixedLength) {
* Read `count` values of the given type from the dataView.
*
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray
* @typedef {import("./types.d.ts").ParquetType} ParquetType
* @param {DataView} dataView - buffer to read data from
* @param {number} type - parquet type of the data
* @param {ParquetType} type - parquet type of the data
* @param {number} count - number of values to read
* @param {number} offset - offset to start reading from the DataView
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
@ -159,19 +159,19 @@ function readPlainByteArrayFixed(dataView, offset, fixedLength) {
*/
export function readPlain(dataView, type, count, offset, utf8) {
if (count === 0) return { value: [], byteLength: 0 }
if (type === ParquetType.BOOLEAN) {
if (type === 'BOOLEAN') {
return readPlainBoolean(dataView, offset, count)
} else if (type === ParquetType.INT32) {
} else if (type === 'INT32') {
return readPlainInt32(dataView, offset, count)
} else if (type === ParquetType.INT64) {
} else if (type === 'INT64') {
return readPlainInt64(dataView, offset, count)
} else if (type === ParquetType.INT96) {
} else if (type === 'INT96') {
return readPlainInt96(dataView, offset, count)
} else if (type === ParquetType.FLOAT) {
} else if (type === 'FLOAT') {
return readPlainFloat(dataView, offset, count)
} else if (type === ParquetType.DOUBLE) {
} else if (type === 'DOUBLE') {
return readPlainDouble(dataView, offset, count)
} else if (type === ParquetType.BYTE_ARRAY) {
} else if (type === 'BYTE_ARRAY') {
const byteArray = readPlainByteArray(dataView, offset, count)
if (utf8) {
const decoder = new TextDecoder()
@ -181,7 +181,7 @@ export function readPlain(dataView, type, count, offset, utf8) {
}
}
return byteArray
} else if (type === ParquetType.FIXED_LEN_BYTE_ARRAY) {
} else if (type === 'FIXED_LEN_BYTE_ARRAY') {
return readPlainByteArrayFixed(dataView, offset, count)
} else {
throw new Error(`parquet unhandled type: ${type}`)

@ -1,4 +1,4 @@
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType } from './constants.js'
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js'
import { schemaTree } from './schema.js'
import { deserializeTCompactProtocol } from './thrift.js'
@ -96,7 +96,7 @@ export function parquetMetadata(arrayBuffer) {
// Parse parquet metadata from thrift data
const version = metadata.field_1
const schema = metadata.field_2.map((/** @type {any} */ field) => ({
type: field.field_1,
type: ParquetType[field.field_1],
type_length: field.field_2,
repetition_type: FieldRepetitionType[field.field_3],
name: field.field_4,
@ -112,7 +112,7 @@ export function parquetMetadata(arrayBuffer) {
file_path: column.field_1,
file_offset: column.field_2,
meta_data: column.field_3 && {
type: column.field_3.field_1,
type: ParquetType[column.field_3.field_1],
encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encoding[e]),
path_in_schema: column.field_3.field_3,
codec: CompressionCodec[column.field_3.field_4],

19
src/types.d.ts vendored

@ -44,16 +44,15 @@ export interface SchemaElement {
field_id?: number
}
export enum ParquetType {
BOOLEAN = 0,
INT32 = 1,
INT64 = 2,
INT96 = 3, // deprecated
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7,
}
export type ParquetType =
'BOOLEAN' |
'INT32' |
'INT64' |
'INT96' | // deprecated
'FLOAT' |
'DOUBLE' |
'BYTE_ARRAY' |
'FIXED_LEN_BYTE_ARRAY'
export type FieldRepetitionType =
'REQUIRED' |

@ -1,5 +1,4 @@
import { describe, expect, it } from 'vitest'
import { ParquetType } from '../src/constants.js'
import { readPlain, readRleBitPackedHybrid } from '../src/encoding.js'
describe('readPlain', () => {
@ -7,21 +6,21 @@ describe('readPlain', () => {
it('reads BOOLEAN values correctly', () => {
const dataView = new DataView(new ArrayBuffer(1))
dataView.setUint8(0, 0b00000001) // Set the first bit to 1
const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0, false)
const result = readPlain(dataView, 'BOOLEAN', 1, 0, false)
expect(result).toEqual({ value: [true], byteLength: 1 })
})
it('reads INT32 values correctly', () => {
const dataView = new DataView(new ArrayBuffer(4))
dataView.setInt32(0, 123456789, true) // little-endian
const result = readPlain(dataView, ParquetType.INT32, 1, 0, false)
const result = readPlain(dataView, 'INT32', 1, 0, false)
expect(result).toEqual({ value: [123456789], byteLength: 4 })
})
it('reads INT64 values correctly', () => {
const dataView = new DataView(new ArrayBuffer(8))
dataView.setBigInt64(0, BigInt('1234567890123456789'), true)
const result = readPlain(dataView, ParquetType.INT64, 1, 0, false)
const result = readPlain(dataView, 'INT64', 1, 0, false)
expect(result).toEqual({ value: [1234567890123456789n], byteLength: 8 })
})
@ -36,7 +35,7 @@ describe('readPlain', () => {
dataView.setInt32(8, high, true)
const expectedValue = (BigInt(high) << BigInt(32)) | low
const result = readPlain(dataView, ParquetType.INT96, 1, 0, false)
const result = readPlain(dataView, 'INT96', 1, 0, false)
expect(result).toEqual({
value: [expectedValue],
byteLength: 12,
@ -46,14 +45,14 @@ describe('readPlain', () => {
it('reads FLOAT values correctly', () => {
const dataView = new DataView(new ArrayBuffer(4))
dataView.setFloat32(0, 1234.5, true) // little-endian
const result = readPlain(dataView, ParquetType.FLOAT, 1, 0, false)
const result = readPlain(dataView, 'FLOAT', 1, 0, false)
expect(result).toEqual({ value: [1234.5], byteLength: 4 })
})
it('reads DOUBLE values correctly', () => {
const dataView = new DataView(new ArrayBuffer(8))
dataView.setFloat64(0, 12345.6789, true) // little-endian
const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0, false)
const result = readPlain(dataView, 'DOUBLE', 1, 0, false)
expect(result).toEqual({ value: [12345.6789], byteLength: 8 })
})
@ -63,7 +62,7 @@ describe('readPlain', () => {
dataView.setUint8(4, 1) // first byte array data
dataView.setUint8(5, 2)
dataView.setUint8(6, 3)
const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0, false)
const result = readPlain(dataView, 'BYTE_ARRAY', 1, 0, false)
expect(result).toEqual({
value: [new Uint8Array([1, 2, 3])],
byteLength: 7,
@ -76,7 +75,7 @@ describe('readPlain', () => {
dataView.setUint8(0, 4)
dataView.setUint8(1, 5)
dataView.setUint8(2, 6)
const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0, false)
const result = readPlain(dataView, 'FIXED_LEN_BYTE_ARRAY', fixedLength, 0, false)
expect(result).toEqual({
value: new Uint8Array([4, 5, 6]),
byteLength: fixedLength,
@ -85,7 +84,8 @@ describe('readPlain', () => {
it('throws an error for unhandled types', () => {
const dataView = new DataView(new ArrayBuffer(0))
const invalidType = 999
/** @type any */
const invalidType = 'invalidType'
expect(() => readPlain(dataView, invalidType, 1, 0, false))
.toThrow(`parquet unhandled type: ${invalidType}`)
})

@ -9,7 +9,7 @@
"num_children": 1
},
{
"type": 6,
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "ADDRTYPE",
"converted_type": "UTF8"
@ -22,7 +22,7 @@
{
"file_offset": 0,
"meta_data": {
"type": 6,
"type": "BYTE_ARRAY",
"encodings": ["PLAIN", "RLE_DICTIONARY"],
"path_in_schema": ["ADDRTYPE"],
"codec": "SNAPPY",

@ -16,7 +16,7 @@
"path_in_schema": [ "value" ],
"total_compressed_size": 168,
"total_uncompressed_size": 168,
"type": 6
"type": "BYTE_ARRAY"
}
}
],
@ -37,7 +37,7 @@
"precision": 4,
"repetition_type": "OPTIONAL",
"scale": 2,
"type": 6
"type": "BYTE_ARRAY"
}
]
}

@ -21,7 +21,7 @@
"statistics": {},
"total_compressed_size": 1467,
"total_uncompressed_size": 4155,
"type": 2
"type": "INT64"
}
}
],
@ -38,7 +38,7 @@
"converted_type": "UINT_64",
"name": "long_col",
"repetition_type": "OPTIONAL",
"type": 2
"type": "INT64"
}
]
}

@ -27,7 +27,7 @@
},
"total_compressed_size": 63,
"total_uncompressed_size": 59,
"type": 6
"type": "BYTE_ARRAY"
}
},
{
@ -45,7 +45,7 @@
},
"total_compressed_size": 49,
"total_uncompressed_size": 47,
"type": 1
"type": "INT32"
}
},
{
@ -63,7 +63,7 @@
},
"total_compressed_size": 88,
"total_uncompressed_size": 94,
"type": 5
"type": "DOUBLE"
}
},
{
@ -81,7 +81,7 @@
},
"total_compressed_size": 39,
"total_uncompressed_size": 37,
"type": 0
"type": "BOOLEAN"
}
},
{
@ -103,7 +103,7 @@
},
"total_compressed_size": 78,
"total_uncompressed_size": 74,
"type": 1
"type": "INT32"
}
}
],
@ -120,22 +120,22 @@
"converted_type": "UTF8",
"name": "a",
"repetition_type": "OPTIONAL",
"type": 6
"type": "BYTE_ARRAY"
},
{
"name": "b",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"name": "c",
"repetition_type": "REQUIRED",
"type": 5
"type": "DOUBLE"
},
{
"name": "d",
"repetition_type": "REQUIRED",
"type": 0
"type": "BOOLEAN"
},
{
"converted_type": "LIST",
@ -151,7 +151,7 @@
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
}
]
}

@ -27,7 +27,7 @@
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 2
"type": "INT64"
}
},
{
@ -45,7 +45,7 @@
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 1
"type": "INT32"
}
},
{
@ -69,7 +69,7 @@
},
"total_compressed_size": 55,
"total_uncompressed_size": 55,
"type": 1
"type": "INT32"
}
},
{
@ -87,7 +87,7 @@
},
"total_compressed_size": 47,
"total_uncompressed_size": 47,
"type": 6
"type": "BYTE_ARRAY"
}
},
{
@ -105,7 +105,7 @@
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 1
"type": "INT32"
}
},
{
@ -129,7 +129,7 @@
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 6
"type": "BYTE_ARRAY"
}
},
{
@ -153,7 +153,7 @@
},
"total_compressed_size": 51,
"total_uncompressed_size": 51,
"type": 1
"type": "INT32"
}
},
{
@ -171,7 +171,7 @@
},
"total_compressed_size": 37,
"total_uncompressed_size": 37,
"type": 1
"type": "INT32"
}
},
{
@ -189,7 +189,7 @@
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 1
"type": "INT32"
}
},
{
@ -216,7 +216,7 @@
},
"total_compressed_size": 51,
"total_uncompressed_size": 51,
"type": 1
"type": "INT32"
}
},
{
@ -243,7 +243,7 @@
},
"total_compressed_size": 76,
"total_uncompressed_size": 76,
"type": 6
"type": "BYTE_ARRAY"
}
},
{
@ -259,7 +259,7 @@
},
"total_compressed_size": 33,
"total_uncompressed_size": 33,
"type": 6
"type": "BYTE_ARRAY"
}
},
{
@ -284,7 +284,7 @@
},
"total_compressed_size": 35,
"total_uncompressed_size": 35,
"type": 5
"type": "DOUBLE"
}
}
],
@ -300,7 +300,7 @@
{
"name": "ID",
"repetition_type": "REQUIRED",
"type": 2
"type": "INT64"
},
{
"converted_type": "LIST",
@ -316,7 +316,7 @@
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"converted_type": "LIST",
@ -343,7 +343,7 @@
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"converted_type": "MAP",
@ -361,12 +361,12 @@
"converted_type": "UTF8",
"name": "key",
"repetition_type": "REQUIRED",
"type": 6
"type": "BYTE_ARRAY"
},
{
"name": "value",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"converted_type": "LIST",
@ -395,12 +395,12 @@
"converted_type": "UTF8",
"name": "key",
"repetition_type": "REQUIRED",
"type": 6
"type": "BYTE_ARRAY"
},
{
"name": "value",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"name": "nested_Struct",
@ -410,7 +410,7 @@
{
"name": "a",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"converted_type": "LIST",
@ -426,7 +426,7 @@
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"name": "c",
@ -463,13 +463,13 @@
{
"name": "e",
"repetition_type": "REQUIRED",
"type": 1
"type": "INT32"
},
{
"converted_type": "UTF8",
"name": "f",
"repetition_type": "REQUIRED",
"type": 6
"type": "BYTE_ARRAY"
},
{
"converted_type": "MAP",
@ -487,7 +487,7 @@
"converted_type": "UTF8",
"name": "key",
"repetition_type": "REQUIRED",
"type": 6
"type": "BYTE_ARRAY"
},
{
"name": "value",
@ -513,7 +513,7 @@
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 5
"type": "DOUBLE"
}
]
}

@ -9,7 +9,7 @@
"num_children": 1
},
{
"type": 2,
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "numbers"
}
@ -38,7 +38,7 @@
},
"total_compressed_size": 146,
"total_uncompressed_size": 172,
"type": 2
"type": "INT64"
}
}
],
@ -67,7 +67,7 @@
},
"total_compressed_size": 120,
"total_uncompressed_size": 126,
"type": 2
"type": "INT64"
}
}
],

@ -28,7 +28,7 @@ const addrtypeSchema = {
converted_type: 'UTF8',
name: 'ADDRTYPE',
repetition_type: 'OPTIONAL',
type: 6,
type: 'BYTE_ARRAY',
},
},
],
@ -49,7 +49,7 @@ const rowgroupsSchema = {
element: {
name: 'numbers',
repetition_type: 'OPTIONAL',
type: 2,
type: 'INT64',
},
},
],