diff --git a/src/constants.js b/src/constants.js index fc23fa8..5a3f55c 100644 --- a/src/constants.js +++ b/src/constants.js @@ -1,13 +1,13 @@ -export const ParquetType = { - BOOLEAN: 0, - INT32: 1, - INT64: 2, - INT96: 3, // deprecated - FLOAT: 4, - DOUBLE: 5, - BYTE_ARRAY: 6, - FIXED_LEN_BYTE_ARRAY: 7, -} +export const ParquetType = [ + 'BOOLEAN', + 'INT32', + 'INT64', + 'INT96', // deprecated + 'FLOAT', + 'DOUBLE', + 'BYTE_ARRAY', + 'FIXED_LEN_BYTE_ARRAY', +] export const Encoding = [ 'PLAIN', diff --git a/src/datapage.js b/src/datapage.js index 44e1281..9254983 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -75,7 +75,7 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { // bit width is stored as single byte let bitWidth // TODO: RLE encoding uses bitWidth = schemaElement.type_length - if (columnMetadata.type === ParquetType.BOOLEAN) { + if (columnMetadata.type === 'BOOLEAN') { bitWidth = 1 } else { bitWidth = dataView.getUint8(offset) diff --git a/src/encoding.js b/src/encoding.js index 8638535..78bc900 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -1,4 +1,3 @@ -import { ParquetType } from './constants.js' import { readVarInt } from './thrift.js' /** @@ -150,8 +149,9 @@ function readPlainByteArrayFixed(dataView, offset, fixedLength) { * Read `count` values of the given type from the dataView. * * @typedef {import("./types.d.ts").DecodedArray} DecodedArray + * @typedef {import("./types.d.ts").ParquetType} ParquetType * @param {DataView} dataView - buffer to read data from - * @param {number} type - parquet type of the data + * @param {ParquetType} type - parquet type of the data * @param {number} count - number of values to read * @param {number} offset - offset to start reading from the DataView * @param {boolean} utf8 - whether to decode byte arrays as UTF-8 @@ -159,19 +159,19 @@ function readPlainByteArrayFixed(dataView, offset, fixedLength) { */ export function readPlain(dataView, type, count, offset, utf8) { if (count === 0) return { value: [], byteLength: 0 } - if (type === ParquetType.BOOLEAN) { + if (type === 'BOOLEAN') { return readPlainBoolean(dataView, offset, count) - } else if (type === ParquetType.INT32) { + } else if (type === 'INT32') { return readPlainInt32(dataView, offset, count) - } else if (type === ParquetType.INT64) { + } else if (type === 'INT64') { return readPlainInt64(dataView, offset, count) - } else if (type === ParquetType.INT96) { + } else if (type === 'INT96') { return readPlainInt96(dataView, offset, count) - } else if (type === ParquetType.FLOAT) { + } else if (type === 'FLOAT') { return readPlainFloat(dataView, offset, count) - } else if (type === ParquetType.DOUBLE) { + } else if (type === 'DOUBLE') { return readPlainDouble(dataView, offset, count) - } else if (type === ParquetType.BYTE_ARRAY) { + } else if (type === 'BYTE_ARRAY') { const byteArray = readPlainByteArray(dataView, offset, count) if (utf8) { const decoder = new TextDecoder() @@ -181,7 +181,7 @@ export function readPlain(dataView, type, count, offset, utf8) { } } return byteArray - } else if (type === ParquetType.FIXED_LEN_BYTE_ARRAY) { + } else if (type === 'FIXED_LEN_BYTE_ARRAY') { return readPlainByteArrayFixed(dataView, offset, count) } else { throw new Error(`parquet unhandled type: ${type}`) diff --git a/src/metadata.js b/src/metadata.js index 22e982d..fe51cdd 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,4 +1,4 @@ -import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType } from './constants.js' +import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' import { schemaTree } from './schema.js' import { deserializeTCompactProtocol } from './thrift.js' @@ -96,7 +96,7 @@ export function parquetMetadata(arrayBuffer) { // Parse parquet metadata from thrift data const version = metadata.field_1 const schema = metadata.field_2.map((/** @type {any} */ field) => ({ - type: field.field_1, + type: ParquetType[field.field_1], type_length: field.field_2, repetition_type: FieldRepetitionType[field.field_3], name: field.field_4, @@ -112,7 +112,7 @@ export function parquetMetadata(arrayBuffer) { file_path: column.field_1, file_offset: column.field_2, meta_data: column.field_3 && { - type: column.field_3.field_1, + type: ParquetType[column.field_3.field_1], encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encoding[e]), path_in_schema: column.field_3.field_3, codec: CompressionCodec[column.field_3.field_4], diff --git a/src/types.d.ts b/src/types.d.ts index 7c6655e..7fbedbb 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -44,16 +44,15 @@ export interface SchemaElement { field_id?: number } -export enum ParquetType { - BOOLEAN = 0, - INT32 = 1, - INT64 = 2, - INT96 = 3, // deprecated - FLOAT = 4, - DOUBLE = 5, - BYTE_ARRAY = 6, - FIXED_LEN_BYTE_ARRAY = 7, -} +export type ParquetType = + 'BOOLEAN' | + 'INT32' | + 'INT64' | + 'INT96' | // deprecated + 'FLOAT' | + 'DOUBLE' | + 'BYTE_ARRAY' | + 'FIXED_LEN_BYTE_ARRAY' export type FieldRepetitionType = 'REQUIRED' | diff --git a/test/encoding.test.js b/test/encoding.test.js index 4eba549..5b52c37 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -1,5 +1,4 @@ import { describe, expect, it } from 'vitest' -import { ParquetType } from '../src/constants.js' import { readPlain, readRleBitPackedHybrid } from '../src/encoding.js' describe('readPlain', () => { @@ -7,21 +6,21 @@ describe('readPlain', () => { it('reads BOOLEAN values correctly', () => { const dataView = new DataView(new ArrayBuffer(1)) dataView.setUint8(0, 0b00000001) // Set the first bit to 1 - const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0, false) + const result = readPlain(dataView, 'BOOLEAN', 1, 0, false) expect(result).toEqual({ value: [true], byteLength: 1 }) }) it('reads INT32 values correctly', () => { const dataView = new DataView(new ArrayBuffer(4)) dataView.setInt32(0, 123456789, true) // little-endian - const result = readPlain(dataView, ParquetType.INT32, 1, 0, false) + const result = readPlain(dataView, 'INT32', 1, 0, false) expect(result).toEqual({ value: [123456789], byteLength: 4 }) }) it('reads INT64 values correctly', () => { const dataView = new DataView(new ArrayBuffer(8)) dataView.setBigInt64(0, BigInt('1234567890123456789'), true) - const result = readPlain(dataView, ParquetType.INT64, 1, 0, false) + const result = readPlain(dataView, 'INT64', 1, 0, false) expect(result).toEqual({ value: [1234567890123456789n], byteLength: 8 }) }) @@ -36,7 +35,7 @@ describe('readPlain', () => { dataView.setInt32(8, high, true) const expectedValue = (BigInt(high) << BigInt(32)) | low - const result = readPlain(dataView, ParquetType.INT96, 1, 0, false) + const result = readPlain(dataView, 'INT96', 1, 0, false) expect(result).toEqual({ value: [expectedValue], byteLength: 12, @@ -46,14 +45,14 @@ describe('readPlain', () => { it('reads FLOAT values correctly', () => { const dataView = new DataView(new ArrayBuffer(4)) dataView.setFloat32(0, 1234.5, true) // little-endian - const result = readPlain(dataView, ParquetType.FLOAT, 1, 0, false) + const result = readPlain(dataView, 'FLOAT', 1, 0, false) expect(result).toEqual({ value: [1234.5], byteLength: 4 }) }) it('reads DOUBLE values correctly', () => { const dataView = new DataView(new ArrayBuffer(8)) dataView.setFloat64(0, 12345.6789, true) // little-endian - const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0, false) + const result = readPlain(dataView, 'DOUBLE', 1, 0, false) expect(result).toEqual({ value: [12345.6789], byteLength: 8 }) }) @@ -63,7 +62,7 @@ describe('readPlain', () => { dataView.setUint8(4, 1) // first byte array data dataView.setUint8(5, 2) dataView.setUint8(6, 3) - const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0, false) + const result = readPlain(dataView, 'BYTE_ARRAY', 1, 0, false) expect(result).toEqual({ value: [new Uint8Array([1, 2, 3])], byteLength: 7, @@ -76,7 +75,7 @@ describe('readPlain', () => { dataView.setUint8(0, 4) dataView.setUint8(1, 5) dataView.setUint8(2, 6) - const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0, false) + const result = readPlain(dataView, 'FIXED_LEN_BYTE_ARRAY', fixedLength, 0, false) expect(result).toEqual({ value: new Uint8Array([4, 5, 6]), byteLength: fixedLength, @@ -85,7 +84,8 @@ describe('readPlain', () => { it('throws an error for unhandled types', () => { const dataView = new DataView(new ArrayBuffer(0)) - const invalidType = 999 + /** @type any */ + const invalidType = 'invalidType' expect(() => readPlain(dataView, invalidType, 1, 0, false)) .toThrow(`parquet unhandled type: ${invalidType}`) }) diff --git a/test/files/addrtype-missing-value.metadata.json b/test/files/addrtype-missing-value.metadata.json index fa0b14b..15cd13c 100644 --- a/test/files/addrtype-missing-value.metadata.json +++ b/test/files/addrtype-missing-value.metadata.json @@ -9,7 +9,7 @@ "num_children": 1 }, { - "type": 6, + "type": "BYTE_ARRAY", "repetition_type": "OPTIONAL", "name": "ADDRTYPE", "converted_type": "UTF8" @@ -22,7 +22,7 @@ { "file_offset": 0, "meta_data": { - "type": 6, + "type": "BYTE_ARRAY", "encodings": ["PLAIN", "RLE_DICTIONARY"], "path_in_schema": ["ADDRTYPE"], "codec": "SNAPPY", diff --git a/test/files/byte_array_decimal.metadata.json b/test/files/byte_array_decimal.metadata.json index 4abe4d0..a0b62db 100644 --- a/test/files/byte_array_decimal.metadata.json +++ b/test/files/byte_array_decimal.metadata.json @@ -16,7 +16,7 @@ "path_in_schema": [ "value" ], "total_compressed_size": 168, "total_uncompressed_size": 168, - "type": 6 + "type": "BYTE_ARRAY" } } ], @@ -37,7 +37,7 @@ "precision": 4, "repetition_type": "OPTIONAL", "scale": 2, - "type": 6 + "type": "BYTE_ARRAY" } ] } diff --git a/test/files/concatenated_gzip_members.metadata.json b/test/files/concatenated_gzip_members.metadata.json index 0e21c6c..29f4ef7 100644 --- a/test/files/concatenated_gzip_members.metadata.json +++ b/test/files/concatenated_gzip_members.metadata.json @@ -21,7 +21,7 @@ "statistics": {}, "total_compressed_size": 1467, "total_uncompressed_size": 4155, - "type": 2 + "type": "INT64" } } ], @@ -38,7 +38,7 @@ "converted_type": "UINT_64", "name": "long_col", "repetition_type": "OPTIONAL", - "type": 2 + "type": "INT64" } ] } diff --git a/test/files/datapage_v2.snappy.metadata.json b/test/files/datapage_v2.snappy.metadata.json index d38e215..a5b583f 100644 --- a/test/files/datapage_v2.snappy.metadata.json +++ b/test/files/datapage_v2.snappy.metadata.json @@ -27,7 +27,7 @@ }, "total_compressed_size": 63, "total_uncompressed_size": 59, - "type": 6 + "type": "BYTE_ARRAY" } }, { @@ -45,7 +45,7 @@ }, "total_compressed_size": 49, "total_uncompressed_size": 47, - "type": 1 + "type": "INT32" } }, { @@ -63,7 +63,7 @@ }, "total_compressed_size": 88, "total_uncompressed_size": 94, - "type": 5 + "type": "DOUBLE" } }, { @@ -81,7 +81,7 @@ }, "total_compressed_size": 39, "total_uncompressed_size": 37, - "type": 0 + "type": "BOOLEAN" } }, { @@ -103,7 +103,7 @@ }, "total_compressed_size": 78, "total_uncompressed_size": 74, - "type": 1 + "type": "INT32" } } ], @@ -120,22 +120,22 @@ "converted_type": "UTF8", "name": "a", "repetition_type": "OPTIONAL", - "type": 6 + "type": "BYTE_ARRAY" }, { "name": "b", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "name": "c", "repetition_type": "REQUIRED", - "type": 5 + "type": "DOUBLE" }, { "name": "d", "repetition_type": "REQUIRED", - "type": 0 + "type": "BOOLEAN" }, { "converted_type": "LIST", @@ -151,7 +151,7 @@ { "name": "element", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" } ] } diff --git a/test/files/nonnullable.impala.metadata.json b/test/files/nonnullable.impala.metadata.json index 654b49e..05e1517 100644 --- a/test/files/nonnullable.impala.metadata.json +++ b/test/files/nonnullable.impala.metadata.json @@ -27,7 +27,7 @@ }, "total_compressed_size": 49, "total_uncompressed_size": 49, - "type": 2 + "type": "INT64" } }, { @@ -45,7 +45,7 @@ }, "total_compressed_size": 49, "total_uncompressed_size": 49, - "type": 1 + "type": "INT32" } }, { @@ -69,7 +69,7 @@ }, "total_compressed_size": 55, "total_uncompressed_size": 55, - "type": 1 + "type": "INT32" } }, { @@ -87,7 +87,7 @@ }, "total_compressed_size": 47, "total_uncompressed_size": 47, - "type": 6 + "type": "BYTE_ARRAY" } }, { @@ -105,7 +105,7 @@ }, "total_compressed_size": 49, "total_uncompressed_size": 49, - "type": 1 + "type": "INT32" } }, { @@ -129,7 +129,7 @@ }, "total_compressed_size": 49, "total_uncompressed_size": 49, - "type": 6 + "type": "BYTE_ARRAY" } }, { @@ -153,7 +153,7 @@ }, "total_compressed_size": 51, "total_uncompressed_size": 51, - "type": 1 + "type": "INT32" } }, { @@ -171,7 +171,7 @@ }, "total_compressed_size": 37, "total_uncompressed_size": 37, - "type": 1 + "type": "INT32" } }, { @@ -189,7 +189,7 @@ }, "total_compressed_size": 49, "total_uncompressed_size": 49, - "type": 1 + "type": "INT32" } }, { @@ -216,7 +216,7 @@ }, "total_compressed_size": 51, "total_uncompressed_size": 51, - "type": 1 + "type": "INT32" } }, { @@ -243,7 +243,7 @@ }, "total_compressed_size": 76, "total_uncompressed_size": 76, - "type": 6 + "type": "BYTE_ARRAY" } }, { @@ -259,7 +259,7 @@ }, "total_compressed_size": 33, "total_uncompressed_size": 33, - "type": 6 + "type": "BYTE_ARRAY" } }, { @@ -284,7 +284,7 @@ }, "total_compressed_size": 35, "total_uncompressed_size": 35, - "type": 5 + "type": "DOUBLE" } } ], @@ -300,7 +300,7 @@ { "name": "ID", "repetition_type": "REQUIRED", - "type": 2 + "type": "INT64" }, { "converted_type": "LIST", @@ -316,7 +316,7 @@ { "name": "element", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "converted_type": "LIST", @@ -343,7 +343,7 @@ { "name": "element", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "converted_type": "MAP", @@ -361,12 +361,12 @@ "converted_type": "UTF8", "name": "key", "repetition_type": "REQUIRED", - "type": 6 + "type": "BYTE_ARRAY" }, { "name": "value", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "converted_type": "LIST", @@ -395,12 +395,12 @@ "converted_type": "UTF8", "name": "key", "repetition_type": "REQUIRED", - "type": 6 + "type": "BYTE_ARRAY" }, { "name": "value", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "name": "nested_Struct", @@ -410,7 +410,7 @@ { "name": "a", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "converted_type": "LIST", @@ -426,7 +426,7 @@ { "name": "element", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "name": "c", @@ -463,13 +463,13 @@ { "name": "e", "repetition_type": "REQUIRED", - "type": 1 + "type": "INT32" }, { "converted_type": "UTF8", "name": "f", "repetition_type": "REQUIRED", - "type": 6 + "type": "BYTE_ARRAY" }, { "converted_type": "MAP", @@ -487,7 +487,7 @@ "converted_type": "UTF8", "name": "key", "repetition_type": "REQUIRED", - "type": 6 + "type": "BYTE_ARRAY" }, { "name": "value", @@ -513,7 +513,7 @@ { "name": "element", "repetition_type": "REQUIRED", - "type": 5 + "type": "DOUBLE" } ] } diff --git a/test/files/rowgroups.metadata.json b/test/files/rowgroups.metadata.json index 1c931a1..12dea07 100644 --- a/test/files/rowgroups.metadata.json +++ b/test/files/rowgroups.metadata.json @@ -9,7 +9,7 @@ "num_children": 1 }, { - "type": 2, + "type": "INT64", "repetition_type": "OPTIONAL", "name": "numbers" } @@ -38,7 +38,7 @@ }, "total_compressed_size": 146, "total_uncompressed_size": 172, - "type": 2 + "type": "INT64" } } ], @@ -67,7 +67,7 @@ }, "total_compressed_size": 120, "total_uncompressed_size": 126, - "type": 2 + "type": "INT64" } } ], diff --git a/test/schemaTree.test.js b/test/schemaTree.test.js index 704dd53..7c73ba8 100644 --- a/test/schemaTree.test.js +++ b/test/schemaTree.test.js @@ -28,7 +28,7 @@ const addrtypeSchema = { converted_type: 'UTF8', name: 'ADDRTYPE', repetition_type: 'OPTIONAL', - type: 6, + type: 'BYTE_ARRAY', }, }, ], @@ -49,7 +49,7 @@ const rowgroupsSchema = { element: { name: 'numbers', repetition_type: 'OPTIONAL', - type: 2, + type: 'INT64', }, }, ],