diff --git a/src/encoding.js b/src/encoding.js index 51aa2ad..8e7ca49 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -2,6 +2,9 @@ import { ParquetEncoding, ParquetType } from './constants.js' import { readVarInt } from './thrift.js' /** + * Return type with bytes read. + * This is useful to advance an offset through a buffer. + * * @typedef {import("./types.d.ts").Decoded} Decoded * @template T */ diff --git a/src/header.js b/src/header.js new file mode 100644 index 0000000..d6c6e6c --- /dev/null +++ b/src/header.js @@ -0,0 +1,75 @@ +import { deserializeTCompactProtocol } from './thrift.js' + +/** + * Return type with bytes read. + * This is useful to advance an offset through a buffer. + * + * @typedef {import("./types.d.ts").Decoded} Decoded + * @template T + */ + +/** + * Read parquet header from a buffer. + * + * @typedef {import("./types.d.ts").PageHeader} PageHeader + * @param {ArrayBuffer} arrayBuffer parquet file contents + * @param {number} offset offset to start reading from + * @returns {Decoded} metadata object and bytes read + */ +export function parquetHeader(arrayBuffer, offset) { + // DataView for easier manipulation of the buffer + const view = new DataView(arrayBuffer) + + const headerBuffer = view.buffer.slice(offset) + const { value: header, byteLength } = deserializeTCompactProtocol(headerBuffer) + + // Parse parquet header from thrift data + const type = header.field_1 + const uncompressed_page_size = header.field_2 + const compressed_page_size = header.field_3 + const crc = header.field_4 + const data_page_header = header.field_5 && { + num_values: header.field_5.field_1, + encoding: header.field_5.field_2, + definition_level_encoding: header.field_5.field_3, + repetition_level_encoding: header.field_5.field_4, + statistics: header.field_5.field_5 && { + max: header.field_5.field_5.field_1, + min: header.field_5.field_5.field_2, + null_count: header.field_5.field_5.field_3, + distinct_count: header.field_5.field_5.field_4, + max_value: header.field_5.field_5.field_5, + min_value: header.field_5.field_5.field_6, + }, + } + const index_page_header = header.field_6 + const dictionary_page_header = header.field_7 && { + num_values: header.field_7.field_1, + encoding: header.field_7.field_2, + is_sorted: header.field_7.field_3, + } + const data_page_header_v2 = header.field_8 && { + num_values: header.field_8.field_1, + num_nulls: header.field_8.field_2, + num_rows: header.field_8.field_3, + encoding: header.field_8.field_4, + definition_levels_byte_length: header.field_8.field_5, + repetition_levels_byte_length: header.field_8.field_6, + is_compressed: header.field_8.field_7, + statistics: header.field_8.field_8, + } + + return { + byteLength, + value: { + type, + uncompressed_page_size, + compressed_page_size, + crc, + data_page_header, + index_page_header, + dictionary_page_header, + data_page_header_v2, + }, + } +} diff --git a/src/types.d.ts b/src/types.d.ts index fa6057e..daa27b8 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -142,3 +142,42 @@ interface SortingColumn { descending: boolean nulls_first: boolean } + +// Parquet file header types +export interface PageHeader { + type: PageType + uncompressed_page_size: number + compressed_page_size: number + crc?: number + data_page_header?: DataPageHeader + index_page_header?: IndexPageHeader + dictionary_page_header?: DictionaryPageHeader + data_page_header_v2?: DataPageHeaderV2 +} + +export interface DataPageHeader { + num_values: number + encoding: Encoding + definition_level_encoding: Encoding + repetition_level_encoding: Encoding + statistics?: Statistics +} + +interface IndexPageHeader {} + +export interface DictionaryPageHeader { + num_values: number + encoding: Encoding + is_sorted?: boolean +} + +interface DataPageHeaderV2 { + num_values: number + num_nulls: number + num_rows: number + encoding: Encoding + definition_levels_byte_length: number + repetition_levels_byte_length: number + is_compressed?: boolean + statistics?: Statistics +} diff --git a/test/encoding.test.js b/test/encoding.test.js index b6e5fb1..8a15644 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -89,3 +89,5 @@ describe('readPlain', () => { expect(() => readPlain(dataView, invalidType, 1, 0)).toThrow(`Unhandled type: ${invalidType}`) }) }) + +// TODO: Add tests for readData