From 86273b110c84b01e3b1eb29acc936a802108457d Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Thu, 18 Apr 2024 00:02:29 -0700 Subject: [PATCH] PageType enum to string --- src/column.js | 8 ++++---- src/constants.js | 16 ++++++++++------ src/datapage.js | 17 ++++------------- src/encoding.js | 4 ---- src/header.js | 9 +++------ src/read.js | 1 - src/types.d.ts | 11 +++++------ 7 files changed, 26 insertions(+), 40 deletions(-) diff --git a/src/column.js b/src/column.js index 3f9d27f..a3bca29 100644 --- a/src/column.js +++ b/src/column.js @@ -1,5 +1,4 @@ import { assembleObjects } from './assemble.js' -import { PageType } from './constants.js' import { convert } from './convert.js' import { readDataPage, readDictionaryPage } from './datapage.js' import { readDataPageV2 } from './datapageV2.js' @@ -49,7 +48,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, ) // parse page data by type - if (header.type === PageType.DATA_PAGE) { + if (header.type === 'DATA_PAGE') { const daph = header.data_page_header if (!daph) throw new Error('parquet data page header is undefined') @@ -95,7 +94,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, // you need the total number of children, not the number of top-level values. concat(rowData, values) - } else if (header.type === PageType.DICTIONARY_PAGE) { + } else if (header.type === 'DICTIONARY_PAGE') { const diph = header.dictionary_page_header if (!diph) throw new Error('parquet dictionary page header is undefined') @@ -103,7 +102,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec, compressors ) dictionary = readDictionaryPage(page, diph, schema, columnMetadata) - } else if (header.type === PageType.DATA_PAGE_V2) { + } else if (header.type === 'DATA_PAGE_V2') { const daph2 = header.data_page_header_v2 if (!daph2) throw new Error('parquet data page header v2 is undefined') @@ -200,6 +199,7 @@ export function decompressPage(compressedBytes, uncompressed_page_size, codec, c /** * Expand data page list with nulls and convert to utf8. + * * @param {number[]} definitionLevels * @param {number} maxDefinitionLevel * @param {ArrayLike} dataPage diff --git a/src/constants.js b/src/constants.js index e577e0a..ca2e3a2 100644 --- a/src/constants.js +++ b/src/constants.js @@ -94,9 +94,13 @@ export const CompressionCodec = [ 'LZ4_RAW', ] -export const PageType = { - DATA_PAGE: 0, - INDEX_PAGE: 1, - DICTIONARY_PAGE: 2, - DATA_PAGE_V2: 3, -} +/** + * @typedef {import('./types.js').PageType} PageType + * @type {PageType[]} + */ +export const PageType = [ + 'DATA_PAGE', + 'INDEX_PAGE', + 'DICTIONARY_PAGE', + 'DATA_PAGE_V2', +] diff --git a/src/datapage.js b/src/datapage.js index 6922f45..ecb5b4d 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -1,26 +1,16 @@ import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' -import { - getMaxDefinitionLevel, - getMaxRepetitionLevel, - isRequired, - schemaElement, - skipDefinitionBytes, -} from './schema.js' +import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement, skipDefinitionBytes } from './schema.js' const skipNulls = false // TODO /** + * Read a data page from the given Uint8Array. + * * @typedef {{ definitionLevels: number[], numNulls: number }} DefinitionLevels * @typedef {import("./types.d.ts").DataPage} DataPage * @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData * @typedef {import("./types.d.ts").DataPageHeader} DataPageHeader - * @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader * @typedef {import("./types.d.ts").SchemaElement} SchemaElement - */ - -/** - * Read a data page from the given Uint8Array. - * * @param {Uint8Array} bytes raw page data (should already be decompressed) * @param {DataPageHeader} daph data page header * @param {SchemaElement[]} schema schema for the file @@ -92,6 +82,7 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { /** * Read a page containing dictionary data. * + * @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader * @param {Uint8Array} bytes raw page data * @param {DictionaryPageHeader} diph dictionary page header * @param {SchemaElement[]} schema schema for the file diff --git a/src/encoding.js b/src/encoding.js index 909f2b6..35bbd97 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -331,9 +331,6 @@ function readBitPacked(reader, header, bitWidth, remaining) { reader.offset++ left += 8 } else { - // otherwise, read bitWidth number of bits - // don't write more than remaining number of rows - // even if there are still bits to read if (remaining > 0) { // emit value by shifting off to the right and masking value.push((data >> right) & mask) @@ -344,7 +341,6 @@ function readBitPacked(reader, header, bitWidth, remaining) { } } - // return values and number of bytes read return value } diff --git a/src/header.js b/src/header.js index 81255ef..f96f070 100644 --- a/src/header.js +++ b/src/header.js @@ -1,10 +1,7 @@ -import { Encoding } from './constants.js' +import { Encoding, PageType } from './constants.js' import { deserializeTCompactProtocol } from './thrift.js' /** - * Return type with bytes read. - * This is useful to advance an offset through a buffer. - * * @typedef {import("./types.d.ts").Decoded} Decoded * @template T */ @@ -21,7 +18,7 @@ export function parquetHeader(arrayBuffer, offset) { const { value: header, byteLength } = deserializeTCompactProtocol(arrayBuffer, offset) // Parse parquet header from thrift data - const type = header.field_1 + const type = PageType[header.field_1] const uncompressed_page_size = header.field_2 const compressed_page_size = header.field_3 const crc = header.field_4 @@ -52,7 +49,7 @@ export function parquetHeader(arrayBuffer, offset) { encoding: Encoding[header.field_8.field_4], definition_levels_byte_length: header.field_8.field_5, repetition_levels_byte_length: header.field_8.field_6, - is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default to true + is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default true statistics: header.field_8.field_8, } diff --git a/src/read.js b/src/read.js index 33c03d6..e64b4e0 100644 --- a/src/read.js +++ b/src/read.js @@ -67,7 +67,6 @@ export async function parquetRead(options) { /** * Read a row group from a file-like object. - * Reads the minimal number of columns to satisfy the request. * * @typedef {import('./types.js').RowGroup} RowGroup * @param {object} options read options diff --git a/src/types.d.ts b/src/types.d.ts index d1ae293..3f90c40 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -195,12 +195,11 @@ interface PageEncodingStats { count: number } -export enum PageType { - DATA_PAGE = 0, - INDEX_PAGE = 1, - DICTIONARY_PAGE = 2, - DATA_PAGE_V2 = 3, -} +export type PageType = + 'DATA_PAGE' | + 'INDEX_PAGE' | + 'DICTIONARY_PAGE' | + 'DATA_PAGE_V2' interface SortingColumn { column_idx: number