PageType enum to string

This commit is contained in:
Kenny Daniel 2024-04-18 00:02:29 -07:00
parent f826bff757
commit 86273b110c
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
7 changed files with 26 additions and 40 deletions

@ -1,5 +1,4 @@
import { assembleObjects } from './assemble.js'
import { PageType } from './constants.js'
import { convert } from './convert.js'
import { readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPageV2 } from './datapageV2.js'
@ -49,7 +48,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
)
// parse page data by type
if (header.type === PageType.DATA_PAGE) {
if (header.type === 'DATA_PAGE') {
const daph = header.data_page_header
if (!daph) throw new Error('parquet data page header is undefined')
@ -95,7 +94,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
// you need the total number of children, not the number of top-level values.
concat(rowData, values)
} else if (header.type === PageType.DICTIONARY_PAGE) {
} else if (header.type === 'DICTIONARY_PAGE') {
const diph = header.dictionary_page_header
if (!diph) throw new Error('parquet dictionary page header is undefined')
@ -103,7 +102,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec, compressors
)
dictionary = readDictionaryPage(page, diph, schema, columnMetadata)
} else if (header.type === PageType.DATA_PAGE_V2) {
} else if (header.type === 'DATA_PAGE_V2') {
const daph2 = header.data_page_header_v2
if (!daph2) throw new Error('parquet data page header v2 is undefined')
@ -200,6 +199,7 @@ export function decompressPage(compressedBytes, uncompressed_page_size, codec, c
/**
* Expand data page list with nulls and convert to utf8.
*
* @param {number[]} definitionLevels
* @param {number} maxDefinitionLevel
* @param {ArrayLike<any>} dataPage

@ -94,9 +94,13 @@ export const CompressionCodec = [
'LZ4_RAW',
]
export const PageType = {
DATA_PAGE: 0,
INDEX_PAGE: 1,
DICTIONARY_PAGE: 2,
DATA_PAGE_V2: 3,
}
/**
* @typedef {import('./types.js').PageType} PageType
* @type {PageType[]}
*/
export const PageType = [
'DATA_PAGE',
'INDEX_PAGE',
'DICTIONARY_PAGE',
'DATA_PAGE_V2',
]

@ -1,26 +1,16 @@
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import {
getMaxDefinitionLevel,
getMaxRepetitionLevel,
isRequired,
schemaElement,
skipDefinitionBytes,
} from './schema.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement, skipDefinitionBytes } from './schema.js'
const skipNulls = false // TODO
/**
* Read a data page from the given Uint8Array.
*
* @typedef {{ definitionLevels: number[], numNulls: number }} DefinitionLevels
* @typedef {import("./types.d.ts").DataPage} DataPage
* @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData
* @typedef {import("./types.d.ts").DataPageHeader} DataPageHeader
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader
* @typedef {import("./types.d.ts").SchemaElement} SchemaElement
*/
/**
* Read a data page from the given Uint8Array.
*
* @param {Uint8Array} bytes raw page data (should already be decompressed)
* @param {DataPageHeader} daph data page header
* @param {SchemaElement[]} schema schema for the file
@ -92,6 +82,7 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
/**
* Read a page containing dictionary data.
*
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader
* @param {Uint8Array} bytes raw page data
* @param {DictionaryPageHeader} diph dictionary page header
* @param {SchemaElement[]} schema schema for the file

@ -331,9 +331,6 @@ function readBitPacked(reader, header, bitWidth, remaining) {
reader.offset++
left += 8
} else {
// otherwise, read bitWidth number of bits
// don't write more than remaining number of rows
// even if there are still bits to read
if (remaining > 0) {
// emit value by shifting off to the right and masking
value.push((data >> right) & mask)
@ -344,7 +341,6 @@ function readBitPacked(reader, header, bitWidth, remaining) {
}
}
// return values and number of bytes read
return value
}

@ -1,10 +1,7 @@
import { Encoding } from './constants.js'
import { Encoding, PageType } from './constants.js'
import { deserializeTCompactProtocol } from './thrift.js'
/**
* Return type with bytes read.
* This is useful to advance an offset through a buffer.
*
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
* @template T
*/
@ -21,7 +18,7 @@ export function parquetHeader(arrayBuffer, offset) {
const { value: header, byteLength } = deserializeTCompactProtocol(arrayBuffer, offset)
// Parse parquet header from thrift data
const type = header.field_1
const type = PageType[header.field_1]
const uncompressed_page_size = header.field_2
const compressed_page_size = header.field_3
const crc = header.field_4
@ -52,7 +49,7 @@ export function parquetHeader(arrayBuffer, offset) {
encoding: Encoding[header.field_8.field_4],
definition_levels_byte_length: header.field_8.field_5,
repetition_levels_byte_length: header.field_8.field_6,
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default to true
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default true
statistics: header.field_8.field_8,
}

@ -67,7 +67,6 @@ export async function parquetRead(options) {
/**
* Read a row group from a file-like object.
* Reads the minimal number of columns to satisfy the request.
*
* @typedef {import('./types.js').RowGroup} RowGroup
* @param {object} options read options

11
src/types.d.ts vendored

@ -195,12 +195,11 @@ interface PageEncodingStats {
count: number
}
export enum PageType {
DATA_PAGE = 0,
INDEX_PAGE = 1,
DICTIONARY_PAGE = 2,
DATA_PAGE_V2 = 3,
}
export type PageType =
'DATA_PAGE' |
'INDEX_PAGE' |
'DICTIONARY_PAGE' |
'DATA_PAGE_V2'
interface SortingColumn {
column_idx: number