mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-11 05:16:36 +00:00
Parquet header parser
This commit is contained in:
parent
04fa052565
commit
92902c41d3
@ -2,6 +2,9 @@ import { ParquetEncoding, ParquetType } from './constants.js'
|
||||
import { readVarInt } from './thrift.js'
|
||||
|
||||
/**
|
||||
* Return type with bytes read.
|
||||
* This is useful to advance an offset through a buffer.
|
||||
*
|
||||
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
|
||||
* @template T
|
||||
*/
|
||||
|
||||
75
src/header.js
Normal file
75
src/header.js
Normal file
@ -0,0 +1,75 @@
|
||||
import { deserializeTCompactProtocol } from './thrift.js'
|
||||
|
||||
/**
|
||||
* Return type with bytes read.
|
||||
* This is useful to advance an offset through a buffer.
|
||||
*
|
||||
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
|
||||
* @template T
|
||||
*/
|
||||
|
||||
/**
|
||||
* Read parquet header from a buffer.
|
||||
*
|
||||
* @typedef {import("./types.d.ts").PageHeader} PageHeader
|
||||
* @param {ArrayBuffer} arrayBuffer parquet file contents
|
||||
* @param {number} offset offset to start reading from
|
||||
* @returns {Decoded<PageHeader>} metadata object and bytes read
|
||||
*/
|
||||
export function parquetHeader(arrayBuffer, offset) {
|
||||
// DataView for easier manipulation of the buffer
|
||||
const view = new DataView(arrayBuffer)
|
||||
|
||||
const headerBuffer = view.buffer.slice(offset)
|
||||
const { value: header, byteLength } = deserializeTCompactProtocol(headerBuffer)
|
||||
|
||||
// Parse parquet header from thrift data
|
||||
const type = header.field_1
|
||||
const uncompressed_page_size = header.field_2
|
||||
const compressed_page_size = header.field_3
|
||||
const crc = header.field_4
|
||||
const data_page_header = header.field_5 && {
|
||||
num_values: header.field_5.field_1,
|
||||
encoding: header.field_5.field_2,
|
||||
definition_level_encoding: header.field_5.field_3,
|
||||
repetition_level_encoding: header.field_5.field_4,
|
||||
statistics: header.field_5.field_5 && {
|
||||
max: header.field_5.field_5.field_1,
|
||||
min: header.field_5.field_5.field_2,
|
||||
null_count: header.field_5.field_5.field_3,
|
||||
distinct_count: header.field_5.field_5.field_4,
|
||||
max_value: header.field_5.field_5.field_5,
|
||||
min_value: header.field_5.field_5.field_6,
|
||||
},
|
||||
}
|
||||
const index_page_header = header.field_6
|
||||
const dictionary_page_header = header.field_7 && {
|
||||
num_values: header.field_7.field_1,
|
||||
encoding: header.field_7.field_2,
|
||||
is_sorted: header.field_7.field_3,
|
||||
}
|
||||
const data_page_header_v2 = header.field_8 && {
|
||||
num_values: header.field_8.field_1,
|
||||
num_nulls: header.field_8.field_2,
|
||||
num_rows: header.field_8.field_3,
|
||||
encoding: header.field_8.field_4,
|
||||
definition_levels_byte_length: header.field_8.field_5,
|
||||
repetition_levels_byte_length: header.field_8.field_6,
|
||||
is_compressed: header.field_8.field_7,
|
||||
statistics: header.field_8.field_8,
|
||||
}
|
||||
|
||||
return {
|
||||
byteLength,
|
||||
value: {
|
||||
type,
|
||||
uncompressed_page_size,
|
||||
compressed_page_size,
|
||||
crc,
|
||||
data_page_header,
|
||||
index_page_header,
|
||||
dictionary_page_header,
|
||||
data_page_header_v2,
|
||||
},
|
||||
}
|
||||
}
|
||||
39
src/types.d.ts
vendored
39
src/types.d.ts
vendored
@ -142,3 +142,42 @@ interface SortingColumn {
|
||||
descending: boolean
|
||||
nulls_first: boolean
|
||||
}
|
||||
|
||||
// Parquet file header types
|
||||
export interface PageHeader {
|
||||
type: PageType
|
||||
uncompressed_page_size: number
|
||||
compressed_page_size: number
|
||||
crc?: number
|
||||
data_page_header?: DataPageHeader
|
||||
index_page_header?: IndexPageHeader
|
||||
dictionary_page_header?: DictionaryPageHeader
|
||||
data_page_header_v2?: DataPageHeaderV2
|
||||
}
|
||||
|
||||
export interface DataPageHeader {
|
||||
num_values: number
|
||||
encoding: Encoding
|
||||
definition_level_encoding: Encoding
|
||||
repetition_level_encoding: Encoding
|
||||
statistics?: Statistics
|
||||
}
|
||||
|
||||
interface IndexPageHeader {}
|
||||
|
||||
export interface DictionaryPageHeader {
|
||||
num_values: number
|
||||
encoding: Encoding
|
||||
is_sorted?: boolean
|
||||
}
|
||||
|
||||
interface DataPageHeaderV2 {
|
||||
num_values: number
|
||||
num_nulls: number
|
||||
num_rows: number
|
||||
encoding: Encoding
|
||||
definition_levels_byte_length: number
|
||||
repetition_levels_byte_length: number
|
||||
is_compressed?: boolean
|
||||
statistics?: Statistics
|
||||
}
|
||||
|
||||
@ -89,3 +89,5 @@ describe('readPlain', () => {
|
||||
expect(() => readPlain(dataView, invalidType, 1, 0)).toThrow(`Unhandled type: ${invalidType}`)
|
||||
})
|
||||
})
|
||||
|
||||
// TODO: Add tests for readData
|
||||
|
||||
Loading…
Reference in New Issue
Block a user