hyparquet/src/header.js

62 lines
2.1 KiB
JavaScript
Raw Normal View History

2024-04-18 07:02:29 +00:00
import { Encoding, PageType } from './constants.js'
2024-01-05 11:06:27 +00:00
import { deserializeTCompactProtocol } from './thrift.js'
/**
* Read parquet header from a buffer.
*
2024-05-01 07:55:16 +00:00
* @typedef {import("./types.d.ts").DataReader} DataReader
2024-01-05 11:06:27 +00:00
* @typedef {import("./types.d.ts").PageHeader} PageHeader
2024-05-01 07:55:16 +00:00
* @param {DataReader} reader - parquet file reader
* @returns {PageHeader} metadata object and bytes read
2024-01-05 11:06:27 +00:00
*/
2024-05-01 07:55:16 +00:00
export function parquetHeader(reader) {
const header = deserializeTCompactProtocol(reader)
2024-01-05 11:06:27 +00:00
// Parse parquet header from thrift data
2024-04-18 07:02:29 +00:00
const type = PageType[header.field_1]
2024-01-05 11:06:27 +00:00
const uncompressed_page_size = header.field_2
const compressed_page_size = header.field_3
const crc = header.field_4
const data_page_header = header.field_5 && {
num_values: header.field_5.field_1,
2024-02-27 18:33:17 +00:00
encoding: Encoding[header.field_5.field_2],
definition_level_encoding: Encoding[header.field_5.field_3],
repetition_level_encoding: Encoding[header.field_5.field_4],
2024-01-05 11:06:27 +00:00
statistics: header.field_5.field_5 && {
max: header.field_5.field_5.field_1,
min: header.field_5.field_5.field_2,
null_count: header.field_5.field_5.field_3,
distinct_count: header.field_5.field_5.field_4,
max_value: header.field_5.field_5.field_5,
min_value: header.field_5.field_5.field_6,
},
}
const index_page_header = header.field_6
const dictionary_page_header = header.field_7 && {
num_values: header.field_7.field_1,
2024-02-27 18:33:17 +00:00
encoding: Encoding[header.field_7.field_2],
2024-01-05 11:06:27 +00:00
is_sorted: header.field_7.field_3,
}
const data_page_header_v2 = header.field_8 && {
num_values: header.field_8.field_1,
num_nulls: header.field_8.field_2,
num_rows: header.field_8.field_3,
2024-02-27 18:33:17 +00:00
encoding: Encoding[header.field_8.field_4],
2024-01-05 11:06:27 +00:00
definition_levels_byte_length: header.field_8.field_5,
repetition_levels_byte_length: header.field_8.field_6,
2024-04-18 07:02:29 +00:00
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default true
2024-01-05 11:06:27 +00:00
statistics: header.field_8.field_8,
}
return {
2024-05-01 07:55:16 +00:00
type,
uncompressed_page_size,
compressed_page_size,
crc,
data_page_header,
index_page_header,
dictionary_page_header,
data_page_header_v2,
2024-01-05 11:06:27 +00:00
}
}