Parse column and offset indexes (#29)

* Parse indicies

* Add parsed offset indices

* Add parsed column indices

* Test readColumnIndex and readOffsetIndex

* Add more parsed offset indices

* Remove unnecessary toJson when loading expected results

* Add length checks to convertMetadata

* Rename indicies.js to indexes.js

* Rename indices.test.js to indexes.test.js

* Rename *_indices.json to *_indexes.json

* Use asyncBufferFromFile in indexes.test.js

---------

Co-authored-by: Brian Park <park-brian@users.noreply.github.com>
This commit is contained in:
Kenny 2024-08-18 18:23:54 -07:00 committed by GitHub
parent 7eb1e05515
commit a2024a781c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
23 changed files with 3270 additions and 50 deletions

@ -129,7 +129,7 @@ function renderSidebar(asyncBuffer, metadata, name) {
const sidebar = /** @type {HTMLElement} */ (document.getElementById('sidebar'))
sidebar.innerHTML = `<div id="filename">${name}</div>`
sidebar.appendChild(fileMetadata(toJson(metadata)))
sidebar.appendChild(fileLayout(metadata, asyncBuffer.byteLength))
sidebar.appendChild(fileLayout(metadata, asyncBuffer))
}
welcome.addEventListener('click', () => {

@ -24,12 +24,14 @@ export function fileMetadata(metadata) {
* Render parquet file layout.
*
* @param {FileMetaData} metadata
* @param {number} byteLength
* @param {import('../src/types.js').AsyncBuffer} asyncBuffer
* @returns {HTMLDivElement}
*/
export function fileLayout(metadata, byteLength) {
export function fileLayout(metadata, asyncBuffer) {
let html = '<h2>File layout</h2>'
html += cell('PAR1', 0n, 4n) // magic number
// data pages by row group and column
/** @type {[string, bigint, bigint][]} */
const indexPages = []
for (const rowGroupIndex in metadata.row_groups) {
@ -67,13 +69,17 @@ export function fileLayout(metadata, byteLength) {
}
html += '</div>'
}
for (const [name, start, length] of indexPages) {
// column and offset indexes
for (const [name, start, length] of indexPages.sort((a, b) => Number(a[1]) - Number(b[1]))) {
html += cell(name, start, start + length)
}
const metadataStart = BigInt(byteLength - metadata.metadata_length - 4)
const metadataEnd = BigInt(byteLength - 4)
// metadata footer
const metadataStart = BigInt(asyncBuffer.byteLength - metadata.metadata_length - 4)
const metadataEnd = BigInt(asyncBuffer.byteLength - 4)
html += cell('Metadata', metadataStart, metadataEnd)
html += cell('PAR1', metadataEnd, BigInt(byteLength)) // magic number
html += cell('PAR1', metadataEnd, BigInt(asyncBuffer.byteLength)) // magic number
const div = document.createElement('div')
div.innerHTML = html
div.classList.add('layout', 'collapsed') // start collapsed

@ -1,6 +1,4 @@
/**
* @type {import('./types.js').ParquetType[]}
*/
/** @type {import('./types.js').ParquetType[]} */
export const ParquetType = [
'BOOLEAN',
'INT32',
@ -31,9 +29,7 @@ export const FieldRepetitionType = [
'REPEATED',
]
/**
* @type {import('./types.js').ConvertedType[]}
*/
/** @type {import('./types.js').ConvertedType[]} */
export const ConvertedType = [
'UTF8',
'MAP',
@ -59,9 +55,7 @@ export const ConvertedType = [
'INTERVAL',
]
/**
* @type {import('./types.js').LogicalTypeType[]}
*/
/** @type {import('./types.js').LogicalTypeType[]} */
export const logicalTypeType = [
'NULL',
'STRING',
@ -91,12 +85,17 @@ export const CompressionCodec = [
'LZ4_RAW',
]
/**
* @type {import('./types.js').PageType[]}
*/
/** @type {import('./types.js').PageType[]} */
export const PageType = [
'DATA_PAGE',
'INDEX_PAGE',
'DICTIONARY_PAGE',
'DATA_PAGE_V2',
]
/** @type {import('./types.js').BoundaryOrder[]} */
export const BoundaryOrder = [
'UNORDERED',
'ASCENDING',
'DESCENDING',
]

46
src/indexes.js Normal file

@ -0,0 +1,46 @@
import { BoundaryOrder } from './constants.js'
import { convertMetadata } from './metadata.js'
import { deserializeTCompactProtocol } from './thrift.js'
/**
* @typedef {import('./types.d.ts').DataReader} DataReader
* @param {DataReader} reader
* @param {import('./types.d.ts').SchemaElement} schema
* @returns {import('./types.d.ts').ColumnIndex}
*/
export function readColumnIndex(reader, schema) {
const thrift = deserializeTCompactProtocol(reader)
return {
null_pages: thrift.field_1,
min_values: thrift.field_2.map((/** @type {any} */ m) => convertMetadata(m, schema)),
max_values: thrift.field_3.map((/** @type {any} */ m) => convertMetadata(m, schema)),
boundary_order: BoundaryOrder[thrift.field_4],
null_counts: thrift.field_5,
repetition_level_histograms: thrift.field_6,
definition_level_histograms: thrift.field_7,
}
}
/**
* @param {DataReader} reader
* @returns {import('./types.d.ts').OffsetIndex}
*/
export function readOffsetIndex(reader) {
const thrift = deserializeTCompactProtocol(reader)
return {
page_locations: thrift.field_1.map(pageLocation),
unencoded_byte_array_data_bytes: thrift.field_2,
}
}
/**
* @param {any} loc
* @returns {import('./types.d.ts').PageLocation}
*/
function pageLocation(loc) {
return {
offset: loc.field_1,
compressed_page_size: loc.field_2,
first_row_index: loc.field_3,
}
}

@ -137,7 +137,7 @@ export function parquetMetadata(arrayBuffer) {
data_page_offset: column.field_3.field_9,
index_page_offset: column.field_3.field_10,
dictionary_page_offset: column.field_3.field_11,
statistics: columnStats(column.field_3.field_12, columnSchema[columnIndex]),
statistics: convertStats(column.field_3.field_12, columnSchema[columnIndex]),
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
page_type: PageType[encodingStat.field_1],
encoding: Encoding[encodingStat.field_2],
@ -252,33 +252,41 @@ function timeUnit(unit) {
* @param {SchemaElement} schema
* @returns {import("./types.d.ts").Statistics}
*/
function columnStats(stats, schema) {
const { type, converted_type, logical_type } = schema
function convert(/** @type {Uint8Array} */ value) {
if (value === undefined) return value
if (type === 'BOOLEAN') return value[0] === 1
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value)
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
if (type === 'FLOAT') return view.getFloat32(0, true)
if (type === 'DOUBLE') return view.getFloat64(0, true)
if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000)
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n))
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true)))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true)))
if (type === 'INT32') return view.getInt32(0, true)
if (type === 'INT64') return view.getBigInt64(0, true)
if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0))
if (logical_type?.type === 'FLOAT16') return parseFloat16(value)
return value
}
function convertStats(stats, schema) {
return stats && {
max: convert(stats.field_1),
min: convert(stats.field_2),
max: convertMetadata(stats.field_1, schema),
min: convertMetadata(stats.field_2, schema),
null_count: stats.field_3,
distinct_count: stats.field_4,
max_value: convert(stats.field_5),
min_value: convert(stats.field_6),
max_value: convertMetadata(stats.field_5, schema),
min_value: convertMetadata(stats.field_6, schema),
is_max_value_exact: stats.field_7,
is_min_value_exact: stats.field_8,
}
}
/**
* @param {Uint8Array | undefined} value
* @param {SchemaElement} schema
* @returns {import('./types.d.ts').MinMaxType | undefined}
*/
export function convertMetadata(value, schema) {
const { type, converted_type, logical_type } = schema
if (value === undefined) return value
if (type === 'BOOLEAN') return value[0] === 1
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value)
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
if (type === 'FLOAT' && view.byteLength === 4) return view.getFloat32(0, true)
if (type === 'DOUBLE' && view.byteLength === 8) return view.getFloat64(0, true)
if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000)
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n))
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true)))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true)))
if (type === 'INT32' && view.byteLength === 4) return view.getInt32(0, true)
if (type === 'INT64' && view.byteLength === 8) return view.getBigInt64(0, true)
if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0))
if (logical_type?.type === 'FLOAT16') return parseFloat16(value)
if (type === 'FIXED_LEN_BYTE_ARRAY') return value
// assert(false)
return value
}

@ -78,9 +78,10 @@ function readElement(reader, type) {
}
case CompactType.LIST: {
const [elemType, listSize] = readCollectionBegin(reader)
const boolType = elemType === CompactType.TRUE || elemType === CompactType.FALSE
const values = new Array(listSize)
for (let i = 0; i < listSize; i++) {
values[i] = readElement(reader, elemType)
values[i] = boolType ? readElement(reader, CompactType.BYTE) === 1 : readElement(reader, elemType)
}
return values
}
@ -203,12 +204,11 @@ function readFieldBegin(reader, lastFid) {
}
const delta = type >> 4
let fid // field id
if (delta === 0) {
// not a delta, read zigzag varint field id
fid = readZigZag(reader)
} else {
if (delta) {
// add delta to last field id
fid = lastFid + delta
} else {
throw new Error('non-delta field id not supported')
}
return [getCompactType(type), fid, fid]
}

25
src/types.d.ts vendored

@ -211,7 +211,7 @@ interface KeyValue {
value?: string
}
type MinMaxType = bigint | boolean | number | string
type MinMaxType = bigint | boolean | number | string | Date | Uint8Array
export interface Statistics {
max?: MinMaxType
@ -301,3 +301,26 @@ export type DecodedArray =
Float32Array |
Float64Array |
any[]
export interface OffsetIndex {
page_locations: PageLocation[]
unencoded_byte_array_data_bytes?: bigint[]
}
interface PageLocation {
offset: bigint
compressed_page_size: number
first_row_index: bigint
}
export interface ColumnIndex {
null_pages: boolean[]
min_values: MinMaxType[]
max_values: MinMaxType[]
boundary_order: BoundaryOrder
null_counts?: bigint[]
repetition_level_histograms?: bigint[]
definition_level_histograms?: bigint[]
}
export type BoundaryOrder = 'UNORDERED' | 'ASCENDING' | 'DESCENDING'

File diff suppressed because it is too large Load Diff

@ -0,0 +1,823 @@
[
[
{
"page_locations": [
{
"offset": 4,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 54,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 104,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 154,
"compressed_page_size": 59,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 213,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 264,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 314,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 364,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 414,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 464,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 514,
"compressed_page_size": 53,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 567,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 617,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 667,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 717,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 767,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 817,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 867,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 917,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 967,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1017,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1067,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1117,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1167,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1217,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1267,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1317,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1367,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1421,
"compressed_page_size": 55,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1476,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1530,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1584,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1638,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1692,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1746,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1800,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1854,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1908,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1962,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2016,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2070,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2124,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2178,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2232,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2286,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2340,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2394,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2448,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2502,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2556,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2610,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2664,
"compressed_page_size": 54,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2718,
"compressed_page_size": 55,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2773,
"compressed_page_size": 53,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2826,
"compressed_page_size": 53,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2879,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2928,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2977,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3026,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3076,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3126,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3177,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3227,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3277,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3327,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3377,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3427,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3477,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3527,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3578,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3629,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3679,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3729,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3780,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3831,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3880,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3929,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3978,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4029,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4078,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4127,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4176,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4225,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4274,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4323,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4374,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4425,
"compressed_page_size": 51,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4476,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4525,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4574,
"compressed_page_size": 49,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4623,
"compressed_page_size": 49,
"first_row_index": 0
}
]
}
]
]

@ -0,0 +1,259 @@
[
[
{
"null_pages": [
false
],
"min_values": [
1
],
"max_values": [
105
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
8817
],
"max_values": [
1895444
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
37
],
"max_values": [
7135
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
464
],
"max_values": [
49388
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
2449130
],
"max_values": [
2452641
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
2449100
],
"max_values": [
2452611
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
1
],
"max_values": [
30
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
1
],
"max_values": [
12
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
1925
],
"max_values": [
1991
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"AAAAAAAAABAAAAAA"
],
"max_values": [
"AAAAAAAAPFAAAAAA"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"Dr."
],
"max_values": [
"Sir"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"Albert"
],
"max_values": [
"William"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"Baker"
],
"max_values": [
"Young"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"N"
],
"max_values": [
"Y"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"AFGHANISTAN"
],
"max_values": [
"WALLIS AND FUTUNA"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"Albert.Brunson@62.com"
],
"max_values": [
"William.Warner@zegnrzurU.org"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"2452293"
],
"max_values": [
"2452644"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
}
]
]

@ -0,0 +1,157 @@
[
[
{
"page_locations": [
{
"offset": 4,
"compressed_page_size": 50,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 54,
"compressed_page_size": 388,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 442,
"compressed_page_size": 261,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 703,
"compressed_page_size": 307,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1010,
"compressed_page_size": 247,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1257,
"compressed_page_size": 247,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1504,
"compressed_page_size": 131,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1635,
"compressed_page_size": 115,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1750,
"compressed_page_size": 144,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1894,
"compressed_page_size": 933,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 2827,
"compressed_page_size": 378,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3205,
"compressed_page_size": 707,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 3912,
"compressed_page_size": 751,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4663,
"compressed_page_size": 154,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 4817,
"compressed_page_size": 1154,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 5971,
"compressed_page_size": 2857,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 8828,
"compressed_page_size": 405,
"first_row_index": 0
}
]
}
]
]

@ -0,0 +1,19 @@
[
[
{
"null_pages": [
false
],
"min_values": [
1552
],
"max_values": [
1552
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
}
]
]

@ -0,0 +1,13 @@
[
[
{
"page_locations": [
{
"offset": 4,
"compressed_page_size": 40,
"first_row_index": 0
}
]
}
]
]

@ -0,0 +1,21 @@
[
[
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null,
null
]
]

@ -0,0 +1,157 @@
[
[
{
"page_locations": [
{
"offset": 4,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 73,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 142,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 207,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 271,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 335,
"compressed_page_size": 38,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 403,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 466,
"compressed_page_size": 32,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 525,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 586,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 659,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 731,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 802,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 870,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 938,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1009,
"compressed_page_size": 34,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 1079,
"compressed_page_size": 30,
"first_row_index": 0
}
]
}
]
]

@ -0,0 +1,83 @@
[
[
{
"boundary_order": "UNORDERED",
"max_values": [
"2022-11-27T17:42:44.280Z"
],
"min_values": [
"2022-11-27T17:42:43.514Z"
],
"null_counts": [
0
],
"null_pages": [
false
]
},
{
"boundary_order": "UNORDERED",
"max_values": [
85016
],
"min_values": [
1184
],
"null_counts": [
0
],
"null_pages": [
false
]
},
{
"boundary_order": "UNORDERED",
"max_values": [
[
0
]
],
"min_values": [
[
0
]
],
"null_counts": [
4
],
"null_pages": [
true
]
},
{
"boundary_order": "UNORDERED",
"max_values": [
1
],
"min_values": [
-1
],
"null_counts": [
0
],
"null_pages": [
false
]
},
{
"boundary_order": "UNORDERED",
"max_values": [
343
],
"min_values": [
343
],
"null_counts": [
0
],
"null_pages": [
false
]
}
]
]

@ -0,0 +1,49 @@
[
[
{
"page_locations": [
{
"offset": 4,
"compressed_page_size": 73,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 132,
"compressed_page_size": 65,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 242,
"compressed_page_size": 37,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 315,
"compressed_page_size": 68,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 435,
"compressed_page_size": 57,
"first_row_index": 0
}
]
}
]
]

@ -0,0 +1,34 @@
[
[
{
"null_pages": [
false
],
"min_values": [
"name"
],
"max_values": [
"parent"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"another"
],
"max_values": [
"report"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
}
]
]

@ -0,0 +1,22 @@
[
[
{
"page_locations": [
{
"offset": 4,
"compressed_page_size": 69,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 73,
"compressed_page_size": 72,
"first_row_index": 0
}
]
}
]
]

@ -0,0 +1,34 @@
[
[
{
"null_pages": [
false
],
"min_values": [
0
],
"max_values": [
0
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
},
{
"null_pages": [
false
],
"min_values": [
"a655fd0e-9949-4059-bcae-fd6a002a4652"
],
"max_values": [
"a655fd0e-9949-4059-bcae-fd6a002a4652"
],
"boundary_order": "ASCENDING",
"null_counts": [
0
]
}
]
]

@ -0,0 +1,22 @@
[
[
{
"page_locations": [
{
"offset": 31,
"compressed_page_size": 27,
"first_row_index": 0
}
]
},
{
"page_locations": [
{
"offset": 117,
"compressed_page_size": 27,
"first_row_index": 0
}
]
}
]
]

62
test/indexes.test.js Normal file

@ -0,0 +1,62 @@
import fs from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetMetadata } from '../src/hyparquet.js'
import { readColumnIndex, readOffsetIndex } from '../src/indexes.js'
import { getSchemaPath } from '../src/schema.js'
import { asyncBufferFromFile, toJson } from '../src/utils.js'
import { fileToJson } from './helpers.js'
describe('readColumnIndex', () => {
const columnIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.column_indexes.json'))
const parquetFiles = columnIndexesFiles.map(f => f.replace(/.column_indexes.json$/i, '.parquet'))
parquetFiles.forEach((file, i) => {
it(`parse column indexes from ${file}`, async () => {
const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
const metadata = parquetMetadata(arrayBuffer)
const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => {
if (column.column_index_offset === undefined || column.column_index_length === undefined) return null
const columnIndexOffset = Number(column.column_index_offset)
const columnIndexLength = Number(column.column_index_length)
const columnIndexArrayBuffer = arrayBuffer.slice(columnIndexOffset, columnIndexOffset + columnIndexLength)
const columnIndexReader = { view: new DataView(columnIndexArrayBuffer), offset: 0 }
const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
return readColumnIndex(columnIndexReader, schemaPath.at(-1)?.element || { name: '' })
}))
const expected = fileToJson(`test/files/${columnIndexesFiles[i]}`)
expect(toJson(result)).toEqual(expected)
})
})
})
describe('readOffsetIndex', () => {
const offsetIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.offset_indexes.json'))
const parquetFiles = offsetIndexesFiles.map(f => f.replace(/.offset_indexes.json$/i, '.parquet'))
parquetFiles.forEach((file, i) => {
it(`parse offset indexes from ${file}`, async () => {
const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
const metadata = parquetMetadata(arrayBuffer)
const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => {
if (column.offset_index_offset === undefined || column.offset_index_length === undefined) return null
const offsetIndexOffset = Number(column.offset_index_offset)
const offsetIndexLength = Number(column.offset_index_length)
const offsetIndexArrayBuffer = arrayBuffer.slice(offsetIndexOffset, offsetIndexOffset + offsetIndexLength)
const offsetIndexReader = { view: new DataView(offsetIndexArrayBuffer), offset: 0 }
return readOffsetIndex(offsetIndexReader)
}))
const expected = fileToJson(`test/files/${offsetIndexesFiles[i]}`)
expect(toJson(result)).toEqual(expected)
})
})
})
/**
* @param {string} filename
* @returns {Promise<ArrayBuffer>}
*/
function readFileToArrayBuffer(filename) {
return asyncBufferFromFile(filename).then((buffer) => buffer.slice(0))
}

@ -5,7 +5,6 @@ import { reader } from './helpers.js'
describe('deserializeTCompactProtocol function', () => {
it('parses basic types correctly', () => {
// Setup a buffer with thrift encoded data for basic types
const buffer = new ArrayBuffer(128)
const view = new DataView(buffer)
let index = 0
@ -79,6 +78,21 @@ describe('deserializeTCompactProtocol function', () => {
expect(new TextDecoder().decode(value.field_8)).toBe('Hello, Thrift!') // STRING
})
it('parses rle-dict column index correctly', () => {
const buffer = new Uint8Array([25, 17, 2, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 22, 0, 0])
const view = new DataView(buffer.buffer)
const reader = { view, offset: 0 }
const value = deserializeTCompactProtocol(reader)
expect(value.field_1).toEqual([false])
expect(value.field_2).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])])
expect(value.field_3).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])])
expect(value.field_4).toEqual(1)
expect(value.field_5).toEqual([0n])
expect(value.field_6).toBeUndefined()
expect(value.field_7).toBeUndefined()
expect(value.field_8).toBeUndefined()
})
})
describe('readVarInt', () => {