diff --git a/demo/demo.js b/demo/demo.js index 1430145..c81a592 100644 --- a/demo/demo.js +++ b/demo/demo.js @@ -129,7 +129,7 @@ function renderSidebar(asyncBuffer, metadata, name) { const sidebar = /** @type {HTMLElement} */ (document.getElementById('sidebar')) sidebar.innerHTML = `
${name}
` sidebar.appendChild(fileMetadata(toJson(metadata))) - sidebar.appendChild(fileLayout(metadata, asyncBuffer.byteLength)) + sidebar.appendChild(fileLayout(metadata, asyncBuffer)) } welcome.addEventListener('click', () => { diff --git a/demo/layout.js b/demo/layout.js index 317ed94..d52a1e9 100644 --- a/demo/layout.js +++ b/demo/layout.js @@ -24,12 +24,14 @@ export function fileMetadata(metadata) { * Render parquet file layout. * * @param {FileMetaData} metadata - * @param {number} byteLength + * @param {import('../src/types.js').AsyncBuffer} asyncBuffer * @returns {HTMLDivElement} */ -export function fileLayout(metadata, byteLength) { +export function fileLayout(metadata, asyncBuffer) { let html = '

File layout

' html += cell('PAR1', 0n, 4n) // magic number + + // data pages by row group and column /** @type {[string, bigint, bigint][]} */ const indexPages = [] for (const rowGroupIndex in metadata.row_groups) { @@ -67,13 +69,17 @@ export function fileLayout(metadata, byteLength) { } html += '' } - for (const [name, start, length] of indexPages) { + + // column and offset indexes + for (const [name, start, length] of indexPages.sort((a, b) => Number(a[1]) - Number(b[1]))) { html += cell(name, start, start + length) } - const metadataStart = BigInt(byteLength - metadata.metadata_length - 4) - const metadataEnd = BigInt(byteLength - 4) + + // metadata footer + const metadataStart = BigInt(asyncBuffer.byteLength - metadata.metadata_length - 4) + const metadataEnd = BigInt(asyncBuffer.byteLength - 4) html += cell('Metadata', metadataStart, metadataEnd) - html += cell('PAR1', metadataEnd, BigInt(byteLength)) // magic number + html += cell('PAR1', metadataEnd, BigInt(asyncBuffer.byteLength)) // magic number const div = document.createElement('div') div.innerHTML = html div.classList.add('layout', 'collapsed') // start collapsed diff --git a/src/constants.js b/src/constants.js index 17bd3f2..42a581e 100644 --- a/src/constants.js +++ b/src/constants.js @@ -1,6 +1,4 @@ -/** - * @type {import('./types.js').ParquetType[]} - */ +/** @type {import('./types.js').ParquetType[]} */ export const ParquetType = [ 'BOOLEAN', 'INT32', @@ -31,9 +29,7 @@ export const FieldRepetitionType = [ 'REPEATED', ] -/** - * @type {import('./types.js').ConvertedType[]} - */ +/** @type {import('./types.js').ConvertedType[]} */ export const ConvertedType = [ 'UTF8', 'MAP', @@ -59,9 +55,7 @@ export const ConvertedType = [ 'INTERVAL', ] -/** - * @type {import('./types.js').LogicalTypeType[]} - */ +/** @type {import('./types.js').LogicalTypeType[]} */ export const logicalTypeType = [ 'NULL', 'STRING', @@ -91,12 +85,17 @@ export const CompressionCodec = [ 'LZ4_RAW', ] -/** - * @type {import('./types.js').PageType[]} - */ +/** @type {import('./types.js').PageType[]} */ export const PageType = [ 'DATA_PAGE', 'INDEX_PAGE', 'DICTIONARY_PAGE', 'DATA_PAGE_V2', ] + +/** @type {import('./types.js').BoundaryOrder[]} */ +export const BoundaryOrder = [ + 'UNORDERED', + 'ASCENDING', + 'DESCENDING', +] diff --git a/src/indexes.js b/src/indexes.js new file mode 100644 index 0000000..f4c5f07 --- /dev/null +++ b/src/indexes.js @@ -0,0 +1,46 @@ +import { BoundaryOrder } from './constants.js' +import { convertMetadata } from './metadata.js' +import { deserializeTCompactProtocol } from './thrift.js' + +/** + * @typedef {import('./types.d.ts').DataReader} DataReader + * @param {DataReader} reader + * @param {import('./types.d.ts').SchemaElement} schema + * @returns {import('./types.d.ts').ColumnIndex} + */ +export function readColumnIndex(reader, schema) { + const thrift = deserializeTCompactProtocol(reader) + return { + null_pages: thrift.field_1, + min_values: thrift.field_2.map((/** @type {any} */ m) => convertMetadata(m, schema)), + max_values: thrift.field_3.map((/** @type {any} */ m) => convertMetadata(m, schema)), + boundary_order: BoundaryOrder[thrift.field_4], + null_counts: thrift.field_5, + repetition_level_histograms: thrift.field_6, + definition_level_histograms: thrift.field_7, + } +} + +/** + * @param {DataReader} reader + * @returns {import('./types.d.ts').OffsetIndex} + */ +export function readOffsetIndex(reader) { + const thrift = deserializeTCompactProtocol(reader) + return { + page_locations: thrift.field_1.map(pageLocation), + unencoded_byte_array_data_bytes: thrift.field_2, + } +} + +/** + * @param {any} loc + * @returns {import('./types.d.ts').PageLocation} + */ +function pageLocation(loc) { + return { + offset: loc.field_1, + compressed_page_size: loc.field_2, + first_row_index: loc.field_3, + } +} diff --git a/src/metadata.js b/src/metadata.js index f95c602..ce2dc9e 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -137,7 +137,7 @@ export function parquetMetadata(arrayBuffer) { data_page_offset: column.field_3.field_9, index_page_offset: column.field_3.field_10, dictionary_page_offset: column.field_3.field_11, - statistics: columnStats(column.field_3.field_12, columnSchema[columnIndex]), + statistics: convertStats(column.field_3.field_12, columnSchema[columnIndex]), encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({ page_type: PageType[encodingStat.field_1], encoding: Encoding[encodingStat.field_2], @@ -252,33 +252,41 @@ function timeUnit(unit) { * @param {SchemaElement} schema * @returns {import("./types.d.ts").Statistics} */ -function columnStats(stats, schema) { - const { type, converted_type, logical_type } = schema - function convert(/** @type {Uint8Array} */ value) { - if (value === undefined) return value - if (type === 'BOOLEAN') return value[0] === 1 - if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) - const view = new DataView(value.buffer, value.byteOffset, value.byteLength) - if (type === 'FLOAT') return view.getFloat32(0, true) - if (type === 'DOUBLE') return view.getFloat64(0, true) - if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000) - if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n)) - if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true))) - if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true))) - if (type === 'INT32') return view.getInt32(0, true) - if (type === 'INT64') return view.getBigInt64(0, true) - if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0)) - if (logical_type?.type === 'FLOAT16') return parseFloat16(value) - return value - } +function convertStats(stats, schema) { return stats && { - max: convert(stats.field_1), - min: convert(stats.field_2), + max: convertMetadata(stats.field_1, schema), + min: convertMetadata(stats.field_2, schema), null_count: stats.field_3, distinct_count: stats.field_4, - max_value: convert(stats.field_5), - min_value: convert(stats.field_6), + max_value: convertMetadata(stats.field_5, schema), + min_value: convertMetadata(stats.field_6, schema), is_max_value_exact: stats.field_7, is_min_value_exact: stats.field_8, } } + +/** + * @param {Uint8Array | undefined} value + * @param {SchemaElement} schema + * @returns {import('./types.d.ts').MinMaxType | undefined} + */ +export function convertMetadata(value, schema) { + const { type, converted_type, logical_type } = schema + if (value === undefined) return value + if (type === 'BOOLEAN') return value[0] === 1 + if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) + const view = new DataView(value.buffer, value.byteOffset, value.byteLength) + if (type === 'FLOAT' && view.byteLength === 4) return view.getFloat32(0, true) + if (type === 'DOUBLE' && view.byteLength === 8) return view.getFloat64(0, true) + if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000) + if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n)) + if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true))) + if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true))) + if (type === 'INT32' && view.byteLength === 4) return view.getInt32(0, true) + if (type === 'INT64' && view.byteLength === 8) return view.getBigInt64(0, true) + if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0)) + if (logical_type?.type === 'FLOAT16') return parseFloat16(value) + if (type === 'FIXED_LEN_BYTE_ARRAY') return value + // assert(false) + return value +} diff --git a/src/thrift.js b/src/thrift.js index 9a48523..75fb735 100644 --- a/src/thrift.js +++ b/src/thrift.js @@ -78,9 +78,10 @@ function readElement(reader, type) { } case CompactType.LIST: { const [elemType, listSize] = readCollectionBegin(reader) + const boolType = elemType === CompactType.TRUE || elemType === CompactType.FALSE const values = new Array(listSize) for (let i = 0; i < listSize; i++) { - values[i] = readElement(reader, elemType) + values[i] = boolType ? readElement(reader, CompactType.BYTE) === 1 : readElement(reader, elemType) } return values } @@ -203,12 +204,11 @@ function readFieldBegin(reader, lastFid) { } const delta = type >> 4 let fid // field id - if (delta === 0) { - // not a delta, read zigzag varint field id - fid = readZigZag(reader) - } else { + if (delta) { // add delta to last field id fid = lastFid + delta + } else { + throw new Error('non-delta field id not supported') } return [getCompactType(type), fid, fid] } diff --git a/src/types.d.ts b/src/types.d.ts index 997a280..df29925 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -211,7 +211,7 @@ interface KeyValue { value?: string } -type MinMaxType = bigint | boolean | number | string +type MinMaxType = bigint | boolean | number | string | Date | Uint8Array export interface Statistics { max?: MinMaxType @@ -301,3 +301,26 @@ export type DecodedArray = Float32Array | Float64Array | any[] + +export interface OffsetIndex { + page_locations: PageLocation[] + unencoded_byte_array_data_bytes?: bigint[] +} + +interface PageLocation { + offset: bigint + compressed_page_size: number + first_row_index: bigint +} + +export interface ColumnIndex { + null_pages: boolean[] + min_values: MinMaxType[] + max_values: MinMaxType[] + boundary_order: BoundaryOrder + null_counts?: bigint[] + repetition_level_histograms?: bigint[] + definition_level_histograms?: bigint[] +} + +export type BoundaryOrder = 'UNORDERED' | 'ASCENDING' | 'DESCENDING' diff --git a/test/files/adam_genotypes.column_indexes.json b/test/files/adam_genotypes.column_indexes.json new file mode 100644 index 0000000..a378149 --- /dev/null +++ b/test/files/adam_genotypes.column_indexes.json @@ -0,0 +1,1369 @@ +[ + [ + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "name" + ], + "max_values": [ + "name" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + "" + ], + "max_values": [ + "" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + false + ], + "max_values": [ + false + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + }, + { + "null_pages": [ + true + ], + "min_values": [ + [] + ], + "max_values": [ + [] + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 1 + ] + } + ] +] diff --git a/test/files/adam_genotypes.offset_indexes.json b/test/files/adam_genotypes.offset_indexes.json new file mode 100644 index 0000000..ec8f158 --- /dev/null +++ b/test/files/adam_genotypes.offset_indexes.json @@ -0,0 +1,823 @@ +[ + [ + { + "page_locations": [ + { + "offset": 4, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 54, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 104, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 154, + "compressed_page_size": 59, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 213, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 264, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 314, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 364, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 414, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 464, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 514, + "compressed_page_size": 53, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 567, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 617, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 667, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 717, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 767, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 817, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 867, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 917, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 967, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1017, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1067, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1117, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1167, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1217, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1267, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1317, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1367, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1421, + "compressed_page_size": 55, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1476, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1530, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1584, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1638, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1692, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1746, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1800, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1854, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1908, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1962, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2016, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2070, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2124, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2178, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2232, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2286, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2340, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2394, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2448, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2502, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2556, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2610, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2664, + "compressed_page_size": 54, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2718, + "compressed_page_size": 55, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2773, + "compressed_page_size": 53, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2826, + "compressed_page_size": 53, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2879, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2928, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2977, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3026, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3076, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3126, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3177, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3227, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3277, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3327, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3377, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3427, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3477, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3527, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3578, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3629, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3679, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3729, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3780, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3831, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3880, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3929, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3978, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4029, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4078, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4127, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4176, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4225, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4274, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4323, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4374, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4425, + "compressed_page_size": 51, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4476, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4525, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4574, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4623, + "compressed_page_size": 49, + "first_row_index": 0 + } + ] + } + ] +] \ No newline at end of file diff --git a/test/files/delta_encoding_required_column.column_indexes.json b/test/files/delta_encoding_required_column.column_indexes.json new file mode 100644 index 0000000..6f3b921 --- /dev/null +++ b/test/files/delta_encoding_required_column.column_indexes.json @@ -0,0 +1,259 @@ +[ + [ + { + "null_pages": [ + false + ], + "min_values": [ + 1 + ], + "max_values": [ + 105 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 8817 + ], + "max_values": [ + 1895444 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 37 + ], + "max_values": [ + 7135 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 464 + ], + "max_values": [ + 49388 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 2449130 + ], + "max_values": [ + 2452641 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 2449100 + ], + "max_values": [ + 2452611 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 1 + ], + "max_values": [ + 30 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 1 + ], + "max_values": [ + 12 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + 1925 + ], + "max_values": [ + 1991 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "AAAAAAAAABAAAAAA" + ], + "max_values": [ + "AAAAAAAAPFAAAAAA" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "Dr." + ], + "max_values": [ + "Sir" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "Albert" + ], + "max_values": [ + "William" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "Baker" + ], + "max_values": [ + "Young" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "N" + ], + "max_values": [ + "Y" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "AFGHANISTAN" + ], + "max_values": [ + "WALLIS AND FUTUNA" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "Albert.Brunson@62.com" + ], + "max_values": [ + "William.Warner@zegnrzurU.org" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "2452293" + ], + "max_values": [ + "2452644" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + } + ] +] \ No newline at end of file diff --git a/test/files/delta_encoding_required_column.offset_indexes.json b/test/files/delta_encoding_required_column.offset_indexes.json new file mode 100644 index 0000000..c9d54d1 --- /dev/null +++ b/test/files/delta_encoding_required_column.offset_indexes.json @@ -0,0 +1,157 @@ +[ + [ + { + "page_locations": [ + { + "offset": 4, + "compressed_page_size": 50, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 54, + "compressed_page_size": 388, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 442, + "compressed_page_size": 261, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 703, + "compressed_page_size": 307, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1010, + "compressed_page_size": 247, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1257, + "compressed_page_size": 247, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1504, + "compressed_page_size": 131, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1635, + "compressed_page_size": 115, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1750, + "compressed_page_size": 144, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1894, + "compressed_page_size": 933, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 2827, + "compressed_page_size": 378, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3205, + "compressed_page_size": 707, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 3912, + "compressed_page_size": 751, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4663, + "compressed_page_size": 154, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 4817, + "compressed_page_size": 1154, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 5971, + "compressed_page_size": 2857, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 8828, + "compressed_page_size": 405, + "first_row_index": 0 + } + ] + } + ] +] \ No newline at end of file diff --git a/test/files/dict-page-offset-zero.column_indexes.json b/test/files/dict-page-offset-zero.column_indexes.json new file mode 100644 index 0000000..9cbd1e7 --- /dev/null +++ b/test/files/dict-page-offset-zero.column_indexes.json @@ -0,0 +1,19 @@ +[ + [ + { + "null_pages": [ + false + ], + "min_values": [ + 1552 + ], + "max_values": [ + 1552 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + } + ] +] \ No newline at end of file diff --git a/test/files/dict-page-offset-zero.offset_indexes.json b/test/files/dict-page-offset-zero.offset_indexes.json new file mode 100644 index 0000000..fadc4ee --- /dev/null +++ b/test/files/dict-page-offset-zero.offset_indexes.json @@ -0,0 +1,13 @@ +[ + [ + { + "page_locations": [ + { + "offset": 4, + "compressed_page_size": 40, + "first_row_index": 0 + } + ] + } + ] +] \ No newline at end of file diff --git a/test/files/duckdb4442.column_indexes.json b/test/files/duckdb4442.column_indexes.json new file mode 100644 index 0000000..f237184 --- /dev/null +++ b/test/files/duckdb4442.column_indexes.json @@ -0,0 +1,21 @@ +[ + [ + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ] +] diff --git a/test/files/duckdb4442.offset_indexes.json b/test/files/duckdb4442.offset_indexes.json new file mode 100644 index 0000000..c1d9fc3 --- /dev/null +++ b/test/files/duckdb4442.offset_indexes.json @@ -0,0 +1,157 @@ +[ + [ + { + "page_locations": [ + { + "offset": 4, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 73, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 142, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 207, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 271, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 335, + "compressed_page_size": 38, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 403, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 466, + "compressed_page_size": 32, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 525, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 586, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 659, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 731, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 802, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 870, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 938, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1009, + "compressed_page_size": 34, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 1079, + "compressed_page_size": 30, + "first_row_index": 0 + } + ] + } + ] +] \ No newline at end of file diff --git a/test/files/duckdb5533.column_indexes.json b/test/files/duckdb5533.column_indexes.json new file mode 100644 index 0000000..ef20121 --- /dev/null +++ b/test/files/duckdb5533.column_indexes.json @@ -0,0 +1,83 @@ +[ + [ + { + "boundary_order": "UNORDERED", + "max_values": [ + "2022-11-27T17:42:44.280Z" + ], + "min_values": [ + "2022-11-27T17:42:43.514Z" + ], + "null_counts": [ + 0 + ], + "null_pages": [ + false + ] + }, + { + "boundary_order": "UNORDERED", + "max_values": [ + 85016 + ], + "min_values": [ + 1184 + ], + "null_counts": [ + 0 + ], + "null_pages": [ + false + ] + }, + { + "boundary_order": "UNORDERED", + "max_values": [ + [ + 0 + ] + ], + "min_values": [ + [ + 0 + ] + ], + "null_counts": [ + 4 + ], + "null_pages": [ + true + ] + }, + { + "boundary_order": "UNORDERED", + "max_values": [ + 1 + ], + "min_values": [ + -1 + ], + "null_counts": [ + 0 + ], + "null_pages": [ + false + ] + }, + { + "boundary_order": "UNORDERED", + "max_values": [ + 343 + ], + "min_values": [ + 343 + ], + "null_counts": [ + 0 + ], + "null_pages": [ + false + ] + } + ] +] diff --git a/test/files/duckdb5533.offset_indexes.json b/test/files/duckdb5533.offset_indexes.json new file mode 100644 index 0000000..cc27ddb --- /dev/null +++ b/test/files/duckdb5533.offset_indexes.json @@ -0,0 +1,49 @@ +[ + [ + { + "page_locations": [ + { + "offset": 4, + "compressed_page_size": 73, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 132, + "compressed_page_size": 65, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 242, + "compressed_page_size": 37, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 315, + "compressed_page_size": 68, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 435, + "compressed_page_size": 57, + "first_row_index": 0 + } + ] + } + ] +] \ No newline at end of file diff --git a/test/files/incorrect_map_schema.column_indexes.json b/test/files/incorrect_map_schema.column_indexes.json new file mode 100644 index 0000000..9e37bfd --- /dev/null +++ b/test/files/incorrect_map_schema.column_indexes.json @@ -0,0 +1,34 @@ +[ + [ + { + "null_pages": [ + false + ], + "min_values": [ + "name" + ], + "max_values": [ + "parent" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "another" + ], + "max_values": [ + "report" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + } + ] +] \ No newline at end of file diff --git a/test/files/incorrect_map_schema.offset_indexes.json b/test/files/incorrect_map_schema.offset_indexes.json new file mode 100644 index 0000000..97ea239 --- /dev/null +++ b/test/files/incorrect_map_schema.offset_indexes.json @@ -0,0 +1,22 @@ +[ + [ + { + "page_locations": [ + { + "offset": 4, + "compressed_page_size": 69, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 73, + "compressed_page_size": 72, + "first_row_index": 0 + } + ] + } + ] +] \ No newline at end of file diff --git a/test/files/plain-dict-uncompressed-checksum.column_indexes.json b/test/files/plain-dict-uncompressed-checksum.column_indexes.json new file mode 100644 index 0000000..3653778 --- /dev/null +++ b/test/files/plain-dict-uncompressed-checksum.column_indexes.json @@ -0,0 +1,34 @@ +[ + [ + { + "null_pages": [ + false + ], + "min_values": [ + 0 + ], + "max_values": [ + 0 + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + }, + { + "null_pages": [ + false + ], + "min_values": [ + "a655fd0e-9949-4059-bcae-fd6a002a4652" + ], + "max_values": [ + "a655fd0e-9949-4059-bcae-fd6a002a4652" + ], + "boundary_order": "ASCENDING", + "null_counts": [ + 0 + ] + } + ] +] \ No newline at end of file diff --git a/test/files/plain-dict-uncompressed-checksum.offset_indexes.json b/test/files/plain-dict-uncompressed-checksum.offset_indexes.json new file mode 100644 index 0000000..0464637 --- /dev/null +++ b/test/files/plain-dict-uncompressed-checksum.offset_indexes.json @@ -0,0 +1,22 @@ +[ + [ + { + "page_locations": [ + { + "offset": 31, + "compressed_page_size": 27, + "first_row_index": 0 + } + ] + }, + { + "page_locations": [ + { + "offset": 117, + "compressed_page_size": 27, + "first_row_index": 0 + } + ] + } + ] +] \ No newline at end of file diff --git a/test/indexes.test.js b/test/indexes.test.js new file mode 100644 index 0000000..bf203cd --- /dev/null +++ b/test/indexes.test.js @@ -0,0 +1,62 @@ +import fs from 'fs' +import { describe, expect, it } from 'vitest' +import { parquetMetadata } from '../src/hyparquet.js' +import { readColumnIndex, readOffsetIndex } from '../src/indexes.js' +import { getSchemaPath } from '../src/schema.js' +import { asyncBufferFromFile, toJson } from '../src/utils.js' +import { fileToJson } from './helpers.js' + +describe('readColumnIndex', () => { + const columnIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.column_indexes.json')) + const parquetFiles = columnIndexesFiles.map(f => f.replace(/.column_indexes.json$/i, '.parquet')) + + parquetFiles.forEach((file, i) => { + it(`parse column indexes from ${file}`, async () => { + const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`) + const metadata = parquetMetadata(arrayBuffer) + + const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => { + if (column.column_index_offset === undefined || column.column_index_length === undefined) return null + const columnIndexOffset = Number(column.column_index_offset) + const columnIndexLength = Number(column.column_index_length) + const columnIndexArrayBuffer = arrayBuffer.slice(columnIndexOffset, columnIndexOffset + columnIndexLength) + const columnIndexReader = { view: new DataView(columnIndexArrayBuffer), offset: 0 } + const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? []) + return readColumnIndex(columnIndexReader, schemaPath.at(-1)?.element || { name: '' }) + })) + const expected = fileToJson(`test/files/${columnIndexesFiles[i]}`) + expect(toJson(result)).toEqual(expected) + }) + }) +}) + +describe('readOffsetIndex', () => { + const offsetIndexesFiles = fs.readdirSync('test/files').filter(f => f.endsWith('.offset_indexes.json')) + const parquetFiles = offsetIndexesFiles.map(f => f.replace(/.offset_indexes.json$/i, '.parquet')) + + parquetFiles.forEach((file, i) => { + it(`parse offset indexes from ${file}`, async () => { + const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`) + const metadata = parquetMetadata(arrayBuffer) + + const result = metadata.row_groups.map((rowGroup) => rowGroup.columns.map((column) => { + if (column.offset_index_offset === undefined || column.offset_index_length === undefined) return null + const offsetIndexOffset = Number(column.offset_index_offset) + const offsetIndexLength = Number(column.offset_index_length) + const offsetIndexArrayBuffer = arrayBuffer.slice(offsetIndexOffset, offsetIndexOffset + offsetIndexLength) + const offsetIndexReader = { view: new DataView(offsetIndexArrayBuffer), offset: 0 } + return readOffsetIndex(offsetIndexReader) + })) + const expected = fileToJson(`test/files/${offsetIndexesFiles[i]}`) + expect(toJson(result)).toEqual(expected) + }) + }) +}) + +/** + * @param {string} filename + * @returns {Promise} + */ +function readFileToArrayBuffer(filename) { + return asyncBufferFromFile(filename).then((buffer) => buffer.slice(0)) +} diff --git a/test/thrift.test.js b/test/thrift.test.js index d947353..d43377e 100644 --- a/test/thrift.test.js +++ b/test/thrift.test.js @@ -5,7 +5,6 @@ import { reader } from './helpers.js' describe('deserializeTCompactProtocol function', () => { it('parses basic types correctly', () => { - // Setup a buffer with thrift encoded data for basic types const buffer = new ArrayBuffer(128) const view = new DataView(buffer) let index = 0 @@ -79,6 +78,21 @@ describe('deserializeTCompactProtocol function', () => { expect(new TextDecoder().decode(value.field_8)).toBe('Hello, Thrift!') // STRING }) + it('parses rle-dict column index correctly', () => { + const buffer = new Uint8Array([25, 17, 2, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 22, 0, 0]) + const view = new DataView(buffer.buffer) + const reader = { view, offset: 0 } + const value = deserializeTCompactProtocol(reader) + expect(value.field_1).toEqual([false]) + expect(value.field_2).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])]) + expect(value.field_3).toEqual([new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0])]) + expect(value.field_4).toEqual(1) + expect(value.field_5).toEqual([0n]) + expect(value.field_6).toBeUndefined() + expect(value.field_7).toBeUndefined() + expect(value.field_8).toBeUndefined() + }) + }) describe('readVarInt', () => {