hyparquet/src/column.js

153 lines
6.2 KiB
JavaScript
Raw Normal View History

2024-05-18 02:41:40 +00:00
import { assembleLists } from './assemble.js'
import { convert, dereferenceDictionary } from './convert.js'
2024-03-18 23:36:16 +00:00
import { readDataPage, readDictionaryPage } from './datapage.js'
2024-02-24 18:11:04 +00:00
import { readDataPageV2 } from './datapageV2.js'
2024-01-08 01:04:05 +00:00
import { parquetHeader } from './header.js'
2024-05-18 05:44:03 +00:00
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
2024-01-08 01:04:05 +00:00
import { snappyUncompress } from './snappy.js'
2024-04-07 16:33:57 +00:00
import { concat } from './utils.js'
2024-01-08 01:04:05 +00:00
/**
* Parse column data from a buffer.
2024-01-08 01:04:05 +00:00
*
2024-05-23 05:24:54 +00:00
* @typedef {import('./types.js').ColumnMetaData} ColumnMetaData
* @typedef {import('./types.js').DecodedArray} DecodedArray
2024-05-23 05:24:54 +00:00
* @param {import('./types.js').DataReader} reader
* @param {import('./types.js').RowGroup} rowGroup row group metadata
2024-01-08 01:04:05 +00:00
* @param {ColumnMetaData} columnMetadata column metadata
2024-05-23 05:24:54 +00:00
* @param {import('./types.js').SchemaTree[]} schemaPath schema path for the column
* @param {import('./hyparquet.js').ParquetReadOptions} options read options
2024-05-14 09:19:37 +00:00
* @returns {any[]} array of values
2024-01-08 01:04:05 +00:00
*/
2024-05-23 05:24:54 +00:00
export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compressors, utf8 }) {
const { element } = schemaPath[schemaPath.length - 1]
/** @type {DecodedArray | undefined} */
2024-01-08 01:04:05 +00:00
let dictionary = undefined
2024-05-22 05:50:50 +00:00
let seen = 0
/** @type {any[]} */
2024-04-07 16:33:57 +00:00
const rowData = []
2024-04-30 00:38:26 +00:00
2024-05-22 05:50:50 +00:00
while (seen < rowGroup.num_rows) {
2024-01-08 01:04:05 +00:00
// parse column header
2024-05-01 07:55:16 +00:00
const header = parquetHeader(reader)
2024-05-23 05:24:54 +00:00
// assert(header.compressed_page_size !== undefined)
2024-01-08 01:04:05 +00:00
// read compressed_page_size bytes starting at offset
2024-05-14 09:19:37 +00:00
const compressedBytes = new Uint8Array(
2024-05-23 05:24:54 +00:00
reader.view.buffer, reader.view.byteOffset + reader.offset, header.compressed_page_size
2024-02-09 21:44:35 +00:00
)
2024-01-08 01:04:05 +00:00
// parse page data by type
2024-05-06 00:51:31 +00:00
/** @type {DecodedArray} */
let values
2024-04-18 07:02:29 +00:00
if (header.type === 'DATA_PAGE') {
2024-01-08 01:04:05 +00:00
const daph = header.data_page_header
2024-01-13 00:28:37 +00:00
if (!daph) throw new Error('parquet data page header is undefined')
2024-01-08 01:04:05 +00:00
2024-05-23 05:24:54 +00:00
const page = decompressPage(compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec, compressors)
2024-05-02 06:23:50 +00:00
const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata)
2024-05-22 05:50:50 +00:00
seen += daph.num_values
2024-05-14 07:35:39 +00:00
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
2024-01-08 01:04:05 +00:00
// construct output values: skip nulls and construct lists
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
2024-05-18 02:41:40 +00:00
if (repetitionLevels.length || definitionLevels?.length) {
2024-01-08 01:04:05 +00:00
// Use repetition levels to construct lists
2024-04-30 00:38:26 +00:00
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
2024-05-18 05:44:03 +00:00
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
2024-05-18 02:41:40 +00:00
values = assembleLists(
2024-05-18 05:44:03 +00:00
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel
2024-02-27 03:33:38 +00:00
)
2024-01-08 01:04:05 +00:00
} else {
2024-05-18 02:41:40 +00:00
// wrap nested flat data by depth
for (let i = 2; i < schemaPath.length; i++) {
if (schemaPath[i].element.repetition_type !== 'REQUIRED') {
values = [values]
}
}
2024-01-08 01:04:05 +00:00
}
2024-05-14 07:35:39 +00:00
// assert(BigInt(values.length) === rowGroup.num_rows)
2024-04-07 16:33:57 +00:00
concat(rowData, values)
2024-04-18 07:02:29 +00:00
} else if (header.type === 'DATA_PAGE_V2') {
2024-02-24 18:11:04 +00:00
const daph2 = header.data_page_header_v2
if (!daph2) throw new Error('parquet data page header v2 is undefined')
2024-05-02 06:23:50 +00:00
const { definitionLevels, repetitionLevels, dataPage } = readDataPageV2(
2024-04-30 00:38:26 +00:00
compressedBytes, header, schemaPath, columnMetadata, compressors
2024-02-24 18:11:04 +00:00
)
2024-05-22 05:50:50 +00:00
seen += daph2.num_values
2024-02-24 18:11:04 +00:00
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
2024-05-18 02:41:40 +00:00
if (repetitionLevels.length || definitionLevels?.length) {
2024-02-24 18:11:04 +00:00
// Use repetition levels to construct lists
2024-05-18 02:41:40 +00:00
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
2024-05-18 05:44:03 +00:00
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
2024-05-18 02:41:40 +00:00
values = assembleLists(
2024-05-18 05:44:03 +00:00
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel
2024-05-06 00:51:31 +00:00
)
2024-02-24 18:11:04 +00:00
}
2024-05-06 00:51:31 +00:00
concat(rowData, values)
2024-05-18 02:41:40 +00:00
} else if (header.type === 'DICTIONARY_PAGE') {
const diph = header.dictionary_page_header
if (!diph) throw new Error('parquet dictionary page header is undefined')
const page = decompressPage(
compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec, compressors
)
dictionary = readDictionaryPage(page, diph, columnMetadata, element.type_length)
2024-01-08 01:04:05 +00:00
} else {
throw new Error(`parquet unsupported page type: ${header.type}`)
}
2024-05-01 07:55:16 +00:00
reader.offset += header.compressed_page_size
2024-01-08 01:04:05 +00:00
}
2024-01-14 19:14:04 +00:00
if (rowData.length !== Number(rowGroup.num_rows)) {
2024-02-24 18:11:04 +00:00
throw new Error(`parquet row data length ${rowData.length} does not match row group length ${rowGroup.num_rows}}`)
2024-01-14 19:14:04 +00:00
}
return rowData
}
/**
* Find the start byte offset for a column chunk.
*
* @param {ColumnMetaData} columnMetadata
2024-01-14 19:14:04 +00:00
* @returns {number} byte offset
*/
2024-05-20 09:53:07 +00:00
export function getColumnOffset({ dictionary_page_offset, data_page_offset }) {
2024-01-14 19:14:04 +00:00
let columnOffset = dictionary_page_offset
2024-05-22 05:50:50 +00:00
if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) {
2024-01-14 19:14:04 +00:00
columnOffset = data_page_offset
}
return Number(columnOffset)
2024-01-08 01:04:05 +00:00
}
2024-01-21 02:28:56 +00:00
/**
* @param {Uint8Array} compressedBytes
* @param {number} uncompressed_page_size
2024-04-28 22:58:25 +00:00
* @param {import('./types.js').CompressionCodec} codec
2024-05-23 05:24:54 +00:00
* @param {import('./types.js').Compressors | undefined} compressors
* @returns {Uint8Array}
*/
2024-02-23 18:25:06 +00:00
export function decompressPage(compressedBytes, uncompressed_page_size, codec, compressors) {
2024-05-20 09:53:07 +00:00
/** @type {Uint8Array} */
let page
2024-02-23 18:25:06 +00:00
const customDecompressor = compressors?.[codec]
if (codec === 'UNCOMPRESSED') {
page = compressedBytes
2024-02-23 18:25:06 +00:00
} else if (customDecompressor) {
page = customDecompressor(compressedBytes, uncompressed_page_size)
} else if (codec === 'SNAPPY') {
page = new Uint8Array(uncompressed_page_size)
snappyUncompress(compressedBytes, page)
} else {
throw new Error(`parquet unsupported compression codec: ${codec}`)
}
if (page?.length !== uncompressed_page_size) {
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
}
return page
}