hyparquet/src/datapageV2.js

186 lines
7.5 KiB
JavaScript
Raw Normal View History

2024-02-24 18:11:04 +00:00
import { decompressPage } from './column.js'
2024-05-01 03:28:50 +00:00
import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import { readPlain } from './plain.js'
2024-04-30 00:38:26 +00:00
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
2024-02-24 18:11:04 +00:00
import { readVarInt, readZigZag } from './thrift.js'
/**
* Read a data page from the given Uint8Array.
*
* @typedef {import("./types.d.ts").DataPage} DataPage
* @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData
2024-02-23 18:25:06 +00:00
* @typedef {import("./types.d.ts").Compressors} Compressors
2024-02-24 18:11:04 +00:00
* @typedef {import("./types.d.ts").DataPageHeaderV2} DataPageHeaderV2
2024-04-30 00:38:26 +00:00
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree
2024-02-24 18:11:04 +00:00
* @param {Uint8Array} compressedBytes raw page data (should already be decompressed)
2024-04-28 22:58:25 +00:00
* @param {import("./types.d.ts").PageHeader} ph page header
* @param {SchemaTree[]} schemaPath
* @param {ColumnMetaData} columnMetadata
2024-02-23 18:25:06 +00:00
* @param {Compressors | undefined} compressors
2024-02-24 18:11:04 +00:00
* @returns {DataPage} definition levels, repetition levels, and array of values
*/
2024-04-30 00:38:26 +00:00
export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, compressors) {
const view = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength)
const reader = { view, offset: 0 }
2024-02-24 18:11:04 +00:00
/** @type {any} */
let values = []
const daph2 = ph.data_page_header_v2
if (!daph2) throw new Error('parquet data page header v2 is undefined')
// repetition levels
2024-04-30 00:38:26 +00:00
const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schemaPath)
if (reader.offset !== daph2.repetition_levels_byte_length) {
throw new Error(`parquet repetition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length}`)
}
2024-02-24 18:11:04 +00:00
// definition levels
2024-04-30 00:38:26 +00:00
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const definitionLevels = readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel)
if (reader.offset !== daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) {
throw new Error(`parquet definition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length}`)
}
2024-02-24 18:11:04 +00:00
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length
// read values based on encoding
const nValues = daph2.num_values - daph2.num_nulls
2024-02-27 18:33:17 +00:00
if (daph2.encoding === 'PLAIN') {
2024-04-30 00:38:26 +00:00
const { element } = schemaPath[schemaPath.length - 1]
2024-03-13 02:58:54 +00:00
const utf8 = element.converted_type === 'UTF8'
let page = compressedBytes.slice(reader.offset)
2024-02-23 18:25:06 +00:00
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors)
}
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const pageReader = { view: pageView, offset: 0 }
values = readPlain(pageReader, columnMetadata.type, nValues, utf8)
2024-02-27 18:33:17 +00:00
} else if (daph2.encoding === 'RLE') {
2024-02-23 18:25:06 +00:00
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
2024-02-24 18:11:04 +00:00
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const bitWidth = 1
if (daph2.num_nulls) {
throw new Error('parquet RLE encoding with nulls not supported')
} else {
const pageReader = { view: pageView, offset: 4 }
2024-04-30 21:40:18 +00:00
values = new Array(nValues)
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, values)
2024-02-24 18:11:04 +00:00
}
} else if (
2024-02-27 18:33:17 +00:00
daph2.encoding === 'PLAIN_DICTIONARY' ||
daph2.encoding === 'RLE_DICTIONARY'
2024-02-24 18:11:04 +00:00
) {
compressedBytes = compressedBytes.subarray(reader.offset)
2024-02-23 18:25:06 +00:00
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
2024-02-24 18:11:04 +00:00
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const bitWidth = pageView.getUint8(0)
const pageReader = { view: pageView, offset: 1 }
2024-04-30 21:40:18 +00:00
values = new Array(nValues)
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, values)
2024-02-27 18:33:17 +00:00
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') {
2024-02-24 18:11:04 +00:00
if (daph2.num_nulls) throw new Error('parquet delta-int not supported')
const codec = daph2.is_compressed ? columnMetadata.codec : 'UNCOMPRESSED'
2024-02-23 18:25:06 +00:00
const page = decompressPage(compressedBytes, uncompressedPageSize, codec, compressors)
2024-02-24 18:11:04 +00:00
deltaBinaryUnpack(page, nValues, values)
} else {
throw new Error(`parquet unsupported encoding: ${daph2.encoding}`)
}
return { definitionLevels, repetitionLevels, value: values }
}
/**
* Read the repetition levels from this page, if any.
*
* @typedef {import("./types.d.ts").DataReader} DataReader
* @param {DataReader} reader data view for the page
2024-02-24 18:11:04 +00:00
* @param {DataPageHeaderV2} daph2 data page header
* @param {SchemaTree[]} schemaPath
2024-02-24 18:11:04 +00:00
* @returns {any[]} repetition levels and number of bytes read
*/
2024-04-30 00:38:26 +00:00
export function readRepetitionLevelsV2(reader, daph2, schemaPath) {
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
2024-02-27 03:33:38 +00:00
if (!maxRepetitionLevel) return []
const bitWidth = widthFromMaxInt(maxRepetitionLevel)
// num_values is index 1 for either type of page header
2024-04-30 21:40:18 +00:00
const values = new Array(daph2.num_values)
readRleBitPackedHybrid(
reader, bitWidth, daph2.repetition_levels_byte_length, values
)
2024-04-30 21:40:18 +00:00
return values
2024-02-24 18:11:04 +00:00
}
/**
* Read the definition levels from this page, if any.
*
* @param {DataReader} reader data view for the page
2024-02-24 18:11:04 +00:00
* @param {DataPageHeaderV2} daph2 data page header v2
* @param {number} maxDefinitionLevel
2024-02-24 18:11:04 +00:00
* @returns {number[] | undefined} definition levels and number of bytes read
*/
function readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) {
2024-02-24 18:11:04 +00:00
if (maxDefinitionLevel) {
// not the same as V1, because we know the length
const bitWidth = widthFromMaxInt(maxDefinitionLevel)
2024-04-30 21:40:18 +00:00
const values = new Array(daph2.num_values)
readRleBitPackedHybrid(reader, bitWidth, daph2.definition_levels_byte_length, values)
return values
2024-02-24 18:11:04 +00:00
}
}
/**
* Unpack the delta binary packed encoding.
*
* @param {Uint8Array} page page data
* @param {number} nValues number of values to read
* @param {any[]} values array to write to
*/
function deltaBinaryUnpack(page, nValues, values) {
const dataView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const [blockSize, index1] = readVarInt(dataView, 0)
const [miniblockPerBlock, index2] = readVarInt(dataView, index1)
const [count, index3] = readVarInt(dataView, index2)
let [value, offset] = readZigZag(dataView, index3)
const valuesPerMiniblock = blockSize / miniblockPerBlock
for (let valueIndex = 0; valueIndex < nValues;) {
const [minDelta, index4] = readZigZag(dataView, offset)
offset = index4
const bitWidths = new Uint8Array(miniblockPerBlock)
for (let i = 0; i < miniblockPerBlock; i++, offset++) {
bitWidths[i] = page[offset]
}
for (let i = 0; i < miniblockPerBlock; i++) {
const bitWidth = bitWidths[i]
if (bitWidth) {
if (count > 1) {
// no more diffs if on last value, delta read bitpacked
let data = 0
let stop = -bitWidth
2024-02-27 03:33:38 +00:00
// only works for bitWidth < 31
const mask = (1 << bitWidth) - 1
2024-02-24 18:11:04 +00:00
while (count) {
if (stop < 0) {
2024-02-27 03:33:38 +00:00
// fails when data gets too large
data = (data << 8) | dataView.getUint8(offset++)
2024-02-24 18:11:04 +00:00
stop += 8
} else {
values.push((data >> stop) & mask)
}
}
}
} else {
for (let j = 0; j < valuesPerMiniblock && valueIndex < nValues; j++, valueIndex++) {
values[valueIndex] = value
value += minDelta
}
}
}
}
}