hyparquet/src/encoding.js

356 lines
10 KiB
JavaScript
Raw Normal View History

2024-01-05 10:16:15 +00:00
import { readVarInt } from './thrift.js'
2024-04-07 16:33:57 +00:00
import { concat } from './utils.js'
2024-01-05 10:16:15 +00:00
/**
* Read `count` boolean values.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
* @returns {boolean[]} array of boolean values
2024-01-05 10:16:15 +00:00
*/
function readPlainBoolean(reader, count) {
2024-01-05 10:16:15 +00:00
const value = []
for (let i = 0; i < count; i++) {
const byteOffset = reader.offset + Math.floor(i / 8)
2024-01-05 10:16:15 +00:00
const bitOffset = i % 8
const byte = reader.view.getUint8(byteOffset)
2024-01-05 10:16:15 +00:00
value.push((byte & (1 << bitOffset)) !== 0)
}
reader.offset += Math.ceil(count / 8)
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read `count` int32 values.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
* @returns {number[]} array of int32 values
2024-01-05 10:16:15 +00:00
*/
function readPlainInt32(reader, count) {
2024-01-05 10:16:15 +00:00
const value = []
for (let i = 0; i < count; i++) {
value.push(reader.view.getInt32(reader.offset + i * 4, true))
2024-01-05 10:16:15 +00:00
}
reader.offset += count * 4
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read `count` int64 values.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
* @returns {bigint[]} array of int64 values
2024-01-05 10:16:15 +00:00
*/
function readPlainInt64(reader, count) {
2024-01-05 10:16:15 +00:00
const value = []
for (let i = 0; i < count; i++) {
value.push(reader.view.getBigInt64(reader.offset + i * 8, true))
2024-01-05 10:16:15 +00:00
}
reader.offset += count * 8
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read `count` int96 values.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
* @returns {bigint[]} array of int96 values
2024-01-05 10:16:15 +00:00
*/
function readPlainInt96(reader, count) {
2024-01-05 10:16:15 +00:00
const value = []
for (let i = 0; i < count; i++) {
const low = reader.view.getBigInt64(reader.offset + i * 12, true)
const high = reader.view.getInt32(reader.offset + i * 12 + 8, true)
2024-01-05 10:16:15 +00:00
value.push((BigInt(high) << BigInt(32)) | low)
}
reader.offset += count * 12
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read `count` float values.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
* @returns {number[]} array of float values
2024-01-05 10:16:15 +00:00
*/
function readPlainFloat(reader, count) {
2024-01-05 10:16:15 +00:00
const value = []
for (let i = 0; i < count; i++) {
value.push(reader.view.getFloat32(reader.offset + i * 4, true))
2024-01-05 10:16:15 +00:00
}
reader.offset += count * 4
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read `count` double values.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
* @returns {number[]} array of double values
2024-01-05 10:16:15 +00:00
*/
function readPlainDouble(reader, count) {
2024-01-05 10:16:15 +00:00
const value = []
for (let i = 0; i < count; i++) {
value.push(reader.view.getFloat64(reader.offset + i * 8, true))
2024-01-05 10:16:15 +00:00
}
reader.offset += count * 8
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read `count` byte array values.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
* @returns {Uint8Array[]} array of byte arrays
2024-01-05 10:16:15 +00:00
*/
function readPlainByteArray(reader, count) {
2024-01-05 10:16:15 +00:00
const value = []
for (let i = 0; i < count; i++) {
const length = reader.view.getInt32(reader.offset, true)
reader.offset += 4
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length)
2024-01-05 10:16:15 +00:00
value.push(bytes)
reader.offset += length
2024-01-05 10:16:15 +00:00
}
return value
2024-01-05 10:16:15 +00:00
}
/**
2024-02-19 00:42:58 +00:00
* Read a fixed length byte array.
2024-01-05 10:16:15 +00:00
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} fixedLength - length of each fixed length byte array
* @returns {Uint8Array} array of fixed length byte arrays
2024-01-05 10:16:15 +00:00
*/
function readPlainByteArrayFixed(reader, fixedLength) {
reader.offset += fixedLength
return new Uint8Array(
reader.view.buffer,
reader.view.byteOffset + reader.offset - fixedLength,
fixedLength
)
2024-01-05 10:16:15 +00:00
}
/**
* Read `count` values of the given type from the reader.view.
2024-01-05 10:16:15 +00:00
*
2024-02-26 18:32:53 +00:00
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray
2024-02-27 19:06:31 +00:00
* @typedef {import("./types.d.ts").ParquetType} ParquetType
* @param {DataReader} reader - buffer to read data from
2024-02-27 19:06:31 +00:00
* @param {ParquetType} type - parquet type of the data
2024-01-05 10:16:15 +00:00
* @param {number} count - number of values to read
2024-02-17 00:25:06 +00:00
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
* @returns {DecodedArray} array of values
2024-01-05 10:16:15 +00:00
*/
export function readPlain(reader, type, count, utf8) {
if (count === 0) return []
2024-02-27 19:06:31 +00:00
if (type === 'BOOLEAN') {
return readPlainBoolean(reader, count)
2024-02-27 19:06:31 +00:00
} else if (type === 'INT32') {
return readPlainInt32(reader, count)
2024-02-27 19:06:31 +00:00
} else if (type === 'INT64') {
return readPlainInt64(reader, count)
2024-02-27 19:06:31 +00:00
} else if (type === 'INT96') {
return readPlainInt96(reader, count)
2024-02-27 19:06:31 +00:00
} else if (type === 'FLOAT') {
return readPlainFloat(reader, count)
2024-02-27 19:06:31 +00:00
} else if (type === 'DOUBLE') {
return readPlainDouble(reader, count)
2024-02-27 19:06:31 +00:00
} else if (type === 'BYTE_ARRAY') {
const byteArray = readPlainByteArray(reader, count)
2024-02-17 00:25:06 +00:00
if (utf8) {
const decoder = new TextDecoder()
return byteArray.map(bytes => decoder.decode(bytes))
2024-02-17 00:25:06 +00:00
}
return byteArray
2024-02-27 19:06:31 +00:00
} else if (type === 'FIXED_LEN_BYTE_ARRAY') {
return readPlainByteArrayFixed(reader, count)
2024-01-05 10:16:15 +00:00
} else {
2024-01-13 00:28:37 +00:00
throw new Error(`parquet unhandled type: ${type}`)
2024-01-05 10:16:15 +00:00
}
}
/**
* Convert the value specified to a bit width.
*
* @param {number} value - value to convert to bitwidth
* @returns {number} bit width of the value
*/
export function widthFromMaxInt(value) {
return Math.ceil(Math.log2(value + 1))
}
/**
* Read data from the file-object using the given encoding.
* The data could be definition levels, repetition levels, or actual values.
*
* @typedef {import("./types.d.ts").Encoding} Encoding
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {Encoding} encoding - encoding type
* @param {number} count - number of values to read
* @param {number} bitWidth - width of each bit-packed group
* @returns {any[]} array of values
2024-01-05 10:16:15 +00:00
*/
export function readData(reader, encoding, count, bitWidth) {
/** @type {any[]} */
2024-04-07 16:33:57 +00:00
const value = []
2024-02-27 18:33:17 +00:00
if (encoding === 'RLE') {
2024-01-05 10:16:15 +00:00
let seen = 0
while (seen < count) {
const rle = readRleBitPackedHybrid(reader, bitWidth, 0, count)
if (!rle.length) break // EOF
concat(value, rle)
seen += rle.length
2024-01-05 10:16:15 +00:00
}
} else {
throw new Error(`parquet encoding not supported ${encoding}`)
}
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read values from a run-length encoded/bit-packed hybrid encoding.
2024-01-20 21:52:36 +00:00
*
* If length is zero, then read as int32 at the start of the encoded data.
2024-01-05 10:16:15 +00:00
*
* @typedef {import("./types.d.ts").DataReader} DataReader
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} width - width of each bit-packed group
2024-01-20 21:52:36 +00:00
* @param {number} length - length of the encoded data
2024-01-05 10:16:15 +00:00
* @param {number} numValues - number of values to read
* @returns {number[]} array of rle/bit-packed values
2024-01-05 10:16:15 +00:00
*/
export function readRleBitPackedHybrid(reader, width, length, numValues) {
2024-01-05 10:16:15 +00:00
if (!length) {
length = reader.view.getInt32(reader.offset, true)
reader.offset += 4
2024-01-13 00:28:37 +00:00
if (length < 0) throw new Error(`parquet invalid rle/bitpack length ${length}`)
2024-01-05 10:16:15 +00:00
}
/** @type {number[]} */
2024-04-07 16:33:57 +00:00
const value = []
const startOffset = reader.offset
while (reader.offset - startOffset < length && value.length < numValues) {
const [header, newOffset] = readVarInt(reader.view, reader.offset)
reader.offset = newOffset
2024-01-05 10:16:15 +00:00
if ((header & 1) === 0) {
// rle
const rle = readRle(reader, header, width)
concat(value, rle)
2024-01-05 10:16:15 +00:00
} else {
// bit-packed
const bitPacked = readBitPacked(
reader, header, width, numValues - value.length
)
concat(value, bitPacked)
2024-01-05 10:16:15 +00:00
}
}
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read a run-length encoded value.
*
* The count is determined from the header and the width is used to grab the
* value that's repeated. Yields the value repeated count times.
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} header - header information
* @param {number} bitWidth - width of each bit-packed group
* @returns {number[]} array of rle values
2024-01-05 10:16:15 +00:00
*/
function readRle(reader, header, bitWidth) {
2024-01-05 10:16:15 +00:00
const count = header >>> 1
const width = (bitWidth + 7) >> 3
let readValue
if (width === 1) {
readValue = reader.view.getUint8(reader.offset)
reader.offset++
2024-01-05 10:16:15 +00:00
} else if (width === 2) {
readValue = reader.view.getUint16(reader.offset, true)
reader.offset += 2
2024-01-05 10:16:15 +00:00
} else if (width === 4) {
readValue = reader.view.getUint32(reader.offset, true)
reader.offset += 4
2024-01-05 10:16:15 +00:00
} else {
2024-01-13 00:28:37 +00:00
throw new Error(`parquet invalid rle width ${width}`)
2024-01-05 10:16:15 +00:00
}
// repeat value count times
const value = []
for (let i = 0; i < count; i++) {
value.push(readValue)
}
return value
2024-01-05 10:16:15 +00:00
}
/**
* Read a bit-packed run of the rle/bitpack hybrid.
* Supports width > 8 (crossing bytes).
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} header - header information
* @param {number} bitWidth - width of each bit-packed group
* @param {number} remaining - number of values remaining to be read
* @returns {number[]} array of bit-packed values
2024-01-05 10:16:15 +00:00
*/
function readBitPacked(reader, header, bitWidth, remaining) {
2024-01-09 23:15:08 +00:00
// extract number of values to read from header
let count = (header >> 1) << 3
2024-01-05 10:16:15 +00:00
const mask = maskForBits(bitWidth)
2024-02-26 18:32:53 +00:00
// Sometimes it tries to read outside of available memory, but it will be masked out anyway
let data = 0
if (reader.offset < reader.view.byteLength) {
data = reader.view.getUint8(reader.offset)
reader.offset++
2024-02-26 18:32:53 +00:00
} else if (mask) {
throw new Error(`parquet bitpack offset ${reader.offset} out of range`)
2024-02-26 18:32:53 +00:00
}
2024-01-05 10:16:15 +00:00
let left = 8
let right = 0
/** @type {number[]} */
const value = []
2024-01-09 23:15:08 +00:00
// read values
2024-01-05 10:16:15 +00:00
while (count) {
2024-01-09 23:15:08 +00:00
// if we have crossed a byte boundary, shift the data
2024-01-05 10:16:15 +00:00
if (right > 8) {
right -= 8
left -= 8
data >>= 8
} else if (left - right < bitWidth) {
2024-01-09 23:15:08 +00:00
// if we don't have bitWidth number of bits to read, read next byte
data |= reader.view.getUint8(reader.offset) << left
reader.offset++
2024-01-05 10:16:15 +00:00
left += 8
} else {
if (remaining > 0) {
2024-01-09 23:15:08 +00:00
// emit value by shifting off to the right and masking
2024-01-05 10:16:15 +00:00
value.push((data >> right) & mask)
remaining--
}
count--
right += bitWidth
}
}
return value
2024-01-05 10:16:15 +00:00
}
/**
* Generate a mask for the given number of bits.
*
* @param {number} bits - number of bits for the mask
* @returns {number} a mask for the given number of bits
*/
function maskForBits(bits) {
return (1 << bits) - 1
}