2024-02-11 22:33:56 +00:00
|
|
|
import { Encoding, ParquetType } from './constants.js'
|
2024-01-05 10:16:15 +00:00
|
|
|
import { readVarInt } from './thrift.js'
|
|
|
|
|
|
|
|
|
|
/**
|
2024-01-05 11:06:27 +00:00
|
|
|
* Return type with bytes read.
|
|
|
|
|
* This is useful to advance an offset through a buffer.
|
|
|
|
|
*
|
2024-01-05 10:16:15 +00:00
|
|
|
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
|
|
|
|
|
* @template T
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` boolean values.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @returns {Decoded<boolean[]>} array of boolean values
|
|
|
|
|
*/
|
|
|
|
|
function readPlainBoolean(dataView, offset, count) {
|
|
|
|
|
const value = []
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
const byteOffset = offset + Math.floor(i / 8)
|
|
|
|
|
const bitOffset = i % 8
|
|
|
|
|
const byte = dataView.getUint8(byteOffset)
|
|
|
|
|
value.push((byte & (1 << bitOffset)) !== 0)
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength: Math.ceil(count / 8) }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` int32 values.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @returns {Decoded<number[]>} array of int32 values
|
|
|
|
|
*/
|
|
|
|
|
function readPlainInt32(dataView, offset, count) {
|
|
|
|
|
const value = []
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
value.push(dataView.getInt32(offset + i * 4, true))
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength: count * 4 }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` int64 values.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @returns {Decoded<bigint[]>} array of int64 values
|
|
|
|
|
*/
|
|
|
|
|
function readPlainInt64(dataView, offset, count) {
|
|
|
|
|
const value = []
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
value.push(dataView.getBigInt64(offset + i * 8, true))
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength: count * 8 }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` int96 values.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @returns {Decoded<bigint[]>} array of int96 values
|
|
|
|
|
*/
|
|
|
|
|
function readPlainInt96(dataView, offset, count) {
|
|
|
|
|
const value = []
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
const low = dataView.getBigInt64(offset + i * 12, true)
|
|
|
|
|
const high = dataView.getInt32(offset + i * 12 + 8, true)
|
|
|
|
|
value.push((BigInt(high) << BigInt(32)) | low)
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength: count * 12 }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` float values.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @returns {Decoded<number[]>} array of float values
|
|
|
|
|
*/
|
|
|
|
|
function readPlainFloat(dataView, offset, count) {
|
|
|
|
|
const value = []
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
value.push(dataView.getFloat32(offset + i * 4, true))
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength: count * 4 }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` double values.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @returns {Decoded<number[]>} array of double values
|
|
|
|
|
*/
|
|
|
|
|
function readPlainDouble(dataView, offset, count) {
|
|
|
|
|
const value = []
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
value.push(dataView.getFloat64(offset + i * 8, true))
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength: count * 8 }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` byte array values.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @returns {Decoded<Uint8Array[]>} array of byte arrays
|
|
|
|
|
*/
|
|
|
|
|
function readPlainByteArray(dataView, offset, count) {
|
|
|
|
|
const value = []
|
|
|
|
|
let byteLength = 0 // byte length of all data read
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
const length = dataView.getInt32(offset + byteLength, true)
|
|
|
|
|
byteLength += 4
|
|
|
|
|
const bytes = new Uint8Array(dataView.buffer, dataView.byteOffset + offset + byteLength, length)
|
|
|
|
|
value.push(bytes)
|
|
|
|
|
byteLength += length
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2024-02-19 00:42:58 +00:00
|
|
|
* Read a fixed length byte array.
|
2024-01-05 10:16:15 +00:00
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} fixedLength - length of each fixed length byte array
|
|
|
|
|
* @returns {Decoded<Uint8Array>} array of fixed length byte arrays
|
|
|
|
|
*/
|
|
|
|
|
function readPlainByteArrayFixed(dataView, offset, fixedLength) {
|
|
|
|
|
return {
|
|
|
|
|
value: new Uint8Array(dataView.buffer, dataView.byteOffset + offset, fixedLength),
|
|
|
|
|
byteLength: fixedLength,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read `count` values of the given type from the dataView.
|
|
|
|
|
*
|
2024-02-26 18:32:53 +00:00
|
|
|
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} type - parquet type of the data
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
2024-02-17 00:25:06 +00:00
|
|
|
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
|
2024-02-26 18:32:53 +00:00
|
|
|
* @returns {Decoded<DecodedArray>} array of values
|
2024-01-05 10:16:15 +00:00
|
|
|
*/
|
2024-02-17 00:25:06 +00:00
|
|
|
export function readPlain(dataView, type, count, offset, utf8) {
|
2024-01-05 10:16:15 +00:00
|
|
|
if (count === 0) return { value: [], byteLength: 0 }
|
|
|
|
|
if (type === ParquetType.BOOLEAN) {
|
|
|
|
|
return readPlainBoolean(dataView, offset, count)
|
|
|
|
|
} else if (type === ParquetType.INT32) {
|
|
|
|
|
return readPlainInt32(dataView, offset, count)
|
|
|
|
|
} else if (type === ParquetType.INT64) {
|
|
|
|
|
return readPlainInt64(dataView, offset, count)
|
|
|
|
|
} else if (type === ParquetType.INT96) {
|
|
|
|
|
return readPlainInt96(dataView, offset, count)
|
|
|
|
|
} else if (type === ParquetType.FLOAT) {
|
|
|
|
|
return readPlainFloat(dataView, offset, count)
|
|
|
|
|
} else if (type === ParquetType.DOUBLE) {
|
|
|
|
|
return readPlainDouble(dataView, offset, count)
|
|
|
|
|
} else if (type === ParquetType.BYTE_ARRAY) {
|
2024-02-17 00:25:06 +00:00
|
|
|
const byteArray = readPlainByteArray(dataView, offset, count)
|
|
|
|
|
if (utf8) {
|
|
|
|
|
const decoder = new TextDecoder()
|
|
|
|
|
return {
|
|
|
|
|
value: byteArray.value.map(bytes => decoder.decode(bytes)),
|
|
|
|
|
byteLength: byteArray.byteLength,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return byteArray
|
2024-01-05 10:16:15 +00:00
|
|
|
} else if (type === ParquetType.FIXED_LEN_BYTE_ARRAY) {
|
|
|
|
|
return readPlainByteArrayFixed(dataView, offset, count)
|
|
|
|
|
} else {
|
2024-01-13 00:28:37 +00:00
|
|
|
throw new Error(`parquet unhandled type: ${type}`)
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert the value specified to a bit width.
|
|
|
|
|
*
|
|
|
|
|
* @param {number} value - value to convert to bitwidth
|
|
|
|
|
* @returns {number} bit width of the value
|
|
|
|
|
*/
|
|
|
|
|
export function widthFromMaxInt(value) {
|
|
|
|
|
return Math.ceil(Math.log2(value + 1))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read data from the file-object using the given encoding.
|
|
|
|
|
* The data could be definition levels, repetition levels, or actual values.
|
|
|
|
|
*
|
|
|
|
|
* @typedef {import("./types.d.ts").Encoding} Encoding
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {Encoding} encoding - encoding type
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} count - number of values to read
|
|
|
|
|
* @param {number} bitWidth - width of each bit-packed group
|
|
|
|
|
* @returns {Decoded<any>} array of values
|
|
|
|
|
*/
|
|
|
|
|
export function readData(dataView, encoding, offset, count, bitWidth) {
|
|
|
|
|
const value = []
|
|
|
|
|
let byteLength = 0
|
2024-02-11 22:33:56 +00:00
|
|
|
if (encoding === Encoding.RLE) {
|
2024-01-05 10:16:15 +00:00
|
|
|
let seen = 0
|
|
|
|
|
while (seen < count) {
|
2024-02-11 18:05:21 +00:00
|
|
|
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)
|
|
|
|
|
if (!rle.value.length) break // EOF
|
|
|
|
|
value.push(...rle.value)
|
|
|
|
|
seen += rle.value.length
|
|
|
|
|
byteLength += rle.byteLength
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
throw new Error(`parquet encoding not supported ${encoding}`)
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read values from a run-length encoded/bit-packed hybrid encoding.
|
2024-01-20 21:52:36 +00:00
|
|
|
*
|
|
|
|
|
* If length is zero, then read as int32 at the start of the encoded data.
|
2024-01-05 10:16:15 +00:00
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} width - width of each bit-packed group
|
2024-01-20 21:52:36 +00:00
|
|
|
* @param {number} length - length of the encoded data
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {number} numValues - number of values to read
|
|
|
|
|
* @returns {Decoded<number[]>} array of rle/bit-packed values
|
|
|
|
|
*/
|
|
|
|
|
export function readRleBitPackedHybrid(dataView, offset, width, length, numValues) {
|
|
|
|
|
let byteLength = 0
|
|
|
|
|
if (!length) {
|
|
|
|
|
length = dataView.getInt32(offset, true)
|
2024-01-13 00:28:37 +00:00
|
|
|
if (length < 0) throw new Error(`parquet invalid rle/bitpack length ${length}`)
|
2024-01-05 10:16:15 +00:00
|
|
|
byteLength += 4
|
|
|
|
|
}
|
|
|
|
|
const value = []
|
|
|
|
|
const startByteLength = byteLength
|
2024-02-27 03:33:38 +00:00
|
|
|
while (byteLength - startByteLength < length && value.length < numValues) {
|
2024-01-05 10:16:15 +00:00
|
|
|
const [header, newOffset] = readVarInt(dataView, offset + byteLength)
|
|
|
|
|
byteLength = newOffset - offset
|
|
|
|
|
if ((header & 1) === 0) {
|
|
|
|
|
// rle
|
2024-02-11 18:05:21 +00:00
|
|
|
const rle = readRle(dataView, offset + byteLength, header, width)
|
|
|
|
|
value.push(...rle.value)
|
|
|
|
|
byteLength += rle.byteLength
|
2024-01-05 10:16:15 +00:00
|
|
|
} else {
|
|
|
|
|
// bit-packed
|
2024-02-11 18:05:21 +00:00
|
|
|
const bitPacked = readBitPacked(
|
|
|
|
|
dataView, offset + byteLength, header, width, numValues - value.length
|
|
|
|
|
)
|
|
|
|
|
value.push(...bitPacked.value)
|
|
|
|
|
byteLength += bitPacked.byteLength
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return { value, byteLength }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a run-length encoded value.
|
|
|
|
|
*
|
|
|
|
|
* The count is determined from the header and the width is used to grab the
|
|
|
|
|
* value that's repeated. Yields the value repeated count times.
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} header - header information
|
|
|
|
|
* @param {number} bitWidth - width of each bit-packed group
|
|
|
|
|
* @returns {Decoded<number[]>} array of rle values
|
|
|
|
|
*/
|
|
|
|
|
function readRle(dataView, offset, header, bitWidth) {
|
|
|
|
|
const count = header >>> 1
|
|
|
|
|
const width = (bitWidth + 7) >> 3
|
|
|
|
|
let byteLength = 0
|
|
|
|
|
let readValue
|
|
|
|
|
if (width === 1) {
|
|
|
|
|
readValue = dataView.getUint8(offset)
|
|
|
|
|
byteLength += 1
|
|
|
|
|
} else if (width === 2) {
|
|
|
|
|
readValue = dataView.getUint16(offset, true)
|
|
|
|
|
byteLength += 2
|
|
|
|
|
} else if (width === 4) {
|
|
|
|
|
readValue = dataView.getUint32(offset, true)
|
|
|
|
|
byteLength += 4
|
|
|
|
|
} else {
|
2024-01-13 00:28:37 +00:00
|
|
|
throw new Error(`parquet invalid rle width ${width}`)
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// repeat value count times
|
|
|
|
|
const value = []
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
|
|
|
|
value.push(readValue)
|
|
|
|
|
}
|
|
|
|
|
return { value, byteLength }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a bit-packed run of the rle/bitpack hybrid.
|
|
|
|
|
* Supports width > 8 (crossing bytes).
|
|
|
|
|
*
|
|
|
|
|
* @param {DataView} dataView - buffer to read data from
|
|
|
|
|
* @param {number} offset - offset to start reading from the DataView
|
|
|
|
|
* @param {number} header - header information
|
|
|
|
|
* @param {number} bitWidth - width of each bit-packed group
|
|
|
|
|
* @param {number} remaining - number of values remaining to be read
|
|
|
|
|
* @returns {Decoded<number[]>} array of bit-packed values
|
|
|
|
|
*/
|
|
|
|
|
function readBitPacked(dataView, offset, header, bitWidth, remaining) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// extract number of values to read from header
|
|
|
|
|
let count = (header >> 1) << 3
|
2024-01-05 10:16:15 +00:00
|
|
|
const mask = maskForBits(bitWidth)
|
|
|
|
|
|
2024-02-26 18:32:53 +00:00
|
|
|
// Sometimes it tries to read outside of available memory, but it will be masked out anyway
|
|
|
|
|
let data = 0
|
|
|
|
|
if (offset < dataView.byteLength) {
|
|
|
|
|
data = dataView.getUint8(offset)
|
|
|
|
|
} else if (mask) {
|
|
|
|
|
throw new Error(`parquet bitpack offset ${offset} out of range`)
|
|
|
|
|
}
|
2024-01-05 10:16:15 +00:00
|
|
|
let byteLength = 1
|
|
|
|
|
let left = 8
|
|
|
|
|
let right = 0
|
|
|
|
|
/** @type {number[]} */
|
|
|
|
|
const value = []
|
|
|
|
|
|
2024-01-09 23:15:08 +00:00
|
|
|
// read values
|
2024-01-05 10:16:15 +00:00
|
|
|
while (count) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// if we have crossed a byte boundary, shift the data
|
2024-01-05 10:16:15 +00:00
|
|
|
if (right > 8) {
|
|
|
|
|
right -= 8
|
|
|
|
|
left -= 8
|
|
|
|
|
data >>= 8
|
|
|
|
|
} else if (left - right < bitWidth) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// if we don't have bitWidth number of bits to read, read next byte
|
|
|
|
|
data |= dataView.getUint8(offset + byteLength) << left
|
2024-01-05 10:16:15 +00:00
|
|
|
byteLength++
|
|
|
|
|
left += 8
|
|
|
|
|
} else {
|
2024-01-09 23:15:08 +00:00
|
|
|
// otherwise, read bitWidth number of bits
|
|
|
|
|
// don't write more than remaining number of rows
|
|
|
|
|
// even if there are still bits to read
|
2024-01-05 10:16:15 +00:00
|
|
|
if (remaining > 0) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// emit value by shifting off to the right and masking
|
2024-01-05 10:16:15 +00:00
|
|
|
value.push((data >> right) & mask)
|
|
|
|
|
remaining--
|
|
|
|
|
}
|
|
|
|
|
count--
|
|
|
|
|
right += bitWidth
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-09 23:15:08 +00:00
|
|
|
// return values and number of bytes read
|
2024-01-05 10:16:15 +00:00
|
|
|
return { value, byteLength }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Generate a mask for the given number of bits.
|
|
|
|
|
*
|
|
|
|
|
* @param {number} bits - number of bits for the mask
|
|
|
|
|
* @returns {number} a mask for the given number of bits
|
|
|
|
|
*/
|
|
|
|
|
function maskForBits(bits) {
|
|
|
|
|
return (1 << bits) - 1
|
|
|
|
|
}
|