hyparquet/src/convert.js

137 lines
3.9 KiB
JavaScript
Raw Normal View History

2024-05-13 01:12:30 +00:00
const dayMillis = 86400000 // 1 day in milliseconds
2024-02-26 20:20:48 +00:00
/**
* Convert known types from primitive to rich.
*
2024-05-02 06:23:50 +00:00
* @typedef {import('./types.js').DecodedArray} DecodedArray
* @param {DecodedArray} data series of primitive types
2024-04-28 22:58:25 +00:00
* @param {import('./types.js').SchemaElement} schemaElement schema element for the data
2024-05-23 05:24:54 +00:00
* @param {boolean | undefined} utf8 decode bytes as utf8?
2024-05-02 06:23:50 +00:00
* @returns {DecodedArray} series of rich types
2024-02-26 20:20:48 +00:00
*/
2024-05-23 05:24:54 +00:00
export function convert(data, schemaElement, utf8 = true) {
2024-02-26 20:20:48 +00:00
const ctype = schemaElement.converted_type
if (ctype === 'DECIMAL') {
2024-05-13 02:52:15 +00:00
const scale = schemaElement.scale || 0
2024-05-13 03:41:39 +00:00
const factor = Math.pow(10, -scale)
2024-05-14 07:35:39 +00:00
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
if (data[0] instanceof Uint8Array) {
arr[i] = parseDecimal(data[i]) * factor
} else {
arr[i] = Number(data[i]) * factor
}
2024-02-26 20:20:48 +00:00
}
2024-05-14 07:35:39 +00:00
return arr
2024-02-26 20:20:48 +00:00
}
2024-05-13 01:12:30 +00:00
if (ctype === undefined && schemaElement.type === 'INT96') {
2024-05-13 03:41:39 +00:00
return Array.from(data).map(parseInt96Date)
2024-05-13 01:12:30 +00:00
}
2024-05-14 07:35:39 +00:00
if (ctype === 'DATE') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = new Date(data[i] * dayMillis)
}
return arr
2024-02-26 20:20:48 +00:00
}
2024-05-23 23:43:26 +00:00
if (ctype === 'TIMESTAMP_MILLIS') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = new Date(Number(data[i]))
}
return arr
}
if (ctype === 'TIMESTAMP_MICROS') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = new Date(Number(data[i] / 1000n))
}
return arr
}
2024-02-26 20:20:48 +00:00
if (ctype === 'JSON') {
2024-05-23 23:43:26 +00:00
const decoder = new TextDecoder()
return data.map(v => JSON.parse(decoder.decode(v)))
2024-02-26 20:20:48 +00:00
}
if (ctype === 'BSON') {
throw new Error('parquet bson not supported')
}
if (ctype === 'INTERVAL') {
throw new Error('parquet interval not supported')
}
2024-05-23 05:24:54 +00:00
if (ctype === 'UTF8' || utf8 && schemaElement.type === 'BYTE_ARRAY') {
const decoder = new TextDecoder()
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = data[i] && decoder.decode(data[i])
}
return arr
}
2024-05-14 07:35:39 +00:00
// TODO: ctype UINT
2024-05-13 16:22:55 +00:00
const logicalType = schemaElement.logical_type?.type
if (logicalType === 'FLOAT16') {
return Array.from(data).map(parseFloat16)
}
2024-02-26 20:20:48 +00:00
return data
}
/**
* @param {Uint8Array} bytes
* @returns {number}
*/
function parseDecimal(bytes) {
// TODO: handle signed
let value = 0
for (const byte of bytes) {
value = value << 8 | byte
}
return value
}
2024-05-13 01:12:30 +00:00
/**
* @param {bigint} value
* @returns {Date}
*/
function parseInt96Date(value) {
const days = Number((value >> 64n) - 2440588n)
const nano = Number((value & 0xffffffffffffffffn) / 1000000n)
const millis = days * dayMillis + nano
return new Date(millis)
}
2024-05-13 16:22:55 +00:00
/**
* @param {Uint8Array | undefined} bytes
* @returns {number | undefined}
*/
export function parseFloat16(bytes) {
if (!bytes) return undefined
2024-05-19 01:21:18 +00:00
const int16 = bytes[1] << 8 | bytes[0]
2024-05-13 16:22:55 +00:00
const sign = int16 >> 15 ? -1 : 1
2024-05-19 01:21:18 +00:00
const exp = int16 >> 10 & 0x1f
2024-05-13 16:22:55 +00:00
const frac = int16 & 0x3ff
if (exp === 0) return sign * Math.pow(2, -14) * (frac / 1024) // subnormals
if (exp === 0x1f) return frac ? NaN : sign * Infinity
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
}
/**
* Map data to dictionary values in place.
*
* @param {DecodedArray | undefined} dictionary
* @param {DecodedArray} dataPage
* @returns {DecodedArray}
*/
export function dereferenceDictionary(dictionary, dataPage) {
let output = dataPage
if (dictionary) {
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// upgrade dataPage to match dictionary type
// @ts-expect-error not my fault typescript doesn't understand constructors
output = new dictionary.constructor(dataPage.length)
}
for (let i = 0; i < dataPage.length; i++) {
output[i] = dictionary[dataPage[i]]
}
}
return output
}