mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-06 06:51:54 +00:00
Fix UTF8 decoding
This commit is contained in:
parent
d02c68e883
commit
e2b85304b3
@ -1,6 +1,12 @@
|
||||
import { Encoding, ParquetType } from './constants.js'
|
||||
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
|
||||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js'
|
||||
import {
|
||||
getMaxDefinitionLevel,
|
||||
getMaxRepetitionLevel,
|
||||
isRequired,
|
||||
schemaElement,
|
||||
skipDefinitionBytes,
|
||||
} from './schema.js'
|
||||
|
||||
const skipNulls = false // TODO
|
||||
|
||||
@ -54,7 +60,9 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
|
||||
// read values based on encoding
|
||||
const nval = daph.num_values - numNulls
|
||||
if (daph.encoding === Encoding.PLAIN) {
|
||||
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset)
|
||||
const se = schemaElement(schema, columnMetadata.path_in_schema)
|
||||
const utf8 = se.converted_type === 'UTF8'
|
||||
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset, utf8)
|
||||
values = plainObj.value
|
||||
offset += plainObj.byteLength
|
||||
} else if (
|
||||
@ -100,7 +108,7 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
|
||||
export function readDictionaryPage(bytes, diph, schema, columnMetadata) {
|
||||
const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
|
||||
// read values based on encoding
|
||||
const { value } = readPlain(dataView, columnMetadata.type, diph.num_values)
|
||||
const { value } = readPlain(dataView, columnMetadata.type, diph.num_values, 0, false)
|
||||
return value
|
||||
}
|
||||
|
||||
|
||||
@ -153,9 +153,10 @@ function readPlainByteArrayFixed(dataView, offset, fixedLength) {
|
||||
* @param {number} type - parquet type of the data
|
||||
* @param {number} count - number of values to read
|
||||
* @param {number} offset - offset to start reading from the DataView
|
||||
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
|
||||
* @returns {Decoded<ArrayLike<any>>} array of values
|
||||
*/
|
||||
export function readPlain(dataView, type, count, offset = 0) {
|
||||
export function readPlain(dataView, type, count, offset, utf8) {
|
||||
if (count === 0) return { value: [], byteLength: 0 }
|
||||
if (type === ParquetType.BOOLEAN) {
|
||||
return readPlainBoolean(dataView, offset, count)
|
||||
@ -170,7 +171,15 @@ export function readPlain(dataView, type, count, offset = 0) {
|
||||
} else if (type === ParquetType.DOUBLE) {
|
||||
return readPlainDouble(dataView, offset, count)
|
||||
} else if (type === ParquetType.BYTE_ARRAY) {
|
||||
return readPlainByteArray(dataView, offset, count)
|
||||
const byteArray = readPlainByteArray(dataView, offset, count)
|
||||
if (utf8) {
|
||||
const decoder = new TextDecoder()
|
||||
return {
|
||||
value: byteArray.value.map(bytes => decoder.decode(bytes)),
|
||||
byteLength: byteArray.byteLength,
|
||||
}
|
||||
}
|
||||
return byteArray
|
||||
} else if (type === ParquetType.FIXED_LEN_BYTE_ARRAY) {
|
||||
return readPlainByteArrayFixed(dataView, offset, count)
|
||||
} else {
|
||||
|
||||
@ -7,21 +7,21 @@ describe('readPlain', () => {
|
||||
it('reads BOOLEAN values correctly', () => {
|
||||
const dataView = new DataView(new ArrayBuffer(1))
|
||||
dataView.setUint8(0, 0b00000001) // Set the first bit to 1
|
||||
const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0)
|
||||
const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0, false)
|
||||
expect(result).toEqual({ value: [true], byteLength: 1 })
|
||||
})
|
||||
|
||||
it('reads INT32 values correctly', () => {
|
||||
const dataView = new DataView(new ArrayBuffer(4))
|
||||
dataView.setInt32(0, 123456789, true) // little-endian
|
||||
const result = readPlain(dataView, ParquetType.INT32, 1, 0)
|
||||
const result = readPlain(dataView, ParquetType.INT32, 1, 0, false)
|
||||
expect(result).toEqual({ value: [123456789], byteLength: 4 })
|
||||
})
|
||||
|
||||
it('reads INT64 values correctly', () => {
|
||||
const dataView = new DataView(new ArrayBuffer(8))
|
||||
dataView.setBigInt64(0, BigInt('1234567890123456789'), true)
|
||||
const result = readPlain(dataView, ParquetType.INT64, 1, 0)
|
||||
const result = readPlain(dataView, ParquetType.INT64, 1, 0, false)
|
||||
expect(result).toEqual({ value: [1234567890123456789n], byteLength: 8 })
|
||||
})
|
||||
|
||||
@ -36,7 +36,7 @@ describe('readPlain', () => {
|
||||
dataView.setInt32(8, high, true)
|
||||
const expectedValue = (BigInt(high) << BigInt(32)) | low
|
||||
|
||||
const result = readPlain(dataView, ParquetType.INT96, 1, 0)
|
||||
const result = readPlain(dataView, ParquetType.INT96, 1, 0, false)
|
||||
expect(result).toEqual({
|
||||
value: [expectedValue],
|
||||
byteLength: 12,
|
||||
@ -46,14 +46,14 @@ describe('readPlain', () => {
|
||||
it('reads FLOAT values correctly', () => {
|
||||
const dataView = new DataView(new ArrayBuffer(4))
|
||||
dataView.setFloat32(0, 1234.5, true) // little-endian
|
||||
const result = readPlain(dataView, ParquetType.FLOAT, 1, 0)
|
||||
const result = readPlain(dataView, ParquetType.FLOAT, 1, 0, false)
|
||||
expect(result).toEqual({ value: [1234.5], byteLength: 4 })
|
||||
})
|
||||
|
||||
it('reads DOUBLE values correctly', () => {
|
||||
const dataView = new DataView(new ArrayBuffer(8))
|
||||
dataView.setFloat64(0, 12345.6789, true) // little-endian
|
||||
const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0)
|
||||
const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0, false)
|
||||
expect(result).toEqual({ value: [12345.6789], byteLength: 8 })
|
||||
})
|
||||
|
||||
@ -63,7 +63,7 @@ describe('readPlain', () => {
|
||||
dataView.setUint8(4, 1) // first byte array data
|
||||
dataView.setUint8(5, 2)
|
||||
dataView.setUint8(6, 3)
|
||||
const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0)
|
||||
const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0, false)
|
||||
expect(result).toEqual({
|
||||
value: [new Uint8Array([1, 2, 3])],
|
||||
byteLength: 7,
|
||||
@ -76,7 +76,7 @@ describe('readPlain', () => {
|
||||
dataView.setUint8(0, 4)
|
||||
dataView.setUint8(1, 5)
|
||||
dataView.setUint8(2, 6)
|
||||
const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0)
|
||||
const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0, false)
|
||||
expect(result).toEqual({
|
||||
value: new Uint8Array([4, 5, 6]),
|
||||
byteLength: fixedLength,
|
||||
@ -86,7 +86,8 @@ describe('readPlain', () => {
|
||||
it('throws an error for unhandled types', () => {
|
||||
const dataView = new DataView(new ArrayBuffer(0))
|
||||
const invalidType = 999
|
||||
expect(() => readPlain(dataView, invalidType, 1, 0)).toThrow(`parquet unhandled type: ${invalidType}`)
|
||||
expect(() => readPlain(dataView, invalidType, 1, 0, false))
|
||||
.toThrow(`parquet unhandled type: ${invalidType}`)
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
@ -11,10 +11,7 @@
|
||||
[],
|
||||
[
|
||||
null,
|
||||
{
|
||||
"0": 107,
|
||||
"1": 49
|
||||
},
|
||||
"k1",
|
||||
null,
|
||||
null
|
||||
],
|
||||
|
||||
Loading…
Reference in New Issue
Block a user