Fix UTF8 decoding

This commit is contained in:
Kenny Daniel 2024-02-16 16:25:06 -08:00
parent d02c68e883
commit e2b85304b3
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 33 additions and 18 deletions

@ -1,6 +1,12 @@
import { Encoding, ParquetType } from './constants.js'
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js'
import {
getMaxDefinitionLevel,
getMaxRepetitionLevel,
isRequired,
schemaElement,
skipDefinitionBytes,
} from './schema.js'
const skipNulls = false // TODO
@ -54,7 +60,9 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
// read values based on encoding
const nval = daph.num_values - numNulls
if (daph.encoding === Encoding.PLAIN) {
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset)
const se = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = se.converted_type === 'UTF8'
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset, utf8)
values = plainObj.value
offset += plainObj.byteLength
} else if (
@ -100,7 +108,7 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
export function readDictionaryPage(bytes, diph, schema, columnMetadata) {
const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
// read values based on encoding
const { value } = readPlain(dataView, columnMetadata.type, diph.num_values)
const { value } = readPlain(dataView, columnMetadata.type, diph.num_values, 0, false)
return value
}

@ -153,9 +153,10 @@ function readPlainByteArrayFixed(dataView, offset, fixedLength) {
* @param {number} type - parquet type of the data
* @param {number} count - number of values to read
* @param {number} offset - offset to start reading from the DataView
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
* @returns {Decoded<ArrayLike<any>>} array of values
*/
export function readPlain(dataView, type, count, offset = 0) {
export function readPlain(dataView, type, count, offset, utf8) {
if (count === 0) return { value: [], byteLength: 0 }
if (type === ParquetType.BOOLEAN) {
return readPlainBoolean(dataView, offset, count)
@ -170,7 +171,15 @@ export function readPlain(dataView, type, count, offset = 0) {
} else if (type === ParquetType.DOUBLE) {
return readPlainDouble(dataView, offset, count)
} else if (type === ParquetType.BYTE_ARRAY) {
return readPlainByteArray(dataView, offset, count)
const byteArray = readPlainByteArray(dataView, offset, count)
if (utf8) {
const decoder = new TextDecoder()
return {
value: byteArray.value.map(bytes => decoder.decode(bytes)),
byteLength: byteArray.byteLength,
}
}
return byteArray
} else if (type === ParquetType.FIXED_LEN_BYTE_ARRAY) {
return readPlainByteArrayFixed(dataView, offset, count)
} else {

@ -7,21 +7,21 @@ describe('readPlain', () => {
it('reads BOOLEAN values correctly', () => {
const dataView = new DataView(new ArrayBuffer(1))
dataView.setUint8(0, 0b00000001) // Set the first bit to 1
const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0)
const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0, false)
expect(result).toEqual({ value: [true], byteLength: 1 })
})
it('reads INT32 values correctly', () => {
const dataView = new DataView(new ArrayBuffer(4))
dataView.setInt32(0, 123456789, true) // little-endian
const result = readPlain(dataView, ParquetType.INT32, 1, 0)
const result = readPlain(dataView, ParquetType.INT32, 1, 0, false)
expect(result).toEqual({ value: [123456789], byteLength: 4 })
})
it('reads INT64 values correctly', () => {
const dataView = new DataView(new ArrayBuffer(8))
dataView.setBigInt64(0, BigInt('1234567890123456789'), true)
const result = readPlain(dataView, ParquetType.INT64, 1, 0)
const result = readPlain(dataView, ParquetType.INT64, 1, 0, false)
expect(result).toEqual({ value: [1234567890123456789n], byteLength: 8 })
})
@ -36,7 +36,7 @@ describe('readPlain', () => {
dataView.setInt32(8, high, true)
const expectedValue = (BigInt(high) << BigInt(32)) | low
const result = readPlain(dataView, ParquetType.INT96, 1, 0)
const result = readPlain(dataView, ParquetType.INT96, 1, 0, false)
expect(result).toEqual({
value: [expectedValue],
byteLength: 12,
@ -46,14 +46,14 @@ describe('readPlain', () => {
it('reads FLOAT values correctly', () => {
const dataView = new DataView(new ArrayBuffer(4))
dataView.setFloat32(0, 1234.5, true) // little-endian
const result = readPlain(dataView, ParquetType.FLOAT, 1, 0)
const result = readPlain(dataView, ParquetType.FLOAT, 1, 0, false)
expect(result).toEqual({ value: [1234.5], byteLength: 4 })
})
it('reads DOUBLE values correctly', () => {
const dataView = new DataView(new ArrayBuffer(8))
dataView.setFloat64(0, 12345.6789, true) // little-endian
const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0)
const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0, false)
expect(result).toEqual({ value: [12345.6789], byteLength: 8 })
})
@ -63,7 +63,7 @@ describe('readPlain', () => {
dataView.setUint8(4, 1) // first byte array data
dataView.setUint8(5, 2)
dataView.setUint8(6, 3)
const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0)
const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0, false)
expect(result).toEqual({
value: [new Uint8Array([1, 2, 3])],
byteLength: 7,
@ -76,7 +76,7 @@ describe('readPlain', () => {
dataView.setUint8(0, 4)
dataView.setUint8(1, 5)
dataView.setUint8(2, 6)
const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0)
const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0, false)
expect(result).toEqual({
value: new Uint8Array([4, 5, 6]),
byteLength: fixedLength,
@ -86,7 +86,8 @@ describe('readPlain', () => {
it('throws an error for unhandled types', () => {
const dataView = new DataView(new ArrayBuffer(0))
const invalidType = 999
expect(() => readPlain(dataView, invalidType, 1, 0)).toThrow(`parquet unhandled type: ${invalidType}`)
expect(() => readPlain(dataView, invalidType, 1, 0, false))
.toThrow(`parquet unhandled type: ${invalidType}`)
})
})

@ -11,10 +11,7 @@
[],
[
null,
{
"0": 107,
"1": 49
},
"k1",
null,
null
],