diff --git a/src/datapage.js b/src/datapage.js index 31e4e9b..606ff22 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -1,6 +1,12 @@ import { Encoding, ParquetType } from './constants.js' import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' -import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js' +import { + getMaxDefinitionLevel, + getMaxRepetitionLevel, + isRequired, + schemaElement, + skipDefinitionBytes, +} from './schema.js' const skipNulls = false // TODO @@ -54,7 +60,9 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { // read values based on encoding const nval = daph.num_values - numNulls if (daph.encoding === Encoding.PLAIN) { - const plainObj = readPlain(dataView, columnMetadata.type, nval, offset) + const se = schemaElement(schema, columnMetadata.path_in_schema) + const utf8 = se.converted_type === 'UTF8' + const plainObj = readPlain(dataView, columnMetadata.type, nval, offset, utf8) values = plainObj.value offset += plainObj.byteLength } else if ( @@ -100,7 +108,7 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { export function readDictionaryPage(bytes, diph, schema, columnMetadata) { const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) // read values based on encoding - const { value } = readPlain(dataView, columnMetadata.type, diph.num_values) + const { value } = readPlain(dataView, columnMetadata.type, diph.num_values, 0, false) return value } diff --git a/src/encoding.js b/src/encoding.js index f8397ec..5f41aa8 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -153,9 +153,10 @@ function readPlainByteArrayFixed(dataView, offset, fixedLength) { * @param {number} type - parquet type of the data * @param {number} count - number of values to read * @param {number} offset - offset to start reading from the DataView + * @param {boolean} utf8 - whether to decode byte arrays as UTF-8 * @returns {Decoded>} array of values */ -export function readPlain(dataView, type, count, offset = 0) { +export function readPlain(dataView, type, count, offset, utf8) { if (count === 0) return { value: [], byteLength: 0 } if (type === ParquetType.BOOLEAN) { return readPlainBoolean(dataView, offset, count) @@ -170,7 +171,15 @@ export function readPlain(dataView, type, count, offset = 0) { } else if (type === ParquetType.DOUBLE) { return readPlainDouble(dataView, offset, count) } else if (type === ParquetType.BYTE_ARRAY) { - return readPlainByteArray(dataView, offset, count) + const byteArray = readPlainByteArray(dataView, offset, count) + if (utf8) { + const decoder = new TextDecoder() + return { + value: byteArray.value.map(bytes => decoder.decode(bytes)), + byteLength: byteArray.byteLength, + } + } + return byteArray } else if (type === ParquetType.FIXED_LEN_BYTE_ARRAY) { return readPlainByteArrayFixed(dataView, offset, count) } else { diff --git a/test/encoding.test.js b/test/encoding.test.js index b89d14b..4eba549 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -7,21 +7,21 @@ describe('readPlain', () => { it('reads BOOLEAN values correctly', () => { const dataView = new DataView(new ArrayBuffer(1)) dataView.setUint8(0, 0b00000001) // Set the first bit to 1 - const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0) + const result = readPlain(dataView, ParquetType.BOOLEAN, 1, 0, false) expect(result).toEqual({ value: [true], byteLength: 1 }) }) it('reads INT32 values correctly', () => { const dataView = new DataView(new ArrayBuffer(4)) dataView.setInt32(0, 123456789, true) // little-endian - const result = readPlain(dataView, ParquetType.INT32, 1, 0) + const result = readPlain(dataView, ParquetType.INT32, 1, 0, false) expect(result).toEqual({ value: [123456789], byteLength: 4 }) }) it('reads INT64 values correctly', () => { const dataView = new DataView(new ArrayBuffer(8)) dataView.setBigInt64(0, BigInt('1234567890123456789'), true) - const result = readPlain(dataView, ParquetType.INT64, 1, 0) + const result = readPlain(dataView, ParquetType.INT64, 1, 0, false) expect(result).toEqual({ value: [1234567890123456789n], byteLength: 8 }) }) @@ -36,7 +36,7 @@ describe('readPlain', () => { dataView.setInt32(8, high, true) const expectedValue = (BigInt(high) << BigInt(32)) | low - const result = readPlain(dataView, ParquetType.INT96, 1, 0) + const result = readPlain(dataView, ParquetType.INT96, 1, 0, false) expect(result).toEqual({ value: [expectedValue], byteLength: 12, @@ -46,14 +46,14 @@ describe('readPlain', () => { it('reads FLOAT values correctly', () => { const dataView = new DataView(new ArrayBuffer(4)) dataView.setFloat32(0, 1234.5, true) // little-endian - const result = readPlain(dataView, ParquetType.FLOAT, 1, 0) + const result = readPlain(dataView, ParquetType.FLOAT, 1, 0, false) expect(result).toEqual({ value: [1234.5], byteLength: 4 }) }) it('reads DOUBLE values correctly', () => { const dataView = new DataView(new ArrayBuffer(8)) dataView.setFloat64(0, 12345.6789, true) // little-endian - const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0) + const result = readPlain(dataView, ParquetType.DOUBLE, 1, 0, false) expect(result).toEqual({ value: [12345.6789], byteLength: 8 }) }) @@ -63,7 +63,7 @@ describe('readPlain', () => { dataView.setUint8(4, 1) // first byte array data dataView.setUint8(5, 2) dataView.setUint8(6, 3) - const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0) + const result = readPlain(dataView, ParquetType.BYTE_ARRAY, 1, 0, false) expect(result).toEqual({ value: [new Uint8Array([1, 2, 3])], byteLength: 7, @@ -76,7 +76,7 @@ describe('readPlain', () => { dataView.setUint8(0, 4) dataView.setUint8(1, 5) dataView.setUint8(2, 6) - const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0) + const result = readPlain(dataView, ParquetType.FIXED_LEN_BYTE_ARRAY, fixedLength, 0, false) expect(result).toEqual({ value: new Uint8Array([4, 5, 6]), byteLength: fixedLength, @@ -86,7 +86,8 @@ describe('readPlain', () => { it('throws an error for unhandled types', () => { const dataView = new DataView(new ArrayBuffer(0)) const invalidType = 999 - expect(() => readPlain(dataView, invalidType, 1, 0)).toThrow(`parquet unhandled type: ${invalidType}`) + expect(() => readPlain(dataView, invalidType, 1, 0, false)) + .toThrow(`parquet unhandled type: ${invalidType}`) }) }) diff --git a/test/files/nonnullable.impala.json b/test/files/nonnullable.impala.json index e00db3a..9e8379d 100644 --- a/test/files/nonnullable.impala.json +++ b/test/files/nonnullable.impala.json @@ -11,10 +11,7 @@ [], [ null, - { - "0": 107, - "1": 49 - }, + "k1", null, null ],