diff --git a/src/datapage.js b/src/datapage.js index af0d49c..8fcd085 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -31,10 +31,7 @@ export function readDataPage(bytes, daph, schemaPath, columnMetadata) { // read values based on encoding const nValues = daph.num_values - numNulls if (daph.encoding === 'PLAIN') { - const { element } = schemaPath[schemaPath.length - 1] - const utf8 = element.converted_type === 'UTF8' - const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8) - dataPage = plainObj + dataPage = readPlain(reader, columnMetadata.type, nValues) } else if ( daph.encoding === 'PLAIN_DICTIONARY' || daph.encoding === 'RLE_DICTIONARY' || @@ -75,7 +72,7 @@ export function readDataPage(bytes, daph, schemaPath, columnMetadata) { export function readDictionaryPage(bytes, diph, columnMetadata) { const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) const reader = { view, offset: 0 } - return readPlain(reader, columnMetadata.type, diph.num_values, false) + return readPlain(reader, columnMetadata.type, diph.num_values) } /** diff --git a/src/datapageV2.js b/src/datapageV2.js index d0a9106..b350675 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -48,15 +48,13 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, // read values based on encoding const nValues = daph2.num_values - daph2.num_nulls if (daph2.encoding === 'PLAIN') { - const { element } = schemaPath[schemaPath.length - 1] - const utf8 = element.converted_type === 'UTF8' let page = compressedBytes.slice(reader.offset) if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') { page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors) } const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) const pageReader = { view: pageView, offset: 0 } - dataPage = readPlain(pageReader, columnMetadata.type, nValues, utf8) + dataPage = readPlain(pageReader, columnMetadata.type, nValues) } else if (daph2.encoding === 'RLE') { const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) diff --git a/src/plain.js b/src/plain.js index 870aece..c89f835 100644 --- a/src/plain.js +++ b/src/plain.js @@ -7,10 +7,9 @@ * @param {DataReader} reader - buffer to read data from * @param {ParquetType} type - parquet type of the data * @param {number} count - number of values to read - * @param {boolean} utf8 - whether to decode byte arrays as UTF-8 * @returns {DecodedArray} array of values */ -export function readPlain(reader, type, count, utf8) { +export function readPlain(reader, type, count) { if (count === 0) return [] if (type === 'BOOLEAN') { return readPlainBoolean(reader, count) @@ -25,12 +24,7 @@ export function readPlain(reader, type, count, utf8) { } else if (type === 'DOUBLE') { return readPlainDouble(reader, count) } else if (type === 'BYTE_ARRAY') { - const byteArray = readPlainByteArray(reader, count) - if (utf8) { - const decoder = new TextDecoder() - return byteArray.map(bytes => decoder.decode(bytes)) - } - return byteArray + return readPlainByteArray(reader, count) } else if (type === 'FIXED_LEN_BYTE_ARRAY') { return readPlainByteArrayFixed(reader, count) } else { diff --git a/test/plain.test.js b/test/plain.test.js index 807f6f6..30343a9 100644 --- a/test/plain.test.js +++ b/test/plain.test.js @@ -7,7 +7,7 @@ describe('readPlain', () => { const view = new DataView(new ArrayBuffer(1)) view.setUint8(0, 0b00000101) // true, false, true const reader = { view, offset: 0 } - const result = readPlain(reader, 'BOOLEAN', 3, false) + const result = readPlain(reader, 'BOOLEAN', 3) expect(result).toEqual([true, false, true]) expect(reader.offset).toBe(1) }) @@ -16,7 +16,7 @@ describe('readPlain', () => { const view = new DataView(new ArrayBuffer(4)) view.setInt32(0, 123456789, true) // little-endian const reader = { view, offset: 0 } - const result = readPlain(reader, 'INT32', 1, false) + const result = readPlain(reader, 'INT32', 1) expect(result).toEqual(new Int32Array([123456789])) expect(reader.offset).toBe(4) }) @@ -25,7 +25,7 @@ describe('readPlain', () => { const view = new DataView(new ArrayBuffer(8)) view.setBigInt64(0, BigInt('1234567890123456789'), true) const reader = { view, offset: 0 } - const result = readPlain(reader, 'INT64', 1, false) + const result = readPlain(reader, 'INT64', 1) expect(result).toEqual(new BigInt64Array([1234567890123456789n])) expect(reader.offset).toBe(8) }) @@ -40,7 +40,7 @@ describe('readPlain', () => { view.setBigInt64(0, low, true) view.setInt32(8, high, true) const reader = { view, offset: 0 } - const result = readPlain(reader, 'INT96', 1, false) + const result = readPlain(reader, 'INT96', 1) const expectedValue = (BigInt(high) << BigInt(32)) | low expect(result).toEqual([expectedValue]) expect(reader.offset).toBe(12) @@ -50,7 +50,7 @@ describe('readPlain', () => { const view = new DataView(new ArrayBuffer(4)) view.setFloat32(0, 1234.5, true) // little-endian const reader = { view, offset: 0 } - const result = readPlain(reader, 'FLOAT', 1, false) + const result = readPlain(reader, 'FLOAT', 1) expect(result).toEqual(new Float32Array([1234.5])) expect(reader.offset).toBe(4) }) @@ -59,7 +59,7 @@ describe('readPlain', () => { const view = new DataView(new ArrayBuffer(8)) view.setFloat64(0, 12345.6789, true) // little-endian const reader = { view, offset: 0 } - const result = readPlain(reader, 'DOUBLE', 1, false) + const result = readPlain(reader, 'DOUBLE', 1) expect(result).toEqual(new Float64Array([12345.6789])) expect(reader.offset).toBe(8) }) @@ -71,23 +71,11 @@ describe('readPlain', () => { view.setUint8(5, 2) view.setUint8(6, 3) const reader = { view, offset: 0 } - const result = readPlain(reader, 'BYTE_ARRAY', 1, false) + const result = readPlain(reader, 'BYTE_ARRAY', 1) expect(result).toEqual([new Uint8Array([1, 2, 3])]) expect(reader.offset).toBe(7) }) - it('reads BYTE_ARRAY values as strings', () => { - const view = new DataView(new ArrayBuffer(10)) - view.setInt32(0, 3, true) // length 3 - view.setUint8(4, 65) - view.setUint8(5, 66) - view.setUint8(6, 67) - const reader = { view, offset: 0 } - const result = readPlain(reader, 'BYTE_ARRAY', 1, true) - expect(result).toEqual(['ABC']) - expect(reader.offset).toBe(7) - }) - it('reads FIXED_LEN_BYTE_ARRAY values', () => { const fixedLength = 3 const view = new DataView(new ArrayBuffer(fixedLength)) @@ -95,7 +83,7 @@ describe('readPlain', () => { view.setUint8(1, 5) view.setUint8(2, 6) const reader = { view, offset: 0 } - const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength, false) + const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength) expect(result).toEqual(new Uint8Array([4, 5, 6])) expect(reader.offset).toBe(fixedLength) }) @@ -105,7 +93,7 @@ describe('readPlain', () => { const reader = { view, offset: 0 } /** @type any */ const invalidType = 'invalidType' - expect(() => readPlain(reader, invalidType, 1, false)) + expect(() => readPlain(reader, invalidType, 1)) .toThrow(`parquet unhandled type: ${invalidType}`) }) })