From b8660baea155d49a133479c299c22bc2b048f8a9 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 30 Apr 2024 20:28:50 -0700 Subject: [PATCH] Split out plain encoding --- src/datapage.js | 3 +- src/datapageV2.js | 3 +- src/encoding.js | 174 ------------------------------------------ src/plain.js | 174 ++++++++++++++++++++++++++++++++++++++++++ test/encoding.test.js | 99 +----------------------- test/plain.test.js | 99 ++++++++++++++++++++++++ 6 files changed, 278 insertions(+), 274 deletions(-) create mode 100644 src/plain.js create mode 100644 test/plain.test.js diff --git a/src/datapage.js b/src/datapage.js index 7a0cbeb..1cbcf1e 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -1,4 +1,5 @@ -import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' +import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' +import { readPlain } from './plain.js' import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js' const skipNulls = false // TODO diff --git a/src/datapageV2.js b/src/datapageV2.js index fc38f0c..52fdaf6 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -1,5 +1,6 @@ import { decompressPage } from './column.js' -import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' +import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' +import { readPlain } from './plain.js' import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' import { readVarInt, readZigZag } from './thrift.js' diff --git a/src/encoding.js b/src/encoding.js index afc2c6f..7ceb84f 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -1,179 +1,5 @@ import { readVarInt } from './thrift.js' -/** - * Read `count` boolean values. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @returns {boolean[]} array of boolean values - */ -function readPlainBoolean(reader, count) { - const values = new Array(count) - for (let i = 0; i < count; i++) { - const byteOffset = reader.offset + Math.floor(i / 8) - const bitOffset = i % 8 - const byte = reader.view.getUint8(byteOffset) - values[i] = (byte & (1 << bitOffset)) !== 0 - } - reader.offset += Math.ceil(count / 8) - return values -} - -/** - * Read `count` int32 values. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @returns {number[]} array of int32 values - */ -function readPlainInt32(reader, count) { - const values = new Array(count) - for (let i = 0; i < count; i++) { - values[i] = reader.view.getInt32(reader.offset + i * 4, true) - } - reader.offset += count * 4 - return values -} - -/** - * Read `count` int64 values. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @returns {bigint[]} array of int64 values - */ -function readPlainInt64(reader, count) { - const values = new Array(count) - for (let i = 0; i < count; i++) { - values[i] = reader.view.getBigInt64(reader.offset + i * 8, true) - } - reader.offset += count * 8 - return values -} - -/** - * Read `count` int96 values. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @returns {bigint[]} array of int96 values - */ -function readPlainInt96(reader, count) { - const values = new Array(count) - for (let i = 0; i < count; i++) { - const low = reader.view.getBigInt64(reader.offset + i * 12, true) - const high = reader.view.getInt32(reader.offset + i * 12 + 8, true) - values[i] = (BigInt(high) << BigInt(32)) | low - } - reader.offset += count * 12 - return values -} - -/** - * Read `count` float values. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @returns {number[]} array of float values - */ -function readPlainFloat(reader, count) { - const values = new Array(count) - for (let i = 0; i < count; i++) { - values[i] = reader.view.getFloat32(reader.offset + i * 4, true) - } - reader.offset += count * 4 - return values -} - -/** - * Read `count` double values. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @returns {number[]} array of double values - */ -function readPlainDouble(reader, count) { - const values = new Array(count) - for (let i = 0; i < count; i++) { - values[i] = reader.view.getFloat64(reader.offset + i * 8, true) - } - reader.offset += count * 8 - return values -} - -/** - * Read `count` byte array values. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @returns {Uint8Array[]} array of byte arrays - */ -function readPlainByteArray(reader, count) { - const values = new Array(count) - for (let i = 0; i < count; i++) { - const length = reader.view.getInt32(reader.offset, true) - reader.offset += 4 - values[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length) - reader.offset += length - } - return values -} - -/** - * Read a fixed length byte array. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} fixedLength - length of each fixed length byte array - * @returns {Uint8Array} array of fixed length byte arrays - */ -function readPlainByteArrayFixed(reader, fixedLength) { - reader.offset += fixedLength - return new Uint8Array( - reader.view.buffer, - reader.view.byteOffset + reader.offset - fixedLength, - fixedLength - ) -} - -/** - * Read `count` values of the given type from the reader.view. - * - * @typedef {import("./types.d.ts").DecodedArray} DecodedArray - * @typedef {import("./types.d.ts").ParquetType} ParquetType - * @param {DataReader} reader - buffer to read data from - * @param {ParquetType} type - parquet type of the data - * @param {number} count - number of values to read - * @param {boolean} utf8 - whether to decode byte arrays as UTF-8 - * @returns {DecodedArray} array of values - */ -export function readPlain(reader, type, count, utf8) { - if (count === 0) return [] - if (type === 'BOOLEAN') { - return readPlainBoolean(reader, count) - } else if (type === 'INT32') { - return readPlainInt32(reader, count) - } else if (type === 'INT64') { - return readPlainInt64(reader, count) - } else if (type === 'INT96') { - return readPlainInt96(reader, count) - } else if (type === 'FLOAT') { - return readPlainFloat(reader, count) - } else if (type === 'DOUBLE') { - return readPlainDouble(reader, count) - } else if (type === 'BYTE_ARRAY') { - const byteArray = readPlainByteArray(reader, count) - if (utf8) { - const decoder = new TextDecoder() - return byteArray.map(bytes => decoder.decode(bytes)) - } - return byteArray - } else if (type === 'FIXED_LEN_BYTE_ARRAY') { - return readPlainByteArrayFixed(reader, count) - } else { - throw new Error(`parquet unhandled type: ${type}`) - } -} - /** * Convert the value specified to a bit width. * diff --git a/src/plain.js b/src/plain.js new file mode 100644 index 0000000..cfd6e37 --- /dev/null +++ b/src/plain.js @@ -0,0 +1,174 @@ +/** + * Read `count` boolean values. + * + * @typedef {import("./types.d.ts").DataReader} DataReader + * @param {DataReader} reader - buffer to read data from + * @param {number} count - number of values to read + * @returns {boolean[]} array of boolean values + */ +function readPlainBoolean(reader, count) { + const values = new Array(count) + for (let i = 0; i < count; i++) { + const byteOffset = reader.offset + Math.floor(i / 8) + const bitOffset = i % 8 + const byte = reader.view.getUint8(byteOffset) + values[i] = (byte & (1 << bitOffset)) !== 0 + } + reader.offset += Math.ceil(count / 8) + return values +} + +/** + * Read `count` int32 values. + * + * @param {DataReader} reader - buffer to read data from + * @param {number} count - number of values to read + * @returns {number[]} array of int32 values + */ +function readPlainInt32(reader, count) { + const values = new Array(count) + for (let i = 0; i < count; i++) { + values[i] = reader.view.getInt32(reader.offset + i * 4, true) + } + reader.offset += count * 4 + return values +} + +/** + * Read `count` int64 values. + * + * @param {DataReader} reader - buffer to read data from + * @param {number} count - number of values to read + * @returns {bigint[]} array of int64 values + */ +function readPlainInt64(reader, count) { + const values = new Array(count) + for (let i = 0; i < count; i++) { + values[i] = reader.view.getBigInt64(reader.offset + i * 8, true) + } + reader.offset += count * 8 + return values +} + +/** + * Read `count` int96 values. + * + * @param {DataReader} reader - buffer to read data from + * @param {number} count - number of values to read + * @returns {bigint[]} array of int96 values + */ +function readPlainInt96(reader, count) { + const values = new Array(count) + for (let i = 0; i < count; i++) { + const low = reader.view.getBigInt64(reader.offset + i * 12, true) + const high = reader.view.getInt32(reader.offset + i * 12 + 8, true) + values[i] = (BigInt(high) << BigInt(32)) | low + } + reader.offset += count * 12 + return values +} + +/** + * Read `count` float values. + * + * @param {DataReader} reader - buffer to read data from + * @param {number} count - number of values to read + * @returns {number[]} array of float values + */ +function readPlainFloat(reader, count) { + const values = new Array(count) + for (let i = 0; i < count; i++) { + values[i] = reader.view.getFloat32(reader.offset + i * 4, true) + } + reader.offset += count * 4 + return values +} + +/** + * Read `count` double values. + * + * @param {DataReader} reader - buffer to read data from + * @param {number} count - number of values to read + * @returns {number[]} array of double values + */ +function readPlainDouble(reader, count) { + const values = new Array(count) + for (let i = 0; i < count; i++) { + values[i] = reader.view.getFloat64(reader.offset + i * 8, true) + } + reader.offset += count * 8 + return values +} + +/** + * Read `count` byte array values. + * + * @param {DataReader} reader - buffer to read data from + * @param {number} count - number of values to read + * @returns {Uint8Array[]} array of byte arrays + */ +function readPlainByteArray(reader, count) { + const values = new Array(count) + for (let i = 0; i < count; i++) { + const length = reader.view.getInt32(reader.offset, true) + reader.offset += 4 + values[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length) + reader.offset += length + } + return values +} + +/** + * Read a fixed length byte array. + * + * @param {DataReader} reader - buffer to read data from + * @param {number} fixedLength - length of each fixed length byte array + * @returns {Uint8Array} array of fixed length byte arrays + */ +function readPlainByteArrayFixed(reader, fixedLength) { + reader.offset += fixedLength + return new Uint8Array( + reader.view.buffer, + reader.view.byteOffset + reader.offset - fixedLength, + fixedLength + ) +} + +/** + * Read `count` values of the given type from the reader.view. + * + * @typedef {import("./types.d.ts").DecodedArray} DecodedArray + * @typedef {import("./types.d.ts").ParquetType} ParquetType + * @param {DataReader} reader - buffer to read data from + * @param {ParquetType} type - parquet type of the data + * @param {number} count - number of values to read + * @param {boolean} utf8 - whether to decode byte arrays as UTF-8 + * @returns {DecodedArray} array of values + */ +export function readPlain(reader, type, count, utf8) { + if (count === 0) return [] + if (type === 'BOOLEAN') { + return readPlainBoolean(reader, count) + } else if (type === 'INT32') { + return readPlainInt32(reader, count) + } else if (type === 'INT64') { + return readPlainInt64(reader, count) + } else if (type === 'INT96') { + return readPlainInt96(reader, count) + } else if (type === 'FLOAT') { + return readPlainFloat(reader, count) + } else if (type === 'DOUBLE') { + return readPlainDouble(reader, count) + } else if (type === 'BYTE_ARRAY') { + const byteArray = readPlainByteArray(reader, count) + if (utf8) { + const decoder = new TextDecoder() + return byteArray.map(bytes => decoder.decode(bytes)) + } + return byteArray + } else if (type === 'FIXED_LEN_BYTE_ARRAY') { + return readPlainByteArrayFixed(reader, count) + } else { + throw new Error(`parquet unhandled type: ${type}`) + } +} diff --git a/test/encoding.test.js b/test/encoding.test.js index dede011..58b6a7a 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -1,102 +1,5 @@ import { describe, expect, it } from 'vitest' -import { readPlain, readRleBitPackedHybrid } from '../src/encoding.js' - -describe('readPlain', () => { - - it('reads BOOLEAN values correctly', () => { - const view = new DataView(new ArrayBuffer(1)) - view.setUint8(0, 0b00000001) // Set the first bit to 1 - const reader = { view, offset: 0 } - const result = readPlain(reader, 'BOOLEAN', 1, false) - expect(result).toEqual([true]) - expect(reader.offset).toBe(1) - }) - - it('reads INT32 values correctly', () => { - const view = new DataView(new ArrayBuffer(4)) - view.setInt32(0, 123456789, true) // little-endian - const reader = { view, offset: 0 } - const result = readPlain(reader, 'INT32', 1, false) - expect(result).toEqual([123456789]) - expect(reader.offset).toBe(4) - }) - - it('reads INT64 values correctly', () => { - const view = new DataView(new ArrayBuffer(8)) - view.setBigInt64(0, BigInt('1234567890123456789'), true) - const reader = { view, offset: 0 } - const result = readPlain(reader, 'INT64', 1, false) - expect(result).toEqual([1234567890123456789n]) - expect(reader.offset).toBe(8) - }) - - it('reads INT96 values correctly', () => { - const buffer = new ArrayBuffer(12) - const view = new DataView(buffer) - - // Example INT96 value split into 64-bit low part and 32-bit high part - const low = BigInt('0x0123456789ABCDEF') - const high = 0x02345678 - view.setBigInt64(0, low, true) - view.setInt32(8, high, true) - const reader = { view, offset: 0 } - const result = readPlain(reader, 'INT96', 1, false) - const expectedValue = (BigInt(high) << BigInt(32)) | low - expect(result).toEqual([expectedValue]) - expect(reader.offset).toBe(12) - }) - - it('reads FLOAT values correctly', () => { - const view = new DataView(new ArrayBuffer(4)) - view.setFloat32(0, 1234.5, true) // little-endian - const reader = { view, offset: 0 } - const result = readPlain(reader, 'FLOAT', 1, false) - expect(result).toEqual([1234.5]) - expect(reader.offset).toBe(4) - }) - - it('reads DOUBLE values correctly', () => { - const view = new DataView(new ArrayBuffer(8)) - view.setFloat64(0, 12345.6789, true) // little-endian - const reader = { view, offset: 0 } - const result = readPlain(reader, 'DOUBLE', 1, false) - expect(result).toEqual([12345.6789]) - expect(reader.offset).toBe(8) - }) - - it('reads BYTE_ARRAY values correctly', () => { - const view = new DataView(new ArrayBuffer(10)) - view.setInt32(0, 3, true) // length of the first byte array - view.setUint8(4, 1) // first byte array data - view.setUint8(5, 2) - view.setUint8(6, 3) - const reader = { view, offset: 0 } - const result = readPlain(reader, 'BYTE_ARRAY', 1, false) - expect(result).toEqual([new Uint8Array([1, 2, 3])]) - expect(reader.offset).toBe(7) - }) - - it('reads FIXED_LEN_BYTE_ARRAY values correctly', () => { - const fixedLength = 3 - const view = new DataView(new ArrayBuffer(fixedLength)) - view.setUint8(0, 4) - view.setUint8(1, 5) - view.setUint8(2, 6) - const reader = { view, offset: 0 } - const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength, false) - expect(result).toEqual(new Uint8Array([4, 5, 6])) - expect(reader.offset).toBe(fixedLength) - }) - - it('throws an error for unhandled types', () => { - const view = new DataView(new ArrayBuffer(0)) - const reader = { view, offset: 0 } - /** @type any */ - const invalidType = 'invalidType' - expect(() => readPlain(reader, invalidType, 1, false)) - .toThrow(`parquet unhandled type: ${invalidType}`) - }) -}) +import { readRleBitPackedHybrid } from '../src/encoding.js' describe('readRleBitPackedHybrid', () => { it('reads RLE bit-packed hybrid values with explicit length', () => { diff --git a/test/plain.test.js b/test/plain.test.js new file mode 100644 index 0000000..9d1de6b --- /dev/null +++ b/test/plain.test.js @@ -0,0 +1,99 @@ +import { describe, expect, it } from 'vitest' +import { readPlain } from '../src/plain.js' + +describe('readPlain', () => { + + it('reads BOOLEAN values correctly', () => { + const view = new DataView(new ArrayBuffer(1)) + view.setUint8(0, 0b00000001) // Set the first bit to 1 + const reader = { view, offset: 0 } + const result = readPlain(reader, 'BOOLEAN', 1, false) + expect(result).toEqual([true]) + expect(reader.offset).toBe(1) + }) + + it('reads INT32 values correctly', () => { + const view = new DataView(new ArrayBuffer(4)) + view.setInt32(0, 123456789, true) // little-endian + const reader = { view, offset: 0 } + const result = readPlain(reader, 'INT32', 1, false) + expect(result).toEqual([123456789]) + expect(reader.offset).toBe(4) + }) + + it('reads INT64 values correctly', () => { + const view = new DataView(new ArrayBuffer(8)) + view.setBigInt64(0, BigInt('1234567890123456789'), true) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'INT64', 1, false) + expect(result).toEqual([1234567890123456789n]) + expect(reader.offset).toBe(8) + }) + + it('reads INT96 values correctly', () => { + const buffer = new ArrayBuffer(12) + const view = new DataView(buffer) + + // Example INT96 value split into 64-bit low part and 32-bit high part + const low = BigInt('0x0123456789ABCDEF') + const high = 0x02345678 + view.setBigInt64(0, low, true) + view.setInt32(8, high, true) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'INT96', 1, false) + const expectedValue = (BigInt(high) << BigInt(32)) | low + expect(result).toEqual([expectedValue]) + expect(reader.offset).toBe(12) + }) + + it('reads FLOAT values correctly', () => { + const view = new DataView(new ArrayBuffer(4)) + view.setFloat32(0, 1234.5, true) // little-endian + const reader = { view, offset: 0 } + const result = readPlain(reader, 'FLOAT', 1, false) + expect(result).toEqual([1234.5]) + expect(reader.offset).toBe(4) + }) + + it('reads DOUBLE values correctly', () => { + const view = new DataView(new ArrayBuffer(8)) + view.setFloat64(0, 12345.6789, true) // little-endian + const reader = { view, offset: 0 } + const result = readPlain(reader, 'DOUBLE', 1, false) + expect(result).toEqual([12345.6789]) + expect(reader.offset).toBe(8) + }) + + it('reads BYTE_ARRAY values correctly', () => { + const view = new DataView(new ArrayBuffer(10)) + view.setInt32(0, 3, true) // length of the first byte array + view.setUint8(4, 1) // first byte array data + view.setUint8(5, 2) + view.setUint8(6, 3) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'BYTE_ARRAY', 1, false) + expect(result).toEqual([new Uint8Array([1, 2, 3])]) + expect(reader.offset).toBe(7) + }) + + it('reads FIXED_LEN_BYTE_ARRAY values correctly', () => { + const fixedLength = 3 + const view = new DataView(new ArrayBuffer(fixedLength)) + view.setUint8(0, 4) + view.setUint8(1, 5) + view.setUint8(2, 6) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength, false) + expect(result).toEqual(new Uint8Array([4, 5, 6])) + expect(reader.offset).toBe(fixedLength) + }) + + it('throws an error for unhandled types', () => { + const view = new DataView(new ArrayBuffer(0)) + const reader = { view, offset: 0 } + /** @type any */ + const invalidType = 'invalidType' + expect(() => readPlain(reader, invalidType, 1, false)) + .toThrow(`parquet unhandled type: ${invalidType}`) + }) +})