diff --git a/src/encoding.js b/src/encoding.js index 1fa6c1d..43a7faf 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -93,12 +93,11 @@ function readBitPacked(reader, header, bitWidth, values, seen) { // mask for bitWidth number of bits const mask = (1 << bitWidth) - 1 - // Sometimes it tries to read outside of available memory, but it will be masked out anyway let data = 0 if (reader.offset < reader.view.byteLength) { - data = reader.view.getUint8(reader.offset) - reader.offset++ + data = reader.view.getUint8(reader.offset++) } else if (mask) { + // sometimes out-of-bounds reads are masked out throw new Error(`parquet bitpack offset ${reader.offset} out of range`) } let left = 8 diff --git a/test/encoding.test.js b/test/encoding.test.js index 58b6a7a..01e62f0 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -1,39 +1,124 @@ import { describe, expect, it } from 'vitest' -import { readRleBitPackedHybrid } from '../src/encoding.js' +import { readRleBitPackedHybrid, widthFromMaxInt } from '../src/encoding.js' describe('readRleBitPackedHybrid', () => { - it('reads RLE bit-packed hybrid values with explicit length', () => { - // Example buffer: 1 RLE group followed by 1 bit-packed group - // RLE values: true x3 - // Bit-packed values: false, false, true + it('reads RLE values with explicit length', () => { const buffer = new ArrayBuffer(4) const view = new DataView(buffer) - view.setUint8(0, 0b00000110) // RLE header for 3 true values - view.setUint8(1, 0b00000001) // RLE value (true) - view.setUint8(2, 0b00000011) // Bit-packed header for 3 values - view.setUint8(3, 0b00000100) // Bit-packed values (false, false, true) + // RLE 3x true + view.setUint8(0, 0b00000110) + view.setUint8(1, 1) + // RLE 3x 100 + view.setUint8(2, 0b00000110) + view.setUint8(3, 100) const reader = { view, offset: 0 } const values = new Array(6) - readRleBitPackedHybrid(reader, 1, 3, values) + readRleBitPackedHybrid(reader, 1, 6, values) expect(reader.offset).toBe(4) - expect(values).toEqual([1, 1, 1, 0, 0, 1]) + expect(values).toEqual([1, 1, 1, 100, 100, 100]) }) - it('reads RLE bit-packed hybrid values with implicit length', () => { - // Example buffer: same as previous test, but with implicit length + it('reads RLE values with bitwidth=16', () => { + const buffer = new ArrayBuffer(6) + const view = new DataView(buffer) + // RLE 3x 65535 + view.setUint8(3, 0b00000110) + view.setUint16(4, 65535, true) + const reader = { view, offset: 0 } + + const values = new Array(3) + readRleBitPackedHybrid(reader, 16, 6, values) + expect(reader.offset).toBe(6) + expect(values).toEqual([65535, 65535, 65535]) + }) + + it('reads RLE values with bitwidth=32', () => { + const buffer = new ArrayBuffer(5) + const view = new DataView(buffer) + // RLE 3x 234000 + view.setUint8(0, 0b00000110) + view.setUint32(1, 234000, true) + const reader = { view, offset: 0 } + + const values = new Array(3) + readRleBitPackedHybrid(reader, 32, 3, values) + expect(reader.offset).toBe(5) + expect(values).toEqual([234000, 234000, 234000]) + }) + + it('throws for invalid bitwidth', () => { + const buffer = new ArrayBuffer(1) + const view = new DataView(buffer) + view.setUint8(0, 0b00000110) + const reader = { view, offset: 0 } + + const values = new Array(3) + expect(() => readRleBitPackedHybrid(reader, 24, 3, values)) + .toThrow('parquet invalid rle width 3') + }) + + it('reads bit-packed values with implicit length', () => { + // Bit-packed values: false, false, true const buffer = new ArrayBuffer(8) const view = new DataView(buffer) view.setInt32(0, 3, true) // length 3 little-endian - view.setUint8(4, 0b00000110) // RLE header for 3 true values - view.setUint8(5, 0b00000001) // RLE value (true) - view.setUint8(6, 0b00000011) // Bit-packed header for 3 values - view.setUint8(7, 0b00000100) // Bit-packed values (false, false, true) + view.setUint8(4, 0b00000011) // Bit-packed header for 1-8 values + view.setUint8(5, 0b00000100) // Bit-packed values (false, false, true) const reader = { view, offset: 0 } - const values = new Array(6) + const values = new Array(3) readRleBitPackedHybrid(reader, 1, 0, values) - expect(reader.offset).toBe(8) - expect(values).toEqual([1, 1, 1, 0, 0, 1]) + expect(reader.offset).toBe(6) + expect(values).toEqual([0, 0, 1]) + }) + + it('reads multi-byte bit-packed values', () => { + // Bit-packed 9x true + const buffer = new ArrayBuffer(3) + const view = new DataView(buffer) + view.setUint8(0, 0b00000101) // Bit-packed header for 9-16 values + view.setUint8(1, 0b11111111) + view.setUint8(2, 0b00000001) + const reader = { view, offset: 0 } + + const values = new Array(9) + readRleBitPackedHybrid(reader, 1, 9, values) + expect(reader.offset).toBe(3) + expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1]) + }) + + it('throws for invalid bit-packed offset', () => { + const buffer = new ArrayBuffer(1) + const view = new DataView(buffer) + view.setUint8(0, 0b00000011) // Bit-packed header for 3 values + const reader = { view, offset: 0 } + + const values = new Array(3) + expect(() => readRleBitPackedHybrid(reader, 1, 3, values)) + .toThrow('parquet bitpack offset 1 out of range') + }) + + it('throws for negative implicit length', () => { + const buffer = new ArrayBuffer(4) + const view = new DataView(buffer) + view.setInt32(0, -1, true) // negative length + const reader = { view, offset: 0 } + + const values = new Array(3) + expect(() => readRleBitPackedHybrid(reader, 1, 0, values)) + .toThrow('parquet invalid rle/bitpack length -1') + }) +}) + +describe('widthFromMaxInt', () => { + it('calculates bit widths', () => { + // Test a range of inputs and their expected outputs + expect(widthFromMaxInt(0)).toBe(0) + expect(widthFromMaxInt(1)).toBe(1) + expect(widthFromMaxInt(255)).toBe(8) + expect(widthFromMaxInt(256)).toBe(9) + expect(widthFromMaxInt(1023)).toBe(10) + expect(widthFromMaxInt(1048575)).toBe(20) }) }) diff --git a/test/plain.test.js b/test/plain.test.js index 1a29ab8..807f6f6 100644 --- a/test/plain.test.js +++ b/test/plain.test.js @@ -3,16 +3,16 @@ import { readPlain } from '../src/plain.js' describe('readPlain', () => { - it('reads BOOLEAN values correctly', () => { + it('reads BOOLEAN values', () => { const view = new DataView(new ArrayBuffer(1)) - view.setUint8(0, 0b00000001) // Set the first bit to 1 + view.setUint8(0, 0b00000101) // true, false, true const reader = { view, offset: 0 } - const result = readPlain(reader, 'BOOLEAN', 1, false) - expect(result).toEqual([true]) + const result = readPlain(reader, 'BOOLEAN', 3, false) + expect(result).toEqual([true, false, true]) expect(reader.offset).toBe(1) }) - it('reads INT32 values correctly', () => { + it('reads INT32 values', () => { const view = new DataView(new ArrayBuffer(4)) view.setInt32(0, 123456789, true) // little-endian const reader = { view, offset: 0 } @@ -21,7 +21,7 @@ describe('readPlain', () => { expect(reader.offset).toBe(4) }) - it('reads INT64 values correctly', () => { + it('reads INT64 values', () => { const view = new DataView(new ArrayBuffer(8)) view.setBigInt64(0, BigInt('1234567890123456789'), true) const reader = { view, offset: 0 } @@ -30,11 +30,11 @@ describe('readPlain', () => { expect(reader.offset).toBe(8) }) - it('reads INT96 values correctly', () => { + it('reads INT96 values', () => { const buffer = new ArrayBuffer(12) const view = new DataView(buffer) - // Example INT96 value split into 64-bit low part and 32-bit high part + // INT96 value split into 64-bit low part and 32-bit high part const low = BigInt('0x0123456789ABCDEF') const high = 0x02345678 view.setBigInt64(0, low, true) @@ -46,7 +46,7 @@ describe('readPlain', () => { expect(reader.offset).toBe(12) }) - it('reads FLOAT values correctly', () => { + it('reads FLOAT values', () => { const view = new DataView(new ArrayBuffer(4)) view.setFloat32(0, 1234.5, true) // little-endian const reader = { view, offset: 0 } @@ -55,7 +55,7 @@ describe('readPlain', () => { expect(reader.offset).toBe(4) }) - it('reads DOUBLE values correctly', () => { + it('reads DOUBLE values', () => { const view = new DataView(new ArrayBuffer(8)) view.setFloat64(0, 12345.6789, true) // little-endian const reader = { view, offset: 0 } @@ -64,10 +64,10 @@ describe('readPlain', () => { expect(reader.offset).toBe(8) }) - it('reads BYTE_ARRAY values correctly', () => { + it('reads BYTE_ARRAY values', () => { const view = new DataView(new ArrayBuffer(10)) - view.setInt32(0, 3, true) // length of the first byte array - view.setUint8(4, 1) // first byte array data + view.setInt32(0, 3, true) // length 3 + view.setUint8(4, 1) view.setUint8(5, 2) view.setUint8(6, 3) const reader = { view, offset: 0 } @@ -76,7 +76,19 @@ describe('readPlain', () => { expect(reader.offset).toBe(7) }) - it('reads FIXED_LEN_BYTE_ARRAY values correctly', () => { + it('reads BYTE_ARRAY values as strings', () => { + const view = new DataView(new ArrayBuffer(10)) + view.setInt32(0, 3, true) // length 3 + view.setUint8(4, 65) + view.setUint8(5, 66) + view.setUint8(6, 67) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'BYTE_ARRAY', 1, true) + expect(result).toEqual(['ABC']) + expect(reader.offset).toBe(7) + }) + + it('reads FIXED_LEN_BYTE_ARRAY values', () => { const fixedLength = 3 const view = new DataView(new ArrayBuffer(fixedLength)) view.setUint8(0, 4)