diff --git a/.eslintrc.json b/.eslintrc.json index 766a59b..7a7655b 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -4,6 +4,7 @@ "eslint:recommended", "plugin:@typescript-eslint/recommended" ], + "ignorePatterns": ["dist/"], "plugins": ["import", "jsdoc"], "rules": { "@typescript-eslint/no-explicit-any": "warn", diff --git a/README.md b/README.md index 2be0c53..6bc7123 100644 --- a/README.md +++ b/README.md @@ -8,3 +8,11 @@ JavaScript parser for [Apache Parquet](https://parquet.apache.org) files. Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. + +## References + + - https://github.com/apache/parquet-format + - https://github.com/dask/fastparquet + - https://github.com/apache/thrift + - https://github.com/google/snappy + - https://github.com/zhipeng-jia/snappyjs diff --git a/src/thrift.ts b/src/thrift.ts index 3a0d25f..a3e8bb6 100644 --- a/src/thrift.ts +++ b/src/thrift.ts @@ -1,3 +1,12 @@ +/** + * Represents a decoded value, and includes the number of bytes read. + * This is used to read data from the file and advance a virtual file pointer. + */ +interface Decoded { + value: T + byteLength: number +} + // TCompactProtocol types const CompactType = { STOP: 0, @@ -19,16 +28,16 @@ const CompactType = { /** * Parse TCompactProtocol */ -export function deserializeTCompactProtocol(buffer: ArrayBuffer): [number, Record] { +export function deserializeTCompactProtocol(buffer: ArrayBuffer): Decoded> { const view = new DataView(buffer) - let index = 0 + let byteLength = 0 let lastFid = 0 - const result: Record = {} + const value: Record = {} - while (index < buffer.byteLength) { + while (byteLength < buffer.byteLength) { // Parse each field based on its type and add to the result object - const [type, fid, newIndex, newLastFid] = readFieldBegin(view, index, lastFid) - index = newIndex + const [type, fid, newIndex, newLastFid] = readFieldBegin(view, byteLength, lastFid) + byteLength = newIndex lastFid = newLastFid if (type === CompactType.STOP) { @@ -37,11 +46,11 @@ export function deserializeTCompactProtocol(buffer: ArrayBuffer): [number, Recor // Handle the field based on its type let fieldValue - [fieldValue, index] = readElement(view, type, index) - result[`field_${fid}`] = fieldValue + [fieldValue, byteLength] = readElement(view, type, byteLength) + value[`field_${fid}`] = fieldValue } - return [ index, result ] + return { value, byteLength } } /** diff --git a/test/thrift.test.ts b/test/thrift.test.ts index 80b0333..ec51136 100644 --- a/test/thrift.test.ts +++ b/test/thrift.test.ts @@ -63,18 +63,18 @@ describe('deserializeTCompactProtocol function', () => { // Mark the end of the structure view.setUint8(index, 0x00) // STOP field - const [bufferLength, result] = deserializeTCompactProtocol(buffer) - expect(bufferLength).toBe(index + 1) + const { byteLength, value } = deserializeTCompactProtocol(buffer) + expect(byteLength).toBe(index + 1) // Assertions for each basic type - expect(result.field_1).toBe(true) // TRUE - expect(result.field_2).toBe(false) // FALSE - expect(result.field_3).toBe(0x7f) // BYTE - expect(result.field_4).toBe(0x7fff) // I16 - expect(result.field_5).toBe(0x7fffffff) // I32 - expect(result.field_6).toBe(BigInt('0x7fffffffffffffff')) // I64 - expect(result.field_7).toBeCloseTo(123.456) // DOUBLE - expect(result.field_8).toBe('Hello, Thrift!') // STRING + expect(value.field_1).toBe(true) // TRUE + expect(value.field_2).toBe(false) // FALSE + expect(value.field_3).toBe(0x7f) // BYTE + expect(value.field_4).toBe(0x7fff) // I16 + expect(value.field_5).toBe(0x7fffffff) // I32 + expect(value.field_6).toBe(BigInt('0x7fffffffffffffff')) // I64 + expect(value.field_7).toBeCloseTo(123.456) // DOUBLE + expect(value.field_8).toBe('Hello, Thrift!') // STRING }) })