diff --git a/package.json b/package.json index 25fc21e..98230a9 100644 --- a/package.json +++ b/package.json @@ -54,10 +54,10 @@ "test": "vitest run" }, "devDependencies": { - "@types/node": "24.5.1", + "@types/node": "24.5.2", "@vitest/coverage-v8": "3.2.4", - "eslint": "9.35.0", - "eslint-plugin-jsdoc": "58.1.0", + "eslint": "9.36.0", + "eslint-plugin-jsdoc": "60.4.0", "hyparquet-compressors": "1.1.1", "typescript": "5.9.2", "vitest": "3.2.4" diff --git a/src/convert.js b/src/convert.js index 271d52f..927741c 100644 --- a/src/convert.js +++ b/src/convert.js @@ -1,7 +1,9 @@ /** - * @import {ColumnDecoder, DecodedArray, Encoding, ParquetParsers, SchemaElement} from '../src/types.d.ts' + * @import {ColumnDecoder, DecodedArray, Encoding, ParquetParsers} from '../src/types.d.ts' */ +const decoder = new TextDecoder() + /** * Default type parsers when no custom ones are given * @type ParquetParsers @@ -20,6 +22,9 @@ export const DEFAULT_PARSERS = { const dayInMillis = 86400000 return new Date(days * dayInMillis) }, + stringFromBytes(bytes) { + return bytes && decoder.decode(bytes) + }, } /** @@ -109,10 +114,10 @@ export function convert(data, columnDecoder) { throw new Error('parquet interval not supported') } if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') { - const decoder = new TextDecoder() const arr = new Array(data.length) for (let i = 0; i < arr.length; i++) { - arr[i] = data[i] && decoder.decode(data[i]) + const value = data[i] + arr[i] = value instanceof Uint8Array ? parsers.stringFromBytes(value) : value } return arr } diff --git a/src/metadata.js b/src/metadata.js index 55cd6f3..c70f8d9 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -279,7 +279,7 @@ export function convertMetadata(value, schema, parsers) { const { type, converted_type, logical_type } = schema if (value === undefined) return value if (type === 'BOOLEAN') return value[0] === 1 - if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) + if (type === 'BYTE_ARRAY') return parsers.stringFromBytes(value) const view = new DataView(value.buffer, value.byteOffset, value.byteLength) if (type === 'FLOAT' && view.byteLength === 4) return view.getFloat32(0, true) if (type === 'DOUBLE' && view.byteLength === 8) return view.getFloat64(0, true) diff --git a/src/types.d.ts b/src/types.d.ts index ea2b446..23a7b45 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -2,10 +2,11 @@ * Custom parsers for columns */ export interface ParquetParsers { - timestampFromMilliseconds(millis: bigint): any; - timestampFromMicroseconds(micros: bigint): any; - timestampFromNanoseconds(nanos: bigint): any; - dateFromDays(days: number): any; + timestampFromMilliseconds(millis: bigint): any + timestampFromMicroseconds(micros: bigint): any + timestampFromNanoseconds(nanos: bigint): any + dateFromDays(days: number): any + stringFromBytes(bytes: Uint8Array): any } /** diff --git a/test/convert.test.js b/test/convert.test.js index b2b3437..00cddcd 100644 --- a/test/convert.test.js +++ b/test/convert.test.js @@ -225,6 +225,24 @@ describe('convert function', () => { expect(convert(data, columnParser)).toEqual([ 12358656, 12358656 ]) }) + + it('respects custom parsers - stringFromBytes', () => { + const encoder = new TextEncoder() + const data = [encoder.encode('foo'), undefined] + /** @type {SchemaElement} */ + const element = { name, converted_type: 'UTF8' } + const columnParser = { + element, + parsers: { + ...parsers, + stringFromBytes(/** @type {Uint8Array} */ bytes) { + return `custom-${new TextDecoder().decode(bytes)}` + }, + }, + } + + expect(convert(data, columnParser)).toEqual(['custom-foo', undefined]) + }) }) describe('parseFloat16', () => {