Custom string parser option (#129)

This commit is contained in:
Kenny Daniel 2025-09-26 19:07:25 -07:00 committed by GitHub
parent 3c1fce4b3f
commit 8611663334
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 35 additions and 11 deletions

@ -54,10 +54,10 @@
"test": "vitest run"
},
"devDependencies": {
"@types/node": "24.5.1",
"@types/node": "24.5.2",
"@vitest/coverage-v8": "3.2.4",
"eslint": "9.35.0",
"eslint-plugin-jsdoc": "58.1.0",
"eslint": "9.36.0",
"eslint-plugin-jsdoc": "60.4.0",
"hyparquet-compressors": "1.1.1",
"typescript": "5.9.2",
"vitest": "3.2.4"

@ -1,7 +1,9 @@
/**
* @import {ColumnDecoder, DecodedArray, Encoding, ParquetParsers, SchemaElement} from '../src/types.d.ts'
* @import {ColumnDecoder, DecodedArray, Encoding, ParquetParsers} from '../src/types.d.ts'
*/
const decoder = new TextDecoder()
/**
* Default type parsers when no custom ones are given
* @type ParquetParsers
@ -20,6 +22,9 @@ export const DEFAULT_PARSERS = {
const dayInMillis = 86400000
return new Date(days * dayInMillis)
},
stringFromBytes(bytes) {
return bytes && decoder.decode(bytes)
},
}
/**
@ -109,10 +114,10 @@ export function convert(data, columnDecoder) {
throw new Error('parquet interval not supported')
}
if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') {
const decoder = new TextDecoder()
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = data[i] && decoder.decode(data[i])
const value = data[i]
arr[i] = value instanceof Uint8Array ? parsers.stringFromBytes(value) : value
}
return arr
}

@ -279,7 +279,7 @@ export function convertMetadata(value, schema, parsers) {
const { type, converted_type, logical_type } = schema
if (value === undefined) return value
if (type === 'BOOLEAN') return value[0] === 1
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value)
if (type === 'BYTE_ARRAY') return parsers.stringFromBytes(value)
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
if (type === 'FLOAT' && view.byteLength === 4) return view.getFloat32(0, true)
if (type === 'DOUBLE' && view.byteLength === 8) return view.getFloat64(0, true)

9
src/types.d.ts vendored

@ -2,10 +2,11 @@
* Custom parsers for columns
*/
export interface ParquetParsers {
timestampFromMilliseconds(millis: bigint): any;
timestampFromMicroseconds(micros: bigint): any;
timestampFromNanoseconds(nanos: bigint): any;
dateFromDays(days: number): any;
timestampFromMilliseconds(millis: bigint): any
timestampFromMicroseconds(micros: bigint): any
timestampFromNanoseconds(nanos: bigint): any
dateFromDays(days: number): any
stringFromBytes(bytes: Uint8Array): any
}
/**

@ -225,6 +225,24 @@ describe('convert function', () => {
expect(convert(data, columnParser)).toEqual([ 12358656, 12358656 ])
})
it('respects custom parsers - stringFromBytes', () => {
const encoder = new TextEncoder()
const data = [encoder.encode('foo'), undefined]
/** @type {SchemaElement} */
const element = { name, converted_type: 'UTF8' }
const columnParser = {
element,
parsers: {
...parsers,
stringFromBytes(/** @type {Uint8Array} */ bytes) {
return `custom-${new TextDecoder().decode(bytes)}`
},
},
}
expect(convert(data, columnParser)).toEqual(['custom-foo', undefined])
})
})
describe('parseFloat16', () => {