Faster decimal conversion

This commit is contained in:
Kenny Daniel 2024-05-14 00:35:39 -07:00
parent c83aa2ea5b
commit 9f95eff222
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
5 changed files with 27 additions and 30 deletions

@ -23,4 +23,4 @@ jobs:
steps:
- uses: actions/checkout@v3
- run: npm i
- run: npm test
- run: npm run coverage

@ -27,8 +27,8 @@
"typecheck": "tsc"
},
"devDependencies": {
"@types/node": "20.12.11",
"@typescript-eslint/eslint-plugin": "7.8.0",
"@types/node": "20.12.12",
"@typescript-eslint/eslint-plugin": "7.9.0",
"@vitest/coverage-v8": "1.6.0",
"eslint": "8.57.0",
"eslint-plugin-import": "2.29.1",

@ -60,6 +60,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
)
const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata)
valuesSeen += daph.num_values
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
// construct output values: skip nulls and construct lists
if (repetitionLevels.length) {
@ -83,10 +84,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
dereferenceDictionary(dictionary, dataPage)
values = convert(dataPage, element)
}
// TODO: check that we are at the end of the page
// values.length !== daph.num_values isn't right. In cases like arrays,
// you need the total number of children, not the number of top-level values.
// assert(BigInt(values.length) === rowGroup.num_rows)
concat(rowData, values)
} else if (header.type === 'DICTIONARY_PAGE') {

@ -12,29 +12,34 @@ export function convert(data, schemaElement) {
const ctype = schemaElement.converted_type
if (ctype === 'UTF8') {
const decoder = new TextDecoder()
return data.map(v => v && decoder.decode(v))
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = data[i] && decoder.decode(data[i])
}
return arr
}
if (ctype === 'DECIMAL') {
const scale = schemaElement.scale || 0
const factor = Math.pow(10, -scale)
if (typeof data[0] === 'number') {
if (factor === 1) return data
return Array.from(data).map(v => v * factor)
} else if (typeof data[0] === 'bigint') {
if (factor === 1) return data
return Array.from(data).map(v => Number(v) * factor)
} else {
return Array.from(data).map(v => parseDecimal(v) * factor)
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
if (data[0] instanceof Uint8Array) {
arr[i] = parseDecimal(data[i]) * factor
} else {
arr[i] = Number(data[i]) * factor
}
}
}
if (ctype === 'DATE') {
return Array.from(data).map(v => new Date(v * dayMillis))
return arr
}
if (ctype === undefined && schemaElement.type === 'INT96') {
return Array.from(data).map(parseInt96Date)
}
if (ctype === 'TIME_MILLIS') {
return Array.from(data).map(v => new Date(v))
if (ctype === 'DATE') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = new Date(data[i] * dayMillis)
}
return arr
}
if (ctype === 'JSON') {
return data.map(v => JSON.parse(v))
@ -45,10 +50,12 @@ export function convert(data, schemaElement) {
if (ctype === 'INTERVAL') {
throw new Error('parquet interval not supported')
}
// TODO: ctype UINT
const logicalType = schemaElement.logical_type?.type
if (logicalType === 'FLOAT16') {
return Array.from(data).map(parseFloat16)
}
// TODO: logical types
return data
}

@ -38,7 +38,7 @@ describe('convert function', () => {
const data = [BigInt(1000), BigInt(2000)]
/** @type {SchemaElement} */
const schemaElement = { name, converted_type: 'DECIMAL' }
expect(convert(data, schemaElement)).toEqual([1000n, 2000n])
expect(convert(data, schemaElement)).toEqual([1000, 2000])
})
it('converts bigint to DECIMAL with scale', () => {
@ -62,14 +62,6 @@ describe('convert function', () => {
expect(convert(data, schemaElement)).toEqual([new Date(86400000), new Date(86400000 * 2)])
})
it('converts milliseconds to TIME_MILLIS', () => {
const now = Date.now()
const data = [now]
/** @type {SchemaElement} */
const schemaElement = { name, converted_type: 'TIME_MILLIS' }
expect(convert(data, schemaElement)).toEqual([new Date(now)])
})
it('converts INT96 to DATE', () => {
// from alltypes_plain.parquet
const data = [45284764452596988585705472n, 45284764452597048585705472n]