mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-04 02:36:36 +00:00
Faster decimal conversion
This commit is contained in:
parent
c83aa2ea5b
commit
9f95eff222
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -23,4 +23,4 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- run: npm i
|
||||
- run: npm test
|
||||
- run: npm run coverage
|
||||
|
||||
@ -27,8 +27,8 @@
|
||||
"typecheck": "tsc"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "20.12.11",
|
||||
"@typescript-eslint/eslint-plugin": "7.8.0",
|
||||
"@types/node": "20.12.12",
|
||||
"@typescript-eslint/eslint-plugin": "7.9.0",
|
||||
"@vitest/coverage-v8": "1.6.0",
|
||||
"eslint": "8.57.0",
|
||||
"eslint-plugin-import": "2.29.1",
|
||||
|
||||
@ -60,6 +60,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
|
||||
)
|
||||
const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata)
|
||||
valuesSeen += daph.num_values
|
||||
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
|
||||
|
||||
// construct output values: skip nulls and construct lists
|
||||
if (repetitionLevels.length) {
|
||||
@ -83,10 +84,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
|
||||
dereferenceDictionary(dictionary, dataPage)
|
||||
values = convert(dataPage, element)
|
||||
}
|
||||
|
||||
// TODO: check that we are at the end of the page
|
||||
// values.length !== daph.num_values isn't right. In cases like arrays,
|
||||
// you need the total number of children, not the number of top-level values.
|
||||
// assert(BigInt(values.length) === rowGroup.num_rows)
|
||||
|
||||
concat(rowData, values)
|
||||
} else if (header.type === 'DICTIONARY_PAGE') {
|
||||
|
||||
@ -12,29 +12,34 @@ export function convert(data, schemaElement) {
|
||||
const ctype = schemaElement.converted_type
|
||||
if (ctype === 'UTF8') {
|
||||
const decoder = new TextDecoder()
|
||||
return data.map(v => v && decoder.decode(v))
|
||||
const arr = new Array(data.length)
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
arr[i] = data[i] && decoder.decode(data[i])
|
||||
}
|
||||
return arr
|
||||
}
|
||||
if (ctype === 'DECIMAL') {
|
||||
const scale = schemaElement.scale || 0
|
||||
const factor = Math.pow(10, -scale)
|
||||
if (typeof data[0] === 'number') {
|
||||
if (factor === 1) return data
|
||||
return Array.from(data).map(v => v * factor)
|
||||
} else if (typeof data[0] === 'bigint') {
|
||||
if (factor === 1) return data
|
||||
return Array.from(data).map(v => Number(v) * factor)
|
||||
} else {
|
||||
return Array.from(data).map(v => parseDecimal(v) * factor)
|
||||
const arr = new Array(data.length)
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
if (data[0] instanceof Uint8Array) {
|
||||
arr[i] = parseDecimal(data[i]) * factor
|
||||
} else {
|
||||
arr[i] = Number(data[i]) * factor
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ctype === 'DATE') {
|
||||
return Array.from(data).map(v => new Date(v * dayMillis))
|
||||
return arr
|
||||
}
|
||||
if (ctype === undefined && schemaElement.type === 'INT96') {
|
||||
return Array.from(data).map(parseInt96Date)
|
||||
}
|
||||
if (ctype === 'TIME_MILLIS') {
|
||||
return Array.from(data).map(v => new Date(v))
|
||||
if (ctype === 'DATE') {
|
||||
const arr = new Array(data.length)
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
arr[i] = new Date(data[i] * dayMillis)
|
||||
}
|
||||
return arr
|
||||
}
|
||||
if (ctype === 'JSON') {
|
||||
return data.map(v => JSON.parse(v))
|
||||
@ -45,10 +50,12 @@ export function convert(data, schemaElement) {
|
||||
if (ctype === 'INTERVAL') {
|
||||
throw new Error('parquet interval not supported')
|
||||
}
|
||||
// TODO: ctype UINT
|
||||
const logicalType = schemaElement.logical_type?.type
|
||||
if (logicalType === 'FLOAT16') {
|
||||
return Array.from(data).map(parseFloat16)
|
||||
}
|
||||
// TODO: logical types
|
||||
return data
|
||||
}
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ describe('convert function', () => {
|
||||
const data = [BigInt(1000), BigInt(2000)]
|
||||
/** @type {SchemaElement} */
|
||||
const schemaElement = { name, converted_type: 'DECIMAL' }
|
||||
expect(convert(data, schemaElement)).toEqual([1000n, 2000n])
|
||||
expect(convert(data, schemaElement)).toEqual([1000, 2000])
|
||||
})
|
||||
|
||||
it('converts bigint to DECIMAL with scale', () => {
|
||||
@ -62,14 +62,6 @@ describe('convert function', () => {
|
||||
expect(convert(data, schemaElement)).toEqual([new Date(86400000), new Date(86400000 * 2)])
|
||||
})
|
||||
|
||||
it('converts milliseconds to TIME_MILLIS', () => {
|
||||
const now = Date.now()
|
||||
const data = [now]
|
||||
/** @type {SchemaElement} */
|
||||
const schemaElement = { name, converted_type: 'TIME_MILLIS' }
|
||||
expect(convert(data, schemaElement)).toEqual([new Date(now)])
|
||||
})
|
||||
|
||||
it('converts INT96 to DATE', () => {
|
||||
// from alltypes_plain.parquet
|
||||
const data = [45284764452596988585705472n, 45284764452597048585705472n]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user