mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-05 22:41:55 +00:00
Fix handling of signed decimals (#60)
This commit is contained in:
parent
5675560266
commit
36d8ea2e1d
@ -58,6 +58,7 @@ export default [
|
||||
object: true,
|
||||
array: false,
|
||||
}],
|
||||
'prefer-exponentiation-operator': 'error',
|
||||
'prefer-promise-reject-errors': 'error',
|
||||
quotes: ['error', 'single'],
|
||||
'require-await': 'warn',
|
||||
|
||||
@ -41,7 +41,7 @@ export function convert(data, schemaElement, utf8 = true) {
|
||||
const ctype = schemaElement.converted_type
|
||||
if (ctype === 'DECIMAL') {
|
||||
const scale = schemaElement.scale || 0
|
||||
const factor = Math.pow(10, -scale)
|
||||
const factor = 10 ** -scale
|
||||
const arr = new Array(data.length)
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
if (data[0] instanceof Uint8Array) {
|
||||
@ -123,11 +123,17 @@ export function convert(data, schemaElement, utf8 = true) {
|
||||
* @returns {number}
|
||||
*/
|
||||
export function parseDecimal(bytes) {
|
||||
// TODO: handle signed
|
||||
let value = 0
|
||||
for (const byte of bytes) {
|
||||
value = value << 8 | byte
|
||||
value = value * 256 + byte
|
||||
}
|
||||
|
||||
// handle signed
|
||||
const bits = bytes.length * 8
|
||||
if (value >= 2 ** (bits - 1)) {
|
||||
value -= 2 ** bits
|
||||
}
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
@ -152,7 +158,7 @@ export function parseFloat16(bytes) {
|
||||
const sign = int16 >> 15 ? -1 : 1
|
||||
const exp = int16 >> 10 & 0x1f
|
||||
const frac = int16 & 0x3ff
|
||||
if (exp === 0) return sign * Math.pow(2, -14) * (frac / 1024) // subnormals
|
||||
if (exp === 0) return sign * 2 ** -14 * (frac / 1024) // subnormals
|
||||
if (exp === 0x1f) return frac ? NaN : sign * Infinity
|
||||
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
|
||||
return sign * 2 ** (exp - 15) * (1 + frac / 1024)
|
||||
}
|
||||
|
||||
@ -284,7 +284,7 @@ export function convertMetadata(value, schema) {
|
||||
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true)))
|
||||
if (type === 'INT32' && view.byteLength === 4) return view.getInt32(0, true)
|
||||
if (type === 'INT64' && view.byteLength === 8) return view.getBigInt64(0, true)
|
||||
if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0))
|
||||
if (converted_type === 'DECIMAL') return parseDecimal(value) * 10 ** -(schema.scale || 0)
|
||||
if (logical_type?.type === 'FLOAT16') return parseFloat16(value)
|
||||
if (type === 'FIXED_LEN_BYTE_ARRAY') return value
|
||||
// assert(false)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { convert, parseFloat16 } from '../src/convert.js'
|
||||
import { convert, parseDecimal, parseFloat16 } from '../src/convert.js'
|
||||
|
||||
/**
|
||||
* @import {SchemaElement} from '../src/types.js'
|
||||
@ -71,6 +71,13 @@ describe('convert function', () => {
|
||||
expect(convert(data, schemaElement)).toEqual([100, 200])
|
||||
})
|
||||
|
||||
it('converts byte array from issue #59 to DECIMAL', () => {
|
||||
const data = [new Uint8Array([18, 83, 137, 151, 156, 0])]
|
||||
/** @type {SchemaElement} */
|
||||
const schemaElement = { name, converted_type: 'DECIMAL', scale: 10, precision: 14 }
|
||||
expect(convert(data, schemaElement)).toEqual([2015])
|
||||
})
|
||||
|
||||
it('converts epoch time to DATE', () => {
|
||||
const data = [1, 2] // days since epoch
|
||||
/** @type {SchemaElement} */
|
||||
@ -180,6 +187,33 @@ describe('parseFloat16', () => {
|
||||
|
||||
it('convert float16 subnormal number', () => {
|
||||
expect(parseFloat16(new Uint8Array([0xff, 0x03])))
|
||||
.toBeCloseTo(Math.pow(2, -14) * (1023 / 1024), 5)
|
||||
.toBeCloseTo(2 ** -14 * (1023 / 1024), 5)
|
||||
})
|
||||
})
|
||||
|
||||
describe('parseDecimal', () => {
|
||||
it('should return 0 for an empty Uint8Array', () => {
|
||||
const result = parseDecimal(new Uint8Array())
|
||||
expect(result).toBe(0)
|
||||
})
|
||||
|
||||
it('should parse a single byte', () => {
|
||||
const result = parseDecimal(new Uint8Array([42]))
|
||||
expect(result).toBe(42)
|
||||
})
|
||||
|
||||
it('should parse two bytes in big-endian order', () => {
|
||||
const result = parseDecimal(new Uint8Array([1, 0]))
|
||||
expect(result).toBe(256)
|
||||
})
|
||||
|
||||
it('should parse three bytes', () => {
|
||||
const result = parseDecimal(new Uint8Array([1, 2, 3]))
|
||||
expect(result).toBe(66051)
|
||||
})
|
||||
|
||||
it('should parse -1 as a 32-bit number', () => {
|
||||
const result = parseDecimal(new Uint8Array([255, 255, 255, 255]))
|
||||
expect(result).toBe(-1)
|
||||
})
|
||||
})
|
||||
|
||||
22
test/files/decimal-column.json
Normal file
22
test/files/decimal-column.json
Normal file
@ -0,0 +1,22 @@
|
||||
[
|
||||
[
|
||||
40,
|
||||
2015
|
||||
],
|
||||
[
|
||||
74,
|
||||
2015
|
||||
],
|
||||
[
|
||||
140,
|
||||
2015
|
||||
],
|
||||
[
|
||||
152,
|
||||
2015
|
||||
],
|
||||
[
|
||||
190,
|
||||
2015
|
||||
]
|
||||
]
|
||||
131
test/files/decimal-column.metadata.json
Normal file
131
test/files/decimal-column.metadata.json
Normal file
@ -0,0 +1,131 @@
|
||||
{
|
||||
"version": 2,
|
||||
"schema": [
|
||||
{
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "schema",
|
||||
"num_children": 2
|
||||
},
|
||||
{
|
||||
"type": "INT64",
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "mid"
|
||||
},
|
||||
{
|
||||
"type": "FIXED_LEN_BYTE_ARRAY",
|
||||
"type_length": 6,
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "value",
|
||||
"converted_type": "DECIMAL",
|
||||
"scale": 10,
|
||||
"precision": 14,
|
||||
"logical_type": {
|
||||
"type": "DECIMAL",
|
||||
"scale": 10,
|
||||
"precision": 14
|
||||
}
|
||||
}
|
||||
],
|
||||
"num_rows": 5,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 0,
|
||||
"meta_data": {
|
||||
"type": "INT64",
|
||||
"encodings": [
|
||||
"PLAIN",
|
||||
"RLE",
|
||||
"RLE_DICTIONARY"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"mid"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 5,
|
||||
"total_uncompressed_size": 126,
|
||||
"total_compressed_size": 120,
|
||||
"data_page_offset": 50,
|
||||
"dictionary_page_offset": 4,
|
||||
"statistics": {
|
||||
"max": 190,
|
||||
"min": 40,
|
||||
"null_count": 0,
|
||||
"max_value": 190,
|
||||
"min_value": 40
|
||||
},
|
||||
"encoding_stats": [
|
||||
{
|
||||
"page_type": "DICTIONARY_PAGE",
|
||||
"encoding": "PLAIN",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"page_type": "DATA_PAGE",
|
||||
"encoding": "RLE_DICTIONARY",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 0,
|
||||
"meta_data": {
|
||||
"type": "FIXED_LEN_BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN",
|
||||
"RLE",
|
||||
"RLE_DICTIONARY"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"value"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 5,
|
||||
"total_uncompressed_size": 82,
|
||||
"total_compressed_size": 86,
|
||||
"data_page_offset": 146,
|
||||
"dictionary_page_offset": 124,
|
||||
"statistics": {
|
||||
"max": 2015,
|
||||
"min": 2015,
|
||||
"null_count": 0,
|
||||
"max_value": 2015,
|
||||
"min_value": 2015
|
||||
},
|
||||
"encoding_stats": [
|
||||
{
|
||||
"page_type": "DICTIONARY_PAGE",
|
||||
"encoding": "PLAIN",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"page_type": "DATA_PAGE",
|
||||
"encoding": "RLE_DICTIONARY",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 208,
|
||||
"num_rows": 5,
|
||||
"file_offset": 4,
|
||||
"total_compressed_size": 206,
|
||||
"ordinal": 0
|
||||
}
|
||||
],
|
||||
"key_value_metadata": [
|
||||
{
|
||||
"key": "pandas",
|
||||
"value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 5, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"mid\", \"field_name\": \"mid\", \"pandas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}, {\"name\": \"value\", \"field_name\": \"value\", \"pandas_type\": \"decimal\", \"numpy_type\": \"object\", \"metadata\": {\"precision\": 14, \"scale\": 10}}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"19.0.0\"}, \"pandas_version\": \"2.2.3\"}"
|
||||
},
|
||||
{
|
||||
"key": "ARROW:schema",
|
||||
"value": "/////xgDAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAGACAAAEAAAAAQAAAAQAAACA/f//QAIAAAQAAAAyAgAAeyJpbmRleF9jb2x1bW5zIjogW3sia2luZCI6ICJyYW5nZSIsICJuYW1lIjogbnVsbCwgInN0YXJ0IjogMCwgInN0b3AiOiA1LCAic3RlcCI6IDF9XSwgImNvbHVtbl9pbmRleGVzIjogW3sibmFtZSI6IG51bGwsICJmaWVsZF9uYW1lIjogbnVsbCwgInBhbmRhc190eXBlIjogInVuaWNvZGUiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7ImVuY29kaW5nIjogIlVURi04In19XSwgImNvbHVtbnMiOiBbeyJuYW1lIjogIm1pZCIsICJmaWVsZF9uYW1lIjogIm1pZCIsICJwYW5kYXNfdHlwZSI6ICJpbnQ2NCIsICJudW1weV90eXBlIjogImludDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJ2YWx1ZSIsICJmaWVsZF9uYW1lIjogInZhbHVlIiwgInBhbmRhc190eXBlIjogImRlY2ltYWwiLCAibnVtcHlfdHlwZSI6ICJvYmplY3QiLCAibWV0YWRhdGEiOiB7InByZWNpc2lvbiI6IDE0LCAic2NhbGUiOiAxMH19XSwgImNyZWF0b3IiOiB7ImxpYnJhcnkiOiAicHlhcnJvdyIsICJ2ZXJzaW9uIjogIjE5LjAuMCJ9LCAicGFuZGFzX3ZlcnNpb24iOiAiMi4yLjMifQAABgAAAHBhbmRhcwAAAgAAAFAAAAAEAAAAyP///wAAAQcQAAAAIAAAAAQAAAAAAAAABQAAAHZhbHVlAAAACAAMAAQACAAIAAAADgAAAAoAAAAQABQACAAGAAcADAAAABAAEAAAAAAAAQIQAAAAHAAAAAQAAAAAAAAAAwAAAG1pZAAIAAwACAAHAAgAAAAAAAABQAAAAAAAAAA="
|
||||
}
|
||||
],
|
||||
"created_by": "parquet-cpp-arrow version 19.0.0",
|
||||
"metadata_length": 1959
|
||||
}
|
||||
BIN
test/files/decimal-column.parquet
Normal file
BIN
test/files/decimal-column.parquet
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user