Convert unsigned types

This commit is contained in:
Kenny Daniel 2025-04-14 22:07:12 -07:00
parent 447a58eca4
commit 9a04cbccd3
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
9 changed files with 438 additions and 234 deletions

@ -47,13 +47,13 @@
"test": "vitest run"
},
"devDependencies": {
"@types/node": "22.14.0",
"@types/node": "22.14.1",
"@vitest/coverage-v8": "3.1.1",
"eslint": "9.24.0",
"eslint-plugin-jsdoc": "50.6.9",
"hyparquet-compressors": "1.1.1",
"typescript": "5.8.3",
"typescript-eslint": "8.29.1",
"typescript-eslint": "8.30.1",
"vitest": "3.1.1"
}
}

@ -36,7 +36,7 @@ export function convertWithDictionary(data, dictionary, schemaElement, encoding,
* @returns {DecodedArray} series of rich types
*/
export function convert(data, schemaElement, utf8 = true) {
const ctype = schemaElement.converted_type
const { type, converted_type: ctype, logical_type: ltype } = schemaElement
if (ctype === 'DECIMAL') {
const scale = schemaElement.scale || 0
const factor = 10 ** -scale
@ -50,7 +50,7 @@ export function convert(data, schemaElement, utf8 = true) {
}
return arr
}
if (ctype === undefined && schemaElement.type === 'INT96') {
if (!ctype && type === 'INT96') {
return Array.from(data).map(parseInt96Date)
}
if (ctype === 'DATE') {
@ -84,7 +84,7 @@ export function convert(data, schemaElement, utf8 = true) {
if (ctype === 'INTERVAL') {
throw new Error('parquet interval not supported')
}
if (ctype === 'UTF8' || utf8 && schemaElement.type === 'BYTE_ARRAY') {
if (ctype === 'UTF8' || utf8 && type === 'BYTE_ARRAY') {
const decoder = new TextDecoder()
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
@ -92,18 +92,27 @@ export function convert(data, schemaElement, utf8 = true) {
}
return arr
}
if (ctype === 'UINT_64') {
const arr = new BigUint64Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = BigInt(data[i])
if (ctype === 'UINT_64' || ltype?.type === 'INTEGER' && ltype.bitWidth === 64 && !ltype.isSigned) {
if (data instanceof BigInt64Array) {
return new BigUint64Array(data.buffer, data.byteOffset, data.length)
}
const arr = new BigUint64Array(data.length)
for (let i = 0; i < arr.length; i++) arr[i] = BigInt(data[i])
return arr
}
if (schemaElement.logical_type?.type === 'FLOAT16') {
if (ctype === 'UINT_32' || ltype?.type === 'INTEGER' && ltype.bitWidth === 32 && !ltype.isSigned) {
if (data instanceof Int32Array) {
return new Uint32Array(data.buffer, data.byteOffset, data.length)
}
const arr = new Uint32Array(data.length)
for (let i = 0; i < arr.length; i++) arr[i] = data[i]
return arr
}
if (ltype?.type === 'FLOAT16') {
return Array.from(data).map(parseFloat16)
}
if (schemaElement.logical_type?.type === 'TIMESTAMP') {
const { unit } = schemaElement.logical_type
if (ltype?.type === 'TIMESTAMP') {
const { unit } = ltype
let factor = 1n
if (unit === 'MICROS') factor = 1000n
if (unit === 'NANOS') factor = 1000000n

1
src/types.d.ts vendored

@ -344,6 +344,7 @@ interface DataPage {
export type DecodedArray =
Uint8Array |
Uint32Array |
Int32Array |
BigInt64Array |
BigUint64Array |

6
test/files/signs.json Normal file

@ -0,0 +1,6 @@
[
[0, 0, 0, 0, -128, -32768, -2147483648, -9223372036854775808],
[127, 32767, 2147483647, 9223372036854775807, -1, -1, -1, -1],
[128, 32768, 2147483648, 9223372036854775808, 0, 0, 0, 0],
[255, 65535, 4294967295, 18446744073709551615, 127, 32767, 2147483647, 9223372036854775807]
]

@ -0,0 +1,410 @@
{
"version": 2,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 8
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "unsigned_int8",
"converted_type": "UINT_8",
"logical_type": {
"type": "INTEGER",
"bitWidth": 8,
"isSigned": false
}
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "unsigned_int16",
"converted_type": "UINT_16",
"logical_type": {
"type": "INTEGER",
"bitWidth": 16,
"isSigned": false
}
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "unsigned_int32",
"converted_type": "UINT_32",
"logical_type": {
"type": "INTEGER",
"bitWidth": 32,
"isSigned": false
}
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "unsigned_int64",
"converted_type": "UINT_64",
"logical_type": {
"type": "INTEGER",
"bitWidth": 64,
"isSigned": false
}
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "signed_int8",
"converted_type": "INT_8",
"logical_type": {
"type": "INTEGER",
"bitWidth": 8,
"isSigned": true
}
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "signed_int16",
"converted_type": "INT_16",
"logical_type": {
"type": "INTEGER",
"bitWidth": 16,
"isSigned": true
}
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "signed_int32"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "signed_int64"
}
],
"num_rows": 4,
"row_groups": [
{
"columns": [
{
"file_offset": 0,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"unsigned_int8"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 73,
"total_compressed_size": 77,
"data_page_offset": 36,
"dictionary_page_offset": 4,
"statistics": {
"null_count": 0,
"max_value": 255,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 0,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"unsigned_int16"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 73,
"total_compressed_size": 77,
"data_page_offset": 113,
"dictionary_page_offset": 81,
"statistics": {
"null_count": 0,
"max_value": 65535,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 0,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"unsigned_int32"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 73,
"total_compressed_size": 77,
"data_page_offset": 190,
"dictionary_page_offset": 158,
"statistics": {
"null_count": 0,
"max_value": -1,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 0,
"meta_data": {
"type": "INT64",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"unsigned_int64"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 97,
"total_compressed_size": 90,
"data_page_offset": 272,
"dictionary_page_offset": 235,
"statistics": {
"null_count": 0,
"max_value": -1,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 0,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"signed_int8"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 85,
"total_compressed_size": 89,
"data_page_offset": 357,
"dictionary_page_offset": 325,
"statistics": {
"max": 127,
"min": -128,
"null_count": 0,
"max_value": 127,
"min_value": -128
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 0,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"signed_int16"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 85,
"total_compressed_size": 89,
"data_page_offset": 446,
"dictionary_page_offset": 414,
"statistics": {
"max": 32767,
"min": -32768,
"null_count": 0,
"max_value": 32767,
"min_value": -32768
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 0,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"signed_int32"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 85,
"total_compressed_size": 89,
"data_page_offset": 535,
"dictionary_page_offset": 503,
"statistics": {
"max": 2147483647,
"min": -2147483648,
"null_count": 0,
"max_value": 2147483647,
"min_value": -2147483648
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 0,
"meta_data": {
"type": "INT64",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"signed_int64"
],
"codec": "SNAPPY",
"num_values": 4,
"total_uncompressed_size": 117,
"total_compressed_size": 110,
"data_page_offset": 629,
"dictionary_page_offset": 592,
"statistics": {
"max": 9223372036854776000,
"min": -9223372036854776000,
"null_count": 0,
"max_value": 9223372036854776000,
"min_value": -9223372036854776000
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
}
],
"total_byte_size": 688,
"num_rows": 4,
"file_offset": 4,
"total_compressed_size": 698,
"ordinal": 0
}
],
"key_value_metadata": [
{
"key": "ARROW:schema",
"value": "/////xgCAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAgAAACwAQAAaAEAADABAAD4AAAAuAAAAHwAAABAAAAABAAAAID+//8AAAECEAAAACAAAAAEAAAAAAAAAAwAAABzaWduZWRfaW50NjQAAAAAXP///wAAAAFAAAAAuP7//wAAAQIQAAAAIAAAAAQAAAAAAAAADAAAAHNpZ25lZF9pbnQzMgAAAACU////AAAAASAAAADw/v//AAABAhAAAAAgAAAABAAAAAAAAAAMAAAAc2lnbmVkX2ludDE2AAAAAMz///8AAAABEAAAACj///8AAAECEAAAACQAAAAEAAAAAAAAAAsAAABzaWduZWRfaW50OAAIAAwACAAHAAgAAAAAAAABCAAAAGT///8AAAECEAAAACAAAAAEAAAAAAAAAA4AAAB1bnNpZ25lZF9pbnQ2NAAAVv///0AAAACY////AAABAhAAAAAgAAAABAAAAAAAAAAOAAAAdW5zaWduZWRfaW50MzIAAIr///8gAAAAzP///wAAAQIQAAAAIAAAAAQAAAAAAAAADgAAAHVuc2lnbmVkX2ludDE2AAC+////EAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAhAAAAAkAAAABAAAAAAAAAANAAAAdW5zaWduZWRfaW50OAAGAAgABAAGAAAACAAAAA=="
}
],
"created_by": "parquet-cpp-arrow version 19.0.1",
"metadata_length": 1733
}

BIN
test/files/signs.parquet Normal file

Binary file not shown.

@ -1,4 +0,0 @@
[
[0,0,0,0],
[255,65535,4294967295,18446744073709552000]
]

@ -1,218 +0,0 @@
{
"version": 1,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 4
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "utiny",
"converted_type": "UINT_8",
"logical_type": {
"type": "INTEGER",
"bitWidth": 8,
"isSigned": false
}
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "usmall",
"converted_type": "UINT_16",
"logical_type": {
"type": "INTEGER",
"bitWidth": 16,
"isSigned": false
}
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "uint"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "ubig",
"converted_type": "UINT_64",
"logical_type": {
"type": "INTEGER",
"bitWidth": 64,
"isSigned": false
}
}
],
"num_rows": 2,
"row_groups": [
{
"columns": [
{
"file_offset": 72,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN_DICTIONARY",
"PLAIN",
"RLE"
],
"path_in_schema": [
"utiny"
],
"codec": "SNAPPY",
"num_values": 2,
"total_uncompressed_size": 64,
"total_compressed_size": 68,
"data_page_offset": 28,
"dictionary_page_offset": 4,
"statistics": {
"null_count": 0,
"max_value": 255,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 207,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN_DICTIONARY",
"PLAIN",
"RLE"
],
"path_in_schema": [
"usmall"
],
"codec": "SNAPPY",
"num_values": 2,
"total_uncompressed_size": 64,
"total_compressed_size": 68,
"data_page_offset": 163,
"dictionary_page_offset": 139,
"statistics": {
"null_count": 0,
"max_value": 65535,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 381,
"meta_data": {
"type": "INT64",
"encodings": [
"PLAIN_DICTIONARY",
"PLAIN",
"RLE"
],
"path_in_schema": [
"uint"
],
"codec": "SNAPPY",
"num_values": 2,
"total_uncompressed_size": 100,
"total_compressed_size": 104,
"data_page_offset": 309,
"dictionary_page_offset": 277,
"statistics": {
"max": 4294967295,
"min": 0,
"null_count": 0,
"max_value": 4294967295,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 561,
"meta_data": {
"type": "INT64",
"encodings": [
"PLAIN_DICTIONARY",
"PLAIN",
"RLE"
],
"path_in_schema": [
"ubig"
],
"codec": "SNAPPY",
"num_values": 2,
"total_uncompressed_size": 80,
"total_compressed_size": 84,
"data_page_offset": 509,
"dictionary_page_offset": 477,
"statistics": {
"null_count": 0,
"max_value": -1,
"min_value": 0
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "PLAIN_DICTIONARY",
"count": 1
}
]
}
}
],
"total_byte_size": 308,
"num_rows": 2,
"file_offset": 4,
"total_compressed_size": 324,
"ordinal": 0
}
],
"key_value_metadata": [
{
"key": "ARROW:schema",
"value": "/////wgBAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAQAAACkAAAAZAAAADQAAAAEAAAAfP///wAAAQIQAAAAGAAAAAQAAAAAAAAABAAAAHViaWcAAAAAbv///0AAAACo////AAABAhAAAAAYAAAABAAAAAAAAAAEAAAAdWludAAAAACa////IAAAANT///8AAAECEAAAABgAAAAEAAAAAAAAAAYAAAB1c21hbGwAAMb///8QAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAECEAAAABwAAAAEAAAAAAAAAAUAAAB1dGlueQAGAAgABAAGAAAACAAAAAAAAAA="
}
],
"created_by": "parquet-cpp-arrow version 6.0.1",
"metadata_length": 851
}

Binary file not shown.