From 9a04cbccd301f5d168cbacdd67378434cb97d964 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 14 Apr 2025 22:07:12 -0700 Subject: [PATCH] Convert unsigned types --- package.json | 4 +- src/convert.js | 29 ++- src/types.d.ts | 1 + test/files/signs.json | 6 + test/files/signs.metadata.json | 410 ++++++++++++++++++++++++++++++ test/files/signs.parquet | Bin 0 -> 2443 bytes test/files/unsigned.json | 4 - test/files/unsigned.metadata.json | 218 ---------------- test/files/unsigned.parquet | Bin 1496 -> 0 bytes 9 files changed, 438 insertions(+), 234 deletions(-) create mode 100644 test/files/signs.json create mode 100644 test/files/signs.metadata.json create mode 100644 test/files/signs.parquet delete mode 100644 test/files/unsigned.json delete mode 100644 test/files/unsigned.metadata.json delete mode 100644 test/files/unsigned.parquet diff --git a/package.json b/package.json index a5cc1c3..853fc02 100644 --- a/package.json +++ b/package.json @@ -47,13 +47,13 @@ "test": "vitest run" }, "devDependencies": { - "@types/node": "22.14.0", + "@types/node": "22.14.1", "@vitest/coverage-v8": "3.1.1", "eslint": "9.24.0", "eslint-plugin-jsdoc": "50.6.9", "hyparquet-compressors": "1.1.1", "typescript": "5.8.3", - "typescript-eslint": "8.29.1", + "typescript-eslint": "8.30.1", "vitest": "3.1.1" } } diff --git a/src/convert.js b/src/convert.js index 262c2ea..955f6c6 100644 --- a/src/convert.js +++ b/src/convert.js @@ -36,7 +36,7 @@ export function convertWithDictionary(data, dictionary, schemaElement, encoding, * @returns {DecodedArray} series of rich types */ export function convert(data, schemaElement, utf8 = true) { - const ctype = schemaElement.converted_type + const { type, converted_type: ctype, logical_type: ltype } = schemaElement if (ctype === 'DECIMAL') { const scale = schemaElement.scale || 0 const factor = 10 ** -scale @@ -50,7 +50,7 @@ export function convert(data, schemaElement, utf8 = true) { } return arr } - if (ctype === undefined && schemaElement.type === 'INT96') { + if (!ctype && type === 'INT96') { return Array.from(data).map(parseInt96Date) } if (ctype === 'DATE') { @@ -84,7 +84,7 @@ export function convert(data, schemaElement, utf8 = true) { if (ctype === 'INTERVAL') { throw new Error('parquet interval not supported') } - if (ctype === 'UTF8' || utf8 && schemaElement.type === 'BYTE_ARRAY') { + if (ctype === 'UTF8' || utf8 && type === 'BYTE_ARRAY') { const decoder = new TextDecoder() const arr = new Array(data.length) for (let i = 0; i < arr.length; i++) { @@ -92,18 +92,27 @@ export function convert(data, schemaElement, utf8 = true) { } return arr } - if (ctype === 'UINT_64') { - const arr = new BigUint64Array(data.length) - for (let i = 0; i < arr.length; i++) { - arr[i] = BigInt(data[i]) + if (ctype === 'UINT_64' || ltype?.type === 'INTEGER' && ltype.bitWidth === 64 && !ltype.isSigned) { + if (data instanceof BigInt64Array) { + return new BigUint64Array(data.buffer, data.byteOffset, data.length) } + const arr = new BigUint64Array(data.length) + for (let i = 0; i < arr.length; i++) arr[i] = BigInt(data[i]) return arr } - if (schemaElement.logical_type?.type === 'FLOAT16') { + if (ctype === 'UINT_32' || ltype?.type === 'INTEGER' && ltype.bitWidth === 32 && !ltype.isSigned) { + if (data instanceof Int32Array) { + return new Uint32Array(data.buffer, data.byteOffset, data.length) + } + const arr = new Uint32Array(data.length) + for (let i = 0; i < arr.length; i++) arr[i] = data[i] + return arr + } + if (ltype?.type === 'FLOAT16') { return Array.from(data).map(parseFloat16) } - if (schemaElement.logical_type?.type === 'TIMESTAMP') { - const { unit } = schemaElement.logical_type + if (ltype?.type === 'TIMESTAMP') { + const { unit } = ltype let factor = 1n if (unit === 'MICROS') factor = 1000n if (unit === 'NANOS') factor = 1000000n diff --git a/src/types.d.ts b/src/types.d.ts index 129c9a2..f5e44db 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -344,6 +344,7 @@ interface DataPage { export type DecodedArray = Uint8Array | + Uint32Array | Int32Array | BigInt64Array | BigUint64Array | diff --git a/test/files/signs.json b/test/files/signs.json new file mode 100644 index 0000000..2a1557b --- /dev/null +++ b/test/files/signs.json @@ -0,0 +1,6 @@ +[ + [0, 0, 0, 0, -128, -32768, -2147483648, -9223372036854775808], + [127, 32767, 2147483647, 9223372036854775807, -1, -1, -1, -1], + [128, 32768, 2147483648, 9223372036854775808, 0, 0, 0, 0], + [255, 65535, 4294967295, 18446744073709551615, 127, 32767, 2147483647, 9223372036854775807] +] diff --git a/test/files/signs.metadata.json b/test/files/signs.metadata.json new file mode 100644 index 0000000..9b5bcc0 --- /dev/null +++ b/test/files/signs.metadata.json @@ -0,0 +1,410 @@ +{ + "version": 2, + "schema": [ + { + "repetition_type": "REQUIRED", + "name": "schema", + "num_children": 8 + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "unsigned_int8", + "converted_type": "UINT_8", + "logical_type": { + "type": "INTEGER", + "bitWidth": 8, + "isSigned": false + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "unsigned_int16", + "converted_type": "UINT_16", + "logical_type": { + "type": "INTEGER", + "bitWidth": 16, + "isSigned": false + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "unsigned_int32", + "converted_type": "UINT_32", + "logical_type": { + "type": "INTEGER", + "bitWidth": 32, + "isSigned": false + } + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "unsigned_int64", + "converted_type": "UINT_64", + "logical_type": { + "type": "INTEGER", + "bitWidth": 64, + "isSigned": false + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "signed_int8", + "converted_type": "INT_8", + "logical_type": { + "type": "INTEGER", + "bitWidth": 8, + "isSigned": true + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "signed_int16", + "converted_type": "INT_16", + "logical_type": { + "type": "INTEGER", + "bitWidth": 16, + "isSigned": true + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "signed_int32" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "signed_int64" + } + ], + "num_rows": 4, + "row_groups": [ + { + "columns": [ + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int8" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 73, + "total_compressed_size": 77, + "data_page_offset": 36, + "dictionary_page_offset": 4, + "statistics": { + "null_count": 0, + "max_value": 255, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int16" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 73, + "total_compressed_size": 77, + "data_page_offset": 113, + "dictionary_page_offset": 81, + "statistics": { + "null_count": 0, + "max_value": 65535, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int32" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 73, + "total_compressed_size": 77, + "data_page_offset": 190, + "dictionary_page_offset": 158, + "statistics": { + "null_count": 0, + "max_value": -1, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int64" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 97, + "total_compressed_size": 90, + "data_page_offset": 272, + "dictionary_page_offset": 235, + "statistics": { + "null_count": 0, + "max_value": -1, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int8" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 85, + "total_compressed_size": 89, + "data_page_offset": 357, + "dictionary_page_offset": 325, + "statistics": { + "max": 127, + "min": -128, + "null_count": 0, + "max_value": 127, + "min_value": -128 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int16" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 85, + "total_compressed_size": 89, + "data_page_offset": 446, + "dictionary_page_offset": 414, + "statistics": { + "max": 32767, + "min": -32768, + "null_count": 0, + "max_value": 32767, + "min_value": -32768 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int32" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 85, + "total_compressed_size": 89, + "data_page_offset": 535, + "dictionary_page_offset": 503, + "statistics": { + "max": 2147483647, + "min": -2147483648, + "null_count": 0, + "max_value": 2147483647, + "min_value": -2147483648 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int64" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 117, + "total_compressed_size": 110, + "data_page_offset": 629, + "dictionary_page_offset": 592, + "statistics": { + "max": 9223372036854776000, + "min": -9223372036854776000, + "null_count": 0, + "max_value": 9223372036854776000, + "min_value": -9223372036854776000 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + } + ], + "total_byte_size": 688, + "num_rows": 4, + "file_offset": 4, + "total_compressed_size": 698, + "ordinal": 0 + } + ], + "key_value_metadata": [ + { + "key": "ARROW:schema", + "value": "/////xgCAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAgAAACwAQAAaAEAADABAAD4AAAAuAAAAHwAAABAAAAABAAAAID+//8AAAECEAAAACAAAAAEAAAAAAAAAAwAAABzaWduZWRfaW50NjQAAAAAXP///wAAAAFAAAAAuP7//wAAAQIQAAAAIAAAAAQAAAAAAAAADAAAAHNpZ25lZF9pbnQzMgAAAACU////AAAAASAAAADw/v//AAABAhAAAAAgAAAABAAAAAAAAAAMAAAAc2lnbmVkX2ludDE2AAAAAMz///8AAAABEAAAACj///8AAAECEAAAACQAAAAEAAAAAAAAAAsAAABzaWduZWRfaW50OAAIAAwACAAHAAgAAAAAAAABCAAAAGT///8AAAECEAAAACAAAAAEAAAAAAAAAA4AAAB1bnNpZ25lZF9pbnQ2NAAAVv///0AAAACY////AAABAhAAAAAgAAAABAAAAAAAAAAOAAAAdW5zaWduZWRfaW50MzIAAIr///8gAAAAzP///wAAAQIQAAAAIAAAAAQAAAAAAAAADgAAAHVuc2lnbmVkX2ludDE2AAC+////EAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAhAAAAAkAAAABAAAAAAAAAANAAAAdW5zaWduZWRfaW50OAAGAAgABAAGAAAACAAAAA==" + } + ], + "created_by": "parquet-cpp-arrow version 19.0.1", + "metadata_length": 1733 +} diff --git a/test/files/signs.parquet b/test/files/signs.parquet new file mode 100644 index 0000000000000000000000000000000000000000..100e64de1a7a7c8ef99e1f138d48ed8a84ca8979 GIT binary patch literal 2443 zcmb7GOK%cU6uv_#4z*a+&SWse%%V(8Y)helt+7cj&=z7J6t%S77+R1%DMBfRO}jQ- zxil^sm&Uj-F1s>u;}0+{To~i33s)|i;5m0@V1Vhvlg^zv559Z8@7|L%L6bhgD#$|L zw7?7G1|iM|1Z*Fk19)oi2t?==q<+Xd1)E@VhsYf(#w9B(XkC34Sb5IU@tc@|)iksk z{K^yz;+F@jsce=XeNlfngLWE$!c~UXB`bo zk$NkrWW}gt#X1qhY$2a);8$2wER1hIeCn$|#IvT*bGA^+SED+Myc=WEocz!t%%xd?DYn z_(2PlW>UeXzGkYRXuOE~Gu5iP{%7fpRk4L~yG{`}bFY_)09OG{4}h0wxT4-Gh^q(O z=x-DIQEv@?QKZw>A2dCMdO;n$^{X^CMkKkAE3~-x3eo3RXNKUPw|xkIJQiP z^7sB~4*uhLNE1o#AroY%yXwb}}5cI&U}?OuRM3 zsUdfyH9y-K^HN-eapVE_uMAi(RtDg)GCm}xGR$yr?6=!9I=)f)o{p7F31XF``HiR{ zsa@PF+Bq2dc}2DrWo!%cXo9T9ppckt+F8=&Y|=CqwmL(t^}8W{81grhpP73J>dWl| z70>Sj4p?i(+;k^%ntxienyalqYyN16Plh~Ol3$mwzi~7%`l$0_i`eF%)@&U6+#;N2 z=;ZcfMB&Rd3-03>oSbFG1~jEn2Bo@ZaB*}9QZg>2?_gRm$8`z|xRr7tH%k8Jla>oS z74eG+CM|KATp`49?W?L>eptCZJ|z1O-9Q@!Ii*wyUbOp4jZ`jB_mm2h7icU81xFZp zN*$ty@5sX!PW4_P{v3z9nra`zmqp}U zf}CpV)H57W=)^RCz0wZr8t{Cq*Pe&7+1jmavziBSM^&tk zL_fiO!;}N;16XOgXvC-S89tZ-eD!=gY?Ksi5#}rOx?kwf_K1DG=l5TSjp16Y*IV=Z h{q87z(&!Ib-A;OYeRDnQ6ySe-@sLLeLW=ND{|iAmA#wl!