diff --git a/package.json b/package.json index a5cc1c3..853fc02 100644 --- a/package.json +++ b/package.json @@ -47,13 +47,13 @@ "test": "vitest run" }, "devDependencies": { - "@types/node": "22.14.0", + "@types/node": "22.14.1", "@vitest/coverage-v8": "3.1.1", "eslint": "9.24.0", "eslint-plugin-jsdoc": "50.6.9", "hyparquet-compressors": "1.1.1", "typescript": "5.8.3", - "typescript-eslint": "8.29.1", + "typescript-eslint": "8.30.1", "vitest": "3.1.1" } } diff --git a/src/convert.js b/src/convert.js index 262c2ea..955f6c6 100644 --- a/src/convert.js +++ b/src/convert.js @@ -36,7 +36,7 @@ export function convertWithDictionary(data, dictionary, schemaElement, encoding, * @returns {DecodedArray} series of rich types */ export function convert(data, schemaElement, utf8 = true) { - const ctype = schemaElement.converted_type + const { type, converted_type: ctype, logical_type: ltype } = schemaElement if (ctype === 'DECIMAL') { const scale = schemaElement.scale || 0 const factor = 10 ** -scale @@ -50,7 +50,7 @@ export function convert(data, schemaElement, utf8 = true) { } return arr } - if (ctype === undefined && schemaElement.type === 'INT96') { + if (!ctype && type === 'INT96') { return Array.from(data).map(parseInt96Date) } if (ctype === 'DATE') { @@ -84,7 +84,7 @@ export function convert(data, schemaElement, utf8 = true) { if (ctype === 'INTERVAL') { throw new Error('parquet interval not supported') } - if (ctype === 'UTF8' || utf8 && schemaElement.type === 'BYTE_ARRAY') { + if (ctype === 'UTF8' || utf8 && type === 'BYTE_ARRAY') { const decoder = new TextDecoder() const arr = new Array(data.length) for (let i = 0; i < arr.length; i++) { @@ -92,18 +92,27 @@ export function convert(data, schemaElement, utf8 = true) { } return arr } - if (ctype === 'UINT_64') { - const arr = new BigUint64Array(data.length) - for (let i = 0; i < arr.length; i++) { - arr[i] = BigInt(data[i]) + if (ctype === 'UINT_64' || ltype?.type === 'INTEGER' && ltype.bitWidth === 64 && !ltype.isSigned) { + if (data instanceof BigInt64Array) { + return new BigUint64Array(data.buffer, data.byteOffset, data.length) } + const arr = new BigUint64Array(data.length) + for (let i = 0; i < arr.length; i++) arr[i] = BigInt(data[i]) return arr } - if (schemaElement.logical_type?.type === 'FLOAT16') { + if (ctype === 'UINT_32' || ltype?.type === 'INTEGER' && ltype.bitWidth === 32 && !ltype.isSigned) { + if (data instanceof Int32Array) { + return new Uint32Array(data.buffer, data.byteOffset, data.length) + } + const arr = new Uint32Array(data.length) + for (let i = 0; i < arr.length; i++) arr[i] = data[i] + return arr + } + if (ltype?.type === 'FLOAT16') { return Array.from(data).map(parseFloat16) } - if (schemaElement.logical_type?.type === 'TIMESTAMP') { - const { unit } = schemaElement.logical_type + if (ltype?.type === 'TIMESTAMP') { + const { unit } = ltype let factor = 1n if (unit === 'MICROS') factor = 1000n if (unit === 'NANOS') factor = 1000000n diff --git a/src/types.d.ts b/src/types.d.ts index 129c9a2..f5e44db 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -344,6 +344,7 @@ interface DataPage { export type DecodedArray = Uint8Array | + Uint32Array | Int32Array | BigInt64Array | BigUint64Array | diff --git a/test/files/signs.json b/test/files/signs.json new file mode 100644 index 0000000..2a1557b --- /dev/null +++ b/test/files/signs.json @@ -0,0 +1,6 @@ +[ + [0, 0, 0, 0, -128, -32768, -2147483648, -9223372036854775808], + [127, 32767, 2147483647, 9223372036854775807, -1, -1, -1, -1], + [128, 32768, 2147483648, 9223372036854775808, 0, 0, 0, 0], + [255, 65535, 4294967295, 18446744073709551615, 127, 32767, 2147483647, 9223372036854775807] +] diff --git a/test/files/signs.metadata.json b/test/files/signs.metadata.json new file mode 100644 index 0000000..9b5bcc0 --- /dev/null +++ b/test/files/signs.metadata.json @@ -0,0 +1,410 @@ +{ + "version": 2, + "schema": [ + { + "repetition_type": "REQUIRED", + "name": "schema", + "num_children": 8 + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "unsigned_int8", + "converted_type": "UINT_8", + "logical_type": { + "type": "INTEGER", + "bitWidth": 8, + "isSigned": false + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "unsigned_int16", + "converted_type": "UINT_16", + "logical_type": { + "type": "INTEGER", + "bitWidth": 16, + "isSigned": false + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "unsigned_int32", + "converted_type": "UINT_32", + "logical_type": { + "type": "INTEGER", + "bitWidth": 32, + "isSigned": false + } + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "unsigned_int64", + "converted_type": "UINT_64", + "logical_type": { + "type": "INTEGER", + "bitWidth": 64, + "isSigned": false + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "signed_int8", + "converted_type": "INT_8", + "logical_type": { + "type": "INTEGER", + "bitWidth": 8, + "isSigned": true + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "signed_int16", + "converted_type": "INT_16", + "logical_type": { + "type": "INTEGER", + "bitWidth": 16, + "isSigned": true + } + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "signed_int32" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "signed_int64" + } + ], + "num_rows": 4, + "row_groups": [ + { + "columns": [ + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int8" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 73, + "total_compressed_size": 77, + "data_page_offset": 36, + "dictionary_page_offset": 4, + "statistics": { + "null_count": 0, + "max_value": 255, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int16" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 73, + "total_compressed_size": 77, + "data_page_offset": 113, + "dictionary_page_offset": 81, + "statistics": { + "null_count": 0, + "max_value": 65535, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int32" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 73, + "total_compressed_size": 77, + "data_page_offset": 190, + "dictionary_page_offset": 158, + "statistics": { + "null_count": 0, + "max_value": -1, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "unsigned_int64" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 97, + "total_compressed_size": 90, + "data_page_offset": 272, + "dictionary_page_offset": 235, + "statistics": { + "null_count": 0, + "max_value": -1, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int8" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 85, + "total_compressed_size": 89, + "data_page_offset": 357, + "dictionary_page_offset": 325, + "statistics": { + "max": 127, + "min": -128, + "null_count": 0, + "max_value": 127, + "min_value": -128 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int16" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 85, + "total_compressed_size": 89, + "data_page_offset": 446, + "dictionary_page_offset": 414, + "statistics": { + "max": 32767, + "min": -32768, + "null_count": 0, + "max_value": 32767, + "min_value": -32768 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int32" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 85, + "total_compressed_size": 89, + "data_page_offset": 535, + "dictionary_page_offset": 503, + "statistics": { + "max": 2147483647, + "min": -2147483648, + "null_count": 0, + "max_value": 2147483647, + "min_value": -2147483648 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "signed_int64" + ], + "codec": "SNAPPY", + "num_values": 4, + "total_uncompressed_size": 117, + "total_compressed_size": 110, + "data_page_offset": 629, + "dictionary_page_offset": 592, + "statistics": { + "max": 9223372036854776000, + "min": -9223372036854776000, + "null_count": 0, + "max_value": 9223372036854776000, + "min_value": -9223372036854776000 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + } + ], + "total_byte_size": 688, + "num_rows": 4, + "file_offset": 4, + "total_compressed_size": 698, + "ordinal": 0 + } + ], + "key_value_metadata": [ + { + "key": "ARROW:schema", + "value": "/////xgCAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAgAAACwAQAAaAEAADABAAD4AAAAuAAAAHwAAABAAAAABAAAAID+//8AAAECEAAAACAAAAAEAAAAAAAAAAwAAABzaWduZWRfaW50NjQAAAAAXP///wAAAAFAAAAAuP7//wAAAQIQAAAAIAAAAAQAAAAAAAAADAAAAHNpZ25lZF9pbnQzMgAAAACU////AAAAASAAAADw/v//AAABAhAAAAAgAAAABAAAAAAAAAAMAAAAc2lnbmVkX2ludDE2AAAAAMz///8AAAABEAAAACj///8AAAECEAAAACQAAAAEAAAAAAAAAAsAAABzaWduZWRfaW50OAAIAAwACAAHAAgAAAAAAAABCAAAAGT///8AAAECEAAAACAAAAAEAAAAAAAAAA4AAAB1bnNpZ25lZF9pbnQ2NAAAVv///0AAAACY////AAABAhAAAAAgAAAABAAAAAAAAAAOAAAAdW5zaWduZWRfaW50MzIAAIr///8gAAAAzP///wAAAQIQAAAAIAAAAAQAAAAAAAAADgAAAHVuc2lnbmVkX2ludDE2AAC+////EAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAhAAAAAkAAAABAAAAAAAAAANAAAAdW5zaWduZWRfaW50OAAGAAgABAAGAAAACAAAAA==" + } + ], + "created_by": "parquet-cpp-arrow version 19.0.1", + "metadata_length": 1733 +} diff --git a/test/files/signs.parquet b/test/files/signs.parquet new file mode 100644 index 0000000..100e64d Binary files /dev/null and b/test/files/signs.parquet differ diff --git a/test/files/unsigned.json b/test/files/unsigned.json deleted file mode 100644 index 2a4e727..0000000 --- a/test/files/unsigned.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - [0,0,0,0], - [255,65535,4294967295,18446744073709552000] -] diff --git a/test/files/unsigned.metadata.json b/test/files/unsigned.metadata.json deleted file mode 100644 index dc6aed3..0000000 --- a/test/files/unsigned.metadata.json +++ /dev/null @@ -1,218 +0,0 @@ -{ - "version": 1, - "schema": [ - { - "repetition_type": "REQUIRED", - "name": "schema", - "num_children": 4 - }, - { - "type": "INT32", - "repetition_type": "OPTIONAL", - "name": "utiny", - "converted_type": "UINT_8", - "logical_type": { - "type": "INTEGER", - "bitWidth": 8, - "isSigned": false - } - }, - { - "type": "INT32", - "repetition_type": "OPTIONAL", - "name": "usmall", - "converted_type": "UINT_16", - "logical_type": { - "type": "INTEGER", - "bitWidth": 16, - "isSigned": false - } - }, - { - "type": "INT64", - "repetition_type": "OPTIONAL", - "name": "uint" - }, - { - "type": "INT64", - "repetition_type": "OPTIONAL", - "name": "ubig", - "converted_type": "UINT_64", - "logical_type": { - "type": "INTEGER", - "bitWidth": 64, - "isSigned": false - } - } - ], - "num_rows": 2, - "row_groups": [ - { - "columns": [ - { - "file_offset": 72, - "meta_data": { - "type": "INT32", - "encodings": [ - "PLAIN_DICTIONARY", - "PLAIN", - "RLE" - ], - "path_in_schema": [ - "utiny" - ], - "codec": "SNAPPY", - "num_values": 2, - "total_uncompressed_size": 64, - "total_compressed_size": 68, - "data_page_offset": 28, - "dictionary_page_offset": 4, - "statistics": { - "null_count": 0, - "max_value": 255, - "min_value": 0 - }, - "encoding_stats": [ - { - "page_type": "DICTIONARY_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - }, - { - "page_type": "DATA_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - } - ] - } - }, - { - "file_offset": 207, - "meta_data": { - "type": "INT32", - "encodings": [ - "PLAIN_DICTIONARY", - "PLAIN", - "RLE" - ], - "path_in_schema": [ - "usmall" - ], - "codec": "SNAPPY", - "num_values": 2, - "total_uncompressed_size": 64, - "total_compressed_size": 68, - "data_page_offset": 163, - "dictionary_page_offset": 139, - "statistics": { - "null_count": 0, - "max_value": 65535, - "min_value": 0 - }, - "encoding_stats": [ - { - "page_type": "DICTIONARY_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - }, - { - "page_type": "DATA_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - } - ] - } - }, - { - "file_offset": 381, - "meta_data": { - "type": "INT64", - "encodings": [ - "PLAIN_DICTIONARY", - "PLAIN", - "RLE" - ], - "path_in_schema": [ - "uint" - ], - "codec": "SNAPPY", - "num_values": 2, - "total_uncompressed_size": 100, - "total_compressed_size": 104, - "data_page_offset": 309, - "dictionary_page_offset": 277, - "statistics": { - "max": 4294967295, - "min": 0, - "null_count": 0, - "max_value": 4294967295, - "min_value": 0 - }, - "encoding_stats": [ - { - "page_type": "DICTIONARY_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - }, - { - "page_type": "DATA_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - } - ] - } - }, - { - "file_offset": 561, - "meta_data": { - "type": "INT64", - "encodings": [ - "PLAIN_DICTIONARY", - "PLAIN", - "RLE" - ], - "path_in_schema": [ - "ubig" - ], - "codec": "SNAPPY", - "num_values": 2, - "total_uncompressed_size": 80, - "total_compressed_size": 84, - "data_page_offset": 509, - "dictionary_page_offset": 477, - "statistics": { - "null_count": 0, - "max_value": -1, - "min_value": 0 - }, - "encoding_stats": [ - { - "page_type": "DICTIONARY_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - }, - { - "page_type": "DATA_PAGE", - "encoding": "PLAIN_DICTIONARY", - "count": 1 - } - ] - } - } - ], - "total_byte_size": 308, - "num_rows": 2, - "file_offset": 4, - "total_compressed_size": 324, - "ordinal": 0 - } - ], - "key_value_metadata": [ - { - "key": "ARROW:schema", - "value": "/////wgBAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAQAAACkAAAAZAAAADQAAAAEAAAAfP///wAAAQIQAAAAGAAAAAQAAAAAAAAABAAAAHViaWcAAAAAbv///0AAAACo////AAABAhAAAAAYAAAABAAAAAAAAAAEAAAAdWludAAAAACa////IAAAANT///8AAAECEAAAABgAAAAEAAAAAAAAAAYAAAB1c21hbGwAAMb///8QAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAECEAAAABwAAAAEAAAAAAAAAAUAAAB1dGlueQAGAAgABAAGAAAACAAAAAAAAAA=" - } - ], - "created_by": "parquet-cpp-arrow version 6.0.1", - "metadata_length": 851 -} diff --git a/test/files/unsigned.parquet b/test/files/unsigned.parquet deleted file mode 100644 index 09d9a2b..0000000 Binary files a/test/files/unsigned.parquet and /dev/null differ