From 17f412c2f592d11909a87e37742817baa5f772a0 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Fri, 24 May 2024 16:48:38 -0700 Subject: [PATCH] Convert logical date units --- src/convert.js | 11 +- src/metadata.js | 3 +- src/types.d.ts | 32 +- test/convert.test.js | 16 + test/files/duckdb4442.json | 21 ++ test/files/duckdb4442.metadata.json | 467 ++++++++++++++++++++++++++++ test/files/duckdb4442.parquet | Bin 0 -> 4430 bytes 7 files changed, 530 insertions(+), 20 deletions(-) create mode 100644 test/files/duckdb4442.json create mode 100644 test/files/duckdb4442.metadata.json create mode 100644 test/files/duckdb4442.parquet diff --git a/src/convert.js b/src/convert.js index 9ebcac7..3ff38ca 100644 --- a/src/convert.js +++ b/src/convert.js @@ -102,14 +102,17 @@ export function convert(data, schemaElement, utf8 = true) { } return arr } - const logicalType = schemaElement.logical_type?.type - if (logicalType === 'FLOAT16') { + if (schemaElement.logical_type?.type === 'FLOAT16') { return Array.from(data).map(parseFloat16) } - if (logicalType === 'TIMESTAMP') { + if (schemaElement.logical_type?.type === 'TIMESTAMP') { + const { unit } = schemaElement.logical_type + let factor = 1n + if (unit === 'MICROS') factor = 1000n + if (unit === 'NANOS') factor = 1000000n const arr = new Array(data.length) for (let i = 0; i < arr.length; i++) { - arr[i] = new Date(Number(data[i])) + arr[i] = new Date(Number(data[i] / factor)) } return arr } diff --git a/src/metadata.js b/src/metadata.js index f108758..f95c602 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -236,12 +236,13 @@ function logicalType(logicalType) { /** * @param {any} unit - * @returns {import("./types.d.ts").TimeUnit | undefined} + * @returns {import("./types.d.ts").TimeUnit} */ function timeUnit(unit) { if (unit.field_1) return 'MILLIS' if (unit.field_2) return 'MICROS' if (unit.field_3) return 'NANOS' + throw new Error('parquet time unit required') } /** diff --git a/src/types.d.ts b/src/types.d.ts index e096242..e2bffad 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -113,28 +113,30 @@ type LogicalIntType = { } export type LogicalType = - { type: LogicalTypeType } | + { type: LogicalTypeSimple } | LogicalDecimalType | LogicalTimeType | LogicalTimestampType | LogicalIntType -export type LogicalTypeType = - 'STRING' | // convertedType UTF8 - 'MAP' | // convertedType MAP - 'LIST' | // convertedType LIST - 'ENUM' | // convertedType ENUM - 'DECIMAL' | // convertedType DECIMAL + precision/scale - 'DATE' | // convertedType DATE +type LogicalTypeSimple = + 'STRING' | + 'MAP' | + 'LIST' | + 'ENUM' | + 'DECIMAL' | + 'DATE' | + 'INTERVAL' | + 'NULL' | + 'JSON' | + 'BSON' | + 'UUID' | + 'FLOAT16' + +export type LogicalTypeType = LogicalTypeSimple | 'TIME' | // convertedType TIME_MILLIS or TIME_MICROS 'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS - 'INTEGER' | // convertedType INT or UINT - 'INTERVAL' | // convertedType INT or UINT - 'NULL' | // no convertedType - 'JSON' | // convertedType JSON - 'BSON' | // convertedType BSON - 'UUID' | // no convertedType - 'FLOAT16' // no convertedType + 'INTEGER' // convertedType INT or UINT export interface RowGroup { columns: ColumnChunk[] diff --git a/test/convert.test.js b/test/convert.test.js index c7f9bc2..1b59b05 100644 --- a/test/convert.test.js +++ b/test/convert.test.js @@ -113,6 +113,13 @@ describe('convert function', () => { expect(convert(data, schemaElement)).toEqual([{ key: true }, { quay: 314 }]) }) + it('converts uint64', () => { + const data = [BigInt(100), BigInt(-100)] + /** @type {SchemaElement} */ + const schemaElement = { name, converted_type: 'UINT_64' } + expect(convert(data, schemaElement)).toEqual(new BigUint64Array([100n, 18446744073709551516n])) + }) + it('converts to float16', () => { const data = [new Uint8Array([0x00, 0x3c]), new Uint8Array([0x00, 0x40])] /** @type {SchemaElement} */ @@ -120,6 +127,15 @@ describe('convert function', () => { expect(convert(data, schemaElement)).toEqual([1, 2]) }) + it('converts timestamp with units', () => { + const data = [1716506900000000n, 1716507000000000n] + /** @type {SchemaElement} */ + const schemaElement = { name, logical_type: { type: 'TIMESTAMP', isAdjustedToUTC: true, unit: 'MICROS' } } + expect(convert(data, schemaElement)).toEqual([ + new Date('2024-05-23T23:28:20.000Z'), new Date('2024-05-23T23:30:00.000Z'), + ]) + }) + it('throws error for BSON conversion', () => { const data = [{}] /** @type {SchemaElement} */ diff --git a/test/files/duckdb4442.json b/test/files/duckdb4442.json new file mode 100644 index 0000000..16711cb --- /dev/null +++ b/test/files/duckdb4442.json @@ -0,0 +1,21 @@ +[ + [ + 12, + 5184, + 1, + 22, + "2011-10-06T22:21:49.580Z", + "outbound", + 323020033, + "{}", + 2100, + 33, + 0, + 7, + 10, + 0, + 1317427200000, + 1317939709580, + 11 + ] +] diff --git a/test/files/duckdb4442.metadata.json b/test/files/duckdb4442.metadata.json new file mode 100644 index 0000000..2a59fd0 --- /dev/null +++ b/test/files/duckdb4442.metadata.json @@ -0,0 +1,467 @@ +{ + "version": 2, + "schema": [ + { + "name": "root", + "num_children": 17 + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "linkback_length" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "agent_call_sid" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "client_sid" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "agent_sid" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "call_date", + "logical_type": { + "type": "TIMESTAMP", + "isAdjustedToUTC": true, + "unit": "NANOS" + } + }, + { + "type": "BYTE_ARRAY", + "repetition_type": "OPTIONAL", + "name": "call_type", + "converted_type": "UTF8", + "logical_type": { + "type": "STRING" + } + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "call_sid" + }, + { + "type": "BYTE_ARRAY", + "repetition_type": "OPTIONAL", + "name": "skills", + "converted_type": "UTF8", + "logical_type": { + "type": "STRING" + } + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "result" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "call_wait_duration" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "transfer_duration" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "wrap_up_duration" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "talk_duration" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "hold_duration" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "call_month_epoch" + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "call_date_epoch" + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "_version" + } + ], + "num_rows": 1, + "row_groups": [ + { + "columns": [ + { + "file_offset": 38, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "linkback_length" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 4 + }, + "offset_index_offset": 1138, + "offset_index_length": 10 + }, + { + "file_offset": 107, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "agent_call_sid" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 73 + }, + "offset_index_offset": 1148, + "offset_index_length": 11 + }, + { + "file_offset": 176, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "client_sid" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 142 + }, + "offset_index_offset": 1159, + "offset_index_length": 11 + }, + { + "file_offset": 241, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "agent_sid" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 207 + }, + "offset_index_offset": 1170, + "offset_index_length": 11 + }, + { + "file_offset": 305, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "call_date" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 271 + }, + "offset_index_offset": 1181, + "offset_index_length": 11 + }, + { + "file_offset": 373, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "call_type" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 36, + "total_compressed_size": 38, + "data_page_offset": 335 + }, + "offset_index_offset": 1192, + "offset_index_length": 11 + }, + { + "file_offset": 437, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "call_sid" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 403 + }, + "offset_index_offset": 1203, + "offset_index_length": 11 + }, + { + "file_offset": 498, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "skills" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 30, + "total_compressed_size": 32, + "data_page_offset": 466 + }, + "offset_index_offset": 1214, + "offset_index_length": 11 + }, + { + "file_offset": 559, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "result" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 525 + }, + "offset_index_offset": 1225, + "offset_index_length": 11 + }, + { + "file_offset": 620, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "call_wait_duration" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 586 + }, + "offset_index_offset": 1236, + "offset_index_length": 11 + }, + { + "file_offset": 693, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "transfer_duration" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 659 + }, + "offset_index_offset": 1247, + "offset_index_length": 11 + }, + { + "file_offset": 765, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "wrap_up_duration" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 731 + }, + "offset_index_offset": 1258, + "offset_index_length": 11 + }, + { + "file_offset": 836, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "talk_duration" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 802 + }, + "offset_index_offset": 1269, + "offset_index_length": 11 + }, + { + "file_offset": 904, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "hold_duration" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 870 + }, + "offset_index_offset": 1280, + "offset_index_length": 11 + }, + { + "file_offset": 972, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "call_month_epoch" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 938 + }, + "offset_index_offset": 1291, + "offset_index_length": 11 + }, + { + "file_offset": 1043, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "call_date_epoch" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 32, + "total_compressed_size": 34, + "data_page_offset": 1009 + }, + "offset_index_offset": 1302, + "offset_index_length": 11 + }, + { + "file_offset": 1109, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "RLE" + ], + "path_in_schema": [ + "_version" + ], + "codec": "SNAPPY", + "num_values": 1, + "total_uncompressed_size": 28, + "total_compressed_size": 30, + "data_page_offset": 1079 + }, + "offset_index_offset": 1313, + "offset_index_length": 11 + } + ], + "total_byte_size": 542, + "num_rows": 1, + "file_offset": 4, + "total_compressed_size": 576, + "ordinal": 0 + } + ], + "key_value_metadata": [ + { + "key": "ARROW:schema", + "value": "/////5wFAAAEAAAA8v///xQAAAAEAAEAAAAKAAsACAAKAAQA+P///wwAAAAIAAgAAAAEABEAAAAcBQAAyAQAAHgEAAAoBAAA0AMAAIwDAAA8AwAA/AIAALACAABYAgAAAAIAAKgBAABUAQAAAAEAAKgAAABUAAAABAAAAOz///84AAAAIAAAABgAAAABAgAAEAASAAQAEAARAAgAAAAMAAAAAAD0////IAAAAAEAAAAIAAkABAAIAAgAAABfdmVyc2lvbgAAAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAAPAAAAY2FsbF9kYXRlX2Vwb2NoAOz///84AAAAIAAAABgAAAABAgAAEAASAAQAEAARAAgAAAAMAAAAAAD0////QAAAAAEAAAAIAAkABAAIABAAAABjYWxsX21vbnRoX2Vwb2NoAAAAAOz///84AAAAIAAAABgAAAABAgAAEAASAAQAEAARAAgAAAAMAAAAAAD0////QAAAAAEAAAAIAAkABAAIAA0AAABob2xkX2R1cmF0aW9uAAAA7P///zgAAAAgAAAAGAAAAAECAAAQABIABAAQABEACAAAAAwAAAAAAPT///9AAAAAAQAAAAgACQAEAAgADQAAAHRhbGtfZHVyYXRpb24AAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAAQAAAAd3JhcF91cF9kdXJhdGlvbgAAAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAARAAAAdHJhbnNmZXJfZHVyYXRpb24AAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAASAAAAY2FsbF93YWl0X2R1cmF0aW9uAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAAGAAAAcmVzdWx0AADs////LAAAACAAAAAYAAAAARQAABAAEgAEABAAEQAIAAAADAAAAAAA/P///wQABAAGAAAAc2tpbGxzAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAAIAAAAY2FsbF9zaWQAAAAA7P///ywAAAAgAAAAGAAAAAEUAAAQABIABAAQABEACAAAAAwAAAAAAPz///8EAAQACQAAAGNhbGxfdHlwZQAAAOz///9AAAAAIAAAABgAAAABCgAAEAASAAQAEAARAAgAAAAMAAAAAAD0////EAAAAAMAAAAIAAoACAAEAAMAAABVVEMACQAAAGNhbGxfZGF0ZQAAAOz///84AAAAIAAAABgAAAABAgAAEAASAAQAEAARAAgAAAAMAAAAAAD0////QAAAAAEAAAAIAAkABAAIAAkAAABhZ2VudF9zaWQAAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAAKAAAAY2xpZW50X3NpZAAA7P///zgAAAAgAAAAGAAAAAECAAAQABIABAAQABEACAAAAAwAAAAAAPT///9AAAAAAQAAAAgACQAEAAgADgAAAGFnZW50X2NhbGxfc2lkAADs////OAAAACAAAAAYAAAAAQIAABAAEgAEABAAEQAIAAAADAAAAAAA9P///0AAAAABAAAACAAJAAQACAAPAAAAbGlua2JhY2tfbGVuZ3RoAA==" + } + ], + "created_by": "Arrow2 - Native Rust implementation of Arrow", + "metadata_length": 3098 +} diff --git a/test/files/duckdb4442.parquet b/test/files/duckdb4442.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1fde26ba8da62ff9bbd875c49b342c2a600253c5 GIT binary patch literal 4430 zcmcIoO=ufO6kf}UEZedzIo`o8I;e%{(zKS>3AKv@9@&-@#c^yk)gJ|+KTE1sS|P10 z*`<(U4xy9~N(sTIU`pvZhc=W_0=?wWLnwtp2_fkrhn!2vq3_MEMnCIpp>YPZJ2U&e z@4b2R&09&^G_oMt#2In+6VW0H>|+%Lmmmxe+1(C7Yoc}Tj9{Caosvt{aweTEr{sLK zpcX|7v5-}=?zIcACRv^7Cz(ta@>MmJP0MnsQOa?Sn?oifLj#eFW#tmG^b*{$m=KTw zf>AAsUi-sg6MNx@XOG`s3r}}aG0Ni4rPVxNz@ZiG-J-{sO2=6{_*a^0Mrl@aqC-ok z#-Z-lG>#D2^d6oviDBs3>Sycs_^WL+g09)$@&}tn#JbHWR?xnk_e;W?{F|B{h|6*3)W9sdA!Qqb5=4JeSjkaK6`gW zT-9f9cX@A(a=_6T@7^2d-3JcdeK^6pk0yEd$rSItaEdGF0`mKEI%KUYiYmSXqcavm zLDXgkDx5aiQ?ay(ZUxfEJm2ySmwSaNLv(Fv}0Wz(D$O?nUT_RAWEp z&}k2hhe(?@913l^U*!5+pJ;XSK@n=xgQ5bh;95FD5r}2ZJ@?Y(-E?=pZu!fY@Cm8F{gw2PX1!Brz4#1m`YAwGSHuP&UH-1CO1#x zLVq9SpkqYBT|;Fi-!OycW8AY-Ef1N_zJC)I7HHtpKHT13q+V560~s?82B$U81x>*C>Vi z4t;kUp0`@~{34|V4d-XLtRKww0YgItoKXC5@zKkGY^_6giGtZYnr$MTKcSh!_)-C& z{*Td=`MiX@Fvcrht{i~VrE}N?G)ywTezXTiA7NZ@kLdA$Lu~6lFq&t0Z5_}pBOlxu z|8A}l-Ooz$UWU=FHxPdXO(8mH33lB9I2dgdfCTS5S_rrxY|EGrAM3JE!8DBVzTWU=-G@BZG7?JR#P^GN&Mk(E*}(Dwe9F0axR?vRF@1nuDb0n# z#Y}apl8lGn^!nhOa6K+2VzRG)?@m0QZ0oa?=s_;l2El0jn;1mz`HDc=@m#&)e9g0Z z1CpUs(tLO`YR@-XfqdB)^l~@`hoWi>a|@<#?JfVi(9oMj8DA2;I0M(Rpvv|_6gnzu)q^v z4|^YEf^su0g^LME-OU7}&E#T4q4dH9VfHMo*Oiv!dC#*2KT!Abo=CHydP