From 8dbb74ac784ed7d770abac0a8ed3e0b788f3633d Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Thu, 15 May 2025 23:44:09 -0700 Subject: [PATCH] Convert logical strings --- src/convert.js | 8 +-- test/files/strings.json | 6 +++ test/files/strings.metadata.json | 90 +++++++++++++++++++++++++++++++ test/files/strings.parquet | Bin 0 -> 417 bytes test/read.utf8.test.js | 54 +++++++++++++++++++ 5 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 test/files/strings.json create mode 100644 test/files/strings.metadata.json create mode 100644 test/files/strings.parquet create mode 100644 test/read.utf8.test.js diff --git a/src/convert.js b/src/convert.js index 98e6d1f..88f5cc5 100644 --- a/src/convert.js +++ b/src/convert.js @@ -7,10 +7,10 @@ const dayMillis = 86400000 // 1 day in milliseconds * @param {DecodedArray | undefined} dictionary * @param {SchemaElement} schemaElement * @param {Encoding} encoding - * @param {boolean | undefined} utf8 decode bytes as utf8? + * @param {boolean} [utf8] decode bytes as utf8? * @returns {DecodedArray} series of rich types */ -export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) { +export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8) { if (dictionary && encoding.endsWith('_DICTIONARY')) { let output = data if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) { @@ -31,7 +31,7 @@ export function convertWithDictionary(data, dictionary, schemaElement, encoding, * * @param {DecodedArray} data series of primitive types * @param {SchemaElement} schemaElement - * @param {boolean | undefined} utf8 decode bytes as utf8? + * @param {boolean} [utf8] decode bytes as utf8? * @returns {DecodedArray} series of rich types */ export function convert(data, schemaElement, utf8 = true) { @@ -83,7 +83,7 @@ export function convert(data, schemaElement, utf8 = true) { if (ctype === 'INTERVAL') { throw new Error('parquet interval not supported') } - if (ctype === 'UTF8' || utf8 && type === 'BYTE_ARRAY') { + if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') { const decoder = new TextDecoder() const arr = new Array(data.length) for (let i = 0; i < arr.length; i++) { diff --git a/test/files/strings.json b/test/files/strings.json new file mode 100644 index 0000000..3a8c167 --- /dev/null +++ b/test/files/strings.json @@ -0,0 +1,6 @@ +[ + ["alpha", "alpha", "alpha"], + ["bravo", "bravo", "bravo"], + ["charlie", "charlie", "charlie"], + ["delta", "delta", "delta"] +] diff --git a/test/files/strings.metadata.json b/test/files/strings.metadata.json new file mode 100644 index 0000000..606c1fd --- /dev/null +++ b/test/files/strings.metadata.json @@ -0,0 +1,90 @@ +{ + "version": 2, + "schema": [ + { + "name": "root", + "num_children": 3 + }, + { + "type": "BYTE_ARRAY", + "name": "bytes" + }, + { + "type": "BYTE_ARRAY", + "name": "c_utf8", + "converted_type": "UTF8" + }, + { + "type": "BYTE_ARRAY", + "name": "l_utf8", + "logical_type": { + "type": "STRING" + } + } + ], + "num_rows": 4, + "row_groups": [ + { + "columns": [ + { + "file_offset": 4, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": ["PLAIN"], + "path_in_schema": ["bytes"], + "codec": "UNCOMPRESSED", + "num_values": 4, + "total_uncompressed_size": 62, + "total_compressed_size": 62, + "data_page_offset": 4, + "statistics": { + "null_count": 0, + "max_value": "delta", + "min_value": "alpha" + } + } + }, + { + "file_offset": 66, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": ["PLAIN"], + "path_in_schema": ["c_utf8"], + "codec": "UNCOMPRESSED", + "num_values": 4, + "total_uncompressed_size": 62, + "total_compressed_size": 62, + "data_page_offset": 66, + "statistics": { + "null_count": 0, + "max_value": "delta", + "min_value": "alpha" + } + } + }, + { + "file_offset": 128, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": ["PLAIN"], + "path_in_schema": ["l_utf8"], + "codec": "UNCOMPRESSED", + "num_values": 4, + "total_uncompressed_size": 62, + "total_compressed_size": 62, + "data_page_offset": 128, + "statistics": { + "null_count": 0, + "max_value": "delta", + "min_value": "alpha" + } + } + } + ], + "total_byte_size": 186, + "num_rows": 4 + } + ], + "created_by": "hyparquet", + "metadata_length": 219 +} diff --git a/test/files/strings.parquet b/test/files/strings.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b34d8ff71ecddedd47413a1ade95d6242999dc25 GIT binary patch literal 417 zcmWG=3^EjD6Acgzh!N!wWdLIqQ3fFf24;R%1_p-2oPvx*5F@E5u`Hh*$VkpeEXv7D z1&O7k=9DB3sBRWX9}kwI{QMG8HU?213)ZB{lGI`li!C|6v?R?!l>yAj0dsO>fNmD! zkd%?MQR9#i<&hL+kd%OE5e15i)ri%oaoCtKXt07kCcz5!JW!8X3nNT9)Lgjw7DlWZ q8kpc3Aa=twG%#V)AohhxjDtagGo!K~v8b>#wS?g|(B1%G { + it('default utf8 behavior', async () => { + const file = await asyncBufferFromFile('test/files/strings.parquet') + const rows = await parquetReadObjects({ file }) + expect(rows).toEqual([ + { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' }, + { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' }, + { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' }, + { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' }, + ]) + }) + + it('utf8 = true', async () => { + const file = await asyncBufferFromFile('test/files/strings.parquet') + const rows = await parquetReadObjects({ file, utf8: true }) + expect(rows).toEqual([ + { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' }, + { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' }, + { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' }, + { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' }, + ]) + }) + + it('utf8 = false', async () => { + const file = await asyncBufferFromFile('test/files/strings.parquet') + const rows = await parquetReadObjects({ file, utf8: false }) + expect(rows).toEqual([ + { + bytes: new Uint8Array([97, 108, 112, 104, 97]), + c_utf8: 'alpha', + l_utf8: 'alpha', + }, + { + bytes: new Uint8Array([98, 114, 97, 118, 111]), + c_utf8: 'bravo', + l_utf8: 'bravo', + }, + { + bytes: new Uint8Array([99, 104, 97, 114, 108, 105, 101]), + c_utf8: 'charlie', + l_utf8: 'charlie', + }, + { + bytes: new Uint8Array([100, 101, 108, 116, 97]), + c_utf8: 'delta', + l_utf8: 'delta', + }, + ]) + }) +})