Convert logical strings

This commit is contained in:
Kenny Daniel 2025-05-15 23:44:09 -07:00
parent b635904239
commit 8dbb74ac78
No known key found for this signature in database
GPG Key ID: FDF16101AF5AFD3A
5 changed files with 154 additions and 4 deletions

@ -7,10 +7,10 @@ const dayMillis = 86400000 // 1 day in milliseconds
* @param {DecodedArray | undefined} dictionary
* @param {SchemaElement} schemaElement
* @param {Encoding} encoding
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @param {boolean} [utf8] decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8) {
if (dictionary && encoding.endsWith('_DICTIONARY')) {
let output = data
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
@ -31,7 +31,7 @@ export function convertWithDictionary(data, dictionary, schemaElement, encoding,
*
* @param {DecodedArray} data series of primitive types
* @param {SchemaElement} schemaElement
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @param {boolean} [utf8] decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
export function convert(data, schemaElement, utf8 = true) {
@ -83,7 +83,7 @@ export function convert(data, schemaElement, utf8 = true) {
if (ctype === 'INTERVAL') {
throw new Error('parquet interval not supported')
}
if (ctype === 'UTF8' || utf8 && type === 'BYTE_ARRAY') {
if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') {
const decoder = new TextDecoder()
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {

6
test/files/strings.json Normal file

@ -0,0 +1,6 @@
[
["alpha", "alpha", "alpha"],
["bravo", "bravo", "bravo"],
["charlie", "charlie", "charlie"],
["delta", "delta", "delta"]
]

@ -0,0 +1,90 @@
{
"version": 2,
"schema": [
{
"name": "root",
"num_children": 3
},
{
"type": "BYTE_ARRAY",
"name": "bytes"
},
{
"type": "BYTE_ARRAY",
"name": "c_utf8",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"name": "l_utf8",
"logical_type": {
"type": "STRING"
}
}
],
"num_rows": 4,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": ["PLAIN"],
"path_in_schema": ["bytes"],
"codec": "UNCOMPRESSED",
"num_values": 4,
"total_uncompressed_size": 62,
"total_compressed_size": 62,
"data_page_offset": 4,
"statistics": {
"null_count": 0,
"max_value": "delta",
"min_value": "alpha"
}
}
},
{
"file_offset": 66,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": ["PLAIN"],
"path_in_schema": ["c_utf8"],
"codec": "UNCOMPRESSED",
"num_values": 4,
"total_uncompressed_size": 62,
"total_compressed_size": 62,
"data_page_offset": 66,
"statistics": {
"null_count": 0,
"max_value": "delta",
"min_value": "alpha"
}
}
},
{
"file_offset": 128,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": ["PLAIN"],
"path_in_schema": ["l_utf8"],
"codec": "UNCOMPRESSED",
"num_values": 4,
"total_uncompressed_size": 62,
"total_compressed_size": 62,
"data_page_offset": 128,
"statistics": {
"null_count": 0,
"max_value": "delta",
"min_value": "alpha"
}
}
}
],
"total_byte_size": 186,
"num_rows": 4
}
],
"created_by": "hyparquet",
"metadata_length": 219
}

BIN
test/files/strings.parquet Normal file

Binary file not shown.

54
test/read.utf8.test.js Normal file

@ -0,0 +1,54 @@
import { describe, expect, it } from 'vitest'
import { parquetReadObjects } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/utils.js'
describe('parquetRead utf8', () => {
it('default utf8 behavior', async () => {
const file = await asyncBufferFromFile('test/files/strings.parquet')
const rows = await parquetReadObjects({ file })
expect(rows).toEqual([
{ bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
{ bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
{ bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
{ bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
])
})
it('utf8 = true', async () => {
const file = await asyncBufferFromFile('test/files/strings.parquet')
const rows = await parquetReadObjects({ file, utf8: true })
expect(rows).toEqual([
{ bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
{ bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
{ bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
{ bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
])
})
it('utf8 = false', async () => {
const file = await asyncBufferFromFile('test/files/strings.parquet')
const rows = await parquetReadObjects({ file, utf8: false })
expect(rows).toEqual([
{
bytes: new Uint8Array([97, 108, 112, 104, 97]),
c_utf8: 'alpha',
l_utf8: 'alpha',
},
{
bytes: new Uint8Array([98, 114, 97, 118, 111]),
c_utf8: 'bravo',
l_utf8: 'bravo',
},
{
bytes: new Uint8Array([99, 104, 97, 114, 108, 105, 101]),
c_utf8: 'charlie',
l_utf8: 'charlie',
},
{
bytes: new Uint8Array([100, 101, 108, 116, 97]),
c_utf8: 'delta',
l_utf8: 'delta',
},
])
})
})