mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-02-21 20:01:33 +00:00
Convert logical strings
This commit is contained in:
parent
b635904239
commit
8dbb74ac78
@ -7,10 +7,10 @@ const dayMillis = 86400000 // 1 day in milliseconds
|
||||
* @param {DecodedArray | undefined} dictionary
|
||||
* @param {SchemaElement} schemaElement
|
||||
* @param {Encoding} encoding
|
||||
* @param {boolean | undefined} utf8 decode bytes as utf8?
|
||||
* @param {boolean} [utf8] decode bytes as utf8?
|
||||
* @returns {DecodedArray} series of rich types
|
||||
*/
|
||||
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
|
||||
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8) {
|
||||
if (dictionary && encoding.endsWith('_DICTIONARY')) {
|
||||
let output = data
|
||||
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
|
||||
@ -31,7 +31,7 @@ export function convertWithDictionary(data, dictionary, schemaElement, encoding,
|
||||
*
|
||||
* @param {DecodedArray} data series of primitive types
|
||||
* @param {SchemaElement} schemaElement
|
||||
* @param {boolean | undefined} utf8 decode bytes as utf8?
|
||||
* @param {boolean} [utf8] decode bytes as utf8?
|
||||
* @returns {DecodedArray} series of rich types
|
||||
*/
|
||||
export function convert(data, schemaElement, utf8 = true) {
|
||||
@ -83,7 +83,7 @@ export function convert(data, schemaElement, utf8 = true) {
|
||||
if (ctype === 'INTERVAL') {
|
||||
throw new Error('parquet interval not supported')
|
||||
}
|
||||
if (ctype === 'UTF8' || utf8 && type === 'BYTE_ARRAY') {
|
||||
if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') {
|
||||
const decoder = new TextDecoder()
|
||||
const arr = new Array(data.length)
|
||||
for (let i = 0; i < arr.length; i++) {
|
||||
|
||||
6
test/files/strings.json
Normal file
6
test/files/strings.json
Normal file
@ -0,0 +1,6 @@
|
||||
[
|
||||
["alpha", "alpha", "alpha"],
|
||||
["bravo", "bravo", "bravo"],
|
||||
["charlie", "charlie", "charlie"],
|
||||
["delta", "delta", "delta"]
|
||||
]
|
||||
90
test/files/strings.metadata.json
Normal file
90
test/files/strings.metadata.json
Normal file
@ -0,0 +1,90 @@
|
||||
{
|
||||
"version": 2,
|
||||
"schema": [
|
||||
{
|
||||
"name": "root",
|
||||
"num_children": 3
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"name": "bytes"
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"name": "c_utf8",
|
||||
"converted_type": "UTF8"
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"name": "l_utf8",
|
||||
"logical_type": {
|
||||
"type": "STRING"
|
||||
}
|
||||
}
|
||||
],
|
||||
"num_rows": 4,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 4,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": ["PLAIN"],
|
||||
"path_in_schema": ["bytes"],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 4,
|
||||
"total_uncompressed_size": 62,
|
||||
"total_compressed_size": 62,
|
||||
"data_page_offset": 4,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "delta",
|
||||
"min_value": "alpha"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 66,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": ["PLAIN"],
|
||||
"path_in_schema": ["c_utf8"],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 4,
|
||||
"total_uncompressed_size": 62,
|
||||
"total_compressed_size": 62,
|
||||
"data_page_offset": 66,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "delta",
|
||||
"min_value": "alpha"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 128,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": ["PLAIN"],
|
||||
"path_in_schema": ["l_utf8"],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 4,
|
||||
"total_uncompressed_size": 62,
|
||||
"total_compressed_size": 62,
|
||||
"data_page_offset": 128,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "delta",
|
||||
"min_value": "alpha"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 186,
|
||||
"num_rows": 4
|
||||
}
|
||||
],
|
||||
"created_by": "hyparquet",
|
||||
"metadata_length": 219
|
||||
}
|
||||
BIN
test/files/strings.parquet
Normal file
BIN
test/files/strings.parquet
Normal file
Binary file not shown.
54
test/read.utf8.test.js
Normal file
54
test/read.utf8.test.js
Normal file
@ -0,0 +1,54 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetReadObjects } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
|
||||
describe('parquetRead utf8', () => {
|
||||
it('default utf8 behavior', async () => {
|
||||
const file = await asyncBufferFromFile('test/files/strings.parquet')
|
||||
const rows = await parquetReadObjects({ file })
|
||||
expect(rows).toEqual([
|
||||
{ bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
|
||||
{ bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
|
||||
{ bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
|
||||
{ bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
|
||||
])
|
||||
})
|
||||
|
||||
it('utf8 = true', async () => {
|
||||
const file = await asyncBufferFromFile('test/files/strings.parquet')
|
||||
const rows = await parquetReadObjects({ file, utf8: true })
|
||||
expect(rows).toEqual([
|
||||
{ bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
|
||||
{ bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
|
||||
{ bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
|
||||
{ bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
|
||||
])
|
||||
})
|
||||
|
||||
it('utf8 = false', async () => {
|
||||
const file = await asyncBufferFromFile('test/files/strings.parquet')
|
||||
const rows = await parquetReadObjects({ file, utf8: false })
|
||||
expect(rows).toEqual([
|
||||
{
|
||||
bytes: new Uint8Array([97, 108, 112, 104, 97]),
|
||||
c_utf8: 'alpha',
|
||||
l_utf8: 'alpha',
|
||||
},
|
||||
{
|
||||
bytes: new Uint8Array([98, 114, 97, 118, 111]),
|
||||
c_utf8: 'bravo',
|
||||
l_utf8: 'bravo',
|
||||
},
|
||||
{
|
||||
bytes: new Uint8Array([99, 104, 97, 114, 108, 105, 101]),
|
||||
c_utf8: 'charlie',
|
||||
l_utf8: 'charlie',
|
||||
},
|
||||
{
|
||||
bytes: new Uint8Array([100, 101, 108, 116, 97]),
|
||||
c_utf8: 'delta',
|
||||
l_utf8: 'delta',
|
||||
},
|
||||
])
|
||||
})
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user