mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-25 23:06:36 +00:00
Fix isRequired and add parquet-testing impala test
This commit is contained in:
parent
5f4e2ffe59
commit
d02c68e883
@ -39,9 +39,7 @@ export function schemaElement(schema, name) {
|
||||
// traverse the tree to find the element
|
||||
for (const part of name) {
|
||||
const child = tree.children.find(child => child.element.name === part)
|
||||
if (!child) {
|
||||
throw new Error(`parquet schema element not found: ${name}`)
|
||||
}
|
||||
if (!child) throw new Error(`parquet schema element not found: ${name}`)
|
||||
tree = child
|
||||
}
|
||||
return tree.element
|
||||
@ -49,13 +47,24 @@ export function schemaElement(schema, name) {
|
||||
|
||||
/**
|
||||
* Check if the schema element with the given name is required.
|
||||
* An element is required if all of its ancestors are required.
|
||||
*
|
||||
* @param {SchemaElement[]} schema
|
||||
* @param {string[]} name path to the element
|
||||
* @returns {boolean} true if the element is required
|
||||
*/
|
||||
export function isRequired(schema, name) {
|
||||
return schemaElement(schema, name).repetition_type === 'REQUIRED'
|
||||
/** @type {SchemaTree | undefined} */
|
||||
let tree = schemaTree(schema, 0)
|
||||
for (let i = 0; i < name.length; i++) {
|
||||
// Find schema child with the given name
|
||||
tree = tree.children.find(child => child.element.name === name[i])
|
||||
if (!tree) throw new Error(`parquet schema element not found: ${name}`)
|
||||
if (tree.element.repetition_type !== 'REQUIRED') {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
34
test/files/nonnullable.impala.json
Normal file
34
test/files/nonnullable.impala.json
Normal file
@ -0,0 +1,34 @@
|
||||
[
|
||||
[
|
||||
8,
|
||||
[],
|
||||
[
|
||||
-1,
|
||||
-2,
|
||||
null
|
||||
],
|
||||
[],
|
||||
[],
|
||||
[
|
||||
null,
|
||||
{
|
||||
"0": 107,
|
||||
"1": 49
|
||||
},
|
||||
null,
|
||||
null
|
||||
],
|
||||
[
|
||||
null,
|
||||
1,
|
||||
null,
|
||||
null
|
||||
],
|
||||
-1,
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[]
|
||||
]
|
||||
]
|
||||
519
test/files/nonnullable.impala.metadata.json
Normal file
519
test/files/nonnullable.impala.metadata.json
Normal file
@ -0,0 +1,519 @@
|
||||
{
|
||||
"version": 1,
|
||||
"created_by": "parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)",
|
||||
"key_value_metadata": [
|
||||
{
|
||||
"key": "parquet.avro.schema",
|
||||
"value": "{\"type\":\"record\",\"name\":\"ComplexTypesTbl\",\"namespace\":\"org.apache.impala\",\"fields\":[{\"name\":\"ID\",\"type\":\"long\"},{\"name\":\"Int_Array\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"int_array_array\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":\"int\"}}},{\"name\":\"Int_Map\",\"type\":{\"type\":\"map\",\"values\":\"int\"}},{\"name\":\"int_map_array\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"map\",\"values\":\"int\"}}},{\"name\":\"nested_Struct\",\"type\":{\"type\":\"record\",\"name\":\"r1\",\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"B\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"c\",\"type\":{\"type\":\"record\",\"name\":\"r2\",\"fields\":[{\"name\":\"D\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"r3\",\"fields\":[{\"name\":\"e\",\"type\":\"int\"},{\"name\":\"f\",\"type\":\"string\"}]}}}}]}},{\"name\":\"G\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"record\",\"name\":\"r4\",\"fields\":[{\"name\":\"h\",\"type\":{\"type\":\"record\",\"name\":\"r5\",\"fields\":[{\"name\":\"i\",\"type\":{\"type\":\"array\",\"items\":\"double\"}}]}}]}}}]}}]}"
|
||||
}
|
||||
],
|
||||
"metadata_length": 2544,
|
||||
"num_rows": 1,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 4,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 4,
|
||||
"encodings": [ 0, 4 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "ID" ],
|
||||
"statistics": {
|
||||
"max": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
"total_uncompressed_size": 49,
|
||||
"type": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 53,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 53,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Array", "list", "element" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
"total_uncompressed_size": 49,
|
||||
"type": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 102,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 102,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 3,
|
||||
"path_in_schema": [
|
||||
"int_array_array",
|
||||
"list",
|
||||
"element",
|
||||
"list",
|
||||
"element"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"null_count": 1
|
||||
},
|
||||
"total_compressed_size": 55,
|
||||
"total_uncompressed_size": 55,
|
||||
"type": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 157,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 157,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Map", "map", "key" ],
|
||||
"statistics": {
|
||||
"max": "k1",
|
||||
"min": "k1",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 47,
|
||||
"total_uncompressed_size": 47,
|
||||
"type": 6
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 204,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 204,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Map", "map", "value" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
"total_uncompressed_size": 49,
|
||||
"type": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 253,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 253,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 4,
|
||||
"path_in_schema": [
|
||||
"int_map_array",
|
||||
"list",
|
||||
"element",
|
||||
"map",
|
||||
"key"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "k1",
|
||||
"min": "k1",
|
||||
"null_count": 3
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
"total_uncompressed_size": 49,
|
||||
"type": 6
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 302,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 302,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 4,
|
||||
"path_in_schema": [
|
||||
"int_map_array",
|
||||
"list",
|
||||
"element",
|
||||
"map",
|
||||
"value"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "\u0001\u0000\u0000\u0000",
|
||||
"min": "\u0001\u0000\u0000\u0000",
|
||||
"null_count": 3
|
||||
},
|
||||
"total_compressed_size": 51,
|
||||
"total_uncompressed_size": 51,
|
||||
"type": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 353,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 353,
|
||||
"encodings": [ 0, 4 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "a" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 37,
|
||||
"total_uncompressed_size": 37,
|
||||
"type": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 390,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 390,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "B", "list", "element" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
"total_uncompressed_size": 49,
|
||||
"type": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 439,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 439,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [
|
||||
"nested_Struct",
|
||||
"c",
|
||||
"D",
|
||||
"list",
|
||||
"element",
|
||||
"list",
|
||||
"element",
|
||||
"e"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 51,
|
||||
"total_uncompressed_size": 51,
|
||||
"type": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 490,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 490,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [
|
||||
"nested_Struct",
|
||||
"c",
|
||||
"D",
|
||||
"list",
|
||||
"element",
|
||||
"list",
|
||||
"element",
|
||||
"f"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "nonnullable",
|
||||
"min": "nonnullable",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 76,
|
||||
"total_uncompressed_size": 76,
|
||||
"type": 6
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 566,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 566,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "G", "map", "key" ],
|
||||
"statistics": {
|
||||
"null_count": 1
|
||||
},
|
||||
"total_compressed_size": 33,
|
||||
"total_uncompressed_size": 33,
|
||||
"type": 6
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 599,
|
||||
"meta_data": {
|
||||
"codec": "UNCOMPRESSED",
|
||||
"data_page_offset": 599,
|
||||
"encodings": [ 0, 3 ],
|
||||
"num_values": 1,
|
||||
"path_in_schema": [
|
||||
"nested_Struct",
|
||||
"G",
|
||||
"map",
|
||||
"value",
|
||||
"h",
|
||||
"i",
|
||||
"list",
|
||||
"element"
|
||||
],
|
||||
"statistics": {
|
||||
"null_count": 1
|
||||
},
|
||||
"total_compressed_size": 35,
|
||||
"total_uncompressed_size": 35,
|
||||
"type": 5
|
||||
}
|
||||
}
|
||||
],
|
||||
"num_rows": 1,
|
||||
"total_byte_size": 630
|
||||
}
|
||||
],
|
||||
"schema": [
|
||||
{
|
||||
"name": "org.apache.impala.ComplexTypesTbl",
|
||||
"num_children": 6
|
||||
},
|
||||
{
|
||||
"name": "ID",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 2
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "Int_Array",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"name": "element",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 1
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "int_array_array",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "element",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"name": "element",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 1
|
||||
},
|
||||
{
|
||||
"converted_type": "MAP",
|
||||
"name": "Int_Map",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"converted_type": "MAP_KEY_VALUE",
|
||||
"name": "map",
|
||||
"num_children": 2,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"converted_type": "UTF8",
|
||||
"name": "key",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 6
|
||||
},
|
||||
{
|
||||
"name": "value",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 1
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "int_map_array",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"converted_type": "MAP",
|
||||
"name": "element",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"converted_type": "MAP_KEY_VALUE",
|
||||
"name": "map",
|
||||
"num_children": 2,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"converted_type": "UTF8",
|
||||
"name": "key",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 6
|
||||
},
|
||||
{
|
||||
"name": "value",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 1
|
||||
},
|
||||
{
|
||||
"name": "nested_Struct",
|
||||
"num_children": 4,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "a",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 1
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "B",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"name": "element",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 1
|
||||
},
|
||||
{
|
||||
"name": "c",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "D",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "element",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"name": "element",
|
||||
"num_children": 2,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "e",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 1
|
||||
},
|
||||
{
|
||||
"converted_type": "UTF8",
|
||||
"name": "f",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 6
|
||||
},
|
||||
{
|
||||
"converted_type": "MAP",
|
||||
"name": "G",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"converted_type": "MAP_KEY_VALUE",
|
||||
"name": "map",
|
||||
"num_children": 2,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"converted_type": "UTF8",
|
||||
"name": "key",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 6
|
||||
},
|
||||
{
|
||||
"name": "value",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "h",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"converted_type": "LIST",
|
||||
"name": "i",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REQUIRED"
|
||||
},
|
||||
{
|
||||
"name": "list",
|
||||
"num_children": 1,
|
||||
"repetition_type": "REPEATED"
|
||||
},
|
||||
{
|
||||
"name": "element",
|
||||
"repetition_type": "REQUIRED",
|
||||
"type": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
BIN
test/files/nonnullable.impala.parquet
Normal file
BIN
test/files/nonnullable.impala.parquet
Normal file
Binary file not shown.
@ -10,10 +10,10 @@ describe('parquetMetadata', () => {
|
||||
for (const file of files) {
|
||||
if (!file.endsWith('.parquet')) continue
|
||||
const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
|
||||
const result = parquetMetadata(arrayBuffer)
|
||||
const result = toJson(parquetMetadata(arrayBuffer))
|
||||
const base = file.replace('.parquet', '')
|
||||
const expected = fileToJson(`test/files/${base}.metadata.json`)
|
||||
expect(toJson(result)).toEqual(expected)
|
||||
expect(result, JSON.stringify(result, null, 2)).toEqual(expected)
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
@ -4,8 +4,8 @@ import { parquetRead } from '../src/hyparquet.js'
|
||||
import { toJson } from '../src/toJson.js'
|
||||
import { fileToAsyncBuffer, fileToJson } from './helpers.js'
|
||||
|
||||
describe('parquetMetadataAsync', () => {
|
||||
it('should parse metadata from all test files', async () => {
|
||||
describe('parquetRead', () => {
|
||||
it('should parse data from all test files', async () => {
|
||||
const files = fs.readdirSync('test/files')
|
||||
for (const file of files) {
|
||||
if (!file.endsWith('.parquet')) continue
|
||||
|
||||
Loading…
Reference in New Issue
Block a user