Fix isRequired and add parquet-testing impala test

This commit is contained in:
Kenny Daniel 2024-02-16 16:07:09 -08:00
parent 5f4e2ffe59
commit d02c68e883
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
6 changed files with 570 additions and 8 deletions

@ -39,9 +39,7 @@ export function schemaElement(schema, name) {
// traverse the tree to find the element
for (const part of name) {
const child = tree.children.find(child => child.element.name === part)
if (!child) {
throw new Error(`parquet schema element not found: ${name}`)
}
if (!child) throw new Error(`parquet schema element not found: ${name}`)
tree = child
}
return tree.element
@ -49,13 +47,24 @@ export function schemaElement(schema, name) {
/**
* Check if the schema element with the given name is required.
* An element is required if all of its ancestors are required.
*
* @param {SchemaElement[]} schema
* @param {string[]} name path to the element
* @returns {boolean} true if the element is required
*/
export function isRequired(schema, name) {
return schemaElement(schema, name).repetition_type === 'REQUIRED'
/** @type {SchemaTree | undefined} */
let tree = schemaTree(schema, 0)
for (let i = 0; i < name.length; i++) {
// Find schema child with the given name
tree = tree.children.find(child => child.element.name === name[i])
if (!tree) throw new Error(`parquet schema element not found: ${name}`)
if (tree.element.repetition_type !== 'REQUIRED') {
return false
}
}
return true
}
/**

@ -0,0 +1,34 @@
[
[
8,
[],
[
-1,
-2,
null
],
[],
[],
[
null,
{
"0": 107,
"1": 49
},
null,
null
],
[
null,
1,
null,
null
],
-1,
[],
[],
[],
[],
[]
]
]

@ -0,0 +1,519 @@
{
"version": 1,
"created_by": "parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)",
"key_value_metadata": [
{
"key": "parquet.avro.schema",
"value": "{\"type\":\"record\",\"name\":\"ComplexTypesTbl\",\"namespace\":\"org.apache.impala\",\"fields\":[{\"name\":\"ID\",\"type\":\"long\"},{\"name\":\"Int_Array\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"int_array_array\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":\"int\"}}},{\"name\":\"Int_Map\",\"type\":{\"type\":\"map\",\"values\":\"int\"}},{\"name\":\"int_map_array\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"map\",\"values\":\"int\"}}},{\"name\":\"nested_Struct\",\"type\":{\"type\":\"record\",\"name\":\"r1\",\"fields\":[{\"name\":\"a\",\"type\":\"int\"},{\"name\":\"B\",\"type\":{\"type\":\"array\",\"items\":\"int\"}},{\"name\":\"c\",\"type\":{\"type\":\"record\",\"name\":\"r2\",\"fields\":[{\"name\":\"D\",\"type\":{\"type\":\"array\",\"items\":{\"type\":\"array\",\"items\":{\"type\":\"record\",\"name\":\"r3\",\"fields\":[{\"name\":\"e\",\"type\":\"int\"},{\"name\":\"f\",\"type\":\"string\"}]}}}}]}},{\"name\":\"G\",\"type\":{\"type\":\"map\",\"values\":{\"type\":\"record\",\"name\":\"r4\",\"fields\":[{\"name\":\"h\",\"type\":{\"type\":\"record\",\"name\":\"r5\",\"fields\":[{\"name\":\"i\",\"type\":{\"type\":\"array\",\"items\":\"double\"}}]}}]}}}]}}]}"
}
],
"metadata_length": 2544,
"num_rows": 1,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 4,
"encodings": [ 0, 4 ],
"num_values": 1,
"path_in_schema": [ "ID" ],
"statistics": {
"max": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"null_count": 0
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 2
}
},
{
"file_offset": 53,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 53,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [ "Int_Array", "list", "element" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"null_count": 0
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 1
}
},
{
"file_offset": 102,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 102,
"encodings": [ 0, 3 ],
"num_values": 3,
"path_in_schema": [
"int_array_array",
"list",
"element",
"list",
"element"
],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"null_count": 1
},
"total_compressed_size": 55,
"total_uncompressed_size": 55,
"type": 1
}
},
{
"file_offset": 157,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 157,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [ "Int_Map", "map", "key" ],
"statistics": {
"max": "k1",
"min": "k1",
"null_count": 0
},
"total_compressed_size": 47,
"total_uncompressed_size": 47,
"type": 6
}
},
{
"file_offset": 204,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 204,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [ "Int_Map", "map", "value" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"null_count": 0
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 1
}
},
{
"file_offset": 253,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 253,
"encodings": [ 0, 3 ],
"num_values": 4,
"path_in_schema": [
"int_map_array",
"list",
"element",
"map",
"key"
],
"statistics": {
"max": "k1",
"min": "k1",
"null_count": 3
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 6
}
},
{
"file_offset": 302,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 302,
"encodings": [ 0, 3 ],
"num_values": 4,
"path_in_schema": [
"int_map_array",
"list",
"element",
"map",
"value"
],
"statistics": {
"max": "\u0001\u0000\u0000\u0000",
"min": "\u0001\u0000\u0000\u0000",
"null_count": 3
},
"total_compressed_size": 51,
"total_uncompressed_size": 51,
"type": 1
}
},
{
"file_offset": 353,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 353,
"encodings": [ 0, 4 ],
"num_values": 1,
"path_in_schema": [ "nested_Struct", "a" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"null_count": 0
},
"total_compressed_size": 37,
"total_uncompressed_size": 37,
"type": 1
}
},
{
"file_offset": 390,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 390,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [ "nested_Struct", "B", "list", "element" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"null_count": 0
},
"total_compressed_size": 49,
"total_uncompressed_size": 49,
"type": 1
}
},
{
"file_offset": 439,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 439,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [
"nested_Struct",
"c",
"D",
"list",
"element",
"list",
"element",
"e"
],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"null_count": 0
},
"total_compressed_size": 51,
"total_uncompressed_size": 51,
"type": 1
}
},
{
"file_offset": 490,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 490,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [
"nested_Struct",
"c",
"D",
"list",
"element",
"list",
"element",
"f"
],
"statistics": {
"max": "nonnullable",
"min": "nonnullable",
"null_count": 0
},
"total_compressed_size": 76,
"total_uncompressed_size": 76,
"type": 6
}
},
{
"file_offset": 566,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 566,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [ "nested_Struct", "G", "map", "key" ],
"statistics": {
"null_count": 1
},
"total_compressed_size": 33,
"total_uncompressed_size": 33,
"type": 6
}
},
{
"file_offset": 599,
"meta_data": {
"codec": "UNCOMPRESSED",
"data_page_offset": 599,
"encodings": [ 0, 3 ],
"num_values": 1,
"path_in_schema": [
"nested_Struct",
"G",
"map",
"value",
"h",
"i",
"list",
"element"
],
"statistics": {
"null_count": 1
},
"total_compressed_size": 35,
"total_uncompressed_size": 35,
"type": 5
}
}
],
"num_rows": 1,
"total_byte_size": 630
}
],
"schema": [
{
"name": "org.apache.impala.ComplexTypesTbl",
"num_children": 6
},
{
"name": "ID",
"repetition_type": "REQUIRED",
"type": 2
},
{
"converted_type": "LIST",
"name": "Int_Array",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 1
},
{
"converted_type": "LIST",
"name": "int_array_array",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"converted_type": "LIST",
"name": "element",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 1
},
{
"converted_type": "MAP",
"name": "Int_Map",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"converted_type": "MAP_KEY_VALUE",
"name": "map",
"num_children": 2,
"repetition_type": "REPEATED"
},
{
"converted_type": "UTF8",
"name": "key",
"repetition_type": "REQUIRED",
"type": 6
},
{
"name": "value",
"repetition_type": "REQUIRED",
"type": 1
},
{
"converted_type": "LIST",
"name": "int_map_array",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"converted_type": "MAP",
"name": "element",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"converted_type": "MAP_KEY_VALUE",
"name": "map",
"num_children": 2,
"repetition_type": "REPEATED"
},
{
"converted_type": "UTF8",
"name": "key",
"repetition_type": "REQUIRED",
"type": 6
},
{
"name": "value",
"repetition_type": "REQUIRED",
"type": 1
},
{
"name": "nested_Struct",
"num_children": 4,
"repetition_type": "REQUIRED"
},
{
"name": "a",
"repetition_type": "REQUIRED",
"type": 1
},
{
"converted_type": "LIST",
"name": "B",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 1
},
{
"name": "c",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"converted_type": "LIST",
"name": "D",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"converted_type": "LIST",
"name": "element",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"name": "element",
"num_children": 2,
"repetition_type": "REQUIRED"
},
{
"name": "e",
"repetition_type": "REQUIRED",
"type": 1
},
{
"converted_type": "UTF8",
"name": "f",
"repetition_type": "REQUIRED",
"type": 6
},
{
"converted_type": "MAP",
"name": "G",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"converted_type": "MAP_KEY_VALUE",
"name": "map",
"num_children": 2,
"repetition_type": "REPEATED"
},
{
"converted_type": "UTF8",
"name": "key",
"repetition_type": "REQUIRED",
"type": 6
},
{
"name": "value",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "h",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"converted_type": "LIST",
"name": "i",
"num_children": 1,
"repetition_type": "REQUIRED"
},
{
"name": "list",
"num_children": 1,
"repetition_type": "REPEATED"
},
{
"name": "element",
"repetition_type": "REQUIRED",
"type": 5
}
]
}

Binary file not shown.

@ -10,10 +10,10 @@ describe('parquetMetadata', () => {
for (const file of files) {
if (!file.endsWith('.parquet')) continue
const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
const result = parquetMetadata(arrayBuffer)
const result = toJson(parquetMetadata(arrayBuffer))
const base = file.replace('.parquet', '')
const expected = fileToJson(`test/files/${base}.metadata.json`)
expect(toJson(result)).toEqual(expected)
expect(result, JSON.stringify(result, null, 2)).toEqual(expected)
}
})

@ -4,8 +4,8 @@ import { parquetRead } from '../src/hyparquet.js'
import { toJson } from '../src/toJson.js'
import { fileToAsyncBuffer, fileToJson } from './helpers.js'
describe('parquetMetadataAsync', () => {
it('should parse metadata from all test files', async () => {
describe('parquetRead', () => {
it('should parse data from all test files', async () => {
const files = fs.readdirSync('test/files')
for (const file of files) {
if (!file.endsWith('.parquet')) continue