diff --git a/src/read.js b/src/read.js index 0d2dd99..c2cdc91 100644 --- a/src/read.js +++ b/src/read.js @@ -1,6 +1,7 @@ import { getColumnOffset, readColumn } from './column.js' import { parquetMetadataAsync } from './metadata.js' +import { getColumnName } from './schema.js' /** * Read parquet data rows from a file-like object. @@ -84,7 +85,7 @@ async function readRowGroup(options, rowGroup) { let [groupStartByte, groupEndByte] = [file.byteLength, 0] rowGroup.columns.forEach(({ meta_data: columnMetadata }) => { if (!columnMetadata) throw new Error('parquet column metadata is undefined') - const columnName = columnMetadata.path_in_schema.join('.') + const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema) // skip columns that are not requested if (columns && !columns.includes(columnName)) return @@ -114,7 +115,7 @@ async function readRowGroup(options, rowGroup) { if (!columnMetadata) throw new Error('parquet column metadata is undefined') // skip columns that are not requested - const columnName = columnMetadata.path_in_schema.join('.') + const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema) // skip columns that are not requested if (columns && !columns.includes(columnName)) continue diff --git a/src/schema.js b/src/schema.js index 256d925..c25b535 100644 --- a/src/schema.js +++ b/src/schema.js @@ -118,3 +118,40 @@ export function skipDefinitionBytes(num) { } return byteLength } + +/** + * Get the column name as foo.bar and handle list-like columns. + * @param {SchemaElement[]} schema + * @param {string[]} path + * @returns {string} column name + */ +export function getColumnName(schema, path) { + if (isListLike(schema, path)) { + return path.slice(0, -2).join('.') + } else { + return path.join('.') + } +} + +/** + * Check if a column is list-like. + * + * @param {SchemaElement[]} schemaElements parquet schema elements + * @param {string[]} path column path + * @returns {boolean} true if map-like + */ +function isListLike(schemaElements, path) { + const schema = schemaElement(schemaElements, path.slice(0, -2)) + if (path.length < 3) return false + if (schema.element.converted_type !== 'LIST') return false + if (schema.children.length > 1) return false + + const firstChild = schema.children[0] + if (firstChild.children.length > 1) return false + if (firstChild.element.repetition_type !== 'REPEATED') return false + + const secondChild = firstChild.children[0] + if (secondChild.element.repetition_type !== 'REQUIRED') return false + + return true +} diff --git a/test/read.test.js b/test/read.test.js index 769ea8a..4ff5026 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -59,4 +59,30 @@ describe('parquetRead', () => { }, }) }) + + it('should read a list-like column from a file', async () => { + const asyncBuffer = fileToAsyncBuffer('test/files/datapage_v2.snappy.parquet') + await parquetRead({ + file: asyncBuffer, + columns: ['e'], + onChunk: (rows) => { + expect(toJson(rows)).toEqual({ + columnName: 'e', + columnData: [[1, 2, 3], null, null, [1, 2, 3], [1, 2]], + rowStart: 0, + rowEnd: 5, + }) + }, + onComplete: (rows) => { + /* eslint-disable no-sparse-arrays */ + expect(toJson(rows)).toEqual([ + [,,,, [1, 2, 3]], + [,,,, null], + [,,,, null], + [,,,, [1, 2, 3]], + [,,,, [1, 2]], + ]) + }, + }) + }) })