List-like column names

This commit is contained in:
Kenny Daniel 2024-03-14 16:39:03 -07:00
parent 2d061392b9
commit 76c4278f40
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
3 changed files with 66 additions and 2 deletions

@ -1,6 +1,7 @@
import { getColumnOffset, readColumn } from './column.js'
import { parquetMetadataAsync } from './metadata.js'
import { getColumnName } from './schema.js'
/**
* Read parquet data rows from a file-like object.
@ -84,7 +85,7 @@ async function readRowGroup(options, rowGroup) {
let [groupStartByte, groupEndByte] = [file.byteLength, 0]
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => {
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
const columnName = columnMetadata.path_in_schema.join('.')
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema)
// skip columns that are not requested
if (columns && !columns.includes(columnName)) return
@ -114,7 +115,7 @@ async function readRowGroup(options, rowGroup) {
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
// skip columns that are not requested
const columnName = columnMetadata.path_in_schema.join('.')
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema)
// skip columns that are not requested
if (columns && !columns.includes(columnName)) continue

@ -118,3 +118,40 @@ export function skipDefinitionBytes(num) {
}
return byteLength
}
/**
* Get the column name as foo.bar and handle list-like columns.
* @param {SchemaElement[]} schema
* @param {string[]} path
* @returns {string} column name
*/
export function getColumnName(schema, path) {
if (isListLike(schema, path)) {
return path.slice(0, -2).join('.')
} else {
return path.join('.')
}
}
/**
* Check if a column is list-like.
*
* @param {SchemaElement[]} schemaElements parquet schema elements
* @param {string[]} path column path
* @returns {boolean} true if map-like
*/
function isListLike(schemaElements, path) {
const schema = schemaElement(schemaElements, path.slice(0, -2))
if (path.length < 3) return false
if (schema.element.converted_type !== 'LIST') return false
if (schema.children.length > 1) return false
const firstChild = schema.children[0]
if (firstChild.children.length > 1) return false
if (firstChild.element.repetition_type !== 'REPEATED') return false
const secondChild = firstChild.children[0]
if (secondChild.element.repetition_type !== 'REQUIRED') return false
return true
}

@ -59,4 +59,30 @@ describe('parquetRead', () => {
},
})
})
it('should read a list-like column from a file', async () => {
const asyncBuffer = fileToAsyncBuffer('test/files/datapage_v2.snappy.parquet')
await parquetRead({
file: asyncBuffer,
columns: ['e'],
onChunk: (rows) => {
expect(toJson(rows)).toEqual({
columnName: 'e',
columnData: [[1, 2, 3], null, null, [1, 2, 3], [1, 2]],
rowStart: 0,
rowEnd: 5,
})
},
onComplete: (rows) => {
/* eslint-disable no-sparse-arrays */
expect(toJson(rows)).toEqual([
[,,,, [1, 2, 3]],
[,,,, null],
[,,,, null],
[,,,, [1, 2, 3]],
[,,,, [1, 2]],
])
},
})
})
})