mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-07 11:36:37 +00:00
List-like column names
This commit is contained in:
parent
2d061392b9
commit
76c4278f40
@ -1,6 +1,7 @@
|
||||
|
||||
import { getColumnOffset, readColumn } from './column.js'
|
||||
import { parquetMetadataAsync } from './metadata.js'
|
||||
import { getColumnName } from './schema.js'
|
||||
|
||||
/**
|
||||
* Read parquet data rows from a file-like object.
|
||||
@ -84,7 +85,7 @@ async function readRowGroup(options, rowGroup) {
|
||||
let [groupStartByte, groupEndByte] = [file.byteLength, 0]
|
||||
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => {
|
||||
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
|
||||
const columnName = columnMetadata.path_in_schema.join('.')
|
||||
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema)
|
||||
// skip columns that are not requested
|
||||
if (columns && !columns.includes(columnName)) return
|
||||
|
||||
@ -114,7 +115,7 @@ async function readRowGroup(options, rowGroup) {
|
||||
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
|
||||
|
||||
// skip columns that are not requested
|
||||
const columnName = columnMetadata.path_in_schema.join('.')
|
||||
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema)
|
||||
// skip columns that are not requested
|
||||
if (columns && !columns.includes(columnName)) continue
|
||||
|
||||
|
||||
@ -118,3 +118,40 @@ export function skipDefinitionBytes(num) {
|
||||
}
|
||||
return byteLength
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the column name as foo.bar and handle list-like columns.
|
||||
* @param {SchemaElement[]} schema
|
||||
* @param {string[]} path
|
||||
* @returns {string} column name
|
||||
*/
|
||||
export function getColumnName(schema, path) {
|
||||
if (isListLike(schema, path)) {
|
||||
return path.slice(0, -2).join('.')
|
||||
} else {
|
||||
return path.join('.')
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a column is list-like.
|
||||
*
|
||||
* @param {SchemaElement[]} schemaElements parquet schema elements
|
||||
* @param {string[]} path column path
|
||||
* @returns {boolean} true if map-like
|
||||
*/
|
||||
function isListLike(schemaElements, path) {
|
||||
const schema = schemaElement(schemaElements, path.slice(0, -2))
|
||||
if (path.length < 3) return false
|
||||
if (schema.element.converted_type !== 'LIST') return false
|
||||
if (schema.children.length > 1) return false
|
||||
|
||||
const firstChild = schema.children[0]
|
||||
if (firstChild.children.length > 1) return false
|
||||
if (firstChild.element.repetition_type !== 'REPEATED') return false
|
||||
|
||||
const secondChild = firstChild.children[0]
|
||||
if (secondChild.element.repetition_type !== 'REQUIRED') return false
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
@ -59,4 +59,30 @@ describe('parquetRead', () => {
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
it('should read a list-like column from a file', async () => {
|
||||
const asyncBuffer = fileToAsyncBuffer('test/files/datapage_v2.snappy.parquet')
|
||||
await parquetRead({
|
||||
file: asyncBuffer,
|
||||
columns: ['e'],
|
||||
onChunk: (rows) => {
|
||||
expect(toJson(rows)).toEqual({
|
||||
columnName: 'e',
|
||||
columnData: [[1, 2, 3], null, null, [1, 2, 3], [1, 2]],
|
||||
rowStart: 0,
|
||||
rowEnd: 5,
|
||||
})
|
||||
},
|
||||
onComplete: (rows) => {
|
||||
/* eslint-disable no-sparse-arrays */
|
||||
expect(toJson(rows)).toEqual([
|
||||
[,,,, [1, 2, 3]],
|
||||
[,,,, null],
|
||||
[,,,, null],
|
||||
[,,,, [1, 2, 3]],
|
||||
[,,,, [1, 2]],
|
||||
])
|
||||
},
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user