mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-05 22:41:55 +00:00
repeated_no_annotation.parquet
This commit is contained in:
parent
b293b77bcd
commit
70387fa345
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,4 +3,5 @@ package-lock.json
|
||||
coverage
|
||||
*.tgz
|
||||
.vscode
|
||||
.DS_Store
|
||||
/*.parquet
|
||||
|
||||
@ -158,7 +158,8 @@ export function assembleNested(subcolumnData, schema, depth = 0) {
|
||||
subcolumnData.delete(child.path.join('.'))
|
||||
}
|
||||
// invert struct by depth
|
||||
const inverted = invertStruct(struct, nextDepth)
|
||||
const invertDepth = schema.element.repetition_type === 'REQUIRED' ? depth : depth + 1
|
||||
const inverted = invertStruct(struct, invertDepth)
|
||||
if (optional) flattenAtDepth(inverted, depth)
|
||||
subcolumnData.set(path, inverted)
|
||||
return
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
/**
|
||||
* @param {import('./types.d.ts').DataReader} reader
|
||||
* @param {number} nValues
|
||||
* @param {Float32Array | Float64Array} output
|
||||
*/
|
||||
export function byteStreamSplit(reader, nValues, output) {
|
||||
const byteWidth = output instanceof Float32Array ? 4 : 8
|
||||
const bytes = new Uint8Array(output.buffer)
|
||||
for (let b = 0; b < byteWidth; b++) {
|
||||
for (let i = 0; i < nValues; i++) {
|
||||
bytes[i * byteWidth + b] = reader.view.getUint8(reader.offset++)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,5 +1,4 @@
|
||||
import { byteStreamSplit } from './byteStreamSplit.js'
|
||||
import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
|
||||
import { byteStreamSplit, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
|
||||
import { readPlain } from './plain.js'
|
||||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired } from './schema.js'
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
import { byteStreamSplit } from './byteStreamSplit.js'
|
||||
import { decompressPage } from './column.js'
|
||||
import { deltaBinaryUnpack, deltaByteArray, deltaLengthByteArray } from './delta.js'
|
||||
import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
|
||||
import { byteStreamSplit, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
|
||||
import { readPlain } from './plain.js'
|
||||
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
|
||||
|
||||
|
||||
@ -125,3 +125,18 @@ function readBitPacked(reader, header, bitWidth, values, seen) {
|
||||
|
||||
return seen
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {DataReader} reader
|
||||
* @param {number} nValues
|
||||
* @param {Float32Array | Float64Array} output
|
||||
*/
|
||||
export function byteStreamSplit(reader, nValues, output) {
|
||||
const byteWidth = output instanceof Float32Array ? 4 : 8
|
||||
const bytes = new Uint8Array(output.buffer)
|
||||
for (let b = 0; b < byteWidth; b++) {
|
||||
for (let i = 0; i < nValues; i++) {
|
||||
bytes[i * byteWidth + b] = reader.view.getUint8(reader.offset++)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -38,9 +38,8 @@ export async function parquetRead(options) {
|
||||
options.metadata ||= await parquetMetadataAsync(options.file)
|
||||
if (!options.metadata) throw new Error('parquet metadata not found')
|
||||
|
||||
const { metadata, onComplete } = options
|
||||
const { metadata, onComplete, rowEnd } = options
|
||||
const rowStart = options.rowStart || 0
|
||||
const rowEnd = options.rowEnd || Number(metadata.num_rows)
|
||||
/** @type {any[][]} */
|
||||
const rowData = []
|
||||
|
||||
@ -50,13 +49,13 @@ export async function parquetRead(options) {
|
||||
// number of rows in this row group
|
||||
const groupRows = Number(rowGroup.num_rows)
|
||||
// if row group overlaps with row range, read it
|
||||
if (groupStart + groupRows >= rowStart && groupStart < rowEnd) {
|
||||
if (groupStart + groupRows >= rowStart && (rowEnd === undefined || groupStart < rowEnd)) {
|
||||
// read row group
|
||||
const groupData = await readRowGroup(options, rowGroup, groupStart)
|
||||
if (onComplete) {
|
||||
// filter to rows in range
|
||||
const start = Math.max(rowStart - groupStart, 0)
|
||||
const end = Math.min(rowEnd - groupStart, groupRows)
|
||||
const end = rowEnd === undefined ? undefined : rowEnd - groupStart
|
||||
concat(rowData, groupData.slice(start, end))
|
||||
}
|
||||
}
|
||||
|
||||
8
test/files/repeated_no_annotation.json
Normal file
8
test/files/repeated_no_annotation.json
Normal file
@ -0,0 +1,8 @@
|
||||
[
|
||||
[1, null],
|
||||
[2, null],
|
||||
[3, {"phone": []}],
|
||||
[4, {"phone": [{"number":5555555555,"kind":null}]}],
|
||||
[5, {"phone": [{"number":1111111111,"kind":"home"}]}],
|
||||
[6, {"phone": [{"number":1111111111,"kind":"home"},{"number":2222222222,"kind":null},{"number":3333333333,"kind":"mobile"}]}]
|
||||
]
|
||||
107
test/files/repeated_no_annotation.metadata.json
Normal file
107
test/files/repeated_no_annotation.metadata.json
Normal file
@ -0,0 +1,107 @@
|
||||
{
|
||||
"version": 1,
|
||||
"schema": [
|
||||
{
|
||||
"name": "user",
|
||||
"num_children": 2
|
||||
},
|
||||
{
|
||||
"type": "INT32",
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "id"
|
||||
},
|
||||
{
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "phoneNumbers",
|
||||
"num_children": 1
|
||||
},
|
||||
{
|
||||
"repetition_type": "REPEATED",
|
||||
"name": "phone",
|
||||
"num_children": 2
|
||||
},
|
||||
{
|
||||
"type": "INT64",
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "number"
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "kind",
|
||||
"converted_type": "UTF8"
|
||||
}
|
||||
],
|
||||
"num_rows": 0,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 64,
|
||||
"meta_data": {
|
||||
"type": "INT32",
|
||||
"encodings": [
|
||||
"PLAIN",
|
||||
"RLE_DICTIONARY"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 6,
|
||||
"total_uncompressed_size": 60,
|
||||
"total_compressed_size": 60,
|
||||
"data_page_offset": 42,
|
||||
"dictionary_page_offset": 4
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 173,
|
||||
"meta_data": {
|
||||
"type": "INT64",
|
||||
"encodings": [
|
||||
"PLAIN",
|
||||
"RLE_DICTIONARY"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"phoneNumbers",
|
||||
"phone",
|
||||
"number"
|
||||
],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 8,
|
||||
"total_uncompressed_size": 80,
|
||||
"total_compressed_size": 80,
|
||||
"data_page_offset": 139,
|
||||
"dictionary_page_offset": 93
|
||||
}
|
||||
},
|
||||
{
|
||||
"file_offset": 294,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN",
|
||||
"RLE_DICTIONARY"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"phoneNumbers",
|
||||
"phone",
|
||||
"kind"
|
||||
],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 8,
|
||||
"total_uncompressed_size": 65,
|
||||
"total_compressed_size": 65,
|
||||
"data_page_offset": 261,
|
||||
"dictionary_page_offset": 229
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 205,
|
||||
"num_rows": 6
|
||||
}
|
||||
],
|
||||
"created_by": "parquet-rs version 0.3.0 (build b45ce7cba2199f22d93269c150d8a83916c69b5e)",
|
||||
"metadata_length": 306
|
||||
}
|
||||
BIN
test/files/repeated_no_annotation.parquet
Normal file
BIN
test/files/repeated_no_annotation.parquet
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user