dict-page-offset-zero.parquet

This commit is contained in:
Kenny Daniel 2024-05-21 22:50:50 -07:00
parent 4f7791354c
commit 5eeb05da40
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 140 additions and 5 deletions

@ -28,14 +28,14 @@ import { concat } from './utils.js'
export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, schemaPath, compressors) {
/** @type {ArrayLike<any> | undefined} */
let dictionary = undefined
let valuesSeen = 0
let seen = 0
/** @type {any[]} */
const rowData = []
const { element } = schemaPath[schemaPath.length - 1]
// column reader:
const reader = { view: new DataView(arrayBuffer, columnOffset), offset: 0 }
while (valuesSeen < rowGroup.num_rows) {
while (seen < rowGroup.num_rows) {
// parse column header
const header = parquetHeader(reader)
if (header.compressed_page_size === undefined) {
@ -58,7 +58,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec, compressors
)
const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata)
valuesSeen += daph.num_values
seen += daph.num_values
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
// construct output values: skip nulls and construct lists
@ -89,7 +89,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
const { definitionLevels, repetitionLevels, dataPage } = readDataPageV2(
compressedBytes, header, schemaPath, columnMetadata, compressors
)
valuesSeen += daph2.num_values
seen += daph2.num_values
dereferenceDictionary(dictionary, dataPage)
values = convert(dataPage, element)
@ -145,7 +145,7 @@ function dereferenceDictionary(dictionary, dataPage) {
*/
export function getColumnOffset({ dictionary_page_offset, data_page_offset }) {
let columnOffset = dictionary_page_offset
if (dictionary_page_offset === undefined || data_page_offset < dictionary_page_offset) {
if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) {
columnOffset = data_page_offset
}
return Number(columnOffset)

@ -0,0 +1,41 @@
[
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552],
[1552]
]

@ -0,0 +1,94 @@
{
"version": 1,
"schema": [
{
"name": "root",
"num_children": 1
},
{
"type": "INT32",
"repetition_type": "OPTIONAL",
"name": "l_partkey"
}
],
"num_rows": 39,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"type": "INT32",
"encodings": [
"PLAIN",
"BIT_PACKED",
"RLE"
],
"path_in_schema": [
"l_partkey"
],
"codec": "SNAPPY",
"num_values": 39,
"total_uncompressed_size": 180,
"total_compressed_size": 40,
"data_page_offset": 4,
"dictionary_page_offset": 0,
"statistics": {
"max": 1552,
"min": 1552,
"null_count": 0,
"max_value": 1552,
"min_value": 1552
},
"encoding_stats": [
{
"page_type": 0,
"encoding": "PLAIN",
"count": 1
}
],
"bloom_filter_length": [
{
"field_1": {
"field_1": 0,
"field_2": 162,
"field_3": 22,
"field_5": {
"field_1": 39,
"field_2": 0,
"field_3": 3,
"field_4": 4
}
},
"field_2": 22
}
]
},
"offset_index_offset": 67,
"offset_index_length": 10,
"column_index_offset": 44,
"column_index_length": 23,
"crypto_metadata": 23
}
],
"total_byte_size": 180,
"num_rows": 39
}
],
"key_value_metadata": [
{
"key": "is.date.correct",
"value": "true"
},
{
"key": "dremio.arrow.schema.2.1",
"value": "{\n \"fields\" : [ {\n \"name\" : \"l_partkey\",\n \"nullable\" : true,\n \"type\" : {\n \"name\" : \"int\",\n \"bitWidth\" : 32,\n \"isSigned\" : true\n },\n \"children\" : [ ]\n } ]\n}"
},
{
"key": "dremio.version",
"value": "3.2.0-201905102005330382-0598733"
}
],
"created_by": "parquet-mr version 1.12.0-201812210311360288-a86293f (build cec1a483e9dcd545e09170ae787d3dcb13744433)",
"metadata_length": 550
}

Binary file not shown.