mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-05 22:41:55 +00:00
Upgrade dataPage to match dictionary type
This commit is contained in:
parent
c4ad05e580
commit
b8e4496063
@ -1,5 +1,5 @@
|
||||
import { assembleLists } from './assemble.js'
|
||||
import { convert } from './convert.js'
|
||||
import { convert, dereferenceDictionary } from './convert.js'
|
||||
import { readDataPage, readDictionaryPage } from './datapage.js'
|
||||
import { readDataPageV2 } from './datapageV2.js'
|
||||
import { parquetHeader } from './header.js'
|
||||
@ -11,6 +11,7 @@ import { concat } from './utils.js'
|
||||
* Parse column data from a buffer.
|
||||
*
|
||||
* @typedef {import('./types.js').ColumnMetaData} ColumnMetaData
|
||||
* @typedef {import('./types.js').DecodedArray} DecodedArray
|
||||
* @param {import('./types.js').DataReader} reader
|
||||
* @param {import('./types.js').RowGroup} rowGroup row group metadata
|
||||
* @param {ColumnMetaData} columnMetadata column metadata
|
||||
@ -20,7 +21,7 @@ import { concat } from './utils.js'
|
||||
*/
|
||||
export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compressors, utf8 }) {
|
||||
const { element } = schemaPath[schemaPath.length - 1]
|
||||
/** @type {ArrayLike<any> | undefined} */
|
||||
/** @type {DecodedArray | undefined} */
|
||||
let dictionary = undefined
|
||||
let seen = 0
|
||||
/** @type {any[]} */
|
||||
@ -49,8 +50,8 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
|
||||
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
|
||||
|
||||
// construct output values: skip nulls and construct lists
|
||||
dereferenceDictionary(dictionary, dataPage)
|
||||
values = convert(dataPage, element, utf8)
|
||||
values = dereferenceDictionary(dictionary, dataPage)
|
||||
values = convert(values, element, utf8)
|
||||
if (repetitionLevels.length || definitionLevels?.length) {
|
||||
// Use repetition levels to construct lists
|
||||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
|
||||
@ -78,8 +79,8 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
|
||||
)
|
||||
seen += daph2.num_values
|
||||
|
||||
dereferenceDictionary(dictionary, dataPage)
|
||||
values = convert(dataPage, element, utf8)
|
||||
values = dereferenceDictionary(dictionary, dataPage)
|
||||
values = convert(values, element, utf8)
|
||||
if (repetitionLevels.length || definitionLevels?.length) {
|
||||
// Use repetition levels to construct lists
|
||||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
|
||||
@ -109,21 +110,6 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
|
||||
return rowData
|
||||
}
|
||||
|
||||
/**
|
||||
* Map data to dictionary values in place.
|
||||
*
|
||||
* @typedef {import('./types.js').DecodedArray} DecodedArray
|
||||
* @param {ArrayLike<any> | undefined} dictionary
|
||||
* @param {DecodedArray} dataPage
|
||||
*/
|
||||
function dereferenceDictionary(dictionary, dataPage) {
|
||||
if (dictionary) {
|
||||
for (let i = 0; i < dataPage.length; i++) {
|
||||
dataPage[i] = dictionary[dataPage[i]]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the start byte offset for a column chunk.
|
||||
*
|
||||
|
||||
@ -97,3 +97,25 @@ export function parseFloat16(bytes) {
|
||||
if (exp === 0x1f) return frac ? NaN : sign * Infinity
|
||||
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
|
||||
}
|
||||
|
||||
/**
|
||||
* Map data to dictionary values in place.
|
||||
*
|
||||
* @param {DecodedArray | undefined} dictionary
|
||||
* @param {DecodedArray} dataPage
|
||||
* @returns {DecodedArray}
|
||||
*/
|
||||
export function dereferenceDictionary(dictionary, dataPage) {
|
||||
let output = dataPage
|
||||
if (dictionary) {
|
||||
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
|
||||
// upgrade dataPage to match dictionary type
|
||||
// @ts-expect-error not my fault typescript doesn't understand constructors
|
||||
output = new dictionary.constructor(dataPage.length)
|
||||
}
|
||||
for (let i = 0; i < dataPage.length; i++) {
|
||||
output[i] = dictionary[dataPage[i]]
|
||||
}
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
@ -54,13 +54,11 @@ export function readDataPage(bytes, daph, schemaPath, { type }) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a page containing dictionary data.
|
||||
*
|
||||
* @param {Uint8Array} bytes raw page data
|
||||
* @param {import("./types.d.ts").DictionaryPageHeader} diph dictionary page header
|
||||
* @param {ColumnMetaData} columnMetadata
|
||||
* @param {number | undefined} typeLength - type_length from schema
|
||||
* @returns {ArrayLike<any>} array of values
|
||||
* @returns {DecodedArray}
|
||||
*/
|
||||
export function readDictionaryPage(bytes, diph, columnMetadata, typeLength) {
|
||||
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
|
||||
|
||||
@ -111,7 +111,8 @@ describe('convert function', () => {
|
||||
})
|
||||
|
||||
describe('parseFloat16', () => {
|
||||
it('should convert numbers', () => {
|
||||
it('convert float16 numbers', () => {
|
||||
expect(parseFloat16(undefined)).toBe(undefined)
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0xbc]))).toBe(-1)
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0x00]))).toBe(0)
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0x38]))).toBe(0.5)
|
||||
@ -119,22 +120,22 @@ describe('parseFloat16', () => {
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0x40]))).toBe(2)
|
||||
})
|
||||
|
||||
it('should convert -0', () => {
|
||||
it('convert float16 -0', () => {
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0x80]))).toBe(-0)
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0x80]))).not.toBe(0)
|
||||
})
|
||||
|
||||
it('should convert Infinity', () => {
|
||||
it('convert float16 Infinity', () => {
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0x7c]))).toBe(Infinity)
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0xfc]))).toBe(-Infinity)
|
||||
})
|
||||
|
||||
it('should convert NaN', () => {
|
||||
it('convert float16 NaN', () => {
|
||||
expect(parseFloat16(new Uint8Array([0x00, 0x7e]))).toBeNaN()
|
||||
expect(parseFloat16(new Uint8Array([0x01, 0x7e]))).toBeNaN()
|
||||
})
|
||||
|
||||
it('should convert a subnormal number', () => {
|
||||
it('convert float16 subnormal number', () => {
|
||||
expect(parseFloat16(new Uint8Array([0xff, 0x03])))
|
||||
.toBeCloseTo(Math.pow(2, -14) * (1023 / 1024), 5)
|
||||
})
|
||||
|
||||
@ -1,12 +0,0 @@
|
||||
[
|
||||
[ "Block" ],
|
||||
[ "Intersection" ],
|
||||
[ "Block" ],
|
||||
[ "Block" ],
|
||||
[ null ],
|
||||
[ "Block" ],
|
||||
[ "Intersection" ],
|
||||
[ "Block" ],
|
||||
[ "Block" ],
|
||||
[ "Intersection" ]
|
||||
]
|
||||
@ -1,50 +0,0 @@
|
||||
{
|
||||
"version": 1,
|
||||
"created_by": "DuckDB",
|
||||
"metadata_length": 149,
|
||||
"schema": [
|
||||
{
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "duckdb_schema",
|
||||
"num_children": 1
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "ADDRTYPE",
|
||||
"converted_type": "UTF8"
|
||||
}
|
||||
],
|
||||
"num_rows": 10,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 0,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": ["PLAIN", "RLE_DICTIONARY"],
|
||||
"path_in_schema": ["ADDRTYPE"],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 10,
|
||||
"total_uncompressed_size": 78,
|
||||
"total_compressed_size": 82,
|
||||
"data_page_offset": 31,
|
||||
"dictionary_page_offset": 4,
|
||||
"statistics": {
|
||||
"max": "Intersection",
|
||||
"min": "Block",
|
||||
"max_value": "Intersection",
|
||||
"min_value": "Block",
|
||||
"null_count": 1,
|
||||
"distinct_count": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"file_offset": 4,
|
||||
"total_byte_size": 33024,
|
||||
"num_rows": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
Binary file not shown.
1002
test/files/plain-dict-uncompressed-checksum.json
Normal file
1002
test/files/plain-dict-uncompressed-checksum.json
Normal file
File diff suppressed because it is too large
Load Diff
125
test/files/plain-dict-uncompressed-checksum.metadata.json
Normal file
125
test/files/plain-dict-uncompressed-checksum.metadata.json
Normal file
@ -0,0 +1,125 @@
|
||||
{
|
||||
"version": 1,
|
||||
"schema": [
|
||||
{
|
||||
"name": "m",
|
||||
"num_children": 2
|
||||
},
|
||||
{
|
||||
"type": "INT64",
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "long_field"
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "binary_field"
|
||||
}
|
||||
],
|
||||
"num_rows": 1000,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 31,
|
||||
"meta_data": {
|
||||
"type": "INT64",
|
||||
"encodings": [
|
||||
"PLAIN_DICTIONARY",
|
||||
"BIT_PACKED"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"long_field"
|
||||
],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 1000,
|
||||
"total_uncompressed_size": 54,
|
||||
"total_compressed_size": 54,
|
||||
"data_page_offset": 31,
|
||||
"dictionary_page_offset": 4,
|
||||
"statistics": {
|
||||
"max": 0,
|
||||
"min": 0,
|
||||
"null_count": 0,
|
||||
"max_value": 0,
|
||||
"min_value": 0
|
||||
},
|
||||
"encoding_stats": [
|
||||
{
|
||||
"page_type": 2,
|
||||
"encoding": "PLAIN_DICTIONARY",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"page_type": 0,
|
||||
"encoding": "PLAIN_DICTIONARY",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"offset_index_offset": 262,
|
||||
"offset_index_length": 10,
|
||||
"column_index_offset": 144,
|
||||
"column_index_length": 31,
|
||||
"crypto_metadata": 31
|
||||
},
|
||||
{
|
||||
"file_offset": 117,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN_DICTIONARY",
|
||||
"BIT_PACKED"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"binary_field"
|
||||
],
|
||||
"codec": "UNCOMPRESSED",
|
||||
"num_values": 1000,
|
||||
"total_uncompressed_size": 86,
|
||||
"total_compressed_size": 86,
|
||||
"data_page_offset": 117,
|
||||
"dictionary_page_offset": 58,
|
||||
"statistics": {
|
||||
"max": "a655fd0e-9949-4059-bcae-fd6a002a4652",
|
||||
"min": "a655fd0e-9949-4059-bcae-fd6a002a4652",
|
||||
"null_count": 0,
|
||||
"max_value": "a655fd0e-9949-4059-bcae-fd6a002a4652",
|
||||
"min_value": "a655fd0e-9949-4059-bcae-fd6a002a4652"
|
||||
},
|
||||
"encoding_stats": [
|
||||
{
|
||||
"page_type": 2,
|
||||
"encoding": "PLAIN_DICTIONARY",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"page_type": 0,
|
||||
"encoding": "PLAIN_DICTIONARY",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"offset_index_offset": 272,
|
||||
"offset_index_length": 11,
|
||||
"column_index_offset": 175,
|
||||
"column_index_length": 87,
|
||||
"crypto_metadata": 87
|
||||
}
|
||||
],
|
||||
"total_byte_size": 140,
|
||||
"num_rows": 1000,
|
||||
"file_offset": 4,
|
||||
"total_compressed_size": 140,
|
||||
"ordinal": 0
|
||||
}
|
||||
],
|
||||
"key_value_metadata": [
|
||||
{
|
||||
"key": "writer.model.name",
|
||||
"value": "example"
|
||||
}
|
||||
],
|
||||
"created_by": "parquet-mr version 1.13.0-SNAPSHOT (build 261f7d2679407c833545b56f4c85a4ae8b5c9ed4)",
|
||||
"metadata_length": 525
|
||||
}
|
||||
BIN
test/files/plain-dict-uncompressed-checksum.parquet
Normal file
BIN
test/files/plain-dict-uncompressed-checksum.parquet
Normal file
Binary file not shown.
@ -2,14 +2,7 @@ import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadata, parquetSchema } from '../src/hyparquet.js'
|
||||
import { readFileToArrayBuffer } from './helpers.js'
|
||||
|
||||
describe('schemaTree', () => {
|
||||
it('parse schema tree from addrtype-missing-value.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
const result = parquetSchema(metadata)
|
||||
expect(result).toEqual(addrtypeSchema)
|
||||
})
|
||||
|
||||
describe('parquetSchema', () => {
|
||||
it('parse schema tree from rowgroups.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
@ -18,30 +11,6 @@ describe('schemaTree', () => {
|
||||
})
|
||||
})
|
||||
|
||||
// Parquet v1 from DuckDB
|
||||
const addrtypeSchema = {
|
||||
children: [
|
||||
{
|
||||
children: [],
|
||||
count: 1,
|
||||
element: {
|
||||
converted_type: 'UTF8',
|
||||
name: 'ADDRTYPE',
|
||||
repetition_type: 'OPTIONAL',
|
||||
type: 'BYTE_ARRAY',
|
||||
},
|
||||
path: ['ADDRTYPE'],
|
||||
},
|
||||
],
|
||||
count: 2,
|
||||
element: {
|
||||
name: 'duckdb_schema',
|
||||
num_children: 1,
|
||||
repetition_type: 'REQUIRED',
|
||||
},
|
||||
path: [],
|
||||
}
|
||||
|
||||
// Parquet v2 from pandas with 2 row groups
|
||||
const rowgroupsSchema = {
|
||||
children: [
|
||||
|
||||
Loading…
Reference in New Issue
Block a user