Upgrade dataPage to match dictionary type

This commit is contained in:
Kenny Daniel 2024-05-22 23:45:02 -07:00
parent c4ad05e580
commit b8e4496063
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
11 changed files with 1164 additions and 123 deletions

@ -1,5 +1,5 @@
import { assembleLists } from './assemble.js'
import { convert } from './convert.js'
import { convert, dereferenceDictionary } from './convert.js'
import { readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPageV2 } from './datapageV2.js'
import { parquetHeader } from './header.js'
@ -11,6 +11,7 @@ import { concat } from './utils.js'
* Parse column data from a buffer.
*
* @typedef {import('./types.js').ColumnMetaData} ColumnMetaData
* @typedef {import('./types.js').DecodedArray} DecodedArray
* @param {import('./types.js').DataReader} reader
* @param {import('./types.js').RowGroup} rowGroup row group metadata
* @param {ColumnMetaData} columnMetadata column metadata
@ -20,7 +21,7 @@ import { concat } from './utils.js'
*/
export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compressors, utf8 }) {
const { element } = schemaPath[schemaPath.length - 1]
/** @type {ArrayLike<any> | undefined} */
/** @type {DecodedArray | undefined} */
let dictionary = undefined
let seen = 0
/** @type {any[]} */
@ -49,8 +50,8 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
// construct output values: skip nulls and construct lists
dereferenceDictionary(dictionary, dataPage)
values = convert(dataPage, element, utf8)
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
@ -78,8 +79,8 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
)
seen += daph2.num_values
dereferenceDictionary(dictionary, dataPage)
values = convert(dataPage, element, utf8)
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
@ -109,21 +110,6 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
return rowData
}
/**
* Map data to dictionary values in place.
*
* @typedef {import('./types.js').DecodedArray} DecodedArray
* @param {ArrayLike<any> | undefined} dictionary
* @param {DecodedArray} dataPage
*/
function dereferenceDictionary(dictionary, dataPage) {
if (dictionary) {
for (let i = 0; i < dataPage.length; i++) {
dataPage[i] = dictionary[dataPage[i]]
}
}
}
/**
* Find the start byte offset for a column chunk.
*

@ -97,3 +97,25 @@ export function parseFloat16(bytes) {
if (exp === 0x1f) return frac ? NaN : sign * Infinity
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
}
/**
* Map data to dictionary values in place.
*
* @param {DecodedArray | undefined} dictionary
* @param {DecodedArray} dataPage
* @returns {DecodedArray}
*/
export function dereferenceDictionary(dictionary, dataPage) {
let output = dataPage
if (dictionary) {
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// upgrade dataPage to match dictionary type
// @ts-expect-error not my fault typescript doesn't understand constructors
output = new dictionary.constructor(dataPage.length)
}
for (let i = 0; i < dataPage.length; i++) {
output[i] = dictionary[dataPage[i]]
}
}
return output
}

@ -54,13 +54,11 @@ export function readDataPage(bytes, daph, schemaPath, { type }) {
}
/**
* Read a page containing dictionary data.
*
* @param {Uint8Array} bytes raw page data
* @param {import("./types.d.ts").DictionaryPageHeader} diph dictionary page header
* @param {ColumnMetaData} columnMetadata
* @param {number | undefined} typeLength - type_length from schema
* @returns {ArrayLike<any>} array of values
* @returns {DecodedArray}
*/
export function readDictionaryPage(bytes, diph, columnMetadata, typeLength) {
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)

@ -111,7 +111,8 @@ describe('convert function', () => {
})
describe('parseFloat16', () => {
it('should convert numbers', () => {
it('convert float16 numbers', () => {
expect(parseFloat16(undefined)).toBe(undefined)
expect(parseFloat16(new Uint8Array([0x00, 0xbc]))).toBe(-1)
expect(parseFloat16(new Uint8Array([0x00, 0x00]))).toBe(0)
expect(parseFloat16(new Uint8Array([0x00, 0x38]))).toBe(0.5)
@ -119,22 +120,22 @@ describe('parseFloat16', () => {
expect(parseFloat16(new Uint8Array([0x00, 0x40]))).toBe(2)
})
it('should convert -0', () => {
it('convert float16 -0', () => {
expect(parseFloat16(new Uint8Array([0x00, 0x80]))).toBe(-0)
expect(parseFloat16(new Uint8Array([0x00, 0x80]))).not.toBe(0)
})
it('should convert Infinity', () => {
it('convert float16 Infinity', () => {
expect(parseFloat16(new Uint8Array([0x00, 0x7c]))).toBe(Infinity)
expect(parseFloat16(new Uint8Array([0x00, 0xfc]))).toBe(-Infinity)
})
it('should convert NaN', () => {
it('convert float16 NaN', () => {
expect(parseFloat16(new Uint8Array([0x00, 0x7e]))).toBeNaN()
expect(parseFloat16(new Uint8Array([0x01, 0x7e]))).toBeNaN()
})
it('should convert a subnormal number', () => {
it('convert float16 subnormal number', () => {
expect(parseFloat16(new Uint8Array([0xff, 0x03])))
.toBeCloseTo(Math.pow(2, -14) * (1023 / 1024), 5)
})

@ -1,12 +0,0 @@
[
[ "Block" ],
[ "Intersection" ],
[ "Block" ],
[ "Block" ],
[ null ],
[ "Block" ],
[ "Intersection" ],
[ "Block" ],
[ "Block" ],
[ "Intersection" ]
]

@ -1,50 +0,0 @@
{
"version": 1,
"created_by": "DuckDB",
"metadata_length": 149,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "duckdb_schema",
"num_children": 1
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "ADDRTYPE",
"converted_type": "UTF8"
}
],
"num_rows": 10,
"row_groups": [
{
"columns": [
{
"file_offset": 0,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": ["PLAIN", "RLE_DICTIONARY"],
"path_in_schema": ["ADDRTYPE"],
"codec": "SNAPPY",
"num_values": 10,
"total_uncompressed_size": 78,
"total_compressed_size": 82,
"data_page_offset": 31,
"dictionary_page_offset": 4,
"statistics": {
"max": "Intersection",
"min": "Block",
"max_value": "Intersection",
"min_value": "Block",
"null_count": 1,
"distinct_count": 2
}
}
}
],
"file_offset": 4,
"total_byte_size": 33024,
"num_rows": 10
}
]
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,125 @@
{
"version": 1,
"schema": [
{
"name": "m",
"num_children": 2
},
{
"type": "INT64",
"repetition_type": "REQUIRED",
"name": "long_field"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "binary_field"
}
],
"num_rows": 1000,
"row_groups": [
{
"columns": [
{
"file_offset": 31,
"meta_data": {
"type": "INT64",
"encodings": [
"PLAIN_DICTIONARY",
"BIT_PACKED"
],
"path_in_schema": [
"long_field"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 54,
"total_compressed_size": 54,
"data_page_offset": 31,
"dictionary_page_offset": 4,
"statistics": {
"max": 0,
"min": 0,
"null_count": 0,
"max_value": 0,
"min_value": 0
},
"encoding_stats": [
{
"page_type": 2,
"encoding": "PLAIN_DICTIONARY",
"count": 1
},
{
"page_type": 0,
"encoding": "PLAIN_DICTIONARY",
"count": 1
}
]
},
"offset_index_offset": 262,
"offset_index_length": 10,
"column_index_offset": 144,
"column_index_length": 31,
"crypto_metadata": 31
},
{
"file_offset": 117,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN_DICTIONARY",
"BIT_PACKED"
],
"path_in_schema": [
"binary_field"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 86,
"total_compressed_size": 86,
"data_page_offset": 117,
"dictionary_page_offset": 58,
"statistics": {
"max": "a655fd0e-9949-4059-bcae-fd6a002a4652",
"min": "a655fd0e-9949-4059-bcae-fd6a002a4652",
"null_count": 0,
"max_value": "a655fd0e-9949-4059-bcae-fd6a002a4652",
"min_value": "a655fd0e-9949-4059-bcae-fd6a002a4652"
},
"encoding_stats": [
{
"page_type": 2,
"encoding": "PLAIN_DICTIONARY",
"count": 1
},
{
"page_type": 0,
"encoding": "PLAIN_DICTIONARY",
"count": 1
}
]
},
"offset_index_offset": 272,
"offset_index_length": 11,
"column_index_offset": 175,
"column_index_length": 87,
"crypto_metadata": 87
}
],
"total_byte_size": 140,
"num_rows": 1000,
"file_offset": 4,
"total_compressed_size": 140,
"ordinal": 0
}
],
"key_value_metadata": [
{
"key": "writer.model.name",
"value": "example"
}
],
"created_by": "parquet-mr version 1.13.0-SNAPSHOT (build 261f7d2679407c833545b56f4c85a4ae8b5c9ed4)",
"metadata_length": 525
}

Binary file not shown.

@ -2,14 +2,7 @@ import { describe, expect, it } from 'vitest'
import { parquetMetadata, parquetSchema } from '../src/hyparquet.js'
import { readFileToArrayBuffer } from './helpers.js'
describe('schemaTree', () => {
it('parse schema tree from addrtype-missing-value.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
const metadata = parquetMetadata(arrayBuffer)
const result = parquetSchema(metadata)
expect(result).toEqual(addrtypeSchema)
})
describe('parquetSchema', () => {
it('parse schema tree from rowgroups.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
const metadata = parquetMetadata(arrayBuffer)
@ -18,30 +11,6 @@ describe('schemaTree', () => {
})
})
// Parquet v1 from DuckDB
const addrtypeSchema = {
children: [
{
children: [],
count: 1,
element: {
converted_type: 'UTF8',
name: 'ADDRTYPE',
repetition_type: 'OPTIONAL',
type: 'BYTE_ARRAY',
},
path: ['ADDRTYPE'],
},
],
count: 2,
element: {
name: 'duckdb_schema',
num_children: 1,
repetition_type: 'REQUIRED',
},
path: [],
}
// Parquet v2 from pandas with 2 row groups
const rowgroupsSchema = {
children: [