Addresses issues with duckdb use of delta encodings (#77)

* Addresses issues with duckdb use of delta encodings

* Shrunk size of test data
This commit is contained in:
mike-iqmo 2025-05-14 19:28:58 -04:00 committed by GitHub
parent 7f31c1e5b6
commit dbf3065f8e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 2607 additions and 0 deletions

@ -48,6 +48,13 @@ export function readDataPage(bytes, daph, { type, element, schemaPath }) {
}
} else if (daph.encoding === 'BYTE_STREAM_SPLIT') {
dataPage = byteStreamSplit(reader, nValues, type, element.type_length)
} else if (daph.encoding === 'DELTA_BINARY_PACKED') {
const int32 = type === 'INT32'
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues)
deltaBinaryUnpack(reader, nValues, dataPage)
} else if (daph.encoding === 'DELTA_LENGTH_BYTE_ARRAY') {
dataPage = new Array(nValues)
deltaLengthByteArray(reader, nValues, dataPage)
} else {
throw new Error(`parquet unsupported encoding: ${daph.encoding}`)
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,48 @@
{
"version": 1,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "duckdb_schema",
"num_children": 1
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "range",
"converted_type": "INT_64"
}
],
"num_rows": 1250,
"row_groups": [
{
"columns": [
{
"file_offset": 0,
"meta_data": {
"type": "INT64",
"encodings": ["DELTA_BINARY_PACKED"],
"path_in_schema": ["range"],
"codec": "SNAPPY",
"num_values": 1250,
"total_uncompressed_size": 40,
"total_compressed_size": 42,
"data_page_offset": 4,
"statistics": {
"max": 1249,
"min": 0,
"null_count": 0,
"max_value": 1249,
"min_value": 0
}
}
}
],
"total_byte_size": 40,
"num_rows": 1250,
"file_offset": 4
}
],
"created_by": "DuckDB version v1.2.1 (build 8e52ec4395)",
"metadata_length": 169
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,48 @@
{
"version": 1,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "duckdb_schema",
"num_children": 1
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "range_varchar",
"converted_type": "UTF8"
}
],
"num_rows": 1250,
"row_groups": [
{
"columns": [
{
"file_offset": 0,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": ["DELTA_LENGTH_BYTE_ARRAY"],
"path_in_schema": ["range_varchar"],
"codec": "SNAPPY",
"num_values": 1250,
"total_uncompressed_size": 3996,
"total_compressed_size": 3390,
"data_page_offset": 4,
"statistics": {
"max": "999",
"min": "0",
"null_count": 0,
"max_value": "999",
"min_value": "0"
}
}
}
],
"total_byte_size": 3996,
"num_rows": 1250,
"file_offset": 4
}
],
"created_by": "DuckDB version v1.2.1 (build 8e52ec4395)",
"metadata_length": 164
}

Binary file not shown.