Delta byte array encoding

This commit is contained in:
Kenny Daniel 2024-02-29 15:13:20 -08:00
parent 48d79e6a1d
commit d4341b803e
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
14 changed files with 16543 additions and 6 deletions

@ -181,7 +181,7 @@ Parquet encodings:
- [X] RLE
- [X] BIT_PACKED
- [X] DELTA_BINARY_PACKED
- [ ] DELTA_BYTE_ARRAY
- [X] DELTA_BYTE_ARRAY
- [ ] DELTA_LENGTH_BYTE_ARRAY
- [ ] BYTE_STREAM_SPLIT

@ -1,5 +1,5 @@
import { decompressPage } from './column.js'
import { deltaBinaryUnpack } from './delta.js'
import { deltaBinaryUnpack, deltaByteArray } from './delta.js'
import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import { readPlain } from './plain.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
@ -12,7 +12,7 @@ import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
* @typedef {import("./types.d.ts").Compressors} Compressors
* @typedef {import("./types.d.ts").DataPageHeaderV2} DataPageHeaderV2
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree
* @param {Uint8Array} compressedBytes raw page data (should already be decompressed)
* @param {Uint8Array} compressedBytes raw page data
* @param {import("./types.d.ts").PageHeader} ph page header
* @param {SchemaTree[]} schemaPath
* @param {ColumnMetaData} columnMetadata
@ -46,7 +46,7 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata,
// read values based on encoding
/** @type {import('./types.d.ts').DecodedArray} */
let dataPage = []
let dataPage
const nValues = daph2.num_values - daph2.num_nulls
if (daph2.encoding === 'PLAIN') {
const { type_length } = schemaPath[schemaPath.length - 1].element
@ -67,6 +67,9 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata,
const int32 = columnMetadata.type === 'INT32'
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues)
deltaBinaryUnpack(pageReader, nValues, dataPage)
} else if (daph2.encoding === 'DELTA_BYTE_ARRAY') {
dataPage = new Array(nValues)
deltaByteArray(pageReader, nValues, dataPage)
} else {
throw new Error(`parquet unsupported encoding: ${daph2.encoding}`)
}

@ -60,3 +60,28 @@ export function deltaBinaryUnpack(reader, nValues, output) {
}
}
}
/**
* @param {DataReader} reader
* @param {number} nValues
* @param {Uint8Array[]} output
*/
export function deltaByteArray(reader, nValues, output) {
const prefixData = new Int32Array(nValues)
deltaBinaryUnpack(reader, nValues, prefixData)
const suffixData = new Int32Array(nValues)
deltaBinaryUnpack(reader, nValues, suffixData)
for (let i = 0; i < nValues; i++) {
const suffix = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, suffixData[i])
if (prefixData[i]) {
// copy from previous value
output[i] = new Uint8Array(prefixData[i] + suffixData[i])
output[i].set(output[i - 1].subarray(0, prefixData[i]))
output[i].set(suffix, prefixData[i])
} else {
output[i] = suffix
}
reader.offset += suffixData[i]
}
}

@ -59,7 +59,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
// combine initial fetch with the new slice
const combinedBuffer = new ArrayBuffer(metadataLength + 8)
const combinedView = new Uint8Array(combinedBuffer)
combinedView.set(new Uint8Array(metadataBuffer), 0)
combinedView.set(new Uint8Array(metadataBuffer))
combinedView.set(new Uint8Array(footerBuffer), footerOffset - metadataOffset)
return parquetMetadata(combinedBuffer)
} else {

@ -19,6 +19,7 @@ const CompactType = {
/**
* Parse TCompactProtocol
*
* @typedef {import("./types.d.ts").DataReader} DataReader
* @param {DataReader} reader
* @returns {Record<string, any>}
*/
@ -119,7 +120,6 @@ function readElement(reader, type) {
* 7-bit group with the 0 bit, prefixing the remaining 7-bit groups with the
* 1 bit and encode the resulting bit-string as Little Endian.
*
* @typedef {import("./types.d.ts").DataReader} DataReader
* @param {DataReader} reader
* @returns {number} value
*/

File diff suppressed because it is too large Load Diff

@ -0,0 +1,333 @@
{
"version": 1,
"schema": [
{
"name": "hive_schema",
"num_children": 9
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_customer_id",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_salutation",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_first_name",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_last_name",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_preferred_cust_flag",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_birth_country",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_login",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_email_address",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_last_review_date",
"converted_type": "UTF8"
}
],
"num_rows": 1000,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_customer_id"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 8248,
"total_compressed_size": 8248,
"data_page_offset": 4,
"statistics": {
"null_count": 0,
"max_value": "AAAAAAAAPPCAAAAA",
"min_value": "AAAAAAAAAABAAAAA"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 8252,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_salutation"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 3362,
"total_compressed_size": 3362,
"data_page_offset": 8252,
"statistics": {
"null_count": 30,
"max_value": "Sir",
"min_value": "Dr."
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 11614,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_first_name"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 6595,
"total_compressed_size": 6595,
"data_page_offset": 11614,
"statistics": {
"null_count": 32,
"max_value": "Zachary",
"min_value": "Aaron"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 18209,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_last_name"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 6955,
"total_compressed_size": 6955,
"data_page_offset": 18209,
"statistics": {
"null_count": 24,
"max_value": "Zamora",
"min_value": "Adams"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 25164,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_preferred_cust_flag"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 1220,
"total_compressed_size": 1220,
"data_page_offset": 25164,
"statistics": {
"null_count": 29,
"max_value": "Y",
"min_value": "N"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 26384,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_birth_country"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 9599,
"total_compressed_size": 9599,
"data_page_offset": 26384,
"statistics": {
"null_count": 31,
"max_value": "ZIMBABWE",
"min_value": "AFGHANISTAN"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 35983,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_login"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 42,
"total_compressed_size": 42,
"data_page_offset": 35983,
"statistics": {
"null_count": 1000
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 36025,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_email_address"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 27763,
"total_compressed_size": 27763,
"data_page_offset": 36025,
"statistics": {
"null_count": 31,
"max_value": "Zachary.Parsons@hHmnLrbKsfY.com",
"min_value": "Aaron.Browder@iUpddkHI9z8.org"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 63788,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_last_review_date"
],
"codec": "UNCOMPRESSED",
"num_values": 1000,
"total_uncompressed_size": 3511,
"total_compressed_size": 3511,
"data_page_offset": 63788,
"statistics": {
"null_count": 25,
"max_value": "2452648",
"min_value": "2452283"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
}
],
"total_byte_size": 67295,
"num_rows": 1000
}
],
"created_by": "parquet-mr version 1.10.0 (build 031a6654009e3b82020012a18434c582bd74c73a)",
"metadata_length": 1046
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,624 @@
{
"version": 1,
"schema": [
{
"name": "hive_schema",
"num_children": 17
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_customer_sk"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_current_cdemo_sk"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_current_hdemo_sk"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_current_addr_sk"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_first_shipto_date_sk"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_first_sales_date_sk"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_birth_day"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_birth_month"
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c_birth_year"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_customer_id",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_salutation",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_first_name",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_last_name",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_preferred_cust_flag",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_birth_country",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_email_address",
"converted_type": "UTF8"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c_last_review_date",
"converted_type": "UTF8"
}
],
"num_rows": 100,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_customer_sk"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 81,
"total_compressed_size": 81,
"data_page_offset": 4,
"statistics": {
"max": 100,
"min": 1,
"null_count": 0,
"max_value": 100,
"min_value": 1
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 85,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_current_cdemo_sk"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 358,
"total_compressed_size": 358,
"data_page_offset": 85,
"statistics": {
"max": 1895444,
"min": 8817,
"null_count": 3,
"max_value": 1895444,
"min_value": 8817
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 443,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_current_hdemo_sk"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 311,
"total_compressed_size": 311,
"data_page_offset": 443,
"statistics": {
"max": 7135,
"min": 37,
"null_count": 2,
"max_value": 7135,
"min_value": 37
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 754,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_current_addr_sk"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 353,
"total_compressed_size": 353,
"data_page_offset": 754,
"statistics": {
"max": 49388,
"min": 571,
"null_count": 0,
"max_value": 49388,
"min_value": 571
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 1107,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_first_shipto_date_sk"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 297,
"total_compressed_size": 297,
"data_page_offset": 1107,
"statistics": {
"max": 2452641,
"min": 2449130,
"null_count": 1,
"max_value": 2452641,
"min_value": 2449130
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 1404,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_first_sales_date_sk"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 297,
"total_compressed_size": 297,
"data_page_offset": 1404,
"statistics": {
"max": 2452611,
"min": 2449010,
"null_count": 1,
"max_value": 2452611,
"min_value": 2449010
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 1701,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_birth_day"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 160,
"total_compressed_size": 160,
"data_page_offset": 1701,
"statistics": {
"max": 30,
"min": 1,
"null_count": 3,
"max_value": 30,
"min_value": 1
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 1861,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_birth_month"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 146,
"total_compressed_size": 146,
"data_page_offset": 1861,
"statistics": {
"max": 12,
"min": 1,
"null_count": 3,
"max_value": 12,
"min_value": 1
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 2007,
"meta_data": {
"type": "INT64",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_birth_year"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 172,
"total_compressed_size": 172,
"data_page_offset": 2007,
"statistics": {
"max": 1991,
"min": 1925,
"null_count": 3,
"max_value": 1991,
"min_value": 1925
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
}
},
{
"file_offset": 2179,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_customer_id"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 976,
"total_compressed_size": 976,
"data_page_offset": 2179,
"statistics": {
"null_count": 0,
"max_value": "AAAAAAAAPFAAAAAA",
"min_value": "AAAAAAAAABAAAAAA"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 3155,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_salutation"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 376,
"total_compressed_size": 376,
"data_page_offset": 3155,
"statistics": {
"null_count": 3,
"max_value": "Sir",
"min_value": "Dr."
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 3531,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_first_name"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 694,
"total_compressed_size": 694,
"data_page_offset": 3531,
"statistics": {
"null_count": 3,
"max_value": "William",
"min_value": "Albert"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 4225,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_last_name"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 777,
"total_compressed_size": 777,
"data_page_offset": 4225,
"statistics": {
"null_count": 1,
"max_value": "Young",
"min_value": "Baker"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 5002,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_preferred_cust_flag"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 156,
"total_compressed_size": 156,
"data_page_offset": 5002,
"statistics": {
"null_count": 4,
"max_value": "Y",
"min_value": "N"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 5158,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_birth_country"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 1111,
"total_compressed_size": 1111,
"data_page_offset": 5158,
"statistics": {
"null_count": 4,
"max_value": "WALLIS AND FUTUNA",
"min_value": "AFGHANISTAN"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 6269,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_email_address"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 2813,
"total_compressed_size": 2813,
"data_page_offset": 6269,
"statistics": {
"null_count": 3,
"max_value": "William.Warner@zegnrzurU.org",
"min_value": "Albert.Brunson@62.com"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
},
{
"file_offset": 9082,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_last_review_date"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 407,
"total_compressed_size": 407,
"data_page_offset": 9082,
"statistics": {
"null_count": 3,
"max_value": "2452644",
"min_value": "2452293"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
}
}
],
"total_byte_size": 9485,
"num_rows": 100
}
],
"created_by": "parquet-mr version 1.10.0 (build 031a6654009e3b82020012a18434c582bd74c73a)",
"metadata_length": 2070
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,746 @@
{
"version": 1,
"schema": [
{
"name": "spark_schema",
"num_children": 17
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_customer_sk:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_current_cdemo_sk:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_current_hdemo_sk:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_current_addr_sk:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_first_shipto_date_sk:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_first_sales_date_sk:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_birth_day:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_birth_month:"
},
{
"type": "INT32",
"repetition_type": "REQUIRED",
"name": "c_birth_year:"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_customer_id:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_salutation:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_first_name:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_last_name:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_preferred_cust_flag:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_birth_country:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_email_address:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "REQUIRED",
"name": "c_last_review_date:",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
}
],
"num_rows": 100,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_customer_sk:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 50,
"total_compressed_size": 50,
"data_page_offset": 4,
"statistics": {
"max": 105,
"min": 1,
"null_count": 0,
"max_value": 105,
"min_value": 1
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9714,
"offset_index_length": 10,
"column_index_offset": 9233,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 54,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_current_cdemo_sk:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 388,
"total_compressed_size": 388,
"data_page_offset": 54,
"statistics": {
"max": 1895444,
"min": 8817,
"null_count": 0,
"max_value": 1895444,
"min_value": 8817
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9724,
"offset_index_length": 11,
"column_index_offset": 9256,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 442,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_current_hdemo_sk:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 261,
"total_compressed_size": 261,
"data_page_offset": 442,
"statistics": {
"max": 7135,
"min": 37,
"null_count": 0,
"max_value": 7135,
"min_value": 37
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9735,
"offset_index_length": 12,
"column_index_offset": 9279,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 703,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_current_addr_sk:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 307,
"total_compressed_size": 307,
"data_page_offset": 703,
"statistics": {
"max": 49388,
"min": 464,
"null_count": 0,
"max_value": 49388,
"min_value": 464
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9747,
"offset_index_length": 12,
"column_index_offset": 9302,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 1010,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_first_shipto_date_sk:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 247,
"total_compressed_size": 247,
"data_page_offset": 1010,
"statistics": {
"max": 2452641,
"min": 2449130,
"null_count": 0,
"max_value": 2452641,
"min_value": 2449130
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9759,
"offset_index_length": 12,
"column_index_offset": 9325,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 1257,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_first_sales_date_sk:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 247,
"total_compressed_size": 247,
"data_page_offset": 1257,
"statistics": {
"max": 2452611,
"min": 2449100,
"null_count": 0,
"max_value": 2452611,
"min_value": 2449100
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9771,
"offset_index_length": 12,
"column_index_offset": 9348,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 1504,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_birth_day:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 131,
"total_compressed_size": 131,
"data_page_offset": 1504,
"statistics": {
"max": 30,
"min": 1,
"null_count": 0,
"max_value": 30,
"min_value": 1
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9783,
"offset_index_length": 12,
"column_index_offset": 9371,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 1635,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_birth_month:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 115,
"total_compressed_size": 115,
"data_page_offset": 1635,
"statistics": {
"max": 12,
"min": 1,
"null_count": 0,
"max_value": 12,
"min_value": 1
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9795,
"offset_index_length": 12,
"column_index_offset": 9394,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 1750,
"meta_data": {
"type": "INT32",
"encodings": [
"DELTA_BINARY_PACKED"
],
"path_in_schema": [
"c_birth_year:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 144,
"total_compressed_size": 144,
"data_page_offset": 1750,
"statistics": {
"max": 1991,
"min": 1925,
"null_count": 0,
"max_value": 1991,
"min_value": 1925
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BINARY_PACKED",
"count": 1
}
]
},
"offset_index_offset": 9807,
"offset_index_length": 12,
"column_index_offset": 9417,
"column_index_length": 23,
"crypto_metadata": 23
},
{
"file_offset": 1894,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_customer_id:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 933,
"total_compressed_size": 933,
"data_page_offset": 1894,
"statistics": {
"null_count": 0,
"max_value": "AAAAAAAAPFAAAAAA",
"min_value": "AAAAAAAAABAAAAAA"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9819,
"offset_index_length": 12,
"column_index_offset": 9440,
"column_index_length": 47,
"crypto_metadata": 47
},
{
"file_offset": 2827,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_salutation:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 378,
"total_compressed_size": 378,
"data_page_offset": 2827,
"statistics": {
"null_count": 0,
"max_value": "Sir",
"min_value": "Dr."
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9831,
"offset_index_length": 12,
"column_index_offset": 9487,
"column_index_length": 21,
"crypto_metadata": 21
},
{
"file_offset": 3205,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_first_name:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 707,
"total_compressed_size": 707,
"data_page_offset": 3205,
"statistics": {
"null_count": 0,
"max_value": "William",
"min_value": "Albert"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9843,
"offset_index_length": 12,
"column_index_offset": 9508,
"column_index_length": 28,
"crypto_metadata": 28
},
{
"file_offset": 3912,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_last_name:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 751,
"total_compressed_size": 751,
"data_page_offset": 3912,
"statistics": {
"null_count": 0,
"max_value": "Young",
"min_value": "Baker"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9855,
"offset_index_length": 12,
"column_index_offset": 9536,
"column_index_length": 25,
"crypto_metadata": 25
},
{
"file_offset": 4663,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_preferred_cust_flag:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 154,
"total_compressed_size": 154,
"data_page_offset": 4663,
"statistics": {
"null_count": 0,
"max_value": "Y",
"min_value": "N"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9867,
"offset_index_length": 12,
"column_index_offset": 9561,
"column_index_length": 17,
"crypto_metadata": 17
},
{
"file_offset": 4817,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_birth_country:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 1154,
"total_compressed_size": 1154,
"data_page_offset": 4817,
"statistics": {
"null_count": 0,
"max_value": "WALLIS AND FUTUNA",
"min_value": "AFGHANISTAN"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9879,
"offset_index_length": 12,
"column_index_offset": 9578,
"column_index_length": 43,
"crypto_metadata": 43
},
{
"file_offset": 5971,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_email_address:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 2857,
"total_compressed_size": 2857,
"data_page_offset": 5971,
"statistics": {
"null_count": 0,
"max_value": "William.Warner@zegnrzurU.org",
"min_value": "Albert.Brunson@62.com"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9891,
"offset_index_length": 12,
"column_index_offset": 9621,
"column_index_length": 64,
"crypto_metadata": 64
},
{
"file_offset": 8828,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"DELTA_BYTE_ARRAY"
],
"path_in_schema": [
"c_last_review_date:"
],
"codec": "UNCOMPRESSED",
"num_values": 100,
"total_uncompressed_size": 405,
"total_compressed_size": 405,
"data_page_offset": 8828,
"statistics": {
"null_count": 0,
"max_value": "2452644",
"min_value": "2452293"
},
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_BYTE_ARRAY",
"count": 1
}
]
},
"offset_index_offset": 9903,
"offset_index_length": 13,
"column_index_offset": 9685,
"column_index_length": 29,
"crypto_metadata": 29
}
],
"total_byte_size": 9229,
"num_rows": 100,
"file_offset": 4,
"total_compressed_size": 9229,
"ordinal": 0
}
],
"key_value_metadata": [
{
"key": "org.apache.spark.version",
"value": "3.2.0"
},
{
"key": "org.apache.spark.sql.parquet.row.metadata",
"value": "{\"type\":\"struct\",\"fields\":[{\"name\":\"c_customer_sk:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_current_cdemo_sk:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_current_hdemo_sk:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_current_addr_sk:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_first_shipto_date_sk:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_first_sales_date_sk:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_birth_day:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_birth_month:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_birth_year:\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_customer_id:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_salutation:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_first_name:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_last_name:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_preferred_cust_flag:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_birth_country:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_email_address:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}},{\"name\":\"c_last_review_date:\",\"type\":\"string\",\"nullable\":false,\"metadata\":{}}]}"
}
],
"created_by": "parquet-mr version 1.12.1 (build 2a5c06c58fa987f85aa22170be14d927d5ff6e7d)",
"metadata_length": 3604
}

Binary file not shown.