Use hyparquet-compressors for tests (brotli, lz4, zstd)

This commit is contained in:
Kenny Daniel 2024-05-20 02:07:40 -07:00
parent 7f282d6f2c
commit da72c06ac2
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
14 changed files with 7603 additions and 12 deletions

@ -34,7 +34,7 @@
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.2.5",
"http-server": "14.1.1",
"hysnappy": "0.3.1",
"hyparquet-compressors": "0.1.2",
"typescript": "5.4.5",
"vitest": "1.6.0"
}

@ -0,0 +1,22 @@
[
[
1593604800,
[97, 98, 99],
42
],
[
1593604800,
[100, 101, 102],
7.7
],
[
1593604801,
[97, 98, 99],
42.125
],
[
1593604801,
[100, 101, 102],
7.7
]
]

@ -0,0 +1,164 @@
{
"version": 2,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 3
},
{
"type": "INT64",
"repetition_type": "OPTIONAL",
"name": "c0"
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "c1"
},
{
"type": "DOUBLE",
"repetition_type": "OPTIONAL",
"name": "v11"
}
],
"num_rows": 4,
"row_groups": [
{
"columns": [
{
"file_offset": 112,
"meta_data": {
"type": "INT64",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"c0"
],
"codec": "BROTLI",
"num_values": 4,
"total_uncompressed_size": 100,
"total_compressed_size": 108,
"data_page_offset": 38,
"dictionary_page_offset": 4,
"statistics": {
"max": 1593604801,
"min": 1593604800,
"null_count": 0,
"max_value": 1593604801,
"min_value": 1593604800
},
"encoding_stats": [
{
"page_type": 2,
"encoding": "PLAIN",
"count": 1
},
{
"page_type": 0,
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 280,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"c1"
],
"codec": "BROTLI",
"num_values": 4,
"total_uncompressed_size": 68,
"total_compressed_size": 76,
"data_page_offset": 236,
"dictionary_page_offset": 204,
"statistics": {
"null_count": 0,
"max_value": "def",
"min_value": "abc"
},
"encoding_stats": [
{
"page_type": 2,
"encoding": "PLAIN",
"count": 1
},
{
"page_type": 0,
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
},
{
"file_offset": 460,
"meta_data": {
"type": "DOUBLE",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"v11"
],
"codec": "BROTLI",
"num_values": 4,
"total_uncompressed_size": 109,
"total_compressed_size": 116,
"data_page_offset": 385,
"dictionary_page_offset": 344,
"statistics": {
"max": 42.125,
"min": 7.7,
"null_count": 0,
"max_value": 42.125,
"min_value": 7.7
},
"encoding_stats": [
{
"page_type": 2,
"encoding": "PLAIN",
"count": 1
},
{
"page_type": 0,
"encoding": "RLE_DICTIONARY",
"count": 1
}
]
}
}
],
"total_byte_size": 277,
"num_rows": 4,
"file_offset": 4,
"total_compressed_size": 300,
"ordinal": 0
}
],
"key_value_metadata": [
{
"key": "pandas",
"value": "{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"stop\": 4, \"step\": 1}], \"column_indexes\": [{\"name\": null, \"field_name\": null, \"pandas_type\": \"unicode\", \"numpy_type\": \"object\", \"metadata\": {\"encoding\": \"UTF-8\"}}], \"columns\": [{\"name\": \"c0\", \"field_name\": \"c0\", \"pandas_type\": \"int64\", \"numpy_type\": \"int64\", \"metadata\": null}, {\"name\": \"c1\", \"field_name\": \"c1\", \"pandas_type\": \"bytes\", \"numpy_type\": \"object\", \"metadata\": null}, {\"name\": \"v11\", \"field_name\": \"v11\", \"pandas_type\": \"float64\", \"numpy_type\": \"float64\", \"metadata\": null}], \"creator\": {\"library\": \"pyarrow\", \"version\": \"15.0.0\"}, \"pandas_version\": \"2.1.4\"}"
},
{
"key": "ARROW:schema",
"value": "/////4gDAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAALACAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYAAABwYW5kYXMAAHkCAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJhbmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiAwLCAic3RvcCI6IDQsICJzdGVwIjogMX1dLCAiY29sdW1uX2luZGV4ZXMiOiBbeyJuYW1lIjogbnVsbCwgImZpZWxkX25hbWUiOiBudWxsLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1weV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IHsiZW5jb2RpbmciOiAiVVRGLTgifX1dLCAiY29sdW1ucyI6IFt7Im5hbWUiOiAiYzAiLCAiZmllbGRfbmFtZSI6ICJjMCIsICJwYW5kYXNfdHlwZSI6ICJpbnQ2NCIsICJudW1weV90eXBlIjogImludDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJjMSIsICJmaWVsZF9uYW1lIjogImMxIiwgInBhbmRhc190eXBlIjogImJ5dGVzIiwgIm51bXB5X3R5cGUiOiAib2JqZWN0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJ2MTEiLCAiZmllbGRfbmFtZSI6ICJ2MTEiLCAicGFuZGFzX3R5cGUiOiAiZmxvYXQ2NCIsICJudW1weV90eXBlIjogImZsb2F0NjQiLCAibWV0YWRhdGEiOiBudWxsfV0sICJjcmVhdG9yIjogeyJsaWJyYXJ5IjogInB5YXJyb3ciLCAidmVyc2lvbiI6ICIxNS4wLjAifSwgInBhbmRhc192ZXJzaW9uIjogIjIuMS40In0AAAADAAAAdAAAADgAAAAEAAAAqP///wAAAQMQAAAAHAAAAAQAAAAAAAAAAwAAAHYxMQAAAAYACAAGAAYAAAAAAAIA2P///wAAAQQQAAAAGAAAAAQAAAAAAAAAAgAAAGMxAAAEAAQABAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAhAAAAAcAAAABAAAAAAAAAACAAAAYzAAAAgADAAIAAcACAAAAAAAAAFAAAAA"
}
],
"created_by": "parquet-cpp-arrow version 15.0.0",
"metadata_length": 2242
}

Binary file not shown.

@ -0,0 +1,22 @@
[
[
1593604800,
[97, 98, 99],
42
],
[
1593604800,
[100, 101, 102],
7.7
],
[
1593604801,
[97, 98, 99],
42.125
],
[
1593604801,
[100, 101, 102],
7.7
]
]

@ -0,0 +1,145 @@
{
"created_by": "parquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)",
"metadata_length": 376,
"num_rows": 4,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"codec": "LZ4",
"data_page_offset": 4,
"encoding_stats": [
{
"count": 1,
"encoding": "PLAIN_DICTIONARY",
"page_type": 2
},
{
"count": 1,
"encoding": "PLAIN_DICTIONARY",
"page_type": 0
}
],
"encodings": [
"BIT_PACKED",
"PLAIN_DICTIONARY"
],
"num_values": 4,
"path_in_schema": [
"c0"
],
"statistics": {
"max": 1593604801,
"max_value": 1593604801,
"min": 1593604800,
"min_value": 1593604800,
"null_count": 0
},
"total_compressed_size": 112,
"total_uncompressed_size": 93,
"type": "INT64"
}
},
{
"file_offset": 116,
"meta_data": {
"codec": "LZ4",
"data_page_offset": 116,
"encoding_stats": [
{
"count": 1,
"encoding": "PLAIN_DICTIONARY",
"page_type": 2
},
{
"count": 1,
"encoding": "PLAIN_DICTIONARY",
"page_type": 0
}
],
"encodings": [
"BIT_PACKED",
"PLAIN_DICTIONARY"
],
"num_values": 4,
"path_in_schema": [
"c1"
],
"statistics": {
"max_value": "def",
"min_value": "abc",
"null_count": 0
},
"total_compressed_size": 79,
"total_uncompressed_size": 61,
"type": "BYTE_ARRAY"
}
},
{
"file_offset": 195,
"meta_data": {
"codec": "LZ4",
"data_page_offset": 195,
"encoding_stats": [
{
"count": 1,
"encoding": "PLAIN_DICTIONARY",
"page_type": 2
},
{
"count": 1,
"encoding": "PLAIN_DICTIONARY",
"page_type": 0
}
],
"encodings": [
"BIT_PACKED",
"PLAIN_DICTIONARY",
"RLE"
],
"num_values": 4,
"path_in_schema": [
"v11"
],
"statistics": {
"max": 42.125,
"max_value": 42.125,
"min": 7.7,
"min_value": 7.7,
"null_count": 0
},
"total_compressed_size": 123,
"total_uncompressed_size": 108,
"type": "DOUBLE"
}
}
],
"num_rows": 4,
"total_byte_size": 262
}
],
"schema": [
{
"name": "",
"num_children": 3
},
{
"name": "c0",
"repetition_type": "REQUIRED",
"type": "INT64"
},
{
"name": "c1",
"repetition_type": "REQUIRED",
"type": "BYTE_ARRAY"
},
{
"name": "v11",
"repetition_type": "OPTIONAL",
"type": "DOUBLE"
}
],
"version": 1
}

Binary file not shown.

@ -0,0 +1,22 @@
[
[
1593604800,
[97, 98, 99],
42
],
[
1593604800,
[100, 101, 102],
7.7
],
[
1593604801,
[97, 98, 99],
42.125
],
[
1593604801,
[100, 101, 102],
7.7
]
]

@ -0,0 +1,133 @@
{
"created_by": "parquet-cpp version 1.5.1-SNAPSHOT",
"metadata_length": 330,
"num_rows": 4,
"row_groups": [
{
"columns": [
{
"file_offset": 89,
"meta_data": {
"codec": "LZ4_RAW",
"data_page_offset": 4,
"encoding_stats": [
{
"count": 1,
"encoding": "PLAIN",
"page_type": 0
}
],
"encodings": [
"PLAIN",
"RLE"
],
"num_values": 4,
"path_in_schema": [
"c0"
],
"statistics": {
"max": 1593604801,
"max_value": 1593604801,
"min": 1593604800,
"min_value": 1593604800,
"null_count": 0
},
"total_compressed_size": 85,
"total_uncompressed_size": 93,
"type": "INT64"
}
},
{
"file_offset": 229,
"meta_data": {
"codec": "LZ4_RAW",
"data_page_offset": 171,
"encoding_stats": [
{
"count": 1,
"encoding": "PLAIN",
"page_type": 0
}
],
"encodings": [
"PLAIN",
"RLE"
],
"num_values": 4,
"path_in_schema": [
"c1"
],
"statistics": {
"max_value": "def",
"min_value": "abc",
"null_count": 0
},
"total_compressed_size": 58,
"total_uncompressed_size": 59,
"type": "BYTE_ARRAY"
}
},
{
"file_offset": 375,
"meta_data": {
"codec": "LZ4_RAW",
"data_page_offset": 280,
"encoding_stats": [
{
"count": 1,
"encoding": "PLAIN",
"page_type": 0
}
],
"encodings": [
"PLAIN",
"RLE"
],
"num_values": 4,
"path_in_schema": [
"v11"
],
"statistics": {
"max": 42.125,
"max_value": 42.125,
"min": 7.7,
"min_value": 7.7,
"null_count": 0
},
"total_compressed_size": 95,
"total_uncompressed_size": 99,
"type": "DOUBLE"
}
}
],
"file_offset": 89,
"num_rows": 4,
"ordinal": 0,
"total_byte_size": 251,
"total_compressed_size": 238
}
],
"schema": [
{
"name": "schema",
"num_children": 3,
"repetition_type": "REQUIRED"
},
{
"name": "c0",
"repetition_type": "REQUIRED",
"type": "INT64"
},
{
"name": "c1",
"repetition_type": "REQUIRED",
"type": "BYTE_ARRAY"
},
{
"name": "v11",
"repetition_type": "OPTIONAL",
"type": "DOUBLE"
}
],
"version": 1
}

Binary file not shown.

@ -0,0 +1,292 @@
[
[
{
"count": 495,
"max": 190407175004000,
"mean": 190406671229999,
"min": 190406409000602,
"sum": 94251302258849570,
"variance": 0
},
{
"count": 495,
"max": 742,
"mean": 416,
"min": 115,
"sum": 206195,
"variance": 10374
},
{
"count": 495,
"max": 32150509,
"mean": 2401239,
"min": 737,
"sum": 1188613496,
"variance": 12977533288261
},
{
"count": 495,
"max": 35195000,
"mean": 3519838,
"min": 1000,
"sum": 1742320297,
"variance": 24581100553044
},
{
"count": 0,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 0,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 351,
"max": 658.63,
"mean": 57.76452991452993,
"min": 0,
"sum": 20275.350000000006,
"variance": 6310.500499135526
},
{
"count": 336,
"max": 312.16,
"mean": 49.40901785714286,
"min": 0,
"sum": 16601.43,
"variance": 3214.842695450431
},
{
"count": 366,
"max": 74136,
"mean": 6162.133196721318,
"min": 0,
"sum": 2255340.7500000023,
"variance": 104255249.59826614
},
{
"count": 334,
"max": 523800,
"mean": 19484.146706586827,
"min": 0,
"sum": 6507705,
"variance": 3563198650.906335
},
{
"count": 10,
"max": 16085,
"mean": 6698.8,
"min": 2628,
"sum": 66988,
"variance": 28540252.400000002
},
{
"count": 4,
"max": 1985,
"mean": 1285,
"min": 288,
"sum": 5140,
"variance": 509875.3333333333
},
{
"count": 348,
"max": 523800,
"mean": 18907.566091954024,
"min": 0,
"sum": 6579833,
"variance": 3428378496.7881336
},
{
"count": 495,
"max": 1,
"mean": 0,
"min": 0,
"sum": 452,
"variance": 0
},
{
"count": 352,
"max": 64749.63000000001,
"mean": 4951.024888352274,
"min": 0,
"sum": 1742760.7607000005,
"variance": 81195383.98823886
},
{
"count": 352,
"max": 2.8000000000000003,
"mean": 0.14237550619122732,
"min": 0,
"sum": 50.11617817931202,
"variance": 0.07516922114035923
},
{
"count": 339,
"max": 658063,
"mean": 1991.3067846607655,
"min": 0,
"sum": 675052.9999999995,
"variance": 1277234044.0126908
},
{
"count": 337,
"max": 300.7,
"mean": 65.32364985163204,
"min": 0,
"sum": 22014.069999999996,
"variance": 3904.805190507992
},
{
"count": 441,
"max": 18169,
"mean": 1528.077097505669,
"min": 0,
"sum": 673882,
"variance": 6122348.621315204
},
{
"count": 352,
"max": 121968,
"mean": 6201.877049689864,
"min": 0,
"sum": 2183060.721490832,
"variance": 142631612.6463931
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 81,
"max": 100,
"mean": 1.2345679012345678,
"min": 0,
"sum": 100,
"variance": 123.45679012345684
},
{
"count": 101,
"max": 4172.084000000002,
"mean": 42.29786138613863,
"min": 0,
"sum": 4272.084000000002,
"variance": 172355.84886194076
},
{
"count": 495,
"max": 1,
"mean": 0,
"min": 0,
"sum": 208,
"variance": 0
},
{
"count": 495,
"max": 1,
"mean": 0,
"min": 0,
"sum": 44,
"variance": 0
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
},
{
"count": 366,
"max": 121968,
"mean": 8685.222814207653,
"min": 0,
"sum": 3178791.550000001,
"variance": 243347757.98270744
},
{
"count": 364,
"max": 121968,
"mean": 8841.174394454862,
"min": 0,
"sum": 3218187.4795815693,
"variance": 244563632.41811454
},
{
"count": 495,
"max": 1,
"mean": 1,
"min": 1,
"sum": 495,
"variance": 0
},
{
"count": 495,
"max": 1608822900000000000,
"mean": 0,
"min": 1608822900000000000,
"sum": 0,
"variance": 0
},
{
"count": 495,
"max": 0,
"mean": 0,
"min": 0,
"sum": 0,
"variance": 0
}
]
]

File diff suppressed because it is too large Load Diff

Binary file not shown.

@ -1,20 +1,10 @@
import fs from 'fs'
import { compressors } from 'hyparquet-compressors'
import { describe, expect, it } from 'vitest'
import { gunzipSync } from 'zlib'
import { parquetRead } from '../src/hyparquet.js'
import { toJson } from '../src/utils.js'
import { fileToAsyncBuffer, fileToJson } from './helpers.js'
/**
* @type {import('../src/types.js').Compressors}
*/
const compressors = {
GZIP: (/** @type {Uint8Array} */ input, /** @type {number} */ outputLength) => {
const result = gunzipSync(input)
return new Uint8Array(result.buffer, result.byteOffset, outputLength)
},
}
describe('parquetRead test files', () => {
const files = fs.readdirSync('test/files').filter(f => f.endsWith('.parquet'))