diff --git a/test/files/page_indexed.column_indexes.json b/test/files/page_indexed.column_indexes.json index 510fd9a..603c52d 100644 --- a/test/files/page_indexed.column_indexes.json +++ b/test/files/page_indexed.column_indexes.json @@ -2,107 +2,33 @@ [ { "boundary_order": "ASCENDING", - "max_values": [ - "good", - "good", - "good", - "good", - "good", - "good", - "good", - "good", - "good", - "good" - ], - "min_values": [ - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad" - ], - "null_counts": [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 - ], - "null_pages": [ - false, - false, - false, - false, - false, - false, - false, - false, - false, - false - ] + "max_values": [9, 19, 29, 39, 49, 59, 69, 79, 89, 99], + "min_values": [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], + "null_counts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "null_pages": [false, false, false, false, false, false, false, false, false, false] + }, + { + "boundary_order": "ASCENDING", + "max_values": ["good", "good", "good", "good", "good", "good", "good", "good", "good", "good"], + "min_values": ["bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad"], + "null_counts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "null_pages": [false, false, false, false, false, false, false, false, false, false] } ], [ + { + "boundary_order": "ASCENDING", + "max_values": [109, 119, 129, 139, 149, 159, 169, 179, 189, 199], + "min_values": [100, 110, 120, 130, 140, 150, 160, 170, 180, 190], + "null_counts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "null_pages": [false, false, false, false, false, false, false, false, false, false] + }, { "boundary_order": "UNORDERED", - "max_values": [ - "good", - "bad", - "good", - "bad", - "good", - "bad", - "good", - "good", - "bad", - "good" - ], - "min_values": [ - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad", - "bad" - ], - "null_counts": [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0 - ], - "null_pages": [ - false, - false, - false, - false, - false, - false, - false, - false, - false, - false - ] + "max_values": ["good", "bad", "good", "bad", "good", "bad", "good", "good", "bad", "good"], + "min_values": ["bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad", "bad"], + "null_counts": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "null_pages": [false, false, false, false, false, false, false, false, false, false] } ] ] diff --git a/test/files/page_indexed.json b/test/files/page_indexed.json index 5bc4b2c..cd746a0 100644 --- a/test/files/page_indexed.json +++ b/test/files/page_indexed.json @@ -1,202 +1,202 @@ [ - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["good"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"], - ["bad"] + [0, "bad"], + [1, "bad"], + [2, "bad"], + [3, "bad"], + [4, "bad"], + [5, "bad"], + [6, "good"], + [7, "bad"], + [8, "bad"], + [9, "bad"], + [10, "good"], + [11, "bad"], + [12, "bad"], + [13, "bad"], + [14, "bad"], + [15, "bad"], + [16, "bad"], + [17, "bad"], + [18, "bad"], + [19, "bad"], + [20, "bad"], + [21, "bad"], + [22, "bad"], + [23, "bad"], + [24, "bad"], + [25, "bad"], + [26, "bad"], + [27, "bad"], + [28, "bad"], + [29, "good"], + [30, "bad"], + [31, "bad"], + [32, "good"], + [33, "bad"], + [34, "bad"], + [35, "bad"], + [36, "bad"], + [37, "good"], + [38, "bad"], + [39, "bad"], + [40, "bad"], + [41, "bad"], + [42, "good"], + [43, "bad"], + [44, "bad"], + [45, "bad"], + [46, "bad"], + [47, "bad"], + [48, "bad"], + [49, "bad"], + [50, "bad"], + [51, "bad"], + [52, "bad"], + [53, "bad"], + [54, "bad"], + [55, "bad"], + [56, "good"], + [57, "bad"], + [58, "good"], + [59, "bad"], + [60, "bad"], + [61, "bad"], + [62, "bad"], + [63, "bad"], + [64, "bad"], + [65, "bad"], + [66, "bad"], + [67, "bad"], + [68, "good"], + [69, "bad"], + [70, "bad"], + [71, "bad"], + [72, "good"], + [73, "bad"], + [74, "bad"], + [75, "bad"], + [76, "bad"], + [77, "good"], + [78, "bad"], + [79, "bad"], + [80, "bad"], + [81, "bad"], + [82, "bad"], + [83, "good"], + [84, "bad"], + [85, "bad"], + [86, "bad"], + [87, "bad"], + [88, "bad"], + [89, "bad"], + [90, "bad"], + [91, "bad"], + [92, "bad"], + [93, "bad"], + [94, "bad"], + [95, "bad"], + [96, "bad"], + [97, "bad"], + [98, "good"], + [99, "bad"], + [100, "good"], + [101, "bad"], + [102, "bad"], + [103, "bad"], + [104, "bad"], + [105, "bad"], + [106, "bad"], + [107, "bad"], + [108, "bad"], + [109, "good"], + [110, "bad"], + [111, "bad"], + [112, "bad"], + [113, "bad"], + [114, "bad"], + [115, "bad"], + [116, "bad"], + [117, "bad"], + [118, "bad"], + [119, "bad"], + [120, "bad"], + [121, "bad"], + [122, "bad"], + [123, "bad"], + [124, "bad"], + [125, "bad"], + [126, "bad"], + [127, "bad"], + [128, "good"], + [129, "bad"], + [130, "bad"], + [131, "bad"], + [132, "bad"], + [133, "bad"], + [134, "bad"], + [135, "bad"], + [136, "bad"], + [137, "bad"], + [138, "bad"], + [139, "bad"], + [140, "bad"], + [141, "bad"], + [142, "bad"], + [143, "bad"], + [144, "bad"], + [145, "good"], + [146, "bad"], + [147, "bad"], + [148, "good"], + [149, "bad"], + [150, "bad"], + [151, "bad"], + [152, "bad"], + [153, "bad"], + [154, "bad"], + [155, "bad"], + [156, "bad"], + [157, "bad"], + [158, "bad"], + [159, "bad"], + [160, "bad"], + [161, "bad"], + [162, "bad"], + [163, "bad"], + [164, "good"], + [165, "bad"], + [166, "bad"], + [167, "bad"], + [168, "good"], + [169, "bad"], + [170, "bad"], + [171, "good"], + [172, "bad"], + [173, "bad"], + [174, "bad"], + [175, "bad"], + [176, "bad"], + [177, "bad"], + [178, "bad"], + [179, "bad"], + [180, "bad"], + [181, "bad"], + [182, "bad"], + [183, "bad"], + [184, "bad"], + [185, "bad"], + [186, "bad"], + [187, "bad"], + [188, "bad"], + [189, "bad"], + [190, "good"], + [191, "bad"], + [192, "bad"], + [193, "bad"], + [194, "bad"], + [195, "bad"], + [196, "bad"], + [197, "bad"], + [198, "bad"], + [199, "bad"] ] diff --git a/test/files/page_indexed.metadata.json b/test/files/page_indexed.metadata.json index 39b6586..e90f87c 100644 --- a/test/files/page_indexed.metadata.json +++ b/test/files/page_indexed.metadata.json @@ -4,12 +4,17 @@ { "repetition_type": "REQUIRED", "name": "schema", - "num_children": 1 + "num_children": 2 + }, + { + "type": "INT64", + "repetition_type": "OPTIONAL", + "name": "row" }, { "type": "BYTE_ARRAY", "repetition_type": "OPTIONAL", - "name": "col", + "name": "quality", "converted_type": "UTF8", "logical_type": { "type": "STRING" @@ -21,7 +26,50 @@ { "columns": [ { - "file_offset": 338, + "file_offset": 0, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "row" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 1197, + "total_compressed_size": 828, + "data_page_offset": 432, + "dictionary_page_offset": 4, + "statistics": { + "max": 99, + "min": 0, + "null_count": 0, + "max_value": 99, + "min_value": 0 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 10 + } + ] + }, + "offset_index_offset": 2986, + "offset_index_length": 86, + "column_index_offset": 2326, + "column_index_length": 211 + }, + { + "file_offset": 0, "meta_data": { "type": "BYTE_ARRAY", "encodings": [ @@ -30,14 +78,14 @@ "RLE_DICTIONARY" ], "path_in_schema": [ - "col" + "quality" ], "codec": "SNAPPY", "num_values": 100, "total_uncompressed_size": 312, "total_compressed_size": 334, - "data_page_offset": 35, - "dictionary_page_offset": 4, + "data_page_offset": 863, + "dictionary_page_offset": 832, "statistics": { "null_count": 0, "max_value": "good", @@ -56,22 +104,65 @@ } ] }, - "offset_index_offset": 1036, - "offset_index_length": 85, - "column_index_offset": 798, + "offset_index_offset": 3072, + "offset_index_length": 86, + "column_index_offset": 2537, "column_index_length": 121 } ], - "total_byte_size": 312, + "total_byte_size": 1509, "num_rows": 100, "file_offset": 4, - "total_compressed_size": 334, + "total_compressed_size": 1162, "ordinal": 0 }, { "columns": [ { - "file_offset": 731, + "file_offset": 0, + "meta_data": { + "type": "INT64", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "row" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 1197, + "total_compressed_size": 832, + "data_page_offset": 1598, + "dictionary_page_offset": 1166, + "statistics": { + "max": 199, + "min": 100, + "null_count": 0, + "max_value": 199, + "min_value": 100 + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 10 + } + ] + }, + "offset_index_offset": 3158, + "offset_index_length": 86, + "column_index_offset": 2658, + "column_index_length": 211 + }, + { + "file_offset": 0, "meta_data": { "type": "BYTE_ARRAY", "encodings": [ @@ -80,14 +171,14 @@ "RLE_DICTIONARY" ], "path_in_schema": [ - "col" + "quality" ], "codec": "SNAPPY", "num_values": 100, "total_uncompressed_size": 306, "total_compressed_size": 328, - "data_page_offset": 434, - "dictionary_page_offset": 403, + "data_page_offset": 2029, + "dictionary_page_offset": 1998, "statistics": { "null_count": 0, "max_value": "good", @@ -106,25 +197,25 @@ } ] }, - "offset_index_offset": 1121, + "offset_index_offset": 3244, "offset_index_length": 86, - "column_index_offset": 919, + "column_index_offset": 2869, "column_index_length": 117 } ], - "total_byte_size": 306, + "total_byte_size": 1503, "num_rows": 100, - "file_offset": 403, - "total_compressed_size": 328, + "file_offset": 1166, + "total_compressed_size": 1160, "ordinal": 1 } ], "key_value_metadata": [ { "key": "ARROW:schema", - "value": "/////3AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEFEAAAABgAAAAEAAAAAAAAAAMAAABjb2wABAAEAAQAAAAAAAAA" + "value": "/////6gAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABEAAAABAAAANT///8AAAEFEAAAABwAAAAEAAAAAAAAAAcAAABxdWFsaXR5AAQABAAEAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAECEAAAABwAAAAEAAAAAAAAAAMAAAByb3cACAAMAAgABwAIAAAAAAAAAUAAAAA=" } ], - "created_by": "parquet-cpp-arrow version 15.0.0", - "metadata_length": 447 + "created_by": "parquet-cpp-arrow version 19.0.1", + "metadata_length": 761 } diff --git a/test/files/page_indexed.offset_indexes.json b/test/files/page_indexed.offset_indexes.json index 1ac7ee4..e036172 100644 --- a/test/files/page_indexed.offset_indexes.json +++ b/test/files/page_indexed.offset_indexes.json @@ -1,112 +1,220 @@ [ [ + { + "page_locations": [ + { + "compressed_page_size": 36, + "first_row_index": 0, + "offset": 432 + }, + { + "compressed_page_size": 38, + "first_row_index": 10, + "offset": 468 + }, + { + "compressed_page_size": 38, + "first_row_index": 20, + "offset": 506 + }, + { + "compressed_page_size": 40, + "first_row_index": 30, + "offset": 544 + }, + { + "compressed_page_size": 40, + "first_row_index": 40, + "offset": 584 + }, + { + "compressed_page_size": 40, + "first_row_index": 50, + "offset": 624 + }, + { + "compressed_page_size": 42, + "first_row_index": 60, + "offset": 664 + }, + { + "compressed_page_size": 42, + "first_row_index": 70, + "offset": 706 + }, + { + "compressed_page_size": 42, + "first_row_index": 80, + "offset": 748 + }, + { + "compressed_page_size": 42, + "first_row_index": 90, + "offset": 790 + } + ] + }, { "page_locations": [ { "compressed_page_size": 30, "first_row_index": 0, - "offset": 35 + "offset": 863 }, { "compressed_page_size": 30, "first_row_index": 10, - "offset": 65 + "offset": 893 }, { "compressed_page_size": 31, "first_row_index": 20, - "offset": 95 + "offset": 923 }, { "compressed_page_size": 30, "first_row_index": 30, - "offset": 126 + "offset": 954 }, { "compressed_page_size": 30, "first_row_index": 40, - "offset": 156 + "offset": 984 }, { "compressed_page_size": 30, "first_row_index": 50, - "offset": 186 + "offset": 1014 }, { "compressed_page_size": 31, "first_row_index": 60, - "offset": 216 + "offset": 1044 }, { "compressed_page_size": 30, "first_row_index": 70, - "offset": 247 + "offset": 1075 }, { "compressed_page_size": 30, "first_row_index": 80, - "offset": 277 + "offset": 1105 }, { "compressed_page_size": 31, "first_row_index": 90, - "offset": 307 + "offset": 1135 } ] } ], [ + { + "page_locations": [ + { + "compressed_page_size": 36, + "first_row_index": 0, + "offset": 1598 + }, + { + "compressed_page_size": 38, + "first_row_index": 10, + "offset": 1634 + }, + { + "compressed_page_size": 38, + "first_row_index": 20, + "offset": 1672 + }, + { + "compressed_page_size": 40, + "first_row_index": 30, + "offset": 1710 + }, + { + "compressed_page_size": 40, + "first_row_index": 40, + "offset": 1750 + }, + { + "compressed_page_size": 40, + "first_row_index": 50, + "offset": 1790 + }, + { + "compressed_page_size": 42, + "first_row_index": 60, + "offset": 1830 + }, + { + "compressed_page_size": 42, + "first_row_index": 70, + "offset": 1872 + }, + { + "compressed_page_size": 42, + "first_row_index": 80, + "offset": 1914 + }, + { + "compressed_page_size": 42, + "first_row_index": 90, + "offset": 1956 + } + ] + }, { "page_locations": [ { "compressed_page_size": 30, "first_row_index": 0, - "offset": 434 + "offset": 2029 }, { "compressed_page_size": 29, "first_row_index": 10, - "offset": 464 + "offset": 2059 }, { "compressed_page_size": 31, "first_row_index": 20, - "offset": 493 + "offset": 2088 }, { "compressed_page_size": 29, "first_row_index": 30, - "offset": 524 + "offset": 2119 }, { "compressed_page_size": 30, "first_row_index": 40, - "offset": 553 + "offset": 2148 }, { "compressed_page_size": 29, "first_row_index": 50, - "offset": 583 + "offset": 2178 }, { "compressed_page_size": 30, "first_row_index": 60, - "offset": 612 + "offset": 2207 }, { "compressed_page_size": 30, "first_row_index": 70, - "offset": 642 + "offset": 2237 }, { "compressed_page_size": 29, "first_row_index": 80, - "offset": 672 + "offset": 2267 }, { "compressed_page_size": 30, "first_row_index": 90, - "offset": 701 + "offset": 2296 } ] } diff --git a/test/files/page_indexed.parquet b/test/files/page_indexed.parquet index fbb176c..4c03a8a 100644 Binary files a/test/files/page_indexed.parquet and b/test/files/page_indexed.parquet differ diff --git a/test/read.test.js b/test/read.test.js index b80ea05..5540102 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -178,13 +178,13 @@ describe('parquetRead', () => { rowStart: 90, rowEnd: 91, }) - expect(rows).toEqual([{ col: 'bad' }]) - expect(convertWithDictionary).toHaveBeenCalledTimes(2) + expect(rows).toEqual([{ row: 90n, quality: 'bad' }]) + expect(convertWithDictionary).toHaveBeenCalledTimes(4) }) it('reads individual pages', async () => { - const file = await asyncBufferFromFile('test/files/page_indexed.parquet') - /** @type {import('../src/types.js').ColumnData[]} */ + const file = countingBuffer(await asyncBufferFromFile('test/files/page_indexed.parquet')) + /** @type {ColumnData[]} */ const pages = [] await parquetRead({ @@ -196,7 +196,13 @@ describe('parquetRead', () => { expect(pages).toEqual([ { - columnName: 'col', + columnName: 'row', + columnData: Array.from({ length: 100 }, (_, i) => BigInt(i)), + rowStart: 0, + rowEnd: 100, + }, + { + columnName: 'quality', columnData: [ 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', @@ -213,7 +219,13 @@ describe('parquetRead', () => { rowEnd: 100, }, { - columnName: 'col', + columnName: 'row', + columnData: Array.from({ length: 100 }, (_, i) => BigInt(i + 100)), + rowStart: 100, + rowEnd: 200, + }, + { + columnName: 'quality', columnData: [ 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', @@ -230,5 +242,25 @@ describe('parquetRead', () => { rowEnd: 200, }, ]) + expect(file.fetches).toBe(3) // 1 metadata, 2 rowgroups }) }) + +/** + * Wraps an AsyncBuffer to count the number of fetches made + * + * @import {AsyncBuffer, ColumnData} from '../src/types.js' + * @param {AsyncBuffer} asyncBuffer + * @returns {AsyncBuffer & {fetches: number}} + */ + +function countingBuffer(asyncBuffer) { + return { + ...asyncBuffer, + fetches: 0, + slice(start, end) { + this.fetches++ + return asyncBuffer.slice(start, end) + }, + } +}