diff --git a/package.json b/package.json index d49af32..4faaf79 100644 --- a/package.json +++ b/package.json @@ -45,10 +45,10 @@ "devDependencies": { "@types/node": "22.14.0", "@vitest/coverage-v8": "3.1.1", - "eslint": "9.23.0", + "eslint": "9.24.0", "eslint-plugin-jsdoc": "50.6.9", "hyparquet-compressors": "1.1.1", - "typescript": "5.8.2", + "typescript": "5.8.3", "typescript-eslint": "8.29.0", "vitest": "3.1.1" } diff --git a/src/read.js b/src/read.js index d2fff05..ccb3bdc 100644 --- a/src/read.js +++ b/src/read.js @@ -22,13 +22,14 @@ export async function parquetRead(options) { if (!options.file || !(options.file.byteLength >= 0)) { throw new Error('parquetRead expected file AsyncBuffer') } + const rowStart = options.rowStart || 0 + if (rowStart < 0) throw new Error('parquetRead rowStart must be postive') // load metadata if not provided options.metadata ||= await parquetMetadataAsync(options.file) if (!options.metadata) throw new Error('parquet metadata not found') const { metadata, onComplete, rowEnd } = options - const rowStart = options.rowStart || 0 /** @type {any[][]} */ const rowData = [] diff --git a/src/utils.js b/src/utils.js index 22c4498..9a11350 100644 --- a/src/utils.js +++ b/src/utils.js @@ -2,8 +2,6 @@ import { defaultInitialFetchSize } from './metadata.js' /** * Replace bigint, date, etc with legal JSON types. - * When parsing parquet files, bigints are used to represent 64-bit integers. - * However, JSON does not support bigints, so it's helpful to convert to numbers. * * @param {any} obj object to convert * @returns {unknown} converted object diff --git a/test/files/page_indexed.column_indexes.json b/test/files/page_indexed.column_indexes.json new file mode 100644 index 0000000..510fd9a --- /dev/null +++ b/test/files/page_indexed.column_indexes.json @@ -0,0 +1,108 @@ +[ + [ + { + "boundary_order": "ASCENDING", + "max_values": [ + "good", + "good", + "good", + "good", + "good", + "good", + "good", + "good", + "good", + "good" + ], + "min_values": [ + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad" + ], + "null_counts": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "null_pages": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ] + } + ], + [ + { + "boundary_order": "UNORDERED", + "max_values": [ + "good", + "bad", + "good", + "bad", + "good", + "bad", + "good", + "good", + "bad", + "good" + ], + "min_values": [ + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad", + "bad" + ], + "null_counts": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ], + "null_pages": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ] + } + ] +] diff --git a/test/files/page_indexed.json b/test/files/page_indexed.json new file mode 100644 index 0000000..5bc4b2c --- /dev/null +++ b/test/files/page_indexed.json @@ -0,0 +1,202 @@ +[ + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["good"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"], + ["bad"] +] diff --git a/test/files/page_indexed.metadata.json b/test/files/page_indexed.metadata.json new file mode 100644 index 0000000..39b6586 --- /dev/null +++ b/test/files/page_indexed.metadata.json @@ -0,0 +1,130 @@ +{ + "version": 2, + "schema": [ + { + "repetition_type": "REQUIRED", + "name": "schema", + "num_children": 1 + }, + { + "type": "BYTE_ARRAY", + "repetition_type": "OPTIONAL", + "name": "col", + "converted_type": "UTF8", + "logical_type": { + "type": "STRING" + } + } + ], + "num_rows": 200, + "row_groups": [ + { + "columns": [ + { + "file_offset": 338, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "col" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 312, + "total_compressed_size": 334, + "data_page_offset": 35, + "dictionary_page_offset": 4, + "statistics": { + "null_count": 0, + "max_value": "good", + "min_value": "bad" + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 10 + } + ] + }, + "offset_index_offset": 1036, + "offset_index_length": 85, + "column_index_offset": 798, + "column_index_length": 121 + } + ], + "total_byte_size": 312, + "num_rows": 100, + "file_offset": 4, + "total_compressed_size": 334, + "ordinal": 0 + }, + { + "columns": [ + { + "file_offset": 731, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "col" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 306, + "total_compressed_size": 328, + "data_page_offset": 434, + "dictionary_page_offset": 403, + "statistics": { + "null_count": 0, + "max_value": "good", + "min_value": "bad" + }, + "encoding_stats": [ + { + "page_type": "DICTIONARY_PAGE", + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": "DATA_PAGE", + "encoding": "RLE_DICTIONARY", + "count": 10 + } + ] + }, + "offset_index_offset": 1121, + "offset_index_length": 86, + "column_index_offset": 919, + "column_index_length": 117 + } + ], + "total_byte_size": 306, + "num_rows": 100, + "file_offset": 403, + "total_compressed_size": 328, + "ordinal": 1 + } + ], + "key_value_metadata": [ + { + "key": "ARROW:schema", + "value": "/////3AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEFEAAAABgAAAAEAAAAAAAAAAMAAABjb2wABAAEAAQAAAAAAAAA" + } + ], + "created_by": "parquet-cpp-arrow version 15.0.0", + "metadata_length": 447 +} diff --git a/test/files/page_indexed.offset_indexes.json b/test/files/page_indexed.offset_indexes.json new file mode 100644 index 0000000..1ac7ee4 --- /dev/null +++ b/test/files/page_indexed.offset_indexes.json @@ -0,0 +1,114 @@ +[ + [ + { + "page_locations": [ + { + "compressed_page_size": 30, + "first_row_index": 0, + "offset": 35 + }, + { + "compressed_page_size": 30, + "first_row_index": 10, + "offset": 65 + }, + { + "compressed_page_size": 31, + "first_row_index": 20, + "offset": 95 + }, + { + "compressed_page_size": 30, + "first_row_index": 30, + "offset": 126 + }, + { + "compressed_page_size": 30, + "first_row_index": 40, + "offset": 156 + }, + { + "compressed_page_size": 30, + "first_row_index": 50, + "offset": 186 + }, + { + "compressed_page_size": 31, + "first_row_index": 60, + "offset": 216 + }, + { + "compressed_page_size": 30, + "first_row_index": 70, + "offset": 247 + }, + { + "compressed_page_size": 30, + "first_row_index": 80, + "offset": 277 + }, + { + "compressed_page_size": 31, + "first_row_index": 90, + "offset": 307 + } + ] + } + ], + [ + { + "page_locations": [ + { + "compressed_page_size": 30, + "first_row_index": 0, + "offset": 434 + }, + { + "compressed_page_size": 29, + "first_row_index": 10, + "offset": 464 + }, + { + "compressed_page_size": 31, + "first_row_index": 20, + "offset": 493 + }, + { + "compressed_page_size": 29, + "first_row_index": 30, + "offset": 524 + }, + { + "compressed_page_size": 30, + "first_row_index": 40, + "offset": 553 + }, + { + "compressed_page_size": 29, + "first_row_index": 50, + "offset": 583 + }, + { + "compressed_page_size": 30, + "first_row_index": 60, + "offset": 612 + }, + { + "compressed_page_size": 30, + "first_row_index": 70, + "offset": 642 + }, + { + "compressed_page_size": 29, + "first_row_index": 80, + "offset": 672 + }, + { + "compressed_page_size": 30, + "first_row_index": 90, + "offset": 701 + } + ] + } + ] +] diff --git a/test/files/page_indexed.parquet b/test/files/page_indexed.parquet new file mode 100644 index 0000000..fbb176c Binary files /dev/null and b/test/files/page_indexed.parquet differ diff --git a/test/read.test.js b/test/read.test.js index 4ad9a34..cea3d85 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -1,7 +1,10 @@ -import { describe, expect, it } from 'vitest' -import { parquetRead, parquetReadObjects } from '../src/hyparquet.js' +import { describe, expect, it, vi } from 'vitest' +import { convertWithDictionary } from '../src/convert.js' +import { parquetMetadataAsync, parquetRead, parquetReadObjects } from '../src/hyparquet.js' import { asyncBufferFromFile } from '../src/utils.js' +vi.mock('../src/convert.js', { spy: true }) + describe('parquetRead', () => { it('throws error for undefined file', async () => { // @ts-expect-error testing invalid input @@ -164,4 +167,18 @@ describe('parquetRead', () => { { a: 'abc', b: 5, c: 2, d: true, e: [1, 2] }, ]) }) + + it('skips converting unnecessary pages', async () => { + const file = await asyncBufferFromFile('test/files/page_indexed.parquet') + const metadata = await parquetMetadataAsync(file) + vi.mocked(convertWithDictionary).mockClear() + const rows = await parquetReadObjects({ + file, + metadata, + rowStart: 90, + rowEnd: 91, + }) + expect(rows).toEqual([{ col: 'bad' }]) + expect(convertWithDictionary).toHaveBeenCalledTimes(10) + }) }) diff --git a/test/readFiles.test.js b/test/readFiles.test.js index 3e9f505..260ea6b 100644 --- a/test/readFiles.test.js +++ b/test/readFiles.test.js @@ -1,7 +1,7 @@ import fs from 'fs' import { compressors } from 'hyparquet-compressors' import { describe, expect, it } from 'vitest' -import { parquetRead } from '../src/hyparquet.js' +import { parquetMetadataAsync, parquetRead } from '../src/hyparquet.js' import { asyncBufferFromFile, toJson } from '../src/utils.js' import { fileToJson } from './helpers.js' @@ -17,10 +17,30 @@ describe('parquetRead test files', () => { onComplete(rows) { const base = filename.replace('.parquet', '') const expected = fileToJson(`test/files/${base}.json`) - // stringify and parse to make legal json + // stringify and parse to make legal json (NaN, -0, etc) expect(JSON.parse(JSON.stringify(toJson(rows)))).toEqual(expected) }, }) }) + + it(`read the last row from ${filename}`, async () => { + // this exercises some of the page-skipping optimizations + const file = await asyncBufferFromFile(`test/files/${filename}`) + const metadata = await parquetMetadataAsync(file) + let numRows = Number(metadata.num_rows) + // repeated_no_annotation has wrong num_rows in metadata: + if (filename === 'repeated_no_annotation.parquet') numRows = 6 + await parquetRead({ + file, + compressors, + rowStart: numRows - 1, + rowEnd: numRows, + onComplete(rows) { + const base = filename.replace('.parquet', '') + const expected = [fileToJson(`test/files/${base}.json`).at(-1)] + expect(toJson(rows)).toEqual(expected) + }, + }) + }) }) })