Test for reading the last row of files

This commit is contained in:
Kenny Daniel 2025-04-06 21:21:24 -07:00
parent 8c4c7456b4
commit ba74d58dd3
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
10 changed files with 599 additions and 9 deletions

@ -45,10 +45,10 @@
"devDependencies": {
"@types/node": "22.14.0",
"@vitest/coverage-v8": "3.1.1",
"eslint": "9.23.0",
"eslint": "9.24.0",
"eslint-plugin-jsdoc": "50.6.9",
"hyparquet-compressors": "1.1.1",
"typescript": "5.8.2",
"typescript": "5.8.3",
"typescript-eslint": "8.29.0",
"vitest": "3.1.1"
}

@ -22,13 +22,14 @@ export async function parquetRead(options) {
if (!options.file || !(options.file.byteLength >= 0)) {
throw new Error('parquetRead expected file AsyncBuffer')
}
const rowStart = options.rowStart || 0
if (rowStart < 0) throw new Error('parquetRead rowStart must be postive')
// load metadata if not provided
options.metadata ||= await parquetMetadataAsync(options.file)
if (!options.metadata) throw new Error('parquet metadata not found')
const { metadata, onComplete, rowEnd } = options
const rowStart = options.rowStart || 0
/** @type {any[][]} */
const rowData = []

@ -2,8 +2,6 @@ import { defaultInitialFetchSize } from './metadata.js'
/**
* Replace bigint, date, etc with legal JSON types.
* When parsing parquet files, bigints are used to represent 64-bit integers.
* However, JSON does not support bigints, so it's helpful to convert to numbers.
*
* @param {any} obj object to convert
* @returns {unknown} converted object

@ -0,0 +1,108 @@
[
[
{
"boundary_order": "ASCENDING",
"max_values": [
"good",
"good",
"good",
"good",
"good",
"good",
"good",
"good",
"good",
"good"
],
"min_values": [
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad"
],
"null_counts": [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
],
"null_pages": [
false,
false,
false,
false,
false,
false,
false,
false,
false,
false
]
}
],
[
{
"boundary_order": "UNORDERED",
"max_values": [
"good",
"bad",
"good",
"bad",
"good",
"bad",
"good",
"good",
"bad",
"good"
],
"min_values": [
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad",
"bad"
],
"null_counts": [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
],
"null_pages": [
false,
false,
false,
false,
false,
false,
false,
false,
false,
false
]
}
]
]

@ -0,0 +1,202 @@
[
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["good"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"],
["bad"]
]

@ -0,0 +1,130 @@
{
"version": 2,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 1
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "col",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
}
],
"num_rows": 200,
"row_groups": [
{
"columns": [
{
"file_offset": 338,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"col"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 312,
"total_compressed_size": 334,
"data_page_offset": 35,
"dictionary_page_offset": 4,
"statistics": {
"null_count": 0,
"max_value": "good",
"min_value": "bad"
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 10
}
]
},
"offset_index_offset": 1036,
"offset_index_length": 85,
"column_index_offset": 798,
"column_index_length": 121
}
],
"total_byte_size": 312,
"num_rows": 100,
"file_offset": 4,
"total_compressed_size": 334,
"ordinal": 0
},
{
"columns": [
{
"file_offset": 731,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"col"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 306,
"total_compressed_size": 328,
"data_page_offset": 434,
"dictionary_page_offset": 403,
"statistics": {
"null_count": 0,
"max_value": "good",
"min_value": "bad"
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 10
}
]
},
"offset_index_offset": 1121,
"offset_index_length": 86,
"column_index_offset": 919,
"column_index_length": 117
}
],
"total_byte_size": 306,
"num_rows": 100,
"file_offset": 403,
"total_compressed_size": 328,
"ordinal": 1
}
],
"key_value_metadata": [
{
"key": "ARROW:schema",
"value": "/////3AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEFEAAAABgAAAAEAAAAAAAAAAMAAABjb2wABAAEAAQAAAAAAAAA"
}
],
"created_by": "parquet-cpp-arrow version 15.0.0",
"metadata_length": 447
}

@ -0,0 +1,114 @@
[
[
{
"page_locations": [
{
"compressed_page_size": 30,
"first_row_index": 0,
"offset": 35
},
{
"compressed_page_size": 30,
"first_row_index": 10,
"offset": 65
},
{
"compressed_page_size": 31,
"first_row_index": 20,
"offset": 95
},
{
"compressed_page_size": 30,
"first_row_index": 30,
"offset": 126
},
{
"compressed_page_size": 30,
"first_row_index": 40,
"offset": 156
},
{
"compressed_page_size": 30,
"first_row_index": 50,
"offset": 186
},
{
"compressed_page_size": 31,
"first_row_index": 60,
"offset": 216
},
{
"compressed_page_size": 30,
"first_row_index": 70,
"offset": 247
},
{
"compressed_page_size": 30,
"first_row_index": 80,
"offset": 277
},
{
"compressed_page_size": 31,
"first_row_index": 90,
"offset": 307
}
]
}
],
[
{
"page_locations": [
{
"compressed_page_size": 30,
"first_row_index": 0,
"offset": 434
},
{
"compressed_page_size": 29,
"first_row_index": 10,
"offset": 464
},
{
"compressed_page_size": 31,
"first_row_index": 20,
"offset": 493
},
{
"compressed_page_size": 29,
"first_row_index": 30,
"offset": 524
},
{
"compressed_page_size": 30,
"first_row_index": 40,
"offset": 553
},
{
"compressed_page_size": 29,
"first_row_index": 50,
"offset": 583
},
{
"compressed_page_size": 30,
"first_row_index": 60,
"offset": 612
},
{
"compressed_page_size": 30,
"first_row_index": 70,
"offset": 642
},
{
"compressed_page_size": 29,
"first_row_index": 80,
"offset": 672
},
{
"compressed_page_size": 30,
"first_row_index": 90,
"offset": 701
}
]
}
]
]

Binary file not shown.

@ -1,7 +1,10 @@
import { describe, expect, it } from 'vitest'
import { parquetRead, parquetReadObjects } from '../src/hyparquet.js'
import { describe, expect, it, vi } from 'vitest'
import { convertWithDictionary } from '../src/convert.js'
import { parquetMetadataAsync, parquetRead, parquetReadObjects } from '../src/hyparquet.js'
import { asyncBufferFromFile } from '../src/utils.js'
vi.mock('../src/convert.js', { spy: true })
describe('parquetRead', () => {
it('throws error for undefined file', async () => {
// @ts-expect-error testing invalid input
@ -164,4 +167,18 @@ describe('parquetRead', () => {
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
])
})
it('skips converting unnecessary pages', async () => {
const file = await asyncBufferFromFile('test/files/page_indexed.parquet')
const metadata = await parquetMetadataAsync(file)
vi.mocked(convertWithDictionary).mockClear()
const rows = await parquetReadObjects({
file,
metadata,
rowStart: 90,
rowEnd: 91,
})
expect(rows).toEqual([{ col: 'bad' }])
expect(convertWithDictionary).toHaveBeenCalledTimes(10)
})
})

@ -1,7 +1,7 @@
import fs from 'fs'
import { compressors } from 'hyparquet-compressors'
import { describe, expect, it } from 'vitest'
import { parquetRead } from '../src/hyparquet.js'
import { parquetMetadataAsync, parquetRead } from '../src/hyparquet.js'
import { asyncBufferFromFile, toJson } from '../src/utils.js'
import { fileToJson } from './helpers.js'
@ -17,10 +17,30 @@ describe('parquetRead test files', () => {
onComplete(rows) {
const base = filename.replace('.parquet', '')
const expected = fileToJson(`test/files/${base}.json`)
// stringify and parse to make legal json
// stringify and parse to make legal json (NaN, -0, etc)
expect(JSON.parse(JSON.stringify(toJson(rows)))).toEqual(expected)
},
})
})
it(`read the last row from ${filename}`, async () => {
// this exercises some of the page-skipping optimizations
const file = await asyncBufferFromFile(`test/files/${filename}`)
const metadata = await parquetMetadataAsync(file)
let numRows = Number(metadata.num_rows)
// repeated_no_annotation has wrong num_rows in metadata:
if (filename === 'repeated_no_annotation.parquet') numRows = 6
await parquetRead({
file,
compressors,
rowStart: numRows - 1,
rowEnd: numRows,
onComplete(rows) {
const base = filename.replace('.parquet', '')
const expected = [fileToJson(`test/files/${base}.json`).at(-1)]
expect(toJson(rows)).toEqual(expected)
},
})
})
})
})