mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-03 18:26:38 +00:00
Test for reading the last row of files
This commit is contained in:
parent
8c4c7456b4
commit
ba74d58dd3
@ -45,10 +45,10 @@
|
||||
"devDependencies": {
|
||||
"@types/node": "22.14.0",
|
||||
"@vitest/coverage-v8": "3.1.1",
|
||||
"eslint": "9.23.0",
|
||||
"eslint": "9.24.0",
|
||||
"eslint-plugin-jsdoc": "50.6.9",
|
||||
"hyparquet-compressors": "1.1.1",
|
||||
"typescript": "5.8.2",
|
||||
"typescript": "5.8.3",
|
||||
"typescript-eslint": "8.29.0",
|
||||
"vitest": "3.1.1"
|
||||
}
|
||||
|
||||
@ -22,13 +22,14 @@ export async function parquetRead(options) {
|
||||
if (!options.file || !(options.file.byteLength >= 0)) {
|
||||
throw new Error('parquetRead expected file AsyncBuffer')
|
||||
}
|
||||
const rowStart = options.rowStart || 0
|
||||
if (rowStart < 0) throw new Error('parquetRead rowStart must be postive')
|
||||
|
||||
// load metadata if not provided
|
||||
options.metadata ||= await parquetMetadataAsync(options.file)
|
||||
if (!options.metadata) throw new Error('parquet metadata not found')
|
||||
|
||||
const { metadata, onComplete, rowEnd } = options
|
||||
const rowStart = options.rowStart || 0
|
||||
/** @type {any[][]} */
|
||||
const rowData = []
|
||||
|
||||
|
||||
@ -2,8 +2,6 @@ import { defaultInitialFetchSize } from './metadata.js'
|
||||
|
||||
/**
|
||||
* Replace bigint, date, etc with legal JSON types.
|
||||
* When parsing parquet files, bigints are used to represent 64-bit integers.
|
||||
* However, JSON does not support bigints, so it's helpful to convert to numbers.
|
||||
*
|
||||
* @param {any} obj object to convert
|
||||
* @returns {unknown} converted object
|
||||
|
||||
108
test/files/page_indexed.column_indexes.json
Normal file
108
test/files/page_indexed.column_indexes.json
Normal file
@ -0,0 +1,108 @@
|
||||
[
|
||||
[
|
||||
{
|
||||
"boundary_order": "ASCENDING",
|
||||
"max_values": [
|
||||
"good",
|
||||
"good",
|
||||
"good",
|
||||
"good",
|
||||
"good",
|
||||
"good",
|
||||
"good",
|
||||
"good",
|
||||
"good",
|
||||
"good"
|
||||
],
|
||||
"min_values": [
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad"
|
||||
],
|
||||
"null_counts": [
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
],
|
||||
"null_pages": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"boundary_order": "UNORDERED",
|
||||
"max_values": [
|
||||
"good",
|
||||
"bad",
|
||||
"good",
|
||||
"bad",
|
||||
"good",
|
||||
"bad",
|
||||
"good",
|
||||
"good",
|
||||
"bad",
|
||||
"good"
|
||||
],
|
||||
"min_values": [
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad",
|
||||
"bad"
|
||||
],
|
||||
"null_counts": [
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
],
|
||||
"null_pages": [
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false
|
||||
]
|
||||
}
|
||||
]
|
||||
]
|
||||
202
test/files/page_indexed.json
Normal file
202
test/files/page_indexed.json
Normal file
@ -0,0 +1,202 @@
|
||||
[
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["good"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"],
|
||||
["bad"]
|
||||
]
|
||||
130
test/files/page_indexed.metadata.json
Normal file
130
test/files/page_indexed.metadata.json
Normal file
@ -0,0 +1,130 @@
|
||||
{
|
||||
"version": 2,
|
||||
"schema": [
|
||||
{
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "schema",
|
||||
"num_children": 1
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "col",
|
||||
"converted_type": "UTF8",
|
||||
"logical_type": {
|
||||
"type": "STRING"
|
||||
}
|
||||
}
|
||||
],
|
||||
"num_rows": 200,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 338,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN",
|
||||
"RLE",
|
||||
"RLE_DICTIONARY"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"col"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 312,
|
||||
"total_compressed_size": 334,
|
||||
"data_page_offset": 35,
|
||||
"dictionary_page_offset": 4,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "good",
|
||||
"min_value": "bad"
|
||||
},
|
||||
"encoding_stats": [
|
||||
{
|
||||
"page_type": "DICTIONARY_PAGE",
|
||||
"encoding": "PLAIN",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"page_type": "DATA_PAGE",
|
||||
"encoding": "RLE_DICTIONARY",
|
||||
"count": 10
|
||||
}
|
||||
]
|
||||
},
|
||||
"offset_index_offset": 1036,
|
||||
"offset_index_length": 85,
|
||||
"column_index_offset": 798,
|
||||
"column_index_length": 121
|
||||
}
|
||||
],
|
||||
"total_byte_size": 312,
|
||||
"num_rows": 100,
|
||||
"file_offset": 4,
|
||||
"total_compressed_size": 334,
|
||||
"ordinal": 0
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 731,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN",
|
||||
"RLE",
|
||||
"RLE_DICTIONARY"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"col"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 306,
|
||||
"total_compressed_size": 328,
|
||||
"data_page_offset": 434,
|
||||
"dictionary_page_offset": 403,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "good",
|
||||
"min_value": "bad"
|
||||
},
|
||||
"encoding_stats": [
|
||||
{
|
||||
"page_type": "DICTIONARY_PAGE",
|
||||
"encoding": "PLAIN",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"page_type": "DATA_PAGE",
|
||||
"encoding": "RLE_DICTIONARY",
|
||||
"count": 10
|
||||
}
|
||||
]
|
||||
},
|
||||
"offset_index_offset": 1121,
|
||||
"offset_index_length": 86,
|
||||
"column_index_offset": 919,
|
||||
"column_index_length": 117
|
||||
}
|
||||
],
|
||||
"total_byte_size": 306,
|
||||
"num_rows": 100,
|
||||
"file_offset": 403,
|
||||
"total_compressed_size": 328,
|
||||
"ordinal": 1
|
||||
}
|
||||
],
|
||||
"key_value_metadata": [
|
||||
{
|
||||
"key": "ARROW:schema",
|
||||
"value": "/////3AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEFEAAAABgAAAAEAAAAAAAAAAMAAABjb2wABAAEAAQAAAAAAAAA"
|
||||
}
|
||||
],
|
||||
"created_by": "parquet-cpp-arrow version 15.0.0",
|
||||
"metadata_length": 447
|
||||
}
|
||||
114
test/files/page_indexed.offset_indexes.json
Normal file
114
test/files/page_indexed.offset_indexes.json
Normal file
@ -0,0 +1,114 @@
|
||||
[
|
||||
[
|
||||
{
|
||||
"page_locations": [
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 0,
|
||||
"offset": 35
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 10,
|
||||
"offset": 65
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 31,
|
||||
"first_row_index": 20,
|
||||
"offset": 95
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 30,
|
||||
"offset": 126
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 40,
|
||||
"offset": 156
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 50,
|
||||
"offset": 186
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 31,
|
||||
"first_row_index": 60,
|
||||
"offset": 216
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 70,
|
||||
"offset": 247
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 80,
|
||||
"offset": 277
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 31,
|
||||
"first_row_index": 90,
|
||||
"offset": 307
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
[
|
||||
{
|
||||
"page_locations": [
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 0,
|
||||
"offset": 434
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 29,
|
||||
"first_row_index": 10,
|
||||
"offset": 464
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 31,
|
||||
"first_row_index": 20,
|
||||
"offset": 493
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 29,
|
||||
"first_row_index": 30,
|
||||
"offset": 524
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 40,
|
||||
"offset": 553
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 29,
|
||||
"first_row_index": 50,
|
||||
"offset": 583
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 60,
|
||||
"offset": 612
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 70,
|
||||
"offset": 642
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 29,
|
||||
"first_row_index": 80,
|
||||
"offset": 672
|
||||
},
|
||||
{
|
||||
"compressed_page_size": 30,
|
||||
"first_row_index": 90,
|
||||
"offset": 701
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
]
|
||||
BIN
test/files/page_indexed.parquet
Normal file
BIN
test/files/page_indexed.parquet
Normal file
Binary file not shown.
@ -1,7 +1,10 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetRead, parquetReadObjects } from '../src/hyparquet.js'
|
||||
import { describe, expect, it, vi } from 'vitest'
|
||||
import { convertWithDictionary } from '../src/convert.js'
|
||||
import { parquetMetadataAsync, parquetRead, parquetReadObjects } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile } from '../src/utils.js'
|
||||
|
||||
vi.mock('../src/convert.js', { spy: true })
|
||||
|
||||
describe('parquetRead', () => {
|
||||
it('throws error for undefined file', async () => {
|
||||
// @ts-expect-error testing invalid input
|
||||
@ -164,4 +167,18 @@ describe('parquetRead', () => {
|
||||
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
|
||||
])
|
||||
})
|
||||
|
||||
it('skips converting unnecessary pages', async () => {
|
||||
const file = await asyncBufferFromFile('test/files/page_indexed.parquet')
|
||||
const metadata = await parquetMetadataAsync(file)
|
||||
vi.mocked(convertWithDictionary).mockClear()
|
||||
const rows = await parquetReadObjects({
|
||||
file,
|
||||
metadata,
|
||||
rowStart: 90,
|
||||
rowEnd: 91,
|
||||
})
|
||||
expect(rows).toEqual([{ col: 'bad' }])
|
||||
expect(convertWithDictionary).toHaveBeenCalledTimes(10)
|
||||
})
|
||||
})
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import fs from 'fs'
|
||||
import { compressors } from 'hyparquet-compressors'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetRead } from '../src/hyparquet.js'
|
||||
import { parquetMetadataAsync, parquetRead } from '../src/hyparquet.js'
|
||||
import { asyncBufferFromFile, toJson } from '../src/utils.js'
|
||||
import { fileToJson } from './helpers.js'
|
||||
|
||||
@ -17,10 +17,30 @@ describe('parquetRead test files', () => {
|
||||
onComplete(rows) {
|
||||
const base = filename.replace('.parquet', '')
|
||||
const expected = fileToJson(`test/files/${base}.json`)
|
||||
// stringify and parse to make legal json
|
||||
// stringify and parse to make legal json (NaN, -0, etc)
|
||||
expect(JSON.parse(JSON.stringify(toJson(rows)))).toEqual(expected)
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
it(`read the last row from ${filename}`, async () => {
|
||||
// this exercises some of the page-skipping optimizations
|
||||
const file = await asyncBufferFromFile(`test/files/${filename}`)
|
||||
const metadata = await parquetMetadataAsync(file)
|
||||
let numRows = Number(metadata.num_rows)
|
||||
// repeated_no_annotation has wrong num_rows in metadata:
|
||||
if (filename === 'repeated_no_annotation.parquet') numRows = 6
|
||||
await parquetRead({
|
||||
file,
|
||||
compressors,
|
||||
rowStart: numRows - 1,
|
||||
rowEnd: numRows,
|
||||
onComplete(rows) {
|
||||
const base = filename.replace('.parquet', '')
|
||||
const expected = [fileToJson(`test/files/${base}.json`).at(-1)]
|
||||
expect(toJson(rows)).toEqual(expected)
|
||||
},
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user