Fix plan row boundaries

This commit is contained in:
Kenny Daniel 2025-11-21 00:11:07 -08:00
parent 0d1fd452aa
commit c3a42b5bc9
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
6 changed files with 307 additions and 7 deletions

@ -55,12 +55,12 @@
"test": "vitest run"
},
"devDependencies": {
"@types/node": "24.10.0",
"@vitest/coverage-v8": "4.0.6",
"@types/node": "24.10.1",
"@vitest/coverage-v8": "4.0.12",
"eslint": "9.39.1",
"eslint-plugin-jsdoc": "61.1.12",
"eslint-plugin-jsdoc": "61.4.0",
"hyparquet-compressors": "1.1.1",
"typescript": "5.9.3",
"vitest": "4.0.6"
"vitest": "4.0.12"
}
}

@ -26,7 +26,7 @@ export function parquetPlan({ metadata, rowStart = 0, rowEnd = Infinity, columns
const groupRows = Number(rowGroup.num_rows)
const groupEnd = groupStart + groupRows
// if row group overlaps with row range, add it to the plan
if (groupRows > 0 && groupEnd >= rowStart && groupStart < rowEnd) {
if (groupRows > 0 && groupEnd > rowStart && groupStart < rowEnd) {
/** @type {ByteRange[]} */
const ranges = []
// loop through each column chunk

70
test/files/alpha.json Normal file

@ -0,0 +1,70 @@
[
["aa"], ["ab"], ["ac"], ["ad"], ["ae"], ["af"], ["ag"], ["ah"], ["ai"], ["aj"],
["ak"], ["al"], ["am"], ["an"], ["ao"], ["ap"], ["aq"], ["ar"], ["as"], ["at"],
["au"], ["av"], ["aw"], ["ax"], ["ay"], ["az"], ["ba"], ["bb"], ["bc"], ["bd"],
["be"], ["bf"], ["bg"], ["bh"], ["bi"], ["bj"], ["bk"], ["bl"], ["bm"], ["bn"],
["bo"], ["bp"], ["bq"], ["br"], ["bs"], ["bt"], ["bu"], ["bv"], ["bw"], ["bx"],
["by"], ["bz"], ["ca"], ["cb"], ["cc"], ["cd"], ["ce"], ["cf"], ["cg"], ["ch"],
["ci"], ["cj"], ["ck"], ["cl"], ["cm"], ["cn"], ["co"], ["cp"], ["cq"], ["cr"],
["cs"], ["ct"], ["cu"], ["cv"], ["cw"], ["cx"], ["cy"], ["cz"], ["da"], ["db"],
["dc"], ["dd"], ["de"], ["df"], ["dg"], ["dh"], ["di"], ["dj"], ["dk"], ["dl"],
["dm"], ["dn"], ["do"], ["dp"], ["dq"], ["dr"], ["ds"], ["dt"], ["du"], ["dv"],
["dw"], ["dx"], ["dy"], ["dz"], ["ea"], ["eb"], ["ec"], ["ed"], ["ee"], ["ef"],
["eg"], ["eh"], ["ei"], ["ej"], ["ek"], ["el"], ["em"], ["en"], ["eo"], ["ep"],
["eq"], ["er"], ["es"], ["et"], ["eu"], ["ev"], ["ew"], ["ex"], ["ey"], ["ez"],
["fa"], ["fb"], ["fc"], ["fd"], ["fe"], ["ff"], ["fg"], ["fh"], ["fi"], ["fj"],
["fk"], ["fl"], ["fm"], ["fn"], ["fo"], ["fp"], ["fq"], ["fr"], ["fs"], ["ft"],
["fu"], ["fv"], ["fw"], ["fx"], ["fy"], ["fz"], ["ga"], ["gb"], ["gc"], ["gd"],
["ge"], ["gf"], ["gg"], ["gh"], ["gi"], ["gj"], ["gk"], ["gl"], ["gm"], ["gn"],
["go"], ["gp"], ["gq"], ["gr"], ["gs"], ["gt"], ["gu"], ["gv"], ["gw"], ["gx"],
["gy"], ["gz"], ["ha"], ["hb"], ["hc"], ["hd"], ["he"], ["hf"], ["hg"], ["hh"],
["hi"], ["hj"], ["hk"], ["hl"], ["hm"], ["hn"], ["ho"], ["hp"], ["hq"], ["hr"],
["hs"], ["ht"], ["hu"], ["hv"], ["hw"], ["hx"], ["hy"], ["hz"], ["ia"], ["ib"],
["ic"], ["id"], ["ie"], ["if"], ["ig"], ["ih"], ["ii"], ["ij"], ["ik"], ["il"],
["im"], ["in"], ["io"], ["ip"], ["iq"], ["ir"], ["is"], ["it"], ["iu"], ["iv"],
["iw"], ["ix"], ["iy"], ["iz"], ["ja"], ["jb"], ["jc"], ["jd"], ["je"], ["jf"],
["jg"], ["jh"], ["ji"], ["jj"], ["jk"], ["jl"], ["jm"], ["jn"], ["jo"], ["jp"],
["jq"], ["jr"], ["js"], ["jt"], ["ju"], ["jv"], ["jw"], ["jx"], ["jy"], ["jz"],
["ka"], ["kb"], ["kc"], ["kd"], ["ke"], ["kf"], ["kg"], ["kh"], ["ki"], ["kj"],
["kk"], ["kl"], ["km"], ["kn"], ["ko"], ["kp"], ["kq"], ["kr"], ["ks"], ["kt"],
["ku"], ["kv"], ["kw"], ["kx"], ["ky"], ["kz"], ["la"], ["lb"], ["lc"], ["ld"],
["le"], ["lf"], ["lg"], ["lh"], ["li"], ["lj"], ["lk"], ["ll"], ["lm"], ["ln"],
["lo"], ["lp"], ["lq"], ["lr"], ["ls"], ["lt"], ["lu"], ["lv"], ["lw"], ["lx"],
["ly"], ["lz"], ["ma"], ["mb"], ["mc"], ["md"], ["me"], ["mf"], ["mg"], ["mh"],
["mi"], ["mj"], ["mk"], ["ml"], ["mm"], ["mn"], ["mo"], ["mp"], ["mq"], ["mr"],
["ms"], ["mt"], ["mu"], ["mv"], ["mw"], ["mx"], ["my"], ["mz"], ["na"], ["nb"],
["nc"], ["nd"], ["ne"], ["nf"], ["ng"], ["nh"], ["ni"], ["nj"], ["nk"], ["nl"],
["nm"], ["nn"], ["no"], ["np"], ["nq"], ["nr"], ["ns"], ["nt"], ["nu"], ["nv"],
["nw"], ["nx"], ["ny"], ["nz"], ["oa"], ["ob"], ["oc"], ["od"], ["oe"], ["of"],
["og"], ["oh"], ["oi"], ["oj"], ["ok"], ["ol"], ["om"], ["on"], ["oo"], ["op"],
["oq"], ["or"], ["os"], ["ot"], ["ou"], ["ov"], ["ow"], ["ox"], ["oy"], ["oz"],
["pa"], ["pb"], ["pc"], ["pd"], ["pe"], ["pf"], ["pg"], ["ph"], ["pi"], ["pj"],
["pk"], ["pl"], ["pm"], ["pn"], ["po"], ["pp"], ["pq"], ["pr"], ["ps"], ["pt"],
["pu"], ["pv"], ["pw"], ["px"], ["py"], ["pz"], ["qa"], ["qb"], ["qc"], ["qd"],
["qe"], ["qf"], ["qg"], ["qh"], ["qi"], ["qj"], ["qk"], ["ql"], ["qm"], ["qn"],
["qo"], ["qp"], ["qq"], ["qr"], ["qs"], ["qt"], ["qu"], ["qv"], ["qw"], ["qx"],
["qy"], ["qz"], ["ra"], ["rb"], ["rc"], ["rd"], ["re"], ["rf"], ["rg"], ["rh"],
["ri"], ["rj"], ["rk"], ["rl"], ["rm"], ["rn"], ["ro"], ["rp"], ["rq"], ["rr"],
["rs"], ["rt"], ["ru"], ["rv"], ["rw"], ["rx"], ["ry"], ["rz"], ["sa"], ["sb"],
["sc"], ["sd"], ["se"], ["sf"], ["sg"], ["sh"], ["si"], ["sj"], ["sk"], ["sl"],
["sm"], ["sn"], ["so"], ["sp"], ["sq"], ["sr"], ["ss"], ["st"], ["su"], ["sv"],
["sw"], ["sx"], ["sy"], ["sz"], ["ta"], ["tb"], ["tc"], ["td"], ["te"], ["tf"],
["tg"], ["th"], ["ti"], ["tj"], ["tk"], ["tl"], ["tm"], ["tn"], ["to"], ["tp"],
["tq"], ["tr"], ["ts"], ["tt"], ["tu"], ["tv"], ["tw"], ["tx"], ["ty"], ["tz"],
["ua"], ["ub"], ["uc"], ["ud"], ["ue"], ["uf"], ["ug"], ["uh"], ["ui"], ["uj"],
["uk"], ["ul"], ["um"], ["un"], ["uo"], ["up"], ["uq"], ["ur"], ["us"], ["ut"],
["uu"], ["uv"], ["uw"], ["ux"], ["uy"], ["uz"], ["va"], ["vb"], ["vc"], ["vd"],
["ve"], ["vf"], ["vg"], ["vh"], ["vi"], ["vj"], ["vk"], ["vl"], ["vm"], ["vn"],
["vo"], ["vp"], ["vq"], ["vr"], ["vs"], ["vt"], ["vu"], ["vv"], ["vw"], ["vx"],
["vy"], ["vz"], ["wa"], ["wb"], ["wc"], ["wd"], ["we"], ["wf"], ["wg"], ["wh"],
["wi"], ["wj"], ["wk"], ["wl"], ["wm"], ["wn"], ["wo"], ["wp"], ["wq"], ["wr"],
["ws"], ["wt"], ["wu"], ["wv"], ["ww"], ["wx"], ["wy"], ["wz"], ["xa"], ["xb"],
["xc"], ["xd"], ["xe"], ["xf"], ["xg"], ["xh"], ["xi"], ["xj"], ["xk"], ["xl"],
["xm"], ["xn"], ["xo"], ["xp"], ["xq"], ["xr"], ["xs"], ["xt"], ["xu"], ["xv"],
["xw"], ["xx"], ["xy"], ["xz"], ["ya"], ["yb"], ["yc"], ["yd"], ["ye"], ["yf"],
["yg"], ["yh"], ["yi"], ["yj"], ["yk"], ["yl"], ["ym"], ["yn"], ["yo"], ["yp"],
["yq"], ["yr"], ["ys"], ["yt"], ["yu"], ["yv"], ["yw"], ["yx"], ["yy"], ["yz"],
["za"], ["zb"], ["zc"], ["zd"], ["ze"], ["zf"], ["zg"], ["zh"], ["zi"], ["zj"],
["zk"], ["zl"], ["zm"], ["zn"], ["zo"], ["zp"], ["zq"], ["zr"], ["zs"], ["zt"],
["zu"], ["zv"], ["zw"], ["zx"], ["zy"], ["zz"]
]

@ -0,0 +1,216 @@
{
"version": 2,
"schema": [
{
"name": "root",
"num_children": 1
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "id",
"converted_type": "UTF8"
}
],
"num_rows": 676,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN"
],
"path_in_schema": [
"id"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 440,
"total_compressed_size": 440,
"data_page_offset": 4,
"statistics": {
"null_count": 0,
"max_value": "dv",
"min_value": "aa"
}
}
}
],
"total_byte_size": 440,
"num_rows": 100
},
{
"columns": [
{
"file_offset": 444,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN"
],
"path_in_schema": [
"id"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 441,
"total_compressed_size": 441,
"data_page_offset": 444,
"statistics": {
"null_count": 0,
"max_value": "hr",
"min_value": "dw"
}
}
}
],
"total_byte_size": 441,
"num_rows": 100
},
{
"columns": [
{
"file_offset": 885,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN"
],
"path_in_schema": [
"id"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 437,
"total_compressed_size": 437,
"data_page_offset": 885,
"statistics": {
"null_count": 0,
"max_value": "ln",
"min_value": "hs"
}
}
}
],
"total_byte_size": 437,
"num_rows": 100
},
{
"columns": [
{
"file_offset": 1322,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN"
],
"path_in_schema": [
"id"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 438,
"total_compressed_size": 438,
"data_page_offset": 1322,
"statistics": {
"null_count": 0,
"max_value": "pj",
"min_value": "lo"
}
}
}
],
"total_byte_size": 438,
"num_rows": 100
},
{
"columns": [
{
"file_offset": 1760,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN"
],
"path_in_schema": [
"id"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 440,
"total_compressed_size": 440,
"data_page_offset": 1760,
"statistics": {
"null_count": 0,
"max_value": "tf",
"min_value": "pk"
}
}
}
],
"total_byte_size": 440,
"num_rows": 100
},
{
"columns": [
{
"file_offset": 2200,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN"
],
"path_in_schema": [
"id"
],
"codec": "SNAPPY",
"num_values": 100,
"total_uncompressed_size": 435,
"total_compressed_size": 435,
"data_page_offset": 2200,
"statistics": {
"null_count": 0,
"max_value": "xb",
"min_value": "tg"
}
}
}
],
"total_byte_size": 435,
"num_rows": 100
},
{
"columns": [
{
"file_offset": 2635,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN"
],
"path_in_schema": [
"id"
],
"codec": "SNAPPY",
"num_values": 76,
"total_uncompressed_size": 335,
"total_compressed_size": 335,
"data_page_offset": 2635,
"statistics": {
"null_count": 0,
"max_value": "zz",
"min_value": "xc"
}
}
}
],
"total_byte_size": 335,
"num_rows": 76
}
],
"created_by": "hyparquet",
"metadata_length": 396
}

BIN
test/files/alpha.parquet Normal file

Binary file not shown.

@ -20,7 +20,7 @@ describe('parquetRead', () => {
.rejects.toThrow('parquet expected AsyncBuffer')
})
it('filter by row', async () => {
it('read row range', async () => {
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
await parquetRead({
file,
@ -32,7 +32,7 @@ describe('parquetRead', () => {
})
})
it('filter by row overestimate', async () => {
it('row range overestimate', async () => {
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
await parquetRead({
file,
@ -183,6 +183,20 @@ describe('parquetRead', () => {
expect(convertWithDictionary).toHaveBeenCalledTimes(4)
})
it('reads only required row groups on the boundary', async () => {
const originalFile = await asyncBufferFromFile('test/files/alpha.parquet')
const metadata = await parquetMetadataAsync(originalFile)
const file = countingBuffer(originalFile)
await parquetReadObjects({
file,
metadata,
rowStart: 100,
rowEnd: 200,
})
expect(file.fetches).toBe(1) // 1 rowgroup
expect(file.bytes).toBe(441) // bytes for 2nd rowgroup
})
it('reads individual pages', async () => {
const file = countingBuffer(await asyncBufferFromFile('test/files/page_indexed.parquet'))
/** @type {import('../src/types.js').ColumnData[]} */