diff --git a/package.json b/package.json index 1fcb425..f4617da 100644 --- a/package.json +++ b/package.json @@ -55,12 +55,12 @@ "test": "vitest run" }, "devDependencies": { - "@types/node": "24.10.0", - "@vitest/coverage-v8": "4.0.6", + "@types/node": "24.10.1", + "@vitest/coverage-v8": "4.0.12", "eslint": "9.39.1", - "eslint-plugin-jsdoc": "61.1.12", + "eslint-plugin-jsdoc": "61.4.0", "hyparquet-compressors": "1.1.1", "typescript": "5.9.3", - "vitest": "4.0.6" + "vitest": "4.0.12" } } diff --git a/src/plan.js b/src/plan.js index c8d9d1b..5304559 100644 --- a/src/plan.js +++ b/src/plan.js @@ -26,7 +26,7 @@ export function parquetPlan({ metadata, rowStart = 0, rowEnd = Infinity, columns const groupRows = Number(rowGroup.num_rows) const groupEnd = groupStart + groupRows // if row group overlaps with row range, add it to the plan - if (groupRows > 0 && groupEnd >= rowStart && groupStart < rowEnd) { + if (groupRows > 0 && groupEnd > rowStart && groupStart < rowEnd) { /** @type {ByteRange[]} */ const ranges = [] // loop through each column chunk diff --git a/test/files/alpha.json b/test/files/alpha.json new file mode 100644 index 0000000..d6b5daa --- /dev/null +++ b/test/files/alpha.json @@ -0,0 +1,70 @@ +[ + ["aa"], ["ab"], ["ac"], ["ad"], ["ae"], ["af"], ["ag"], ["ah"], ["ai"], ["aj"], + ["ak"], ["al"], ["am"], ["an"], ["ao"], ["ap"], ["aq"], ["ar"], ["as"], ["at"], + ["au"], ["av"], ["aw"], ["ax"], ["ay"], ["az"], ["ba"], ["bb"], ["bc"], ["bd"], + ["be"], ["bf"], ["bg"], ["bh"], ["bi"], ["bj"], ["bk"], ["bl"], ["bm"], ["bn"], + ["bo"], ["bp"], ["bq"], ["br"], ["bs"], ["bt"], ["bu"], ["bv"], ["bw"], ["bx"], + ["by"], ["bz"], ["ca"], ["cb"], ["cc"], ["cd"], ["ce"], ["cf"], ["cg"], ["ch"], + ["ci"], ["cj"], ["ck"], ["cl"], ["cm"], ["cn"], ["co"], ["cp"], ["cq"], ["cr"], + ["cs"], ["ct"], ["cu"], ["cv"], ["cw"], ["cx"], ["cy"], ["cz"], ["da"], ["db"], + ["dc"], ["dd"], ["de"], ["df"], ["dg"], ["dh"], ["di"], ["dj"], ["dk"], ["dl"], + ["dm"], ["dn"], ["do"], ["dp"], ["dq"], ["dr"], ["ds"], ["dt"], ["du"], ["dv"], + ["dw"], ["dx"], ["dy"], ["dz"], ["ea"], ["eb"], ["ec"], ["ed"], ["ee"], ["ef"], + ["eg"], ["eh"], ["ei"], ["ej"], ["ek"], ["el"], ["em"], ["en"], ["eo"], ["ep"], + ["eq"], ["er"], ["es"], ["et"], ["eu"], ["ev"], ["ew"], ["ex"], ["ey"], ["ez"], + ["fa"], ["fb"], ["fc"], ["fd"], ["fe"], ["ff"], ["fg"], ["fh"], ["fi"], ["fj"], + ["fk"], ["fl"], ["fm"], ["fn"], ["fo"], ["fp"], ["fq"], ["fr"], ["fs"], ["ft"], + ["fu"], ["fv"], ["fw"], ["fx"], ["fy"], ["fz"], ["ga"], ["gb"], ["gc"], ["gd"], + ["ge"], ["gf"], ["gg"], ["gh"], ["gi"], ["gj"], ["gk"], ["gl"], ["gm"], ["gn"], + ["go"], ["gp"], ["gq"], ["gr"], ["gs"], ["gt"], ["gu"], ["gv"], ["gw"], ["gx"], + ["gy"], ["gz"], ["ha"], ["hb"], ["hc"], ["hd"], ["he"], ["hf"], ["hg"], ["hh"], + ["hi"], ["hj"], ["hk"], ["hl"], ["hm"], ["hn"], ["ho"], ["hp"], ["hq"], ["hr"], + ["hs"], ["ht"], ["hu"], ["hv"], ["hw"], ["hx"], ["hy"], ["hz"], ["ia"], ["ib"], + ["ic"], ["id"], ["ie"], ["if"], ["ig"], ["ih"], ["ii"], ["ij"], ["ik"], ["il"], + ["im"], ["in"], ["io"], ["ip"], ["iq"], ["ir"], ["is"], ["it"], ["iu"], ["iv"], + ["iw"], ["ix"], ["iy"], ["iz"], ["ja"], ["jb"], ["jc"], ["jd"], ["je"], ["jf"], + ["jg"], ["jh"], ["ji"], ["jj"], ["jk"], ["jl"], ["jm"], ["jn"], ["jo"], ["jp"], + ["jq"], ["jr"], ["js"], ["jt"], ["ju"], ["jv"], ["jw"], ["jx"], ["jy"], ["jz"], + ["ka"], ["kb"], ["kc"], ["kd"], ["ke"], ["kf"], ["kg"], ["kh"], ["ki"], ["kj"], + ["kk"], ["kl"], ["km"], ["kn"], ["ko"], ["kp"], ["kq"], ["kr"], ["ks"], ["kt"], + ["ku"], ["kv"], ["kw"], ["kx"], ["ky"], ["kz"], ["la"], ["lb"], ["lc"], ["ld"], + ["le"], ["lf"], ["lg"], ["lh"], ["li"], ["lj"], ["lk"], ["ll"], ["lm"], ["ln"], + ["lo"], ["lp"], ["lq"], ["lr"], ["ls"], ["lt"], ["lu"], ["lv"], ["lw"], ["lx"], + ["ly"], ["lz"], ["ma"], ["mb"], ["mc"], ["md"], ["me"], ["mf"], ["mg"], ["mh"], + ["mi"], ["mj"], ["mk"], ["ml"], ["mm"], ["mn"], ["mo"], ["mp"], ["mq"], ["mr"], + ["ms"], ["mt"], ["mu"], ["mv"], ["mw"], ["mx"], ["my"], ["mz"], ["na"], ["nb"], + ["nc"], ["nd"], ["ne"], ["nf"], ["ng"], ["nh"], ["ni"], ["nj"], ["nk"], ["nl"], + ["nm"], ["nn"], ["no"], ["np"], ["nq"], ["nr"], ["ns"], ["nt"], ["nu"], ["nv"], + ["nw"], ["nx"], ["ny"], ["nz"], ["oa"], ["ob"], ["oc"], ["od"], ["oe"], ["of"], + ["og"], ["oh"], ["oi"], ["oj"], ["ok"], ["ol"], ["om"], ["on"], ["oo"], ["op"], + ["oq"], ["or"], ["os"], ["ot"], ["ou"], ["ov"], ["ow"], ["ox"], ["oy"], ["oz"], + ["pa"], ["pb"], ["pc"], ["pd"], ["pe"], ["pf"], ["pg"], ["ph"], ["pi"], ["pj"], + ["pk"], ["pl"], ["pm"], ["pn"], ["po"], ["pp"], ["pq"], ["pr"], ["ps"], ["pt"], + ["pu"], ["pv"], ["pw"], ["px"], ["py"], ["pz"], ["qa"], ["qb"], ["qc"], ["qd"], + ["qe"], ["qf"], ["qg"], ["qh"], ["qi"], ["qj"], ["qk"], ["ql"], ["qm"], ["qn"], + ["qo"], ["qp"], ["qq"], ["qr"], ["qs"], ["qt"], ["qu"], ["qv"], ["qw"], ["qx"], + ["qy"], ["qz"], ["ra"], ["rb"], ["rc"], ["rd"], ["re"], ["rf"], ["rg"], ["rh"], + ["ri"], ["rj"], ["rk"], ["rl"], ["rm"], ["rn"], ["ro"], ["rp"], ["rq"], ["rr"], + ["rs"], ["rt"], ["ru"], ["rv"], ["rw"], ["rx"], ["ry"], ["rz"], ["sa"], ["sb"], + ["sc"], ["sd"], ["se"], ["sf"], ["sg"], ["sh"], ["si"], ["sj"], ["sk"], ["sl"], + ["sm"], ["sn"], ["so"], ["sp"], ["sq"], ["sr"], ["ss"], ["st"], ["su"], ["sv"], + ["sw"], ["sx"], ["sy"], ["sz"], ["ta"], ["tb"], ["tc"], ["td"], ["te"], ["tf"], + ["tg"], ["th"], ["ti"], ["tj"], ["tk"], ["tl"], ["tm"], ["tn"], ["to"], ["tp"], + ["tq"], ["tr"], ["ts"], ["tt"], ["tu"], ["tv"], ["tw"], ["tx"], ["ty"], ["tz"], + ["ua"], ["ub"], ["uc"], ["ud"], ["ue"], ["uf"], ["ug"], ["uh"], ["ui"], ["uj"], + ["uk"], ["ul"], ["um"], ["un"], ["uo"], ["up"], ["uq"], ["ur"], ["us"], ["ut"], + ["uu"], ["uv"], ["uw"], ["ux"], ["uy"], ["uz"], ["va"], ["vb"], ["vc"], ["vd"], + ["ve"], ["vf"], ["vg"], ["vh"], ["vi"], ["vj"], ["vk"], ["vl"], ["vm"], ["vn"], + ["vo"], ["vp"], ["vq"], ["vr"], ["vs"], ["vt"], ["vu"], ["vv"], ["vw"], ["vx"], + ["vy"], ["vz"], ["wa"], ["wb"], ["wc"], ["wd"], ["we"], ["wf"], ["wg"], ["wh"], + ["wi"], ["wj"], ["wk"], ["wl"], ["wm"], ["wn"], ["wo"], ["wp"], ["wq"], ["wr"], + ["ws"], ["wt"], ["wu"], ["wv"], ["ww"], ["wx"], ["wy"], ["wz"], ["xa"], ["xb"], + ["xc"], ["xd"], ["xe"], ["xf"], ["xg"], ["xh"], ["xi"], ["xj"], ["xk"], ["xl"], + ["xm"], ["xn"], ["xo"], ["xp"], ["xq"], ["xr"], ["xs"], ["xt"], ["xu"], ["xv"], + ["xw"], ["xx"], ["xy"], ["xz"], ["ya"], ["yb"], ["yc"], ["yd"], ["ye"], ["yf"], + ["yg"], ["yh"], ["yi"], ["yj"], ["yk"], ["yl"], ["ym"], ["yn"], ["yo"], ["yp"], + ["yq"], ["yr"], ["ys"], ["yt"], ["yu"], ["yv"], ["yw"], ["yx"], ["yy"], ["yz"], + ["za"], ["zb"], ["zc"], ["zd"], ["ze"], ["zf"], ["zg"], ["zh"], ["zi"], ["zj"], + ["zk"], ["zl"], ["zm"], ["zn"], ["zo"], ["zp"], ["zq"], ["zr"], ["zs"], ["zt"], + ["zu"], ["zv"], ["zw"], ["zx"], ["zy"], ["zz"] +] diff --git a/test/files/alpha.metadata.json b/test/files/alpha.metadata.json new file mode 100644 index 0000000..bd4e64a --- /dev/null +++ b/test/files/alpha.metadata.json @@ -0,0 +1,216 @@ +{ + "version": 2, + "schema": [ + { + "name": "root", + "num_children": 1 + }, + { + "type": "BYTE_ARRAY", + "repetition_type": "OPTIONAL", + "name": "id", + "converted_type": "UTF8" + } + ], + "num_rows": 676, + "row_groups": [ + { + "columns": [ + { + "file_offset": 4, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN" + ], + "path_in_schema": [ + "id" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 440, + "total_compressed_size": 440, + "data_page_offset": 4, + "statistics": { + "null_count": 0, + "max_value": "dv", + "min_value": "aa" + } + } + } + ], + "total_byte_size": 440, + "num_rows": 100 + }, + { + "columns": [ + { + "file_offset": 444, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN" + ], + "path_in_schema": [ + "id" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 441, + "total_compressed_size": 441, + "data_page_offset": 444, + "statistics": { + "null_count": 0, + "max_value": "hr", + "min_value": "dw" + } + } + } + ], + "total_byte_size": 441, + "num_rows": 100 + }, + { + "columns": [ + { + "file_offset": 885, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN" + ], + "path_in_schema": [ + "id" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 437, + "total_compressed_size": 437, + "data_page_offset": 885, + "statistics": { + "null_count": 0, + "max_value": "ln", + "min_value": "hs" + } + } + } + ], + "total_byte_size": 437, + "num_rows": 100 + }, + { + "columns": [ + { + "file_offset": 1322, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN" + ], + "path_in_schema": [ + "id" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 438, + "total_compressed_size": 438, + "data_page_offset": 1322, + "statistics": { + "null_count": 0, + "max_value": "pj", + "min_value": "lo" + } + } + } + ], + "total_byte_size": 438, + "num_rows": 100 + }, + { + "columns": [ + { + "file_offset": 1760, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN" + ], + "path_in_schema": [ + "id" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 440, + "total_compressed_size": 440, + "data_page_offset": 1760, + "statistics": { + "null_count": 0, + "max_value": "tf", + "min_value": "pk" + } + } + } + ], + "total_byte_size": 440, + "num_rows": 100 + }, + { + "columns": [ + { + "file_offset": 2200, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN" + ], + "path_in_schema": [ + "id" + ], + "codec": "SNAPPY", + "num_values": 100, + "total_uncompressed_size": 435, + "total_compressed_size": 435, + "data_page_offset": 2200, + "statistics": { + "null_count": 0, + "max_value": "xb", + "min_value": "tg" + } + } + } + ], + "total_byte_size": 435, + "num_rows": 100 + }, + { + "columns": [ + { + "file_offset": 2635, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "PLAIN" + ], + "path_in_schema": [ + "id" + ], + "codec": "SNAPPY", + "num_values": 76, + "total_uncompressed_size": 335, + "total_compressed_size": 335, + "data_page_offset": 2635, + "statistics": { + "null_count": 0, + "max_value": "zz", + "min_value": "xc" + } + } + } + ], + "total_byte_size": 335, + "num_rows": 76 + } + ], + "created_by": "hyparquet", + "metadata_length": 396 +} diff --git a/test/files/alpha.parquet b/test/files/alpha.parquet new file mode 100644 index 0000000..b06e10c Binary files /dev/null and b/test/files/alpha.parquet differ diff --git a/test/read.test.js b/test/read.test.js index 85e0892..5e691cc 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -20,7 +20,7 @@ describe('parquetRead', () => { .rejects.toThrow('parquet expected AsyncBuffer') }) - it('filter by row', async () => { + it('read row range', async () => { const file = await asyncBufferFromFile('test/files/rowgroups.parquet') await parquetRead({ file, @@ -32,7 +32,7 @@ describe('parquetRead', () => { }) }) - it('filter by row overestimate', async () => { + it('row range overestimate', async () => { const file = await asyncBufferFromFile('test/files/rowgroups.parquet') await parquetRead({ file, @@ -183,6 +183,20 @@ describe('parquetRead', () => { expect(convertWithDictionary).toHaveBeenCalledTimes(4) }) + it('reads only required row groups on the boundary', async () => { + const originalFile = await asyncBufferFromFile('test/files/alpha.parquet') + const metadata = await parquetMetadataAsync(originalFile) + const file = countingBuffer(originalFile) + await parquetReadObjects({ + file, + metadata, + rowStart: 100, + rowEnd: 200, + }) + expect(file.fetches).toBe(1) // 1 rowgroup + expect(file.bytes).toBe(441) // bytes for 2nd rowgroup + }) + it('reads individual pages', async () => { const file = countingBuffer(await asyncBufferFromFile('test/files/page_indexed.parquet')) /** @type {import('../src/types.js').ColumnData[]} */