mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-05 22:41:55 +00:00
Fix plan row boundaries
This commit is contained in:
parent
0d1fd452aa
commit
c3a42b5bc9
@ -55,12 +55,12 @@
|
||||
"test": "vitest run"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "24.10.0",
|
||||
"@vitest/coverage-v8": "4.0.6",
|
||||
"@types/node": "24.10.1",
|
||||
"@vitest/coverage-v8": "4.0.12",
|
||||
"eslint": "9.39.1",
|
||||
"eslint-plugin-jsdoc": "61.1.12",
|
||||
"eslint-plugin-jsdoc": "61.4.0",
|
||||
"hyparquet-compressors": "1.1.1",
|
||||
"typescript": "5.9.3",
|
||||
"vitest": "4.0.6"
|
||||
"vitest": "4.0.12"
|
||||
}
|
||||
}
|
||||
|
||||
@ -26,7 +26,7 @@ export function parquetPlan({ metadata, rowStart = 0, rowEnd = Infinity, columns
|
||||
const groupRows = Number(rowGroup.num_rows)
|
||||
const groupEnd = groupStart + groupRows
|
||||
// if row group overlaps with row range, add it to the plan
|
||||
if (groupRows > 0 && groupEnd >= rowStart && groupStart < rowEnd) {
|
||||
if (groupRows > 0 && groupEnd > rowStart && groupStart < rowEnd) {
|
||||
/** @type {ByteRange[]} */
|
||||
const ranges = []
|
||||
// loop through each column chunk
|
||||
|
||||
70
test/files/alpha.json
Normal file
70
test/files/alpha.json
Normal file
@ -0,0 +1,70 @@
|
||||
[
|
||||
["aa"], ["ab"], ["ac"], ["ad"], ["ae"], ["af"], ["ag"], ["ah"], ["ai"], ["aj"],
|
||||
["ak"], ["al"], ["am"], ["an"], ["ao"], ["ap"], ["aq"], ["ar"], ["as"], ["at"],
|
||||
["au"], ["av"], ["aw"], ["ax"], ["ay"], ["az"], ["ba"], ["bb"], ["bc"], ["bd"],
|
||||
["be"], ["bf"], ["bg"], ["bh"], ["bi"], ["bj"], ["bk"], ["bl"], ["bm"], ["bn"],
|
||||
["bo"], ["bp"], ["bq"], ["br"], ["bs"], ["bt"], ["bu"], ["bv"], ["bw"], ["bx"],
|
||||
["by"], ["bz"], ["ca"], ["cb"], ["cc"], ["cd"], ["ce"], ["cf"], ["cg"], ["ch"],
|
||||
["ci"], ["cj"], ["ck"], ["cl"], ["cm"], ["cn"], ["co"], ["cp"], ["cq"], ["cr"],
|
||||
["cs"], ["ct"], ["cu"], ["cv"], ["cw"], ["cx"], ["cy"], ["cz"], ["da"], ["db"],
|
||||
["dc"], ["dd"], ["de"], ["df"], ["dg"], ["dh"], ["di"], ["dj"], ["dk"], ["dl"],
|
||||
["dm"], ["dn"], ["do"], ["dp"], ["dq"], ["dr"], ["ds"], ["dt"], ["du"], ["dv"],
|
||||
["dw"], ["dx"], ["dy"], ["dz"], ["ea"], ["eb"], ["ec"], ["ed"], ["ee"], ["ef"],
|
||||
["eg"], ["eh"], ["ei"], ["ej"], ["ek"], ["el"], ["em"], ["en"], ["eo"], ["ep"],
|
||||
["eq"], ["er"], ["es"], ["et"], ["eu"], ["ev"], ["ew"], ["ex"], ["ey"], ["ez"],
|
||||
["fa"], ["fb"], ["fc"], ["fd"], ["fe"], ["ff"], ["fg"], ["fh"], ["fi"], ["fj"],
|
||||
["fk"], ["fl"], ["fm"], ["fn"], ["fo"], ["fp"], ["fq"], ["fr"], ["fs"], ["ft"],
|
||||
["fu"], ["fv"], ["fw"], ["fx"], ["fy"], ["fz"], ["ga"], ["gb"], ["gc"], ["gd"],
|
||||
["ge"], ["gf"], ["gg"], ["gh"], ["gi"], ["gj"], ["gk"], ["gl"], ["gm"], ["gn"],
|
||||
["go"], ["gp"], ["gq"], ["gr"], ["gs"], ["gt"], ["gu"], ["gv"], ["gw"], ["gx"],
|
||||
["gy"], ["gz"], ["ha"], ["hb"], ["hc"], ["hd"], ["he"], ["hf"], ["hg"], ["hh"],
|
||||
["hi"], ["hj"], ["hk"], ["hl"], ["hm"], ["hn"], ["ho"], ["hp"], ["hq"], ["hr"],
|
||||
["hs"], ["ht"], ["hu"], ["hv"], ["hw"], ["hx"], ["hy"], ["hz"], ["ia"], ["ib"],
|
||||
["ic"], ["id"], ["ie"], ["if"], ["ig"], ["ih"], ["ii"], ["ij"], ["ik"], ["il"],
|
||||
["im"], ["in"], ["io"], ["ip"], ["iq"], ["ir"], ["is"], ["it"], ["iu"], ["iv"],
|
||||
["iw"], ["ix"], ["iy"], ["iz"], ["ja"], ["jb"], ["jc"], ["jd"], ["je"], ["jf"],
|
||||
["jg"], ["jh"], ["ji"], ["jj"], ["jk"], ["jl"], ["jm"], ["jn"], ["jo"], ["jp"],
|
||||
["jq"], ["jr"], ["js"], ["jt"], ["ju"], ["jv"], ["jw"], ["jx"], ["jy"], ["jz"],
|
||||
["ka"], ["kb"], ["kc"], ["kd"], ["ke"], ["kf"], ["kg"], ["kh"], ["ki"], ["kj"],
|
||||
["kk"], ["kl"], ["km"], ["kn"], ["ko"], ["kp"], ["kq"], ["kr"], ["ks"], ["kt"],
|
||||
["ku"], ["kv"], ["kw"], ["kx"], ["ky"], ["kz"], ["la"], ["lb"], ["lc"], ["ld"],
|
||||
["le"], ["lf"], ["lg"], ["lh"], ["li"], ["lj"], ["lk"], ["ll"], ["lm"], ["ln"],
|
||||
["lo"], ["lp"], ["lq"], ["lr"], ["ls"], ["lt"], ["lu"], ["lv"], ["lw"], ["lx"],
|
||||
["ly"], ["lz"], ["ma"], ["mb"], ["mc"], ["md"], ["me"], ["mf"], ["mg"], ["mh"],
|
||||
["mi"], ["mj"], ["mk"], ["ml"], ["mm"], ["mn"], ["mo"], ["mp"], ["mq"], ["mr"],
|
||||
["ms"], ["mt"], ["mu"], ["mv"], ["mw"], ["mx"], ["my"], ["mz"], ["na"], ["nb"],
|
||||
["nc"], ["nd"], ["ne"], ["nf"], ["ng"], ["nh"], ["ni"], ["nj"], ["nk"], ["nl"],
|
||||
["nm"], ["nn"], ["no"], ["np"], ["nq"], ["nr"], ["ns"], ["nt"], ["nu"], ["nv"],
|
||||
["nw"], ["nx"], ["ny"], ["nz"], ["oa"], ["ob"], ["oc"], ["od"], ["oe"], ["of"],
|
||||
["og"], ["oh"], ["oi"], ["oj"], ["ok"], ["ol"], ["om"], ["on"], ["oo"], ["op"],
|
||||
["oq"], ["or"], ["os"], ["ot"], ["ou"], ["ov"], ["ow"], ["ox"], ["oy"], ["oz"],
|
||||
["pa"], ["pb"], ["pc"], ["pd"], ["pe"], ["pf"], ["pg"], ["ph"], ["pi"], ["pj"],
|
||||
["pk"], ["pl"], ["pm"], ["pn"], ["po"], ["pp"], ["pq"], ["pr"], ["ps"], ["pt"],
|
||||
["pu"], ["pv"], ["pw"], ["px"], ["py"], ["pz"], ["qa"], ["qb"], ["qc"], ["qd"],
|
||||
["qe"], ["qf"], ["qg"], ["qh"], ["qi"], ["qj"], ["qk"], ["ql"], ["qm"], ["qn"],
|
||||
["qo"], ["qp"], ["qq"], ["qr"], ["qs"], ["qt"], ["qu"], ["qv"], ["qw"], ["qx"],
|
||||
["qy"], ["qz"], ["ra"], ["rb"], ["rc"], ["rd"], ["re"], ["rf"], ["rg"], ["rh"],
|
||||
["ri"], ["rj"], ["rk"], ["rl"], ["rm"], ["rn"], ["ro"], ["rp"], ["rq"], ["rr"],
|
||||
["rs"], ["rt"], ["ru"], ["rv"], ["rw"], ["rx"], ["ry"], ["rz"], ["sa"], ["sb"],
|
||||
["sc"], ["sd"], ["se"], ["sf"], ["sg"], ["sh"], ["si"], ["sj"], ["sk"], ["sl"],
|
||||
["sm"], ["sn"], ["so"], ["sp"], ["sq"], ["sr"], ["ss"], ["st"], ["su"], ["sv"],
|
||||
["sw"], ["sx"], ["sy"], ["sz"], ["ta"], ["tb"], ["tc"], ["td"], ["te"], ["tf"],
|
||||
["tg"], ["th"], ["ti"], ["tj"], ["tk"], ["tl"], ["tm"], ["tn"], ["to"], ["tp"],
|
||||
["tq"], ["tr"], ["ts"], ["tt"], ["tu"], ["tv"], ["tw"], ["tx"], ["ty"], ["tz"],
|
||||
["ua"], ["ub"], ["uc"], ["ud"], ["ue"], ["uf"], ["ug"], ["uh"], ["ui"], ["uj"],
|
||||
["uk"], ["ul"], ["um"], ["un"], ["uo"], ["up"], ["uq"], ["ur"], ["us"], ["ut"],
|
||||
["uu"], ["uv"], ["uw"], ["ux"], ["uy"], ["uz"], ["va"], ["vb"], ["vc"], ["vd"],
|
||||
["ve"], ["vf"], ["vg"], ["vh"], ["vi"], ["vj"], ["vk"], ["vl"], ["vm"], ["vn"],
|
||||
["vo"], ["vp"], ["vq"], ["vr"], ["vs"], ["vt"], ["vu"], ["vv"], ["vw"], ["vx"],
|
||||
["vy"], ["vz"], ["wa"], ["wb"], ["wc"], ["wd"], ["we"], ["wf"], ["wg"], ["wh"],
|
||||
["wi"], ["wj"], ["wk"], ["wl"], ["wm"], ["wn"], ["wo"], ["wp"], ["wq"], ["wr"],
|
||||
["ws"], ["wt"], ["wu"], ["wv"], ["ww"], ["wx"], ["wy"], ["wz"], ["xa"], ["xb"],
|
||||
["xc"], ["xd"], ["xe"], ["xf"], ["xg"], ["xh"], ["xi"], ["xj"], ["xk"], ["xl"],
|
||||
["xm"], ["xn"], ["xo"], ["xp"], ["xq"], ["xr"], ["xs"], ["xt"], ["xu"], ["xv"],
|
||||
["xw"], ["xx"], ["xy"], ["xz"], ["ya"], ["yb"], ["yc"], ["yd"], ["ye"], ["yf"],
|
||||
["yg"], ["yh"], ["yi"], ["yj"], ["yk"], ["yl"], ["ym"], ["yn"], ["yo"], ["yp"],
|
||||
["yq"], ["yr"], ["ys"], ["yt"], ["yu"], ["yv"], ["yw"], ["yx"], ["yy"], ["yz"],
|
||||
["za"], ["zb"], ["zc"], ["zd"], ["ze"], ["zf"], ["zg"], ["zh"], ["zi"], ["zj"],
|
||||
["zk"], ["zl"], ["zm"], ["zn"], ["zo"], ["zp"], ["zq"], ["zr"], ["zs"], ["zt"],
|
||||
["zu"], ["zv"], ["zw"], ["zx"], ["zy"], ["zz"]
|
||||
]
|
||||
216
test/files/alpha.metadata.json
Normal file
216
test/files/alpha.metadata.json
Normal file
@ -0,0 +1,216 @@
|
||||
{
|
||||
"version": 2,
|
||||
"schema": [
|
||||
{
|
||||
"name": "root",
|
||||
"num_children": 1
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "id",
|
||||
"converted_type": "UTF8"
|
||||
}
|
||||
],
|
||||
"num_rows": 676,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 4,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 440,
|
||||
"total_compressed_size": 440,
|
||||
"data_page_offset": 4,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "dv",
|
||||
"min_value": "aa"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 440,
|
||||
"num_rows": 100
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 444,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 441,
|
||||
"total_compressed_size": 441,
|
||||
"data_page_offset": 444,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "hr",
|
||||
"min_value": "dw"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 441,
|
||||
"num_rows": 100
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 885,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 437,
|
||||
"total_compressed_size": 437,
|
||||
"data_page_offset": 885,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "ln",
|
||||
"min_value": "hs"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 437,
|
||||
"num_rows": 100
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 1322,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 438,
|
||||
"total_compressed_size": 438,
|
||||
"data_page_offset": 1322,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "pj",
|
||||
"min_value": "lo"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 438,
|
||||
"num_rows": 100
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 1760,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 440,
|
||||
"total_compressed_size": 440,
|
||||
"data_page_offset": 1760,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "tf",
|
||||
"min_value": "pk"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 440,
|
||||
"num_rows": 100
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 2200,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 100,
|
||||
"total_uncompressed_size": 435,
|
||||
"total_compressed_size": 435,
|
||||
"data_page_offset": 2200,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "xb",
|
||||
"min_value": "tg"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 435,
|
||||
"num_rows": 100
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 2635,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": [
|
||||
"PLAIN"
|
||||
],
|
||||
"path_in_schema": [
|
||||
"id"
|
||||
],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 76,
|
||||
"total_uncompressed_size": 335,
|
||||
"total_compressed_size": 335,
|
||||
"data_page_offset": 2635,
|
||||
"statistics": {
|
||||
"null_count": 0,
|
||||
"max_value": "zz",
|
||||
"min_value": "xc"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 335,
|
||||
"num_rows": 76
|
||||
}
|
||||
],
|
||||
"created_by": "hyparquet",
|
||||
"metadata_length": 396
|
||||
}
|
||||
BIN
test/files/alpha.parquet
Normal file
BIN
test/files/alpha.parquet
Normal file
Binary file not shown.
@ -20,7 +20,7 @@ describe('parquetRead', () => {
|
||||
.rejects.toThrow('parquet expected AsyncBuffer')
|
||||
})
|
||||
|
||||
it('filter by row', async () => {
|
||||
it('read row range', async () => {
|
||||
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
|
||||
await parquetRead({
|
||||
file,
|
||||
@ -32,7 +32,7 @@ describe('parquetRead', () => {
|
||||
})
|
||||
})
|
||||
|
||||
it('filter by row overestimate', async () => {
|
||||
it('row range overestimate', async () => {
|
||||
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
|
||||
await parquetRead({
|
||||
file,
|
||||
@ -183,6 +183,20 @@ describe('parquetRead', () => {
|
||||
expect(convertWithDictionary).toHaveBeenCalledTimes(4)
|
||||
})
|
||||
|
||||
it('reads only required row groups on the boundary', async () => {
|
||||
const originalFile = await asyncBufferFromFile('test/files/alpha.parquet')
|
||||
const metadata = await parquetMetadataAsync(originalFile)
|
||||
const file = countingBuffer(originalFile)
|
||||
await parquetReadObjects({
|
||||
file,
|
||||
metadata,
|
||||
rowStart: 100,
|
||||
rowEnd: 200,
|
||||
})
|
||||
expect(file.fetches).toBe(1) // 1 rowgroup
|
||||
expect(file.bytes).toBe(441) // bytes for 2nd rowgroup
|
||||
})
|
||||
|
||||
it('reads individual pages', async () => {
|
||||
const file = countingBuffer(await asyncBufferFromFile('test/files/page_indexed.parquet'))
|
||||
/** @type {import('../src/types.js').ColumnData[]} */
|
||||
|
||||
Loading…
Reference in New Issue
Block a user