2024-02-24 18:11:04 +00:00
import { decompressPage } from './column.js'
2024-05-01 03:28:50 +00:00
import { readRleBitPackedHybrid , widthFromMaxInt } from './encoding.js'
import { readPlain } from './plain.js'
2024-04-30 00:38:26 +00:00
import { getMaxDefinitionLevel , getMaxRepetitionLevel } from './schema.js'
2024-02-24 18:11:04 +00:00
import { readVarInt , readZigZag } from './thrift.js'
/ * *
* Read a data page from the given Uint8Array .
*
* @ typedef { import ( "./types.d.ts" ) . DataPage } DataPage
* @ typedef { import ( "./types.d.ts" ) . ColumnMetaData } ColumnMetaData
2024-02-23 18:25:06 +00:00
* @ typedef { import ( "./types.d.ts" ) . Compressors } Compressors
2024-02-24 18:11:04 +00:00
* @ typedef { import ( "./types.d.ts" ) . DataPageHeaderV2 } DataPageHeaderV2
2024-04-30 00:38:26 +00:00
* @ typedef { import ( "./types.d.ts" ) . SchemaTree } SchemaTree
2024-02-24 18:11:04 +00:00
* @ param { Uint8Array } compressedBytes raw page data ( should already be decompressed )
2024-04-28 22:58:25 +00:00
* @ param { import ( "./types.d.ts" ) . PageHeader } ph page header
2024-04-30 01:45:29 +00:00
* @ param { SchemaTree [ ] } schemaPath
* @ param { ColumnMetaData } columnMetadata
2024-02-23 18:25:06 +00:00
* @ param { Compressors | undefined } compressors
2024-02-24 18:11:04 +00:00
* @ returns { DataPage } definition levels , repetition levels , and array of values
* /
2024-04-30 00:38:26 +00:00
export function readDataPageV2 ( compressedBytes , ph , schemaPath , columnMetadata , compressors ) {
2024-04-17 07:48:33 +00:00
const view = new DataView ( compressedBytes . buffer , compressedBytes . byteOffset , compressedBytes . byteLength )
const reader = { view , offset : 0 }
2024-02-24 18:11:04 +00:00
/** @type {any} */
let values = [ ]
const daph2 = ph . data _page _header _v2
if ( ! daph2 ) throw new Error ( 'parquet data page header v2 is undefined' )
// repetition levels
2024-04-30 00:38:26 +00:00
const repetitionLevels = readRepetitionLevelsV2 ( reader , daph2 , schemaPath )
2024-04-17 07:48:33 +00:00
if ( reader . offset !== daph2 . repetition _levels _byte _length ) {
throw new Error ( ` parquet repetition levels byte length ${ reader . offset } does not match expected ${ daph2 . repetition _levels _byte _length } ` )
}
2024-02-24 18:11:04 +00:00
// definition levels
2024-04-30 00:38:26 +00:00
const maxDefinitionLevel = getMaxDefinitionLevel ( schemaPath )
2024-04-17 07:48:33 +00:00
const definitionLevels = readDefinitionLevelsV2 ( reader , daph2 , maxDefinitionLevel )
if ( reader . offset !== daph2 . repetition _levels _byte _length + daph2 . definition _levels _byte _length ) {
throw new Error ( ` parquet definition levels byte length ${ reader . offset } does not match expected ${ daph2 . repetition _levels _byte _length + daph2 . definition _levels _byte _length } ` )
}
2024-02-24 18:11:04 +00:00
const uncompressedPageSize = ph . uncompressed _page _size - daph2 . definition _levels _byte _length - daph2 . repetition _levels _byte _length
// read values based on encoding
const nValues = daph2 . num _values - daph2 . num _nulls
2024-02-27 18:33:17 +00:00
if ( daph2 . encoding === 'PLAIN' ) {
2024-04-30 00:38:26 +00:00
const { element } = schemaPath [ schemaPath . length - 1 ]
2024-03-13 02:58:54 +00:00
const utf8 = element . converted _type === 'UTF8'
2024-04-17 07:48:33 +00:00
let page = compressedBytes . slice ( reader . offset )
2024-02-23 18:25:06 +00:00
if ( daph2 . is _compressed && columnMetadata . codec !== 'UNCOMPRESSED' ) {
page = decompressPage ( page , uncompressedPageSize , columnMetadata . codec , compressors )
}
const pageView = new DataView ( page . buffer , page . byteOffset , page . byteLength )
2024-04-17 07:48:33 +00:00
const pageReader = { view : pageView , offset : 0 }
values = readPlain ( pageReader , columnMetadata . type , nValues , utf8 )
2024-02-27 18:33:17 +00:00
} else if ( daph2 . encoding === 'RLE' ) {
2024-02-23 18:25:06 +00:00
const page = decompressPage ( compressedBytes , uncompressedPageSize , columnMetadata . codec , compressors )
2024-02-24 18:11:04 +00:00
const pageView = new DataView ( page . buffer , page . byteOffset , page . byteLength )
const bitWidth = 1
if ( daph2 . num _nulls ) {
throw new Error ( 'parquet RLE encoding with nulls not supported' )
} else {
2024-04-17 07:48:33 +00:00
const pageReader = { view : pageView , offset : 4 }
2024-04-30 21:40:18 +00:00
values = new Array ( nValues )
readRleBitPackedHybrid ( pageReader , bitWidth , uncompressedPageSize , values )
2024-02-24 18:11:04 +00:00
}
} else if (
2024-02-27 18:33:17 +00:00
daph2 . encoding === 'PLAIN_DICTIONARY' ||
daph2 . encoding === 'RLE_DICTIONARY'
2024-02-24 18:11:04 +00:00
) {
2024-04-17 07:48:33 +00:00
compressedBytes = compressedBytes . subarray ( reader . offset )
2024-02-23 18:25:06 +00:00
const page = decompressPage ( compressedBytes , uncompressedPageSize , columnMetadata . codec , compressors )
2024-02-24 18:11:04 +00:00
const pageView = new DataView ( page . buffer , page . byteOffset , page . byteLength )
const bitWidth = pageView . getUint8 ( 0 )
2024-04-17 07:48:33 +00:00
const pageReader = { view : pageView , offset : 1 }
2024-04-30 21:40:18 +00:00
values = new Array ( nValues )
readRleBitPackedHybrid ( pageReader , bitWidth , uncompressedPageSize , values )
2024-02-27 18:33:17 +00:00
} else if ( daph2 . encoding === 'DELTA_BINARY_PACKED' ) {
2024-02-24 18:11:04 +00:00
if ( daph2 . num _nulls ) throw new Error ( 'parquet delta-int not supported' )
const codec = daph2 . is _compressed ? columnMetadata . codec : 'UNCOMPRESSED'
2024-02-23 18:25:06 +00:00
const page = decompressPage ( compressedBytes , uncompressedPageSize , codec , compressors )
2024-02-24 18:11:04 +00:00
deltaBinaryUnpack ( page , nValues , values )
} else {
throw new Error ( ` parquet unsupported encoding: ${ daph2 . encoding } ` )
}
return { definitionLevels , repetitionLevels , value : values }
}
/ * *
* Read the repetition levels from this page , if any .
*
2024-04-17 07:48:33 +00:00
* @ typedef { import ( "./types.d.ts" ) . DataReader } DataReader
* @ param { DataReader } reader data view for the page
2024-02-24 18:11:04 +00:00
* @ param { DataPageHeaderV2 } daph2 data page header
2024-04-30 01:45:29 +00:00
* @ param { SchemaTree [ ] } schemaPath
2024-02-24 18:11:04 +00:00
* @ returns { any [ ] } repetition levels and number of bytes read
* /
2024-04-30 00:38:26 +00:00
export function readRepetitionLevelsV2 ( reader , daph2 , schemaPath ) {
const maxRepetitionLevel = getMaxRepetitionLevel ( schemaPath )
2024-02-27 03:33:38 +00:00
if ( ! maxRepetitionLevel ) return [ ]
const bitWidth = widthFromMaxInt ( maxRepetitionLevel )
// num_values is index 1 for either type of page header
2024-04-30 21:40:18 +00:00
const values = new Array ( daph2 . num _values )
readRleBitPackedHybrid (
reader , bitWidth , daph2 . repetition _levels _byte _length , values
2024-04-17 07:48:33 +00:00
)
2024-04-30 21:40:18 +00:00
return values
2024-02-24 18:11:04 +00:00
}
/ * *
* Read the definition levels from this page , if any .
*
2024-04-17 07:48:33 +00:00
* @ param { DataReader } reader data view for the page
2024-02-24 18:11:04 +00:00
* @ param { DataPageHeaderV2 } daph2 data page header v2
2024-04-30 01:45:29 +00:00
* @ param { number } maxDefinitionLevel
2024-02-24 18:11:04 +00:00
* @ returns { number [ ] | undefined } definition levels and number of bytes read
* /
2024-04-17 07:48:33 +00:00
function readDefinitionLevelsV2 ( reader , daph2 , maxDefinitionLevel ) {
2024-02-24 18:11:04 +00:00
if ( maxDefinitionLevel ) {
// not the same as V1, because we know the length
const bitWidth = widthFromMaxInt ( maxDefinitionLevel )
2024-04-30 21:40:18 +00:00
const values = new Array ( daph2 . num _values )
readRleBitPackedHybrid ( reader , bitWidth , daph2 . definition _levels _byte _length , values )
return values
2024-02-24 18:11:04 +00:00
}
}
/ * *
* Unpack the delta binary packed encoding .
*
* @ param { Uint8Array } page page data
* @ param { number } nValues number of values to read
* @ param { any [ ] } values array to write to
* /
function deltaBinaryUnpack ( page , nValues , values ) {
const dataView = new DataView ( page . buffer , page . byteOffset , page . byteLength )
const [ blockSize , index1 ] = readVarInt ( dataView , 0 )
const [ miniblockPerBlock , index2 ] = readVarInt ( dataView , index1 )
const [ count , index3 ] = readVarInt ( dataView , index2 )
let [ value , offset ] = readZigZag ( dataView , index3 )
const valuesPerMiniblock = blockSize / miniblockPerBlock
for ( let valueIndex = 0 ; valueIndex < nValues ; ) {
const [ minDelta , index4 ] = readZigZag ( dataView , offset )
offset = index4
const bitWidths = new Uint8Array ( miniblockPerBlock )
for ( let i = 0 ; i < miniblockPerBlock ; i ++ , offset ++ ) {
bitWidths [ i ] = page [ offset ]
}
for ( let i = 0 ; i < miniblockPerBlock ; i ++ ) {
const bitWidth = bitWidths [ i ]
if ( bitWidth ) {
if ( count > 1 ) {
// no more diffs if on last value, delta read bitpacked
let data = 0
let stop = - bitWidth
2024-02-27 03:33:38 +00:00
// only works for bitWidth < 31
const mask = ( 1 << bitWidth ) - 1
2024-02-24 18:11:04 +00:00
while ( count ) {
if ( stop < 0 ) {
2024-02-27 03:33:38 +00:00
// fails when data gets too large
data = ( data << 8 ) | dataView . getUint8 ( offset ++ )
2024-02-24 18:11:04 +00:00
stop += 8
} else {
values . push ( ( data >> stop ) & mask )
}
}
}
} else {
for ( let j = 0 ; j < valuesPerMiniblock && valueIndex < nValues ; j ++ , valueIndex ++ ) {
values [ valueIndex ] = value
value += minDelta
}
}
}
}
}