mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
RLE encoding for booleans
This commit is contained in:
parent
263dae7101
commit
5ecc4ff52e
@ -53,8 +53,9 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
|
||||
values = unconvert(schemaElement, values)
|
||||
|
||||
// write data page
|
||||
writeDataPageV2(writer, values, type, schemaPath, 'PLAIN', compressed)
|
||||
encodings.push('PLAIN')
|
||||
const encoding = type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN'
|
||||
writeDataPageV2(writer, values, type, schemaPath, encoding, compressed)
|
||||
encodings.push(encoding)
|
||||
}
|
||||
|
||||
return {
|
||||
|
||||
@ -27,7 +27,11 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
|
||||
|
||||
// write page data to temp buffer
|
||||
const page = new ByteWriter()
|
||||
if (encoding === 'RLE_DICTIONARY') {
|
||||
if (encoding === 'RLE') {
|
||||
if (type !== 'BOOLEAN') throw new Error('RLE encoding only supported for BOOLEAN type')
|
||||
page.appendUint32(nonnull.length) // prepend length
|
||||
writeRleBitPackedHybrid(page, nonnull, 1)
|
||||
} else if (encoding === 'PLAIN_DICTIONARY' || encoding === 'RLE_DICTIONARY') {
|
||||
// find max bitwidth
|
||||
let maxValue = 0
|
||||
for (const v of values) if (v > maxValue) maxValue = v
|
||||
|
||||
@ -10,8 +10,8 @@ export const exampleData = [
|
||||
]
|
||||
|
||||
/**
|
||||
* @import {FileMetaData, LogicalType} from 'hyparquet'
|
||||
* @import {ColumnData, ThriftObject} from '../src/types.js'
|
||||
* @import {FileMetaData} from 'hyparquet'
|
||||
* @import {ColumnData} from '../src/types.js'
|
||||
* @type {FileMetaData}
|
||||
*/
|
||||
export const exampleMetadata = {
|
||||
|
||||
@ -38,14 +38,24 @@ describe('parquetWriteBuffer', () => {
|
||||
expect(file.byteLength).toBe(162)
|
||||
})
|
||||
|
||||
it('serializes booleans as RLE', async () => {
|
||||
const data = Array(100).fill(true)
|
||||
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data }] })
|
||||
expect(file.byteLength).toBe(131)
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['RLE'])
|
||||
const result = await parquetReadObjects({ file })
|
||||
expect(result).toEqual(data.map(bool => ({ bool })))
|
||||
})
|
||||
|
||||
it('efficiently serializes sparse booleans', async () => {
|
||||
const bool = Array(10000).fill(null)
|
||||
bool[10] = true
|
||||
bool[100] = false
|
||||
bool[500] = true
|
||||
bool[9999] = false
|
||||
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data: bool }] })
|
||||
expect(file.byteLength).toBe(154)
|
||||
const data = Array(10000).fill(null)
|
||||
data[10] = true
|
||||
data[100] = false
|
||||
data[500] = true
|
||||
data[9999] = false
|
||||
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data }] })
|
||||
expect(file.byteLength).toBe(159)
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata.metadata_length).toBe(92)
|
||||
const result = await parquetReadObjects({ file })
|
||||
|
||||
Loading…
Reference in New Issue
Block a user