RLE encoding for booleans

This commit is contained in:
Kenny Daniel 2025-04-20 18:47:53 -07:00
parent 263dae7101
commit 5ecc4ff52e
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 27 additions and 12 deletions

@ -53,8 +53,9 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
values = unconvert(schemaElement, values)
// write data page
writeDataPageV2(writer, values, type, schemaPath, 'PLAIN', compressed)
encodings.push('PLAIN')
const encoding = type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN'
writeDataPageV2(writer, values, type, schemaPath, encoding, compressed)
encodings.push(encoding)
}
return {

@ -27,7 +27,11 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
// write page data to temp buffer
const page = new ByteWriter()
if (encoding === 'RLE_DICTIONARY') {
if (encoding === 'RLE') {
if (type !== 'BOOLEAN') throw new Error('RLE encoding only supported for BOOLEAN type')
page.appendUint32(nonnull.length) // prepend length
writeRleBitPackedHybrid(page, nonnull, 1)
} else if (encoding === 'PLAIN_DICTIONARY' || encoding === 'RLE_DICTIONARY') {
// find max bitwidth
let maxValue = 0
for (const v of values) if (v > maxValue) maxValue = v

@ -10,8 +10,8 @@ export const exampleData = [
]
/**
* @import {FileMetaData, LogicalType} from 'hyparquet'
* @import {ColumnData, ThriftObject} from '../src/types.js'
* @import {FileMetaData} from 'hyparquet'
* @import {ColumnData} from '../src/types.js'
* @type {FileMetaData}
*/
export const exampleMetadata = {

@ -38,14 +38,24 @@ describe('parquetWriteBuffer', () => {
expect(file.byteLength).toBe(162)
})
it('serializes booleans as RLE', async () => {
const data = Array(100).fill(true)
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data }] })
expect(file.byteLength).toBe(131)
const metadata = parquetMetadata(file)
expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['RLE'])
const result = await parquetReadObjects({ file })
expect(result).toEqual(data.map(bool => ({ bool })))
})
it('efficiently serializes sparse booleans', async () => {
const bool = Array(10000).fill(null)
bool[10] = true
bool[100] = false
bool[500] = true
bool[9999] = false
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data: bool }] })
expect(file.byteLength).toBe(154)
const data = Array(10000).fill(null)
data[10] = true
data[100] = false
data[500] = true
data[9999] = false
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data }] })
expect(file.byteLength).toBe(159)
const metadata = parquetMetadata(file)
expect(metadata.metadata_length).toBe(92)
const result = await parquetReadObjects({ file })