diff --git a/src/column.js b/src/column.js index 3197448..9de7b66 100644 --- a/src/column.js +++ b/src/column.js @@ -53,8 +53,9 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) { values = unconvert(schemaElement, values) // write data page - writeDataPageV2(writer, values, type, schemaPath, 'PLAIN', compressed) - encodings.push('PLAIN') + const encoding = type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN' + writeDataPageV2(writer, values, type, schemaPath, encoding, compressed) + encodings.push(encoding) } return { diff --git a/src/datapage.js b/src/datapage.js index fa47406..f4246f8 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -27,7 +27,11 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp // write page data to temp buffer const page = new ByteWriter() - if (encoding === 'RLE_DICTIONARY') { + if (encoding === 'RLE') { + if (type !== 'BOOLEAN') throw new Error('RLE encoding only supported for BOOLEAN type') + page.appendUint32(nonnull.length) // prepend length + writeRleBitPackedHybrid(page, nonnull, 1) + } else if (encoding === 'PLAIN_DICTIONARY' || encoding === 'RLE_DICTIONARY') { // find max bitwidth let maxValue = 0 for (const v of values) if (v > maxValue) maxValue = v diff --git a/test/example.js b/test/example.js index 866aadc..6e57a7a 100644 --- a/test/example.js +++ b/test/example.js @@ -10,8 +10,8 @@ export const exampleData = [ ] /** - * @import {FileMetaData, LogicalType} from 'hyparquet' - * @import {ColumnData, ThriftObject} from '../src/types.js' + * @import {FileMetaData} from 'hyparquet' + * @import {ColumnData} from '../src/types.js' * @type {FileMetaData} */ export const exampleMetadata = { diff --git a/test/write.buffer.test.js b/test/write.buffer.test.js index a8f2882..24139cf 100644 --- a/test/write.buffer.test.js +++ b/test/write.buffer.test.js @@ -38,14 +38,24 @@ describe('parquetWriteBuffer', () => { expect(file.byteLength).toBe(162) }) + it('serializes booleans as RLE', async () => { + const data = Array(100).fill(true) + const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data }] }) + expect(file.byteLength).toBe(131) + const metadata = parquetMetadata(file) + expect(metadata.row_groups[0].columns[0].meta_data?.encodings).toEqual(['RLE']) + const result = await parquetReadObjects({ file }) + expect(result).toEqual(data.map(bool => ({ bool }))) + }) + it('efficiently serializes sparse booleans', async () => { - const bool = Array(10000).fill(null) - bool[10] = true - bool[100] = false - bool[500] = true - bool[9999] = false - const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data: bool }] }) - expect(file.byteLength).toBe(154) + const data = Array(10000).fill(null) + data[10] = true + data[100] = false + data[500] = true + data[9999] = false + const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data }] }) + expect(file.byteLength).toBe(159) const metadata = parquetMetadata(file) expect(metadata.metadata_length).toBe(92) const result = await parquetReadObjects({ file })