2025-04-04 03:19:37 +00:00
|
|
|
import { unconvert } from './unconvert.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
import { writeRleBitPackedHybrid } from './encoding.js'
|
|
|
|
|
import { writePlain } from './plain.js'
|
2025-03-27 07:01:24 +00:00
|
|
|
import { snappyCompress } from './snappy.js'
|
2025-04-08 06:14:48 +00:00
|
|
|
import { ByteWriter } from './bytewriter.js'
|
2025-04-13 20:44:48 +00:00
|
|
|
import { writeLevels, writePageHeader } from './datapage.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {Writer} writer
|
2025-03-26 07:11:14 +00:00
|
|
|
* @param {SchemaElement[]} schemaPath
|
2025-03-26 04:06:43 +00:00
|
|
|
* @param {DecodedArray} values
|
2025-03-27 07:27:22 +00:00
|
|
|
* @param {boolean} compressed
|
2025-04-03 20:21:57 +00:00
|
|
|
* @param {boolean} stats
|
2025-03-26 04:06:43 +00:00
|
|
|
* @returns {ColumnMetaData}
|
|
|
|
|
*/
|
2025-04-03 20:21:57 +00:00
|
|
|
export function writeColumn(writer, schemaPath, values, compressed, stats) {
|
2025-03-26 07:11:14 +00:00
|
|
|
const schemaElement = schemaPath[schemaPath.length - 1]
|
|
|
|
|
const { type } = schemaElement
|
|
|
|
|
if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`)
|
2025-03-26 04:06:43 +00:00
|
|
|
const offsetStart = writer.offset
|
2025-03-27 06:46:40 +00:00
|
|
|
const num_values = values.length
|
2025-04-03 20:21:57 +00:00
|
|
|
|
|
|
|
|
// Compute statistics
|
2025-04-13 20:44:48 +00:00
|
|
|
const statistics = stats ? getStatistics(values) : undefined
|
2025-03-26 04:06:43 +00:00
|
|
|
|
2025-03-27 06:46:40 +00:00
|
|
|
// Write levels to temp buffer
|
2025-04-08 06:14:48 +00:00
|
|
|
const levels = new ByteWriter()
|
2025-04-13 20:44:48 +00:00
|
|
|
const { definition_levels_byte_length, repetition_levels_byte_length, num_nulls }
|
|
|
|
|
= writeLevels(levels, schemaPath, values)
|
2025-03-26 04:06:43 +00:00
|
|
|
|
2025-03-28 06:30:32 +00:00
|
|
|
// dictionary encoding
|
|
|
|
|
let dictionary_page_offset = undefined
|
|
|
|
|
/** @type {DecodedArray | undefined} */
|
2025-04-13 20:44:48 +00:00
|
|
|
const dictionary = useDictionary(values, type)
|
2025-03-28 06:30:32 +00:00
|
|
|
if (dictionary) {
|
|
|
|
|
dictionary_page_offset = BigInt(writer.offset)
|
2025-03-26 04:06:43 +00:00
|
|
|
|
2025-03-28 06:30:32 +00:00
|
|
|
// replace values with dictionary indices
|
|
|
|
|
const indexes = new Int32Array(values.length)
|
|
|
|
|
for (let i = 0; i < values.length; i++) {
|
|
|
|
|
indexes[i] = dictionary.indexOf(values[i])
|
2025-03-26 05:36:06 +00:00
|
|
|
}
|
2025-03-28 06:30:32 +00:00
|
|
|
values = indexes
|
2025-03-26 04:06:43 +00:00
|
|
|
|
2025-04-13 20:44:48 +00:00
|
|
|
// write unconverted dictionary page
|
|
|
|
|
const unconverted = unconvert(schemaElement, dictionary)
|
|
|
|
|
writeDictionaryPage(writer, unconverted, type, compressed)
|
2025-03-28 06:30:32 +00:00
|
|
|
} else {
|
|
|
|
|
// unconvert type and filter out nulls
|
|
|
|
|
values = unconvert(schemaElement, values)
|
|
|
|
|
.filter(v => v !== null && v !== undefined)
|
|
|
|
|
}
|
2025-03-27 06:46:40 +00:00
|
|
|
|
|
|
|
|
// write page data to temp buffer
|
2025-04-08 06:14:48 +00:00
|
|
|
const page = new ByteWriter()
|
2025-03-28 06:30:32 +00:00
|
|
|
/** @type {import('hyparquet').Encoding} */
|
|
|
|
|
const encoding = dictionary ? 'RLE_DICTIONARY' : 'PLAIN'
|
|
|
|
|
if (dictionary) {
|
|
|
|
|
const bitWidth = Math.ceil(Math.log2(dictionary.length))
|
|
|
|
|
page.appendUint8(bitWidth)
|
|
|
|
|
writeRleBitPackedHybrid(page, values)
|
|
|
|
|
} else {
|
|
|
|
|
writePlain(page, values, type)
|
|
|
|
|
}
|
2025-03-26 05:36:06 +00:00
|
|
|
|
2025-03-27 07:01:24 +00:00
|
|
|
// compress page data
|
2025-03-27 07:27:22 +00:00
|
|
|
let compressedPage = page
|
|
|
|
|
if (compressed) {
|
2025-04-08 06:14:48 +00:00
|
|
|
compressedPage = new ByteWriter()
|
2025-03-27 07:27:22 +00:00
|
|
|
snappyCompress(compressedPage, new Uint8Array(page.getBuffer()))
|
|
|
|
|
}
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
// write page header
|
2025-03-28 06:30:32 +00:00
|
|
|
const data_page_offset = BigInt(writer.offset)
|
2025-03-26 04:06:43 +00:00
|
|
|
/** @type {PageHeader} */
|
|
|
|
|
const header = {
|
|
|
|
|
type: 'DATA_PAGE_V2',
|
2025-03-27 06:46:40 +00:00
|
|
|
uncompressed_page_size: levels.offset + page.offset,
|
2025-03-27 07:27:22 +00:00
|
|
|
compressed_page_size: levels.offset + compressedPage.offset,
|
2025-03-26 04:06:43 +00:00
|
|
|
data_page_header_v2: {
|
2025-03-27 06:46:40 +00:00
|
|
|
num_values,
|
2025-03-26 04:06:43 +00:00
|
|
|
num_nulls,
|
2025-03-27 06:46:40 +00:00
|
|
|
num_rows: num_values,
|
2025-03-28 06:30:32 +00:00
|
|
|
encoding: dictionary ? 'RLE_DICTIONARY' : encoding,
|
2025-03-26 04:06:43 +00:00
|
|
|
definition_levels_byte_length,
|
|
|
|
|
repetition_levels_byte_length,
|
2025-03-27 07:01:24 +00:00
|
|
|
is_compressed: true,
|
2025-03-26 04:06:43 +00:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
writePageHeader(writer, header)
|
|
|
|
|
|
2025-03-27 06:46:40 +00:00
|
|
|
// write levels
|
|
|
|
|
writer.appendBuffer(levels.getBuffer())
|
|
|
|
|
|
2025-03-26 04:06:43 +00:00
|
|
|
// write page data
|
2025-03-27 07:27:22 +00:00
|
|
|
writer.appendBuffer(compressedPage.getBuffer())
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
type,
|
2025-03-28 06:30:32 +00:00
|
|
|
encodings: [encoding],
|
2025-03-26 05:36:06 +00:00
|
|
|
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
2025-03-27 07:27:22 +00:00
|
|
|
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
|
2025-03-27 06:46:40 +00:00
|
|
|
num_values: BigInt(num_values),
|
2025-03-26 04:06:43 +00:00
|
|
|
total_compressed_size: BigInt(writer.offset - offsetStart),
|
|
|
|
|
total_uncompressed_size: BigInt(writer.offset - offsetStart),
|
2025-03-28 06:30:32 +00:00
|
|
|
data_page_offset,
|
|
|
|
|
dictionary_page_offset,
|
2025-04-03 20:21:57 +00:00
|
|
|
statistics,
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {DecodedArray} values
|
|
|
|
|
* @param {ParquetType} type
|
2025-03-28 06:30:32 +00:00
|
|
|
* @returns {any[] | undefined}
|
|
|
|
|
*/
|
|
|
|
|
function useDictionary(values, type) {
|
|
|
|
|
if (type === 'BOOLEAN') return
|
|
|
|
|
const unique = new Set(values)
|
2025-04-13 20:44:48 +00:00
|
|
|
unique.delete(undefined)
|
|
|
|
|
unique.delete(null)
|
2025-03-28 06:30:32 +00:00
|
|
|
if (values.length > 10 && values.length / unique.size > 0.1) {
|
|
|
|
|
if (unique.size < values.length) {
|
|
|
|
|
// TODO: sort by frequency
|
|
|
|
|
return Array.from(unique)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {Writer} writer
|
|
|
|
|
* @param {DecodedArray} dictionary
|
|
|
|
|
* @param {ParquetType} type
|
|
|
|
|
* @param {boolean} compressed
|
2025-03-26 04:06:43 +00:00
|
|
|
*/
|
2025-03-28 06:30:32 +00:00
|
|
|
function writeDictionaryPage(writer, dictionary, type, compressed) {
|
2025-04-08 06:14:48 +00:00
|
|
|
const dictionaryPage = new ByteWriter()
|
2025-03-28 06:30:32 +00:00
|
|
|
writePlain(dictionaryPage, dictionary, type)
|
|
|
|
|
|
|
|
|
|
// compress dictionary page data
|
|
|
|
|
let compressedDictionaryPage = dictionaryPage
|
|
|
|
|
if (compressed) {
|
2025-04-08 06:14:48 +00:00
|
|
|
compressedDictionaryPage = new ByteWriter()
|
2025-03-28 06:30:32 +00:00
|
|
|
snappyCompress(compressedDictionaryPage, new Uint8Array(dictionaryPage.getBuffer()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// write dictionary page header
|
|
|
|
|
/** @type {PageHeader} */
|
|
|
|
|
const dictionaryHeader = {
|
|
|
|
|
type: 'DICTIONARY_PAGE',
|
|
|
|
|
uncompressed_page_size: dictionaryPage.offset,
|
|
|
|
|
compressed_page_size: compressedDictionaryPage.offset,
|
|
|
|
|
dictionary_page_header: {
|
|
|
|
|
num_values: dictionary.length,
|
|
|
|
|
encoding: 'PLAIN',
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
writePageHeader(writer, dictionaryHeader)
|
|
|
|
|
writer.appendBuffer(compressedDictionaryPage.getBuffer())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2025-04-13 20:44:48 +00:00
|
|
|
* @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType, SchemaElement, Statistics} from 'hyparquet'
|
|
|
|
|
* @import {Writer} from '../src/types.js'
|
2025-03-28 06:30:32 +00:00
|
|
|
* @param {DecodedArray} values
|
2025-04-13 20:44:48 +00:00
|
|
|
* @returns {Statistics}
|
2025-03-28 06:30:32 +00:00
|
|
|
*/
|
2025-04-13 20:44:48 +00:00
|
|
|
function getStatistics(values) {
|
|
|
|
|
let min_value = undefined
|
|
|
|
|
let max_value = undefined
|
|
|
|
|
let null_count = 0n
|
|
|
|
|
for (const value of values) {
|
|
|
|
|
if (value === null || value === undefined) {
|
|
|
|
|
null_count++
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if (min_value === undefined || value < min_value) {
|
|
|
|
|
min_value = value
|
|
|
|
|
}
|
|
|
|
|
if (max_value === undefined || value > max_value) {
|
|
|
|
|
max_value = value
|
2025-03-28 06:30:32 +00:00
|
|
|
}
|
|
|
|
|
}
|
2025-04-13 20:44:48 +00:00
|
|
|
return { min_value, max_value, null_count }
|
2025-03-26 05:36:06 +00:00
|
|
|
}
|