mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2026-01-04 19:06:37 +00:00
127 lines
3.9 KiB
JavaScript
127 lines
3.9 KiB
JavaScript
|
|
import { Encoding, PageType } from 'hyparquet/src/constants.js'
|
||
|
|
import { writeRleBitPackedHybrid } from './encoding.js'
|
||
|
|
import { writePlain } from './plain.js'
|
||
|
|
import { serializeTCompactProtocol } from './thrift.js'
|
||
|
|
import { Writer } from './writer.js'
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType} from 'hyparquet/src/types.js'
|
||
|
|
* @param {Writer} writer
|
||
|
|
* @param {string} columnName
|
||
|
|
* @param {DecodedArray} values
|
||
|
|
* @param {ParquetType} type
|
||
|
|
* @returns {ColumnMetaData}
|
||
|
|
*/
|
||
|
|
export function writeColumn(writer, columnName, values, type) {
|
||
|
|
// Get data stats
|
||
|
|
const num_nulls = values.filter(v => v === null).length
|
||
|
|
const offsetStart = writer.offset
|
||
|
|
|
||
|
|
// Write page to temp buffer
|
||
|
|
const page = new Writer()
|
||
|
|
|
||
|
|
/** @type {import('hyparquet/src/types.js').Encoding} */
|
||
|
|
const encoding = 'PLAIN'
|
||
|
|
|
||
|
|
// TODO: repetition levels
|
||
|
|
const maxRepetitionLevel = 0
|
||
|
|
let repetition_levels_byte_length = 0
|
||
|
|
if (maxRepetitionLevel) {
|
||
|
|
repetition_levels_byte_length = writeRleBitPackedHybrid(page, [])
|
||
|
|
}
|
||
|
|
|
||
|
|
// TODO: definition levels
|
||
|
|
const maxDefinitionLevel = 0
|
||
|
|
let definition_levels_byte_length = 0
|
||
|
|
if (maxDefinitionLevel) {
|
||
|
|
definition_levels_byte_length = writeRleBitPackedHybrid(page, [])
|
||
|
|
}
|
||
|
|
|
||
|
|
// write page data (TODO: compressed)
|
||
|
|
const { uncompressed_page_size, compressed_page_size } = writePageData(page, values, type)
|
||
|
|
|
||
|
|
// write page header
|
||
|
|
/** @type {PageHeader} */
|
||
|
|
const header = {
|
||
|
|
type: 'DATA_PAGE_V2',
|
||
|
|
uncompressed_page_size,
|
||
|
|
compressed_page_size,
|
||
|
|
data_page_header_v2: {
|
||
|
|
num_values: values.length,
|
||
|
|
num_nulls,
|
||
|
|
num_rows: values.length,
|
||
|
|
encoding,
|
||
|
|
definition_levels_byte_length,
|
||
|
|
repetition_levels_byte_length,
|
||
|
|
is_compressed: false,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
writePageHeader(writer, header)
|
||
|
|
|
||
|
|
// write page data
|
||
|
|
writer.appendBuffer(page.getBuffer())
|
||
|
|
|
||
|
|
return {
|
||
|
|
type,
|
||
|
|
encodings: ['PLAIN'],
|
||
|
|
path_in_schema: [columnName],
|
||
|
|
codec: 'UNCOMPRESSED',
|
||
|
|
num_values: BigInt(values.length),
|
||
|
|
total_compressed_size: BigInt(writer.offset - offsetStart),
|
||
|
|
total_uncompressed_size: BigInt(writer.offset - offsetStart),
|
||
|
|
data_page_offset: BigInt(offsetStart),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Deduce a ParquetType from the JS value
|
||
|
|
*
|
||
|
|
* @param {DecodedArray} values
|
||
|
|
* @returns {ParquetType}
|
||
|
|
*/
|
||
|
|
export function getParquetTypeForValues(values) {
|
||
|
|
if (values.every(v => typeof v === 'boolean')) return 'BOOLEAN'
|
||
|
|
if (values.every(v => typeof v === 'bigint')) return 'INT64'
|
||
|
|
if (values.every(v => Number.isInteger(v))) return 'INT32'
|
||
|
|
if (values.every(v => typeof v === 'number')) return 'DOUBLE'
|
||
|
|
if (values.every(v => typeof v === 'string')) return 'BYTE_ARRAY'
|
||
|
|
throw new Error(`Cannot determine parquet type for: ${values}`)
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param {Writer} writer
|
||
|
|
* @param {PageHeader} header
|
||
|
|
*/
|
||
|
|
function writePageHeader(writer, header) {
|
||
|
|
const compact = {
|
||
|
|
field_1: PageType.indexOf(header.type),
|
||
|
|
field_2: header.uncompressed_page_size,
|
||
|
|
field_3: header.compressed_page_size,
|
||
|
|
field_8: header.data_page_header_v2 && {
|
||
|
|
field_1: header.data_page_header_v2.num_values,
|
||
|
|
field_2: header.data_page_header_v2.num_nulls,
|
||
|
|
field_3: header.data_page_header_v2.num_rows,
|
||
|
|
field_4: Encoding.indexOf(header.data_page_header_v2.encoding),
|
||
|
|
field_5: header.data_page_header_v2.definition_levels_byte_length,
|
||
|
|
field_6: header.data_page_header_v2.repetition_levels_byte_length,
|
||
|
|
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
|
||
|
|
},
|
||
|
|
}
|
||
|
|
serializeTCompactProtocol(writer, compact)
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param {Writer} writer
|
||
|
|
* @param {DecodedArray} values
|
||
|
|
* @param {ParquetType} type
|
||
|
|
* @returns {{ uncompressed_page_size: number, compressed_page_size: number }}
|
||
|
|
*/
|
||
|
|
function writePageData(writer, values, type) {
|
||
|
|
// write plain data
|
||
|
|
const startOffset = writer.offset
|
||
|
|
writePlain(writer, values, type)
|
||
|
|
const size = writer.offset - startOffset
|
||
|
|
|
||
|
|
return { uncompressed_page_size: size, compressed_page_size: size }
|
||
|
|
}
|