Repeated types not yet supported

This commit is contained in:
Kenny Daniel 2025-04-21 22:47:43 -07:00
parent b11b92ffb9
commit 842ff4c15e
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
3 changed files with 15 additions and 13 deletions

@ -13,9 +13,9 @@ import { writeDataPageV2, writePageHeader } from './datapage.js'
* @returns {ColumnMetaData}
*/
export function writeColumn(writer, schemaPath, values, compressed, stats) {
const schemaElement = schemaPath[schemaPath.length - 1]
const { type, type_length } = schemaElement
if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`)
const element = schemaPath[schemaPath.length - 1]
const { type, type_length } = element
if (!type) throw new Error(`column ${element.name} cannot determine type`)
const offsetStart = writer.offset
const num_values = values.length
/** @type {Encoding[]} */
@ -41,20 +41,20 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
}
// write unconverted dictionary page
const unconverted = unconvert(schemaElement, dictionary)
const unconverted = unconvert(element, dictionary)
writeDictionaryPage(writer, unconverted, type, type_length, compressed)
// write data page with dictionary indexes
data_page_offset = BigInt(writer.offset)
writeDataPageV2(writer, indexes, type, schemaPath, 'RLE_DICTIONARY', compressed)
writeDataPageV2(writer, indexes, schemaPath, 'RLE_DICTIONARY', compressed)
encodings.push('RLE_DICTIONARY')
} else {
// unconvert values from rich types to simple
values = unconvert(schemaElement, values)
values = unconvert(element, values)
// write data page
const encoding = type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN'
writeDataPageV2(writer, values, type, schemaPath, encoding, compressed)
writeDataPageV2(writer, values, schemaPath, encoding, compressed)
encodings.push(encoding)
}

@ -10,13 +10,15 @@ import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
* @import {Writer} from '../src/types.js'
* @param {Writer} writer
* @param {DecodedArray} values
* @param {ParquetType} type
* @param {SchemaElement[]} schemaPath
* @param {import('hyparquet').Encoding} encoding
* @param {boolean} compressed
*/
export function writeDataPageV2(writer, values, type, schemaPath, encoding, compressed) {
const fixedLength = schemaPath.at(-1)?.type_length
export function writeDataPageV2(writer, values, schemaPath, encoding, compressed) {
const { name, type, type_length, repetition_type } = schemaPath[schemaPath.length - 1]
if (!type) throw new Error(`column ${name} cannot determine type`)
if (repetition_type === 'REPEATED') throw new Error(`column ${name} repeated types not supported`)
// write levels to temp buffer
const levels = new ByteWriter()
@ -39,7 +41,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
page.appendUint8(bitWidth) // prepend bitWidth
writeRleBitPackedHybrid(page, nonnull, bitWidth)
} else {
writePlain(page, nonnull, type, fixedLength)
writePlain(page, nonnull, type, type_length)
}
// compress page data
@ -108,7 +110,7 @@ export function writePageHeader(writer, header) {
}
/**
* @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
* @import {DecodedArray, PageHeader, SchemaElement} from 'hyparquet'
* @param {Writer} writer
* @param {SchemaElement[]} schemaPath
* @param {DecodedArray} values

@ -35,7 +35,7 @@ export function unconvert(element, values) {
if (ctype === 'UTF8') {
if (!Array.isArray(values)) throw new Error('strings must be an array')
const encoder = new TextEncoder()
return values.map(v => encoder.encode(v))
return values.map(v => typeof v === 'string' ? encoder.encode(v) : v)
}
if (ltype?.type === 'FLOAT16') {
if (type !== 'FIXED_LEN_BYTE_ARRAY') throw new Error('FLOAT16 must be FIXED_LEN_BYTE_ARRAY type')