From 842ff4c15ea988b9a8b99c05ca17894e7ad1f1d6 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 21 Apr 2025 22:47:43 -0700 Subject: [PATCH] Repeated types not yet supported --- src/column.js | 14 +++++++------- src/datapage.js | 12 +++++++----- src/unconvert.js | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/column.js b/src/column.js index 8006fd1..a781290 100644 --- a/src/column.js +++ b/src/column.js @@ -13,9 +13,9 @@ import { writeDataPageV2, writePageHeader } from './datapage.js' * @returns {ColumnMetaData} */ export function writeColumn(writer, schemaPath, values, compressed, stats) { - const schemaElement = schemaPath[schemaPath.length - 1] - const { type, type_length } = schemaElement - if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`) + const element = schemaPath[schemaPath.length - 1] + const { type, type_length } = element + if (!type) throw new Error(`column ${element.name} cannot determine type`) const offsetStart = writer.offset const num_values = values.length /** @type {Encoding[]} */ @@ -41,20 +41,20 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) { } // write unconverted dictionary page - const unconverted = unconvert(schemaElement, dictionary) + const unconverted = unconvert(element, dictionary) writeDictionaryPage(writer, unconverted, type, type_length, compressed) // write data page with dictionary indexes data_page_offset = BigInt(writer.offset) - writeDataPageV2(writer, indexes, type, schemaPath, 'RLE_DICTIONARY', compressed) + writeDataPageV2(writer, indexes, schemaPath, 'RLE_DICTIONARY', compressed) encodings.push('RLE_DICTIONARY') } else { // unconvert values from rich types to simple - values = unconvert(schemaElement, values) + values = unconvert(element, values) // write data page const encoding = type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN' - writeDataPageV2(writer, values, type, schemaPath, encoding, compressed) + writeDataPageV2(writer, values, schemaPath, encoding, compressed) encodings.push(encoding) } diff --git a/src/datapage.js b/src/datapage.js index df4883e..bb4704d 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -10,13 +10,15 @@ import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' * @import {Writer} from '../src/types.js' * @param {Writer} writer * @param {DecodedArray} values - * @param {ParquetType} type * @param {SchemaElement[]} schemaPath * @param {import('hyparquet').Encoding} encoding * @param {boolean} compressed */ -export function writeDataPageV2(writer, values, type, schemaPath, encoding, compressed) { - const fixedLength = schemaPath.at(-1)?.type_length +export function writeDataPageV2(writer, values, schemaPath, encoding, compressed) { + const { name, type, type_length, repetition_type } = schemaPath[schemaPath.length - 1] + + if (!type) throw new Error(`column ${name} cannot determine type`) + if (repetition_type === 'REPEATED') throw new Error(`column ${name} repeated types not supported`) // write levels to temp buffer const levels = new ByteWriter() @@ -39,7 +41,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp page.appendUint8(bitWidth) // prepend bitWidth writeRleBitPackedHybrid(page, nonnull, bitWidth) } else { - writePlain(page, nonnull, type, fixedLength) + writePlain(page, nonnull, type, type_length) } // compress page data @@ -108,7 +110,7 @@ export function writePageHeader(writer, header) { } /** - * @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet' + * @import {DecodedArray, PageHeader, SchemaElement} from 'hyparquet' * @param {Writer} writer * @param {SchemaElement[]} schemaPath * @param {DecodedArray} values diff --git a/src/unconvert.js b/src/unconvert.js index c0c6bf0..839e98b 100644 --- a/src/unconvert.js +++ b/src/unconvert.js @@ -35,7 +35,7 @@ export function unconvert(element, values) { if (ctype === 'UTF8') { if (!Array.isArray(values)) throw new Error('strings must be an array') const encoder = new TextEncoder() - return values.map(v => encoder.encode(v)) + return values.map(v => typeof v === 'string' ? encoder.encode(v) : v) } if (ltype?.type === 'FLOAT16') { if (type !== 'FIXED_LEN_BYTE_ARRAY') throw new Error('FLOAT16 must be FIXED_LEN_BYTE_ARRAY type')