diff --git a/.gitignore b/.gitignore index 0e5a8a8..77485ad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ coverage node_modules package-lock.json -dist *.tgz +.DS_Store +/types diff --git a/README.md b/README.md index cd50f63..07d1134 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,21 @@ # Hyparquet Writer +[![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions) [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) ![coverage](https://img.shields.io/badge/Coverage-96-darkred) +[![dependencies](https://img.shields.io/badge/Dependencies-0-blueviolet)](https://www.npmjs.com/package/hyparquet?activeTab=dependencies) ## Usage -```javascript -import { writeParquet } from 'hyparquet-writer' +Call `parquetWrite` with a list of columns, each column is an object with a `name` and `data` field. The `data` field should be an array of same-type values. -const arrayBuffer = writeParquet({ - name: ['Alice', 'Bob', 'Charlie'], - age: [25, 30, 35], -}) +```javascript +import { parquetWrite } from 'hyparquet-writer' + +const arrayBuffer = parquetWrite([ + { name: 'name', data: ['Alice', 'Bob', 'Charlie'] }, + { name: 'age', data: [25, 30, 35] }, +]) ``` ## References diff --git a/package.json b/package.json index 06d63b8..0ddf1ff 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,8 @@ "homepage": "https://hyperparam.app", "keywords": [ "hyparquet", - "parquet" + "parquet", + "thrift" ], "license": "MIT", "repository": { @@ -15,14 +16,27 @@ }, "main": "src/index.js", "files": [ - "src" + "src", + "types" ], "type": "module", "types": "src/index.d.ts", + "exports": { + ".": { + "import": "./src/index.js", + "types": "./types/index.d.ts" + }, + "./src/*.js": { + "import": "./src/*.js", + "types": "./types/*.d.ts" + } + }, "scripts": { + "build:types": "tsc -p ./tsconfig.build.json", "coverage": "vitest run --coverage", "lint": "eslint", "lint:fix": "eslint --fix", + "prepare": "npm run build:types", "test": "vitest run" }, "devDependencies": { diff --git a/src/convert.js b/src/convert.js index 74e4787..6973990 100644 --- a/src/convert.js +++ b/src/convert.js @@ -13,9 +13,14 @@ export function unconvert(schemaElement, values) { return values.map(v => v.getTime()) } if (ctype === 'JSON') { - const encoder = new TextEncoder() if (!Array.isArray(values)) throw new Error('JSON must be an array') + const encoder = new TextEncoder() return values.map(v => encoder.encode(JSON.stringify(v))) } + if (ctype === 'UTF8') { + if (!Array.isArray(values)) throw new Error('strings must be an array') + const encoder = new TextEncoder() + return values.map(v => encoder.encode(v)) + } return values } diff --git a/src/plain.js b/src/plain.js index c8be649..a2f42ef 100644 --- a/src/plain.js +++ b/src/plain.js @@ -84,10 +84,9 @@ function writePlainDouble(writer, values) { * @param {DecodedArray} values */ function writePlainByteArray(writer, values) { - const encoder = new TextEncoder() for (const value of values) { - const bytes = typeof value === 'string' ? encoder.encode(value) : value - writer.appendUint32(bytes.length) - writer.appendBytes(bytes) + if (!(value instanceof Uint8Array)) throw new Error('BYTE_ARRAY must be Uint8Array') + writer.appendUint32(value.length) + writer.appendBytes(value) } } diff --git a/src/schema.js b/src/schema.js index bc68035..62e5859 100644 --- a/src/schema.js +++ b/src/schema.js @@ -29,7 +29,13 @@ export function getSchemaElementForValues(name, values) { else if (typeof value === 'bigint') valueType = 'INT64' else if (Number.isInteger(value)) valueType = 'INT32' else if (typeof value === 'number') valueType = 'DOUBLE' - else if (typeof value === 'string') valueType = 'BYTE_ARRAY' + else if (value instanceof Uint8Array) valueType = 'BYTE_ARRAY' + else if (typeof value === 'string') { + valueType = 'BYTE_ARRAY' + // make sure they are all strings + if (type && !converted_type) throw new Error('mixed types not supported') + converted_type = 'UTF8' + } else if (value instanceof Date) { valueType = 'INT64' // make sure they are all dates diff --git a/src/thrift.js b/src/thrift.js index 0124965..ab626e9 100644 --- a/src/thrift.js +++ b/src/thrift.js @@ -21,7 +21,7 @@ const CompactType = { * * Expects keys named like "field_1", "field_2", etc. in ascending order. * - * @import {Writer} from './types.js' + * @import {Writer} from '../src/types.js' * @param {Writer} writer * @param {Record} data */ diff --git a/src/types.d.ts b/src/types.d.ts index b3dabb5..f10648b 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -1,3 +1,9 @@ +import { DecodedArray, ParquetType } from "hyparquet" + +export interface ColumnData { + name: string + data: DecodedArray +} export interface Writer { buffer: ArrayBuffer diff --git a/src/write.js b/src/write.js index 93e5587..f4a146e 100644 --- a/src/write.js +++ b/src/write.js @@ -7,17 +7,17 @@ import { getSchemaElementForValues } from './schema.js' * Write data as parquet to an ArrayBuffer * * @import {ColumnChunk, DecodedArray, FileMetaData, SchemaElement, SchemaTree} from 'hyparquet' - * @param {Record} columnData + * @import {ColumnData} from '../src/types.js' + * @param {ColumnData[]} columnData * @returns {ArrayBuffer} */ export function parquetWrite(columnData) { const writer = new Writer() // Check if all columns have the same length - const columnNames = Object.keys(columnData) - const num_rows = columnNames.length ? BigInt(columnData[columnNames[0]].length) : 0n - for (const name of columnNames) { - if (BigInt(columnData[name].length) !== num_rows) { + const num_rows = columnData.length ? BigInt(columnData[0].data.length) : 0n + for (const { data } of columnData) { + if (BigInt(data.length) !== num_rows) { throw new Error('parquetWrite: all columns must have the same length') } } @@ -29,7 +29,7 @@ export function parquetWrite(columnData) { /** @type {SchemaElement[]} */ const schema = [{ name: 'root', - num_children: columnNames.length, + num_children: columnData.length, }] // row group columns @@ -37,9 +37,9 @@ export function parquetWrite(columnData) { const columns = [] // Write columns - for (const name of columnNames) { - const values = columnData[name] - const schemaElement = getSchemaElementForValues(name, values) + for (const { name, data } of columnData) { + // auto-detect type + const schemaElement = getSchemaElementForValues(name, data) if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`) const file_offset = BigInt(writer.offset) /** @type {SchemaElement[]} */ @@ -47,7 +47,7 @@ export function parquetWrite(columnData) { schema[0], schemaElement, ] - const meta_data = writeColumn(writer, schemaPath, values) + const meta_data = writeColumn(writer, schemaPath, data) // save metadata schema.push(schemaElement) diff --git a/src/writer.js b/src/writer.js index e0e1749..b83d593 100644 --- a/src/writer.js +++ b/src/writer.js @@ -2,8 +2,7 @@ /** * Self-expanding buffer view * - * @import {Writer} from './types.js' - * @returns {Writer} + * @returns {import('../src/types.js').Writer} */ export function Writer() { this.buffer = new ArrayBuffer(1024) diff --git a/test/metadata.test.js b/test/metadata.test.js index 582c1d5..fa224e3 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -16,7 +16,7 @@ export const exampleMetadata = { { name: 'int', type: 'INT32', repetition_type: 'REQUIRED' }, { name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' }, { name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' }, - { name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED' }, + { name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED', converted_type: 'UTF8' }, { name: 'nullable', type: 'BOOLEAN', repetition_type: 'OPTIONAL' }, ], num_rows: 4n, @@ -110,7 +110,7 @@ export const exampleMetadata = { total_byte_size: 236n, num_rows: 4n, }], - metadata_length: 336, + metadata_length: 338, } describe('writeMetadata', () => { diff --git a/test/plain.test.js b/test/plain.test.js index 0e9b0c7..78113f7 100644 --- a/test/plain.test.js +++ b/test/plain.test.js @@ -62,7 +62,10 @@ describe('writePlain', () => { it('writes BYTE_ARRAY', () => { const writer = new Writer() const strings = ['a', 'b', 'c', 'd'] - writePlain(writer, strings, 'BYTE_ARRAY') + // strings must be pre-converted to Uint8Array + const encoder = new TextEncoder() + const bytes = strings.map(s => encoder.encode(s)) + writePlain(writer, bytes, 'BYTE_ARRAY') let offset = 0 for (const s of strings) { diff --git a/test/write.test.js b/test/write.test.js index 1afd009..c973b61 100644 --- a/test/write.test.js +++ b/test/write.test.js @@ -6,32 +6,33 @@ import { exampleMetadata } from './metadata.test.js' /** * Utility to encode a parquet file and then read it back into a JS object. * - * @param {Record} columnData + * @import {ColumnData} from '../src/types.js' + * @param {ColumnData[]} columnData * @returns {Promise>} */ async function roundTripDeserialize(columnData) { const file = parquetWrite(columnData) - return await parquetReadObjects({ file }) + return await parquetReadObjects({ file, utf8: false }) } -const data = { - bool: [true, false, true, false], // BOOLEAN - int: [0, 127, 0x7fff, 0x7fffffff], // INT32 - bigint: [0n, 127n, 0x7fffn, 0x7fffffffffffffffn], // INT64 - double: [0, 0.0001, 123.456, 1e100], // DOUBLE - string: ['a', 'b', 'c', 'd'], // BYTE_ARRAY - nullable: [true, false, null, null], // BOOLEAN nullable -} +const basicData = [ + { name: 'bool', data: [true, false, true, false] }, + { name: 'int', data: [0, 127, 0x7fff, 0x7fffffff] }, + { name: 'bigint', data: [0n, 127n, 0x7fffn, 0x7fffffffffffffffn] }, + { name: 'double', data: [0, 0.0001, 123.456, 1e100] }, + { name: 'string', data: ['a', 'b', 'c', 'd'] }, + { name: 'nullable', data: [true, false, null, null] }, +] describe('parquetWrite', () => { it('writes expected metadata', () => { - const file = parquetWrite(data) + const file = parquetWrite(basicData) const metadata = parquetMetadata(file) expect(metadata).toEqual(exampleMetadata) }) it('serializes basic types', async () => { - const result = await roundTripDeserialize(data) + const result = await roundTripDeserialize(basicData) expect(result).toEqual([ { bool: true, int: 0, bigint: 0n, double: 0, string: 'a', nullable: true }, { bool: false, int: 127, bigint: 127n, double: 0.0001, string: 'b', nullable: false }, @@ -46,16 +47,17 @@ describe('parquetWrite', () => { bool[100] = false bool[500] = true bool[9999] = false - const buffer = parquetWrite({ bool }) + const buffer = parquetWrite([{ name: 'bool', data: bool }]) expect(buffer.byteLength).toBe(1399) const metadata = parquetMetadata(buffer) expect(metadata.metadata_length).toBe(89) }) it('serializes list types', async () => { - const result = await roundTripDeserialize({ - list: [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], - }) + const result = await roundTripDeserialize([{ + name: 'list', + data: [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], + }]) expect(result).toEqual([ { list: [1, 2, 3] }, { list: [4, 5, 6] }, @@ -65,9 +67,10 @@ describe('parquetWrite', () => { }) it('serializes object types', async () => { - const result = await roundTripDeserialize({ - obj: [{ a: 1, b: 2 }, { a: 3, b: 4 }, { a: 5, b: 6 }, { a: 7, b: 8 }], - }) + const result = await roundTripDeserialize([{ + name: 'obj', + data: [{ a: 1, b: 2 }, { a: 3, b: 4 }, { a: 5, b: 6 }, { a: 7, b: 8 }], + }]) expect(result).toEqual([ { obj: { a: 1, b: 2 } }, { obj: { a: 3, b: 4 } }, @@ -77,9 +80,10 @@ describe('parquetWrite', () => { }) it('serializes date types', async () => { - const result = await roundTripDeserialize({ - date: [new Date(0), new Date(100000), new Date(200000), new Date(300000)], - }) + const result = await roundTripDeserialize([{ + name: 'date', + data: [new Date(0), new Date(100000), new Date(200000), new Date(300000)], + }]) expect(result).toEqual([ { date: new Date(0) }, { date: new Date(100000) }, @@ -88,8 +92,21 @@ describe('parquetWrite', () => { ]) }) + it('serializes byte array types', async () => { + const result = await roundTripDeserialize([{ + name: 'bytes', + data: [Uint8Array.of(1, 2, 3), Uint8Array.of(4, 5, 6), Uint8Array.of(7, 8, 9), Uint8Array.of(10, 11, 12)], + }]) + expect(result).toEqual([ + { bytes: Uint8Array.of(1, 2, 3) }, + { bytes: Uint8Array.of(4, 5, 6) }, + { bytes: Uint8Array.of(7, 8, 9) }, + { bytes: Uint8Array.of(10, 11, 12) }, + ]) + }) + it('throws for mixed types', () => { - expect(() => parquetWrite({ mixed: [1, 2, 3, 'boom'] })) - .toThrow('parquet cannot write mixed types: INT32 and BYTE_ARRAY') + expect(() => parquetWrite([{ name: 'mixed', data: [1, 2, 3, 'boom'] }])) + .toThrow('mixed types not supported') }) }) diff --git a/tsconfig.build.json b/tsconfig.build.json new file mode 100644 index 0000000..51c713e --- /dev/null +++ b/tsconfig.build.json @@ -0,0 +1,11 @@ +{ + "extends": "./tsconfig.json", + "compilerOptions": { + "noEmit": false, + "declaration": true, + "emitDeclarationOnly": true, + "outDir": "types", + "declarationMap": true + }, + "include": ["src"] +}