From de03425587cfd6043c33b8ba93876c240b528a66 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Thu, 17 Apr 2025 01:09:43 -0700 Subject: [PATCH] Don't write file_path, duckdb doesn't like it --- src/parquet-writer.js | 3 +-- test/example.js | 9 +-------- test/metadata.test.js | 2 +- test/write.buffer.test.js | 14 +++++++------- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/parquet-writer.js b/src/parquet-writer.js index ed198db..3fbede6 100644 --- a/src/parquet-writer.js +++ b/src/parquet-writer.js @@ -48,7 +48,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 }) // write columns for (let j = 0; j < columnData.length; j++) { - const { name, data } = columnData[j] + const { data } = columnData[j] const schemaPath = [this.schema[0], this.schema[j + 1]] const groupData = data.slice(groupStartIndex, groupStartIndex + groupSize) const file_offset = BigInt(this.writer.offset) @@ -56,7 +56,6 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 }) // save column chunk metadata columns.push({ - file_path: name, file_offset, meta_data, }) diff --git a/test/example.js b/test/example.js index 4433488..866aadc 100644 --- a/test/example.js +++ b/test/example.js @@ -31,7 +31,6 @@ export const exampleMetadata = { row_groups: [{ columns: [ { - file_path: 'bool', file_offset: 4n, meta_data: { type: 'BOOLEAN', @@ -50,7 +49,6 @@ export const exampleMetadata = { }, }, { - file_path: 'int', file_offset: 28n, meta_data: { type: 'INT32', @@ -69,7 +67,6 @@ export const exampleMetadata = { }, }, { - file_path: 'bigint', file_offset: 67n, meta_data: { type: 'INT64', @@ -88,7 +85,6 @@ export const exampleMetadata = { }, }, { - file_path: 'float', file_offset: 110n, meta_data: { type: 'FLOAT', @@ -107,7 +103,6 @@ export const exampleMetadata = { }, }, { - file_path: 'double', file_offset: 149n, meta_data: { type: 'DOUBLE', @@ -126,7 +121,6 @@ export const exampleMetadata = { }, }, { - file_path: 'string', file_offset: 200n, meta_data: { type: 'BYTE_ARRAY', @@ -145,7 +139,6 @@ export const exampleMetadata = { }, }, { - file_path: 'nullable', file_offset: 242n, meta_data: { type: 'BOOLEAN', @@ -167,5 +160,5 @@ export const exampleMetadata = { total_byte_size: 264n, num_rows: 4n, }], - metadata_length: 497, + metadata_length: 445, } diff --git a/test/metadata.test.js b/test/metadata.test.js index 527b493..9f4d009 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -24,7 +24,7 @@ describe('writeMetadata', () => { { key: 'key1', value: 'value1' }, { key: 'key2', value: 'value2' }, ], - metadata_length: 529, + metadata_length: 477, } writeMetadata(writer, withKvMetadata) diff --git a/test/write.buffer.test.js b/test/write.buffer.test.js index c7b4864..c2f600e 100644 --- a/test/write.buffer.test.js +++ b/test/write.buffer.test.js @@ -39,9 +39,9 @@ describe('parquetWriteBuffer', () => { bool[500] = true bool[9999] = false const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data: bool }] }) - expect(file.byteLength).toBe(160) + expect(file.byteLength).toBe(154) const metadata = parquetMetadata(file) - expect(metadata.metadata_length).toBe(98) + expect(metadata.metadata_length).toBe(92) const result = await parquetReadObjects({ file }) expect(result.length).toBe(10000) expect(result[0]).toEqual({ bool: null }) @@ -55,14 +55,14 @@ describe('parquetWriteBuffer', () => { it('efficiently serializes long string', () => { const str = 'a'.repeat(10000) const file = parquetWriteBuffer({ columnData: [{ name: 'string', data: [str] }] }) - expect(file.byteLength).toBe(646) + expect(file.byteLength).toBe(638) }) it('less efficiently serializes string without compression', () => { const str = 'a'.repeat(10000) const columnData = [{ name: 'string', data: [str] }] const file = parquetWriteBuffer({ columnData, compressed: false }) - expect(file.byteLength).toBe(10176) + expect(file.byteLength).toBe(10168) }) it('efficiently serializes column with few distinct values', async () => { @@ -70,7 +70,7 @@ describe('parquetWriteBuffer', () => { .fill('aaaa', 0, 50000) .fill('bbbb', 50000, 100000) const file = parquetWriteBuffer({ columnData: [{ name: 'string', data }], statistics: false }) - expect(file.byteLength).toBe(178) + expect(file.byteLength).toBe(170) // round trip const result = await parquetReadObjects({ file }) expect(result.length).toBe(100000) @@ -81,8 +81,8 @@ describe('parquetWriteBuffer', () => { it('writes statistics when enabled', () => { const withStats = parquetWriteBuffer({ columnData: exampleData, statistics: true }) const noStats = parquetWriteBuffer({ columnData: exampleData, statistics: false }) - expect(withStats.byteLength).toBe(773) - expect(noStats.byteLength).toBe(663) + expect(withStats.byteLength).toBe(721) + expect(noStats.byteLength).toBe(611) }) it('serializes list types', async () => {