Don't write file_path, duckdb doesn't like it

This commit is contained in:
Kenny Daniel 2025-04-17 01:09:43 -07:00
parent 0e680e2706
commit de03425587
No known key found for this signature in database
GPG Key ID: FDF16101AF5AFD3A
4 changed files with 10 additions and 18 deletions

@ -48,7 +48,7 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 })
// write columns
for (let j = 0; j < columnData.length; j++) {
const { name, data } = columnData[j]
const { data } = columnData[j]
const schemaPath = [this.schema[0], this.schema[j + 1]]
const groupData = data.slice(groupStartIndex, groupStartIndex + groupSize)
const file_offset = BigInt(this.writer.offset)
@ -56,7 +56,6 @@ ParquetWriter.prototype.write = function({ columnData, rowGroupSize = 100000 })
// save column chunk metadata
columns.push({
file_path: name,
file_offset,
meta_data,
})

@ -31,7 +31,6 @@ export const exampleMetadata = {
row_groups: [{
columns: [
{
file_path: 'bool',
file_offset: 4n,
meta_data: {
type: 'BOOLEAN',
@ -50,7 +49,6 @@ export const exampleMetadata = {
},
},
{
file_path: 'int',
file_offset: 28n,
meta_data: {
type: 'INT32',
@ -69,7 +67,6 @@ export const exampleMetadata = {
},
},
{
file_path: 'bigint',
file_offset: 67n,
meta_data: {
type: 'INT64',
@ -88,7 +85,6 @@ export const exampleMetadata = {
},
},
{
file_path: 'float',
file_offset: 110n,
meta_data: {
type: 'FLOAT',
@ -107,7 +103,6 @@ export const exampleMetadata = {
},
},
{
file_path: 'double',
file_offset: 149n,
meta_data: {
type: 'DOUBLE',
@ -126,7 +121,6 @@ export const exampleMetadata = {
},
},
{
file_path: 'string',
file_offset: 200n,
meta_data: {
type: 'BYTE_ARRAY',
@ -145,7 +139,6 @@ export const exampleMetadata = {
},
},
{
file_path: 'nullable',
file_offset: 242n,
meta_data: {
type: 'BOOLEAN',
@ -167,5 +160,5 @@ export const exampleMetadata = {
total_byte_size: 264n,
num_rows: 4n,
}],
metadata_length: 497,
metadata_length: 445,
}

@ -24,7 +24,7 @@ describe('writeMetadata', () => {
{ key: 'key1', value: 'value1' },
{ key: 'key2', value: 'value2' },
],
metadata_length: 529,
metadata_length: 477,
}
writeMetadata(writer, withKvMetadata)

@ -39,9 +39,9 @@ describe('parquetWriteBuffer', () => {
bool[500] = true
bool[9999] = false
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data: bool }] })
expect(file.byteLength).toBe(160)
expect(file.byteLength).toBe(154)
const metadata = parquetMetadata(file)
expect(metadata.metadata_length).toBe(98)
expect(metadata.metadata_length).toBe(92)
const result = await parquetReadObjects({ file })
expect(result.length).toBe(10000)
expect(result[0]).toEqual({ bool: null })
@ -55,14 +55,14 @@ describe('parquetWriteBuffer', () => {
it('efficiently serializes long string', () => {
const str = 'a'.repeat(10000)
const file = parquetWriteBuffer({ columnData: [{ name: 'string', data: [str] }] })
expect(file.byteLength).toBe(646)
expect(file.byteLength).toBe(638)
})
it('less efficiently serializes string without compression', () => {
const str = 'a'.repeat(10000)
const columnData = [{ name: 'string', data: [str] }]
const file = parquetWriteBuffer({ columnData, compressed: false })
expect(file.byteLength).toBe(10176)
expect(file.byteLength).toBe(10168)
})
it('efficiently serializes column with few distinct values', async () => {
@ -70,7 +70,7 @@ describe('parquetWriteBuffer', () => {
.fill('aaaa', 0, 50000)
.fill('bbbb', 50000, 100000)
const file = parquetWriteBuffer({ columnData: [{ name: 'string', data }], statistics: false })
expect(file.byteLength).toBe(178)
expect(file.byteLength).toBe(170)
// round trip
const result = await parquetReadObjects({ file })
expect(result.length).toBe(100000)
@ -81,8 +81,8 @@ describe('parquetWriteBuffer', () => {
it('writes statistics when enabled', () => {
const withStats = parquetWriteBuffer({ columnData: exampleData, statistics: true })
const noStats = parquetWriteBuffer({ columnData: exampleData, statistics: false })
expect(withStats.byteLength).toBe(773)
expect(noStats.byteLength).toBe(663)
expect(withStats.byteLength).toBe(721)
expect(noStats.byteLength).toBe(611)
})
it('serializes list types', async () => {