mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Move example data to test/example.js
This commit is contained in:
parent
6226d50734
commit
f6740aba3f
171
test/example.js
Normal file
171
test/example.js
Normal file
@ -0,0 +1,171 @@
|
||||
/** @type {ColumnData[]} */
|
||||
export const exampleData = [
|
||||
{ name: 'bool', data: [true, false, true, false] },
|
||||
{ name: 'int', data: [0, 127, 0x7fff, 0x7fffffff] },
|
||||
{ name: 'bigint', data: [0n, 127n, 0x7fffn, 0x7fffffffffffffffn] },
|
||||
{ name: 'float', data: [0, 0.0001, 123.456, 1e100], type: 'FLOAT', repetition_type: 'REQUIRED' },
|
||||
{ name: 'double', data: [0, 0.0001, 123.456, 1e100] },
|
||||
{ name: 'string', data: ['a', 'b', 'c', 'd'] },
|
||||
{ name: 'nullable', data: [true, false, null, null] },
|
||||
]
|
||||
|
||||
/**
|
||||
* @import {FileMetaData, LogicalType} from 'hyparquet'
|
||||
* @import {ColumnData, ThriftObject} from '../src/types.js'
|
||||
* @type {FileMetaData}
|
||||
*/
|
||||
export const exampleMetadata = {
|
||||
version: 2,
|
||||
created_by: 'hyparquet',
|
||||
schema: [
|
||||
{ name: 'root', num_children: 7 },
|
||||
{ name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' },
|
||||
{ name: 'int', type: 'INT32', repetition_type: 'REQUIRED' },
|
||||
{ name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' },
|
||||
{ name: 'float', type: 'FLOAT', repetition_type: 'REQUIRED' },
|
||||
{ name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' },
|
||||
{ name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED', converted_type: 'UTF8' },
|
||||
{ name: 'nullable', type: 'BOOLEAN', repetition_type: 'OPTIONAL' },
|
||||
],
|
||||
num_rows: 4n,
|
||||
row_groups: [{
|
||||
columns: [
|
||||
{
|
||||
file_path: 'bool',
|
||||
file_offset: 4n,
|
||||
meta_data: {
|
||||
type: 'BOOLEAN',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bool'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 24n,
|
||||
total_compressed_size: 24n,
|
||||
data_page_offset: 4n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: false,
|
||||
max_value: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'int',
|
||||
file_offset: 28n,
|
||||
meta_data: {
|
||||
type: 'INT32',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['int'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 39n,
|
||||
total_compressed_size: 39n,
|
||||
data_page_offset: 28n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0,
|
||||
max_value: 0x7fffffff,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'bigint',
|
||||
file_offset: 67n,
|
||||
meta_data: {
|
||||
type: 'INT64',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bigint'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 43n,
|
||||
total_compressed_size: 43n,
|
||||
data_page_offset: 67n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0n,
|
||||
max_value: 0x7fffffffffffffffn,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'float',
|
||||
file_offset: 110n,
|
||||
meta_data: {
|
||||
type: 'FLOAT',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['float'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 39n,
|
||||
total_compressed_size: 39n,
|
||||
data_page_offset: 110n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0,
|
||||
max_value: Infinity,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'double',
|
||||
file_offset: 149n,
|
||||
meta_data: {
|
||||
type: 'DOUBLE',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['double'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 51n,
|
||||
total_compressed_size: 51n,
|
||||
data_page_offset: 149n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0,
|
||||
max_value: 1e100,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'string',
|
||||
file_offset: 200n,
|
||||
meta_data: {
|
||||
type: 'BYTE_ARRAY',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['string'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 42n,
|
||||
total_compressed_size: 42n,
|
||||
data_page_offset: 200n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 'a',
|
||||
max_value: 'd',
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'nullable',
|
||||
file_offset: 242n,
|
||||
meta_data: {
|
||||
type: 'BOOLEAN',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['nullable'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 26n,
|
||||
total_compressed_size: 26n,
|
||||
data_page_offset: 242n,
|
||||
statistics: {
|
||||
null_count: 2n,
|
||||
min_value: false,
|
||||
max_value: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 264n,
|
||||
num_rows: 4n,
|
||||
}],
|
||||
metadata_length: 497,
|
||||
}
|
||||
@ -2,167 +2,12 @@ import { parquetMetadata } from 'hyparquet'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { ByteWriter } from '../src/bytewriter.js'
|
||||
import { logicalType, writeMetadata } from '../src/metadata.js'
|
||||
import { exampleMetadata } from './example.js'
|
||||
|
||||
/**
|
||||
* @import {FileMetaData, LogicalType} from 'hyparquet'
|
||||
* @import {ThriftObject} from '../src/types.js'
|
||||
* @type {FileMetaData}
|
||||
*/
|
||||
export const exampleMetadata = {
|
||||
version: 2,
|
||||
created_by: 'hyparquet',
|
||||
schema: [
|
||||
{ name: 'root', num_children: 7 },
|
||||
{ name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' },
|
||||
{ name: 'int', type: 'INT32', repetition_type: 'REQUIRED' },
|
||||
{ name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' },
|
||||
{ name: 'float', type: 'FLOAT', repetition_type: 'REQUIRED' },
|
||||
{ name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' },
|
||||
{ name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED', converted_type: 'UTF8' },
|
||||
{ name: 'nullable', type: 'BOOLEAN', repetition_type: 'OPTIONAL' },
|
||||
],
|
||||
num_rows: 4n,
|
||||
row_groups: [{
|
||||
columns: [
|
||||
{
|
||||
file_path: 'bool',
|
||||
file_offset: 4n,
|
||||
meta_data: {
|
||||
type: 'BOOLEAN',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bool'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 24n,
|
||||
total_compressed_size: 24n,
|
||||
data_page_offset: 4n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: false,
|
||||
max_value: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'int',
|
||||
file_offset: 28n,
|
||||
meta_data: {
|
||||
type: 'INT32',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['int'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 39n,
|
||||
total_compressed_size: 39n,
|
||||
data_page_offset: 28n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0,
|
||||
max_value: 0x7fffffff,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'bigint',
|
||||
file_offset: 67n,
|
||||
meta_data: {
|
||||
type: 'INT64',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bigint'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 43n,
|
||||
total_compressed_size: 43n,
|
||||
data_page_offset: 67n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0n,
|
||||
max_value: 0x7fffffffffffffffn,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'float',
|
||||
file_offset: 110n,
|
||||
meta_data: {
|
||||
type: 'FLOAT',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['float'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 39n,
|
||||
total_compressed_size: 39n,
|
||||
data_page_offset: 110n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0,
|
||||
max_value: Infinity,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'double',
|
||||
file_offset: 149n,
|
||||
meta_data: {
|
||||
type: 'DOUBLE',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['double'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 51n,
|
||||
total_compressed_size: 51n,
|
||||
data_page_offset: 149n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 0,
|
||||
max_value: 1e100,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'string',
|
||||
file_offset: 200n,
|
||||
meta_data: {
|
||||
type: 'BYTE_ARRAY',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['string'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 42n,
|
||||
total_compressed_size: 42n,
|
||||
data_page_offset: 200n,
|
||||
statistics: {
|
||||
null_count: 0n,
|
||||
min_value: 'a',
|
||||
max_value: 'd',
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'nullable',
|
||||
file_offset: 242n,
|
||||
meta_data: {
|
||||
type: 'BOOLEAN',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['nullable'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 26n,
|
||||
total_compressed_size: 26n,
|
||||
data_page_offset: 242n,
|
||||
statistics: {
|
||||
null_count: 2n,
|
||||
min_value: false,
|
||||
max_value: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 264n,
|
||||
num_rows: 4n,
|
||||
}],
|
||||
metadata_length: 497,
|
||||
}
|
||||
|
||||
describe('writeMetadata', () => {
|
||||
it('writes metadata and parses in hyparquet', () => {
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import { parquetMetadata, parquetReadObjects } from 'hyparquet'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetWriteBuffer } from '../src/index.js'
|
||||
import { exampleMetadata } from './metadata.test.js'
|
||||
import { exampleData, exampleMetadata } from './example.js'
|
||||
|
||||
/**
|
||||
* Utility to encode a parquet file and then read it back into a JS object.
|
||||
@ -15,26 +15,15 @@ async function roundTripDeserialize(columnData) {
|
||||
return await parquetReadObjects({ file, utf8: false })
|
||||
}
|
||||
|
||||
/** @type {ColumnData[]} */
|
||||
export const basicData = [
|
||||
{ name: 'bool', data: [true, false, true, false] },
|
||||
{ name: 'int', data: [0, 127, 0x7fff, 0x7fffffff] },
|
||||
{ name: 'bigint', data: [0n, 127n, 0x7fffn, 0x7fffffffffffffffn] },
|
||||
{ name: 'float', data: [0, 0.0001, 123.456, 1e100], type: 'FLOAT', repetition_type: 'REQUIRED' },
|
||||
{ name: 'double', data: [0, 0.0001, 123.456, 1e100] },
|
||||
{ name: 'string', data: ['a', 'b', 'c', 'd'] },
|
||||
{ name: 'nullable', data: [true, false, null, null] },
|
||||
]
|
||||
|
||||
describe('parquetWriteBuffer', () => {
|
||||
it('writes expected metadata', () => {
|
||||
const file = parquetWriteBuffer({ columnData: basicData })
|
||||
const file = parquetWriteBuffer({ columnData: exampleData })
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata).toEqual(exampleMetadata)
|
||||
})
|
||||
|
||||
it('serializes basic types', async () => {
|
||||
const result = await roundTripDeserialize(basicData)
|
||||
const result = await roundTripDeserialize(exampleData)
|
||||
expect(result).toEqual([
|
||||
{ bool: true, int: 0, bigint: 0n, float: 0, double: 0, string: 'a', nullable: true },
|
||||
{ bool: false, int: 127, bigint: 127n, float: 0.00009999999747378752, double: 0.0001, string: 'b', nullable: false },
|
||||
@ -90,8 +79,8 @@ describe('parquetWriteBuffer', () => {
|
||||
})
|
||||
|
||||
it('writes statistics when enabled', () => {
|
||||
const withStats = parquetWriteBuffer({ columnData: basicData, statistics: true })
|
||||
const noStats = parquetWriteBuffer({ columnData: basicData, statistics: false })
|
||||
const withStats = parquetWriteBuffer({ columnData: exampleData, statistics: true })
|
||||
const noStats = parquetWriteBuffer({ columnData: exampleData, statistics: false })
|
||||
expect(withStats.byteLength).toBe(773)
|
||||
expect(noStats.byteLength).toBe(663)
|
||||
})
|
||||
|
||||
@ -1,9 +1,8 @@
|
||||
import fs from 'fs'
|
||||
import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects } from 'hyparquet'
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest'
|
||||
import { exampleMetadata } from './metadata.test.js'
|
||||
import { parquetWriteFile } from '../src/index.js'
|
||||
import { basicData } from './write.buffer.test.js'
|
||||
import { exampleData, exampleMetadata } from './example.js'
|
||||
|
||||
const filedir = 'data/'
|
||||
const filename = 'data/write.file.parquet'
|
||||
@ -24,7 +23,7 @@ describe('parquetWriteFile', () => {
|
||||
})
|
||||
|
||||
it('writes parquet file', async () => {
|
||||
parquetWriteFile({ filename, columnData: basicData })
|
||||
parquetWriteFile({ filename, columnData: exampleData })
|
||||
|
||||
// check parquet metadata
|
||||
const file = await asyncBufferFromFile(filename)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user