diff --git a/.eslintrc.json b/.eslintrc.json index 7a7655b..1269690 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -10,7 +10,7 @@ "@typescript-eslint/no-explicit-any": "warn", "@typescript-eslint/no-unused-vars": "warn", "arrow-spacing": "error", - "camelcase": "error", + "camelcase": "off", "comma-spacing": "error", "comma-dangle": ["error", "always-multiline"], "eol-last": "error", @@ -32,7 +32,7 @@ "jsdoc/check-tag-names": "error", "jsdoc/no-types": "error", "jsdoc/sort-tags": "error", - "no-constant-condition": "warn", + "no-constant-condition": "off", "no-multi-spaces": "error", "no-trailing-spaces": "error", "no-var": "error", diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f892bfb..26ab6b2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,7 @@ on: push: jobs: - hyllama: + hyparquet: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 diff --git a/package.json b/package.json index bcacc84..bbdc603 100644 --- a/package.json +++ b/package.json @@ -9,7 +9,7 @@ "license": "MIT", "repository": { "type": "git", - "url": "https://github.com/hyparam/hyllama" + "url": "https://github.com/hyparam/hyparquet" }, "main": "dist/hyparquet.js", "files": [ @@ -24,12 +24,13 @@ "test": "vitest run" }, "devDependencies": { - "@typescript-eslint/eslint-plugin": "6.16.0", - "@vitest/coverage-v8": "1.1.0", + "@types/node": "20.10.6", + "@typescript-eslint/eslint-plugin": "6.17.0", + "@vitest/coverage-v8": "1.1.1", "eslint": "8.56.0", "eslint-plugin-import": "2.29.1", - "eslint-plugin-jsdoc": "46.9.1", + "eslint-plugin-jsdoc": "48.0.2", "typescript": "5.3.3", - "vitest": "1.1.0" + "vitest": "1.1.1" } } diff --git a/src/hyparquet.ts b/src/hyparquet.ts index d53fbed..45a5869 100644 --- a/src/hyparquet.ts +++ b/src/hyparquet.ts @@ -1,4 +1,4 @@ -import { parquetMetadata } from './metadata' +import { parquetMetadata } from './metadata.js' /** * Read parquet data rows from a file diff --git a/src/metadata.ts b/src/metadata.ts index f6d90ec..716b662 100644 --- a/src/metadata.ts +++ b/src/metadata.ts @@ -1,12 +1,11 @@ -import { deserializeTCompactProtocol } from './thrift' -import type { FileMetaData, SchemaElement } from './types' +import { deserializeTCompactProtocol } from './thrift.js' +import type { FileMetaData, SchemaElement } from './types.ts' /** * Read parquet header, metadata, and schema information from a file * @param arrayBuffer parquet file contents * @returns metadata object */ -/* eslint-disable camelcase */ export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData { // DataView for easier manipulation of the buffer const view = new DataView(arrayBuffer) @@ -110,3 +109,22 @@ export function schemaElement(schema: SchemaElement[], name: string[]): any { } return element } + +/** + * Replace bigints with numbers. + */ +export function castBigInts(obj: any): any { + if (typeof obj === 'bigint') { + return Number(obj) + } else if (Array.isArray(obj)) { + return obj.map(castBigInts) + } else if (typeof obj === 'object') { + const newObj = {} + for (const key of Object.keys(obj)) { + newObj[key] = castBigInts(obj[key]) + } + return newObj + } else { + return obj + } +} diff --git a/src/thrift.ts b/src/thrift.ts index def536d..a246965 100644 --- a/src/thrift.ts +++ b/src/thrift.ts @@ -1,4 +1,4 @@ -import { Decoded } from './types' +import type { Decoded } from './types.ts' // TCompactProtocol types const CompactType = { diff --git a/test/files/addrtype-missing-value.parquet b/test/files/addrtype-missing-value.parquet new file mode 100644 index 0000000..55e92be Binary files /dev/null and b/test/files/addrtype-missing-value.parquet differ diff --git a/test/metadata.test.ts b/test/metadata.test.ts new file mode 100644 index 0000000..9cd470d --- /dev/null +++ b/test/metadata.test.ts @@ -0,0 +1,67 @@ +import { promises as fs } from 'fs' +import { describe, expect, it } from 'vitest' +import { castBigInts, parquetMetadata } from '../src/metadata' + +// Helper function to read .parquet file into ArrayBuffer +async function readFileToArrayBuffer(filePath: string): Promise { + const buffer = await fs.readFile(filePath) + return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) +} + +describe('parquetMetadata', () => { + it('should correctly decode metadata from addrtype-missing-value.parquet', async () => { + const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet') + const result = parquetMetadata(arrayBuffer) + + const expectedMetadata = { + version: 1, + schema: [ + { repetition_type: 0, name: 'duckdb_schema', num_children: 1 }, + { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 }, + ], + num_rows: 10, + row_groups: [ + { + columns: [ + { + file_offset: 0, + meta_data: { + type: 6, + encodings: [0, 8], + path_in_schema: ['ADDRTYPE'], + codec: 1, + num_values: 10, + total_uncompressed_size: 78, + total_compressed_size: 82, + data_page_offset: 31, + dictionary_page_offset: 4, + statistics: { + max: 'Intersection', + min: 'Block', + null_count: 1, + distinct_count: 2, + }, + }, + }, + ], + total_byte_size: 33024, + num_rows: 10, + }, + ], + created_by: 'DuckDB', + } + + const casted = castBigInts(result) + expect(casted).toEqual(expectedMetadata) + }) + + it('should throw an error for a too short file', () => { + const arrayBuffer = new ArrayBuffer(0) + expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short') + }) + + it('should throw an error for invalid magic number', () => { + const arrayBuffer = new ArrayBuffer(8) + expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number') + }) +}) diff --git a/tsconfig.json b/tsconfig.json index ac53ee9..4b50dee 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -2,6 +2,7 @@ "compilerOptions": { "declaration": true, "lib": ["esnext", "dom"], + "module": "nodenext", "outDir": "dist", "sourceMap": true, "target": "esnext",