From bfb1d74bf8a0d24eafc533922bea3089f5441d10 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sun, 26 Oct 2025 14:30:49 -0700 Subject: [PATCH] Geospatial stats (#13) --- README.md | 10 ++- package.json | 8 +- src/column.js | 25 +++--- src/geospatial.js | 149 ++++++++++++++++++++++++++++++++++ src/metadata.js | 13 +++ src/wkb.js | 17 +--- test/geospatial.test.js | 93 +++++++++++++++++++++ test/metadata.test.js | 88 ++++++++++++++++++++ test/write.geospatial.test.js | 61 ++++++++++++++ 9 files changed, 430 insertions(+), 34 deletions(-) create mode 100644 src/geospatial.js create mode 100644 test/geospatial.test.js create mode 100644 test/write.geospatial.test.js diff --git a/README.md b/README.md index 3f682ea..e4b427d 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![minzipped](https://img.shields.io/bundlephobia/minzip/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer) [![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions) [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) -![coverage](https://img.shields.io/badge/Coverage-96-darkred) +![coverage](https://img.shields.io/badge/Coverage-94-darkred) [![dependencies](https://img.shields.io/badge/Dependencies-1-blueviolet)](https://www.npmjs.com/package/hyparquet-writer?activeTab=dependencies) Hyparquet Writer is a JavaScript library for writing [Apache Parquet](https://parquet.apache.org) files. It is designed to be lightweight, fast and store data very efficiently. It is a companion to the [hyparquet](https://github.com/hyparam/hyparquet) library, which is a JavaScript library for reading parquet files. @@ -30,8 +30,10 @@ const arrayBuffer = parquetWriteBuffer({ }) ``` -Note: if `type` is not provided, the type will be guessed from the data. The supported types are a superset of the parquet types: +Note: if `type` is not provided, the type will be guessed from the data. The supported `BasicType` are a superset of the parquet primitive types: +| Basic Type | Equivalent Schema Element | +|------|----------------| | `BOOLEAN` | `{ type: 'BOOLEAN' }` | | `INT32` | `{ type: 'INT32' }` | | `INT64` | `{ type: 'INT64' }` | @@ -43,10 +45,12 @@ Note: if `type` is not provided, the type will be guessed from the data. The sup | `TIMESTAMP` | `{ type: 'INT64', converted_type: 'TIMESTAMP_MILLIS' }` | | `UUID` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 16, logical_type: { type: 'UUID' } }` | | `FLOAT16` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' } }` | +| `GEOMETRY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' } }` | +| `GEOGRAPHY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' } }` | More types are supported but require defining the `schema` explicitly. See the [advanced usage](#advanced-usage) section for more details. -### Node.js Write to Local Parquet File +### Write to Local Parquet File (nodejs) To write a local parquet file in node.js use `parquetWriteFile` with arguments `filename` and `columnData`: diff --git a/package.json b/package.json index f5f31e9..4c9c76c 100644 --- a/package.json +++ b/package.json @@ -52,15 +52,15 @@ "test": "vitest run" }, "dependencies": { - "hyparquet": "1.20.0" + "hyparquet": "1.20.1" }, "devDependencies": { "@babel/eslint-parser": "7.28.5", "@types/node": "24.9.1", - "@vitest/coverage-v8": "4.0.2", + "@vitest/coverage-v8": "4.0.3", "eslint": "9.38.0", - "eslint-plugin-jsdoc": "61.1.7", + "eslint-plugin-jsdoc": "61.1.9", "typescript": "5.9.3", - "vitest": "4.0.2" + "vitest": "4.0.3" } } diff --git a/src/column.js b/src/column.js index 6a7a483..41dcecf 100644 --- a/src/column.js +++ b/src/column.js @@ -1,6 +1,7 @@ import { ByteWriter } from './bytewriter.js' import { writeDataPageV2, writePageHeader } from './datapage.js' import { encodeListValues } from './dremel.js' +import { geospatialStatistics } from './geospatial.js' import { writePlain } from './plain.js' import { snappyCompress } from './snappy.js' import { unconvert } from './unconvert.js' @@ -32,8 +33,11 @@ export function writeColumn(writer, column, values, stats) { /** @type {Encoding[]} */ const encodings = [] + const isGeospatial = element?.logical_type?.type === 'GEOMETRY' || element?.logical_type?.type === 'GEOGRAPHY' + // Compute statistics - const statistics = stats ? getStatistics(values, element) : undefined + const statistics = stats ? getStatistics(values) : undefined + const geospatial_statistics = stats && isGeospatial ? geospatialStatistics(values) : undefined // dictionary encoding let dictionary_page_offset @@ -80,6 +84,7 @@ export function writeColumn(writer, column, values, stats) { data_page_offset, dictionary_page_offset, statistics, + geospatial_statistics, } } @@ -135,14 +140,9 @@ function writeDictionaryPage(writer, column, dictionary) { * @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet' * @import {ColumnEncoder, ListValues, Writer} from '../src/types.js' * @param {DecodedArray} values - * @param {SchemaElement} element - * @returns {Statistics | undefined} + * @returns {Statistics} */ -function getStatistics(values, element) { - const ltype = element?.logical_type?.type - const isGeospatial = ltype === 'GEOMETRY' || ltype === 'GEOGRAPHY' - if (isGeospatial) return - +function getStatistics(values) { let min_value = undefined let max_value = undefined let null_count = 0n @@ -151,12 +151,9 @@ function getStatistics(values, element) { null_count++ continue } - if (min_value === undefined || value < min_value) { - min_value = value - } - if (max_value === undefined || value > max_value) { - max_value = value - } + if (typeof value === 'object') continue // skip objects + if (min_value === undefined || value < min_value) min_value = value + if (max_value === undefined || value > max_value) max_value = value } return { min_value, max_value, null_count } } diff --git a/src/geospatial.js b/src/geospatial.js new file mode 100644 index 0000000..c522929 --- /dev/null +++ b/src/geospatial.js @@ -0,0 +1,149 @@ +/** + * Compute geospatial statistics for GEOMETRY and GEOGRAPHY columns. + * + * @import {BoundingBox, DecodedArray, Geometry, GeospatialStatistics} from 'hyparquet/src/types.js' + * @param {DecodedArray} values + * @returns {GeospatialStatistics | undefined} + */ +export function geospatialStatistics(values) { + /** @type {Set} */ + const typeCodes = new Set() + /** @type {BoundingBox | undefined} */ + let bbox + + for (const value of values) { + if (value === null || value === undefined) continue + if (typeof value !== 'object') { + throw new Error('geospatial column expects GeoJSON geometries') + } + bbox = extendBoundsFromGeometry(bbox, value) + typeCodes.add(geometryTypeCodeWithDimension(value)) + } + + if (typeCodes.size || bbox) { + return { + bbox, + // Geospatial type codes of all instances, or an empty list if not known + geospatial_types: typeCodes.size ? Array.from(typeCodes).sort((a, b) => a - b) : [], + } + } +} + +/** + * @param {BoundingBox | undefined} bbox + * @param {Geometry} geometry + * @returns {BoundingBox | undefined} + */ +function extendBoundsFromGeometry(bbox, geometry) { + if (geometry.type === 'GeometryCollection') { + for (const child of geometry.geometries || []) { + bbox = extendBoundsFromGeometry(bbox, child) + } + return bbox + } + return extendBoundsFromCoordinates(bbox, geometry.coordinates) +} + +/** + * @param {BoundingBox | undefined} bbox + * @param {any[]} coordinates + * @returns {BoundingBox | undefined} + */ +function extendBoundsFromCoordinates(bbox, coordinates) { + if (typeof coordinates[0] === 'number') { + return grow(bbox, coordinates) + } + for (const child of coordinates) { + bbox = extendBoundsFromCoordinates(bbox, child) + } + return bbox +} + +/** + * Initialize or expand bbox with a single position [x,y,(z),(m)]. + * @param {BoundingBox | undefined} bbox + * @param {number[]} position + * @returns {BoundingBox | undefined} + */ +function grow(bbox, position) { + const x = position[0] + const y = position[1] + if (!Number.isFinite(x) || !Number.isFinite(y)) return bbox + + if (!bbox) { + bbox = { xmin: x, ymin: y, xmax: x, ymax: y } + } else { + updateAxis(bbox, 'xmin', 'xmax', x) + updateAxis(bbox, 'ymin', 'ymax', y) + } + + if (position.length > 2) updateAxis(bbox, 'zmin', 'zmax', position[2]) + if (position.length > 3) updateAxis(bbox, 'mmin', 'mmax', position[3]) + return bbox +} + +/** + * @param {BoundingBox} bbox + * @param {'xmin' | 'ymin' | 'zmin' | 'mmin'} minKey + * @param {'xmax' | 'ymax' | 'zmax' | 'mmax'} maxKey + * @param {number | undefined} value + */ +function updateAxis(bbox, minKey, maxKey, value) { + if (value === undefined || !Number.isFinite(value)) return + if (bbox[minKey] === undefined || value < bbox[minKey]) bbox[minKey] = value + if (bbox[maxKey] === undefined || value > bbox[maxKey]) bbox[maxKey] = value +} + +/** + * @param {Geometry} geometry + * @returns {number} + */ +function geometryTypeCodeWithDimension(geometry) { + const base = geometryTypeCodes[geometry.type] + if (base === undefined) throw new Error(`unknown geometry type: ${geometry.type}`) + const dim = inferGeometryDimensions(geometry) + if (dim === 2) return base + if (dim === 3) return base + 1000 + if (dim === 4) return base + 3000 + throw new Error(`unsupported geometry dimensions: ${dim}`) +} + +const geometryTypeCodes = { + Point: 1, + LineString: 2, + Polygon: 3, + MultiPoint: 4, + MultiLineString: 5, + MultiPolygon: 6, + GeometryCollection: 7, +} + +/** + * Determine the maximum coordinate dimensions for the geometry. + * @param {Geometry} geometry + * @returns {number} + */ +function inferGeometryDimensions(geometry) { + if (geometry.type === 'GeometryCollection') { + let maxDim = 0 + for (const child of geometry.geometries || []) { + maxDim = Math.max(maxDim, inferGeometryDimensions(child)) + } + return maxDim || 2 + } + return inferCoordinateDimensions(geometry.coordinates) +} + +/** + * @param {any[]} value + * @returns {number} + */ +function inferCoordinateDimensions(value) { + if (!value.length) return 2 + if (typeof value[0] === 'number') return value.length + let maxDim = 0 + for (const item of value) { + maxDim = Math.max(maxDim, inferCoordinateDimensions(item)) + } + return maxDim || 2 +} diff --git a/src/metadata.js b/src/metadata.js index 7083451..11cc8df 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -61,6 +61,19 @@ export function writeMetadata(writer, metadata) { field_2: c.meta_data.size_statistics.repetition_level_histogram, field_3: c.meta_data.size_statistics.definition_level_histogram, }, + field_17: c.meta_data.geospatial_statistics && { + field_1: c.meta_data.geospatial_statistics.bbox && { + field_1: c.meta_data.geospatial_statistics.bbox.xmin, + field_2: c.meta_data.geospatial_statistics.bbox.xmax, + field_3: c.meta_data.geospatial_statistics.bbox.ymin, + field_4: c.meta_data.geospatial_statistics.bbox.ymax, + field_5: c.meta_data.geospatial_statistics.bbox.zmin, + field_6: c.meta_data.geospatial_statistics.bbox.zmax, + field_7: c.meta_data.geospatial_statistics.bbox.mmin, + field_8: c.meta_data.geospatial_statistics.bbox.mmax, + }, + field_2: c.meta_data.geospatial_statistics.geospatial_types, + }, }, field_4: c.offset_index_offset, field_5: c.offset_index_length, diff --git a/src/wkb.js b/src/wkb.js index 7019624..a6e3948 100644 --- a/src/wkb.js +++ b/src/wkb.js @@ -35,7 +35,10 @@ function writeGeometry(writer, geometry) { } else if (geometry.type === 'LineString') { writeLine(writer, geometry.coordinates, dim) } else if (geometry.type === 'Polygon') { - writePolygon(writer, geometry.coordinates, dim) + writer.appendUint32(geometry.coordinates.length) + for (const ring of geometry.coordinates) { + writeLine(writer, ring, dim) + } } else if (geometry.type === 'MultiPoint') { writer.appendUint32(geometry.coordinates.length) for (const coordinates of geometry.coordinates) { @@ -87,18 +90,6 @@ function writeLine(writer, coordinates, dim) { } } -/** - * @param {ByteWriter} writer - * @param {Position[][]} rings - * @param {number} dimensions - */ -function writePolygon(writer, rings, dimensions) { - writer.appendUint32(rings.length) - for (const ring of rings) { - writeLine(writer, ring, dimensions) - } -} - /** * @param {Geometry['type']} type * @returns {number} diff --git a/test/geospatial.test.js b/test/geospatial.test.js new file mode 100644 index 0000000..3987fbb --- /dev/null +++ b/test/geospatial.test.js @@ -0,0 +1,93 @@ +import { describe, expect, it } from 'vitest' +import { geospatialStatistics } from '../src/geospatial.js' + +describe('geospatialStatistics', () => { + it('computes bounding boxes and geospatial type codes for nested inputs', () => { + const result = geospatialStatistics([ + null, + undefined, + { type: 'Point', coordinates: [1, 2] }, + { + type: 'LineString', + coordinates: [ + [5, -1, 10], + [0, 3, -5], + [2, 2, undefined], + [6, 1, Infinity], + ], + }, + { + type: 'Polygon', + coordinates: [ + [ + [9, 9, 1, 5], + [9, 10, 3, 5], + [8, 9, -4, 8], + [7, 8, Infinity, Infinity], + ], + ], + }, + { + type: 'MultiPoint', + coordinates: [ + [-5, -5, 0, -10], + [4, 4, 12, undefined], + ], + }, + { type: 'MultiPolygon', coordinates: [] }, + { + type: 'MultiLineString', + coordinates: [ + [ + [ + [Infinity, 0], + ], + ], + ], + }, + { + type: 'GeometryCollection', + geometries: [ + { type: 'Point', coordinates: [2, -3, 7, 9] }, + { type: 'MultiPoint', coordinates: [[60, 10, 0, 11], [3, 6]] }, + ], + }, + { type: 'GeometryCollection', geometries: [] }, + ]) + + expect(result).toEqual({ + bbox: { + xmin: -5, + xmax: 60, + ymin: -5, + ymax: 10, + zmin: -5, + zmax: 12, + mmin: -10, + mmax: 11, + }, + geospatial_types: [1, 5, 6, 7, 1002, 3003, 3004, 3007], + }) + }) + + it('omits geospatial statistics when only null-like values are present', () => { + const result = geospatialStatistics([null, undefined, null]) + expect(result).toBeUndefined() + }) + + it('tracks type codes even when coordinates are empty', () => { + const result = geospatialStatistics([ + { type: 'Point', coordinates: [] }, + ]) + expect(result).toEqual({ + bbox: undefined, + geospatial_types: [1], + }) + }) + + it('throws on invalid value types and geometry definitions', () => { + expect(() => geospatialStatistics(['oops'])).toThrow('geospatial column expects GeoJSON geometries') + expect(() => geospatialStatistics([{ type: 'Unknown', coordinates: [] }])).toThrow('unknown geometry type: Unknown') + expect(() => geospatialStatistics([{ type: 'Point', coordinates: [0, 0, 0, 0, 0] }])).toThrow('unsupported geometry dimensions: 5') + }) +}) diff --git a/test/metadata.test.js b/test/metadata.test.js index 9f4d009..25217df 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -36,6 +36,94 @@ describe('writeMetadata', () => { expect(outputMetadata).toEqual(withKvMetadata) }) + + it('writes extended column metadata fields', () => { + const writer = new ByteWriter() + writer.appendUint32(0x31524150) + + /** @type {FileMetaData} */ + const extendedMetadata = { + version: 2, + created_by: 'hyparquet', + schema: [ + { name: 'root', num_children: 1 }, + { + name: 'geo', + type: 'BYTE_ARRAY', + repetition_type: 'REQUIRED', + logical_type: { type: 'GEOGRAPHY', crs: 'EPSG:4326', algorithm: 'KARNEY' }, + }, + ], + num_rows: 1n, + row_groups: [{ + columns: [{ + file_path: 'part-0.parquet', + file_offset: 4n, + meta_data: { + type: 'BYTE_ARRAY', + encodings: ['PLAIN', 'RLE'], + path_in_schema: [], + codec: 'SNAPPY', + num_values: 1n, + total_uncompressed_size: 10n, + total_compressed_size: 8n, + key_value_metadata: [{ key: 'chunk', value: 'value' }], + data_page_offset: 4n, + index_page_offset: 12n, + dictionary_page_offset: 20n, + statistics: { + null_count: 0n, + min_value: 'a', + max_value: 'z', + }, + encoding_stats: [{ page_type: 'DATA_PAGE', encoding: 'PLAIN', count: 1 }], + bloom_filter_offset: 30n, + bloom_filter_length: 4, + size_statistics: { + unencoded_byte_array_data_bytes: 5n, + repetition_level_histogram: [1n, 0n], + definition_level_histogram: [2n, 0n], + }, + geospatial_statistics: { + bbox: { + xmin: 0, + xmax: 10, + ymin: -5, + ymax: 5, + zmin: 1, + zmax: 2, + mmin: 3, + mmax: 4, + }, + geospatial_types: [0, 1], + }, + }, + offset_index_offset: 40n, + offset_index_length: 16, + column_index_offset: 60n, + column_index_length: 24, + encrypted_column_metadata: new Uint8Array([7, 8, 9]), + }], + total_byte_size: 64n, + num_rows: 1n, + sorting_columns: [{ + column_idx: 0, + descending: true, + nulls_first: false, + }], + file_offset: 4n, + total_compressed_size: 8n, + }], + key_value_metadata: [{ key: 'meta', value: 'data' }], + metadata_length: 223, + } + + writeMetadata(writer, extendedMetadata) + writer.appendUint32(0x31524150) + + const outputMetadata = parquetMetadata(writer.getBuffer()) + expect(outputMetadata).toEqual(extendedMetadata) + }) }) describe('logicalType', () => { diff --git a/test/write.geospatial.test.js b/test/write.geospatial.test.js new file mode 100644 index 0000000..5362f57 --- /dev/null +++ b/test/write.geospatial.test.js @@ -0,0 +1,61 @@ +import { parquetMetadata } from 'hyparquet' +import { describe, expect, it } from 'vitest' +import { parquetWriteBuffer } from '../src/index.js' + +/** + * @import {ColumnSource} from '../src/types.js' + */ + +describe('geospatial statistics', () => { + it('writes geospatial statistics into column metadata', () => { + /** @type {ColumnSource[]} */ + const columnData = [{ + name: 'geometry', + type: 'GEOMETRY', + data: [ + { type: 'Point', coordinates: [10, 5, 100, 2] }, + null, + { + type: 'LineString', + coordinates: [ + [-20, -10, 50, 5], + [40, 30, 75, -5], + ], + }, + { + type: 'GeometryCollection', + geometries: [ + { type: 'Point', coordinates: [5, 15] }, + { + type: 'MultiPoint', + coordinates: [ + [0, -5], + [60, 10], + ], + }, + ], + }, + ], + }] + + const buffer = parquetWriteBuffer({ columnData }) + const metadata = parquetMetadata(buffer) + const columnMeta = metadata.row_groups[0].columns[0].meta_data + + expect(columnMeta?.statistics).toEqual({ null_count: 1n }) + expect(columnMeta?.geospatial_statistics).toEqual({ + bbox: { + xmin: -20, + xmax: 60, + ymin: -10, + ymax: 30, + zmin: 50, + zmax: 100, + mmin: -5, + mmax: 5, + }, + // sort numerically not by string order + geospatial_types: [7, 3001, 3002], + }) + }) +})