Geospatial stats (#13)

This commit is contained in:
Kenny Daniel 2025-10-26 14:30:49 -07:00 committed by GitHub
parent 3a2e0203aa
commit bfb1d74bf8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 430 additions and 34 deletions

@ -6,7 +6,7 @@
[![minzipped](https://img.shields.io/bundlephobia/minzip/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer)
[![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions)
[![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)
![coverage](https://img.shields.io/badge/Coverage-96-darkred)
![coverage](https://img.shields.io/badge/Coverage-94-darkred)
[![dependencies](https://img.shields.io/badge/Dependencies-1-blueviolet)](https://www.npmjs.com/package/hyparquet-writer?activeTab=dependencies)
Hyparquet Writer is a JavaScript library for writing [Apache Parquet](https://parquet.apache.org) files. It is designed to be lightweight, fast and store data very efficiently. It is a companion to the [hyparquet](https://github.com/hyparam/hyparquet) library, which is a JavaScript library for reading parquet files.
@ -30,8 +30,10 @@ const arrayBuffer = parquetWriteBuffer({
})
```
Note: if `type` is not provided, the type will be guessed from the data. The supported types are a superset of the parquet types:
Note: if `type` is not provided, the type will be guessed from the data. The supported `BasicType` are a superset of the parquet primitive types:
| Basic Type | Equivalent Schema Element |
|------|----------------|
| `BOOLEAN` | `{ type: 'BOOLEAN' }` |
| `INT32` | `{ type: 'INT32' }` |
| `INT64` | `{ type: 'INT64' }` |
@ -43,10 +45,12 @@ Note: if `type` is not provided, the type will be guessed from the data. The sup
| `TIMESTAMP` | `{ type: 'INT64', converted_type: 'TIMESTAMP_MILLIS' }` |
| `UUID` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 16, logical_type: { type: 'UUID' } }` |
| `FLOAT16` | `{ type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' } }` |
| `GEOMETRY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' } }` |
| `GEOGRAPHY` | `{ type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' } }` |
More types are supported but require defining the `schema` explicitly. See the [advanced usage](#advanced-usage) section for more details.
### Node.js Write to Local Parquet File
### Write to Local Parquet File (nodejs)
To write a local parquet file in node.js use `parquetWriteFile` with arguments `filename` and `columnData`:

@ -52,15 +52,15 @@
"test": "vitest run"
},
"dependencies": {
"hyparquet": "1.20.0"
"hyparquet": "1.20.1"
},
"devDependencies": {
"@babel/eslint-parser": "7.28.5",
"@types/node": "24.9.1",
"@vitest/coverage-v8": "4.0.2",
"@vitest/coverage-v8": "4.0.3",
"eslint": "9.38.0",
"eslint-plugin-jsdoc": "61.1.7",
"eslint-plugin-jsdoc": "61.1.9",
"typescript": "5.9.3",
"vitest": "4.0.2"
"vitest": "4.0.3"
}
}

@ -1,6 +1,7 @@
import { ByteWriter } from './bytewriter.js'
import { writeDataPageV2, writePageHeader } from './datapage.js'
import { encodeListValues } from './dremel.js'
import { geospatialStatistics } from './geospatial.js'
import { writePlain } from './plain.js'
import { snappyCompress } from './snappy.js'
import { unconvert } from './unconvert.js'
@ -32,8 +33,11 @@ export function writeColumn(writer, column, values, stats) {
/** @type {Encoding[]} */
const encodings = []
const isGeospatial = element?.logical_type?.type === 'GEOMETRY' || element?.logical_type?.type === 'GEOGRAPHY'
// Compute statistics
const statistics = stats ? getStatistics(values, element) : undefined
const statistics = stats ? getStatistics(values) : undefined
const geospatial_statistics = stats && isGeospatial ? geospatialStatistics(values) : undefined
// dictionary encoding
let dictionary_page_offset
@ -80,6 +84,7 @@ export function writeColumn(writer, column, values, stats) {
data_page_offset,
dictionary_page_offset,
statistics,
geospatial_statistics,
}
}
@ -135,14 +140,9 @@ function writeDictionaryPage(writer, column, dictionary) {
* @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
* @import {ColumnEncoder, ListValues, Writer} from '../src/types.js'
* @param {DecodedArray} values
* @param {SchemaElement} element
* @returns {Statistics | undefined}
* @returns {Statistics}
*/
function getStatistics(values, element) {
const ltype = element?.logical_type?.type
const isGeospatial = ltype === 'GEOMETRY' || ltype === 'GEOGRAPHY'
if (isGeospatial) return
function getStatistics(values) {
let min_value = undefined
let max_value = undefined
let null_count = 0n
@ -151,12 +151,9 @@ function getStatistics(values, element) {
null_count++
continue
}
if (min_value === undefined || value < min_value) {
min_value = value
}
if (max_value === undefined || value > max_value) {
max_value = value
}
if (typeof value === 'object') continue // skip objects
if (min_value === undefined || value < min_value) min_value = value
if (max_value === undefined || value > max_value) max_value = value
}
return { min_value, max_value, null_count }
}

149
src/geospatial.js Normal file

@ -0,0 +1,149 @@
/**
* Compute geospatial statistics for GEOMETRY and GEOGRAPHY columns.
*
* @import {BoundingBox, DecodedArray, Geometry, GeospatialStatistics} from 'hyparquet/src/types.js'
* @param {DecodedArray} values
* @returns {GeospatialStatistics | undefined}
*/
export function geospatialStatistics(values) {
/** @type {Set<number>} */
const typeCodes = new Set()
/** @type {BoundingBox | undefined} */
let bbox
for (const value of values) {
if (value === null || value === undefined) continue
if (typeof value !== 'object') {
throw new Error('geospatial column expects GeoJSON geometries')
}
bbox = extendBoundsFromGeometry(bbox, value)
typeCodes.add(geometryTypeCodeWithDimension(value))
}
if (typeCodes.size || bbox) {
return {
bbox,
// Geospatial type codes of all instances, or an empty list if not known
geospatial_types: typeCodes.size ? Array.from(typeCodes).sort((a, b) => a - b) : [],
}
}
}
/**
* @param {BoundingBox | undefined} bbox
* @param {Geometry} geometry
* @returns {BoundingBox | undefined}
*/
function extendBoundsFromGeometry(bbox, geometry) {
if (geometry.type === 'GeometryCollection') {
for (const child of geometry.geometries || []) {
bbox = extendBoundsFromGeometry(bbox, child)
}
return bbox
}
return extendBoundsFromCoordinates(bbox, geometry.coordinates)
}
/**
* @param {BoundingBox | undefined} bbox
* @param {any[]} coordinates
* @returns {BoundingBox | undefined}
*/
function extendBoundsFromCoordinates(bbox, coordinates) {
if (typeof coordinates[0] === 'number') {
return grow(bbox, coordinates)
}
for (const child of coordinates) {
bbox = extendBoundsFromCoordinates(bbox, child)
}
return bbox
}
/**
* Initialize or expand bbox with a single position [x,y,(z),(m)].
* @param {BoundingBox | undefined} bbox
* @param {number[]} position
* @returns {BoundingBox | undefined}
*/
function grow(bbox, position) {
const x = position[0]
const y = position[1]
if (!Number.isFinite(x) || !Number.isFinite(y)) return bbox
if (!bbox) {
bbox = { xmin: x, ymin: y, xmax: x, ymax: y }
} else {
updateAxis(bbox, 'xmin', 'xmax', x)
updateAxis(bbox, 'ymin', 'ymax', y)
}
if (position.length > 2) updateAxis(bbox, 'zmin', 'zmax', position[2])
if (position.length > 3) updateAxis(bbox, 'mmin', 'mmax', position[3])
return bbox
}
/**
* @param {BoundingBox} bbox
* @param {'xmin' | 'ymin' | 'zmin' | 'mmin'} minKey
* @param {'xmax' | 'ymax' | 'zmax' | 'mmax'} maxKey
* @param {number | undefined} value
*/
function updateAxis(bbox, minKey, maxKey, value) {
if (value === undefined || !Number.isFinite(value)) return
if (bbox[minKey] === undefined || value < bbox[minKey]) bbox[minKey] = value
if (bbox[maxKey] === undefined || value > bbox[maxKey]) bbox[maxKey] = value
}
/**
* @param {Geometry} geometry
* @returns {number}
*/
function geometryTypeCodeWithDimension(geometry) {
const base = geometryTypeCodes[geometry.type]
if (base === undefined) throw new Error(`unknown geometry type: ${geometry.type}`)
const dim = inferGeometryDimensions(geometry)
if (dim === 2) return base
if (dim === 3) return base + 1000
if (dim === 4) return base + 3000
throw new Error(`unsupported geometry dimensions: ${dim}`)
}
const geometryTypeCodes = {
Point: 1,
LineString: 2,
Polygon: 3,
MultiPoint: 4,
MultiLineString: 5,
MultiPolygon: 6,
GeometryCollection: 7,
}
/**
* Determine the maximum coordinate dimensions for the geometry.
* @param {Geometry} geometry
* @returns {number}
*/
function inferGeometryDimensions(geometry) {
if (geometry.type === 'GeometryCollection') {
let maxDim = 0
for (const child of geometry.geometries || []) {
maxDim = Math.max(maxDim, inferGeometryDimensions(child))
}
return maxDim || 2
}
return inferCoordinateDimensions(geometry.coordinates)
}
/**
* @param {any[]} value
* @returns {number}
*/
function inferCoordinateDimensions(value) {
if (!value.length) return 2
if (typeof value[0] === 'number') return value.length
let maxDim = 0
for (const item of value) {
maxDim = Math.max(maxDim, inferCoordinateDimensions(item))
}
return maxDim || 2
}

@ -61,6 +61,19 @@ export function writeMetadata(writer, metadata) {
field_2: c.meta_data.size_statistics.repetition_level_histogram,
field_3: c.meta_data.size_statistics.definition_level_histogram,
},
field_17: c.meta_data.geospatial_statistics && {
field_1: c.meta_data.geospatial_statistics.bbox && {
field_1: c.meta_data.geospatial_statistics.bbox.xmin,
field_2: c.meta_data.geospatial_statistics.bbox.xmax,
field_3: c.meta_data.geospatial_statistics.bbox.ymin,
field_4: c.meta_data.geospatial_statistics.bbox.ymax,
field_5: c.meta_data.geospatial_statistics.bbox.zmin,
field_6: c.meta_data.geospatial_statistics.bbox.zmax,
field_7: c.meta_data.geospatial_statistics.bbox.mmin,
field_8: c.meta_data.geospatial_statistics.bbox.mmax,
},
field_2: c.meta_data.geospatial_statistics.geospatial_types,
},
},
field_4: c.offset_index_offset,
field_5: c.offset_index_length,

@ -35,7 +35,10 @@ function writeGeometry(writer, geometry) {
} else if (geometry.type === 'LineString') {
writeLine(writer, geometry.coordinates, dim)
} else if (geometry.type === 'Polygon') {
writePolygon(writer, geometry.coordinates, dim)
writer.appendUint32(geometry.coordinates.length)
for (const ring of geometry.coordinates) {
writeLine(writer, ring, dim)
}
} else if (geometry.type === 'MultiPoint') {
writer.appendUint32(geometry.coordinates.length)
for (const coordinates of geometry.coordinates) {
@ -87,18 +90,6 @@ function writeLine(writer, coordinates, dim) {
}
}
/**
* @param {ByteWriter} writer
* @param {Position[][]} rings
* @param {number} dimensions
*/
function writePolygon(writer, rings, dimensions) {
writer.appendUint32(rings.length)
for (const ring of rings) {
writeLine(writer, ring, dimensions)
}
}
/**
* @param {Geometry['type']} type
* @returns {number}

93
test/geospatial.test.js Normal file

@ -0,0 +1,93 @@
import { describe, expect, it } from 'vitest'
import { geospatialStatistics } from '../src/geospatial.js'
describe('geospatialStatistics', () => {
it('computes bounding boxes and geospatial type codes for nested inputs', () => {
const result = geospatialStatistics([
null,
undefined,
{ type: 'Point', coordinates: [1, 2] },
{
type: 'LineString',
coordinates: [
[5, -1, 10],
[0, 3, -5],
[2, 2, undefined],
[6, 1, Infinity],
],
},
{
type: 'Polygon',
coordinates: [
[
[9, 9, 1, 5],
[9, 10, 3, 5],
[8, 9, -4, 8],
[7, 8, Infinity, Infinity],
],
],
},
{
type: 'MultiPoint',
coordinates: [
[-5, -5, 0, -10],
[4, 4, 12, undefined],
],
},
{ type: 'MultiPolygon', coordinates: [] },
{
type: 'MultiLineString',
coordinates: [
[
[
[Infinity, 0],
],
],
],
},
{
type: 'GeometryCollection',
geometries: [
{ type: 'Point', coordinates: [2, -3, 7, 9] },
{ type: 'MultiPoint', coordinates: [[60, 10, 0, 11], [3, 6]] },
],
},
{ type: 'GeometryCollection', geometries: [] },
])
expect(result).toEqual({
bbox: {
xmin: -5,
xmax: 60,
ymin: -5,
ymax: 10,
zmin: -5,
zmax: 12,
mmin: -10,
mmax: 11,
},
geospatial_types: [1, 5, 6, 7, 1002, 3003, 3004, 3007],
})
})
it('omits geospatial statistics when only null-like values are present', () => {
const result = geospatialStatistics([null, undefined, null])
expect(result).toBeUndefined()
})
it('tracks type codes even when coordinates are empty', () => {
const result = geospatialStatistics([
{ type: 'Point', coordinates: [] },
])
expect(result).toEqual({
bbox: undefined,
geospatial_types: [1],
})
})
it('throws on invalid value types and geometry definitions', () => {
expect(() => geospatialStatistics(['oops'])).toThrow('geospatial column expects GeoJSON geometries')
expect(() => geospatialStatistics([{ type: 'Unknown', coordinates: [] }])).toThrow('unknown geometry type: Unknown')
expect(() => geospatialStatistics([{ type: 'Point', coordinates: [0, 0, 0, 0, 0] }])).toThrow('unsupported geometry dimensions: 5')
})
})

@ -36,6 +36,94 @@ describe('writeMetadata', () => {
expect(outputMetadata).toEqual(withKvMetadata)
})
it('writes extended column metadata fields', () => {
const writer = new ByteWriter()
writer.appendUint32(0x31524150)
/** @type {FileMetaData} */
const extendedMetadata = {
version: 2,
created_by: 'hyparquet',
schema: [
{ name: 'root', num_children: 1 },
{
name: 'geo',
type: 'BYTE_ARRAY',
repetition_type: 'REQUIRED',
logical_type: { type: 'GEOGRAPHY', crs: 'EPSG:4326', algorithm: 'KARNEY' },
},
],
num_rows: 1n,
row_groups: [{
columns: [{
file_path: 'part-0.parquet',
file_offset: 4n,
meta_data: {
type: 'BYTE_ARRAY',
encodings: ['PLAIN', 'RLE'],
path_in_schema: [],
codec: 'SNAPPY',
num_values: 1n,
total_uncompressed_size: 10n,
total_compressed_size: 8n,
key_value_metadata: [{ key: 'chunk', value: 'value' }],
data_page_offset: 4n,
index_page_offset: 12n,
dictionary_page_offset: 20n,
statistics: {
null_count: 0n,
min_value: 'a',
max_value: 'z',
},
encoding_stats: [{ page_type: 'DATA_PAGE', encoding: 'PLAIN', count: 1 }],
bloom_filter_offset: 30n,
bloom_filter_length: 4,
size_statistics: {
unencoded_byte_array_data_bytes: 5n,
repetition_level_histogram: [1n, 0n],
definition_level_histogram: [2n, 0n],
},
geospatial_statistics: {
bbox: {
xmin: 0,
xmax: 10,
ymin: -5,
ymax: 5,
zmin: 1,
zmax: 2,
mmin: 3,
mmax: 4,
},
geospatial_types: [0, 1],
},
},
offset_index_offset: 40n,
offset_index_length: 16,
column_index_offset: 60n,
column_index_length: 24,
encrypted_column_metadata: new Uint8Array([7, 8, 9]),
}],
total_byte_size: 64n,
num_rows: 1n,
sorting_columns: [{
column_idx: 0,
descending: true,
nulls_first: false,
}],
file_offset: 4n,
total_compressed_size: 8n,
}],
key_value_metadata: [{ key: 'meta', value: 'data' }],
metadata_length: 223,
}
writeMetadata(writer, extendedMetadata)
writer.appendUint32(0x31524150)
const outputMetadata = parquetMetadata(writer.getBuffer())
expect(outputMetadata).toEqual(extendedMetadata)
})
})
describe('logicalType', () => {

@ -0,0 +1,61 @@
import { parquetMetadata } from 'hyparquet'
import { describe, expect, it } from 'vitest'
import { parquetWriteBuffer } from '../src/index.js'
/**
* @import {ColumnSource} from '../src/types.js'
*/
describe('geospatial statistics', () => {
it('writes geospatial statistics into column metadata', () => {
/** @type {ColumnSource[]} */
const columnData = [{
name: 'geometry',
type: 'GEOMETRY',
data: [
{ type: 'Point', coordinates: [10, 5, 100, 2] },
null,
{
type: 'LineString',
coordinates: [
[-20, -10, 50, 5],
[40, 30, 75, -5],
],
},
{
type: 'GeometryCollection',
geometries: [
{ type: 'Point', coordinates: [5, 15] },
{
type: 'MultiPoint',
coordinates: [
[0, -5],
[60, 10],
],
},
],
},
],
}]
const buffer = parquetWriteBuffer({ columnData })
const metadata = parquetMetadata(buffer)
const columnMeta = metadata.row_groups[0].columns[0].meta_data
expect(columnMeta?.statistics).toEqual({ null_count: 1n })
expect(columnMeta?.geospatial_statistics).toEqual({
bbox: {
xmin: -20,
xmax: 60,
ymin: -10,
ymax: 30,
zmin: 50,
zmax: 100,
mmin: -5,
mmax: 5,
},
// sort numerically not by string order
geospatial_types: [7, 3001, 3002],
})
})
})