diff --git a/src/column.js b/src/column.js index 874c7cd..6a7a483 100644 --- a/src/column.js +++ b/src/column.js @@ -33,7 +33,7 @@ export function writeColumn(writer, column, values, stats) { const encodings = [] // Compute statistics - const statistics = stats ? getStatistics(values) : undefined + const statistics = stats ? getStatistics(values, element) : undefined // dictionary encoding let dictionary_page_offset @@ -135,9 +135,14 @@ function writeDictionaryPage(writer, column, dictionary) { * @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet' * @import {ColumnEncoder, ListValues, Writer} from '../src/types.js' * @param {DecodedArray} values - * @returns {Statistics} + * @param {SchemaElement} element + * @returns {Statistics | undefined} */ -function getStatistics(values) { +function getStatistics(values, element) { + const ltype = element?.logical_type?.type + const isGeospatial = ltype === 'GEOMETRY' || ltype === 'GEOGRAPHY' + if (isGeospatial) return + let min_value = undefined let max_value = undefined let null_count = 0n diff --git a/src/metadata.js b/src/metadata.js index 0e3dbbc..7083451 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -144,8 +144,13 @@ export function logicalType(type) { if (type.type === 'UUID') return { field_14: {} } if (type.type === 'FLOAT16') return { field_15: {} } if (type.type === 'VARIANT') return { field_16: {} } - if (type.type === 'GEOMETRY') return { field_17: {} } - if (type.type === 'GEOGRAPHY') return { field_18: {} } + if (type.type === 'GEOMETRY') return { field_17: { + field_1: type.crs, + } } + if (type.type === 'GEOGRAPHY') return { field_18: { + field_1: type.crs, + field_2: type.algorithm && edgeAlgorithm[type.algorithm], + } } } /** @@ -157,3 +162,15 @@ function timeUnit(unit) { if (unit === 'MICROS') return { field_2: {} } return { field_1: {} } } + +/** + * @import {EdgeInterpolationAlgorithm} from 'hyparquet/src/types.js' + * @type {Record} + */ +const edgeAlgorithm = { + SPHERICAL: 0, + VINCENTY: 1, + THOMAS: 2, + ANDOYER: 3, + KARNEY: 4, +} diff --git a/src/schema.js b/src/schema.js index bb5c8da..0f2ec1c 100644 --- a/src/schema.js +++ b/src/schema.js @@ -66,6 +66,12 @@ function basicTypeToSchemaElement(name, type, nullable) { if (type === 'FLOAT16') { return { name, type: 'FIXED_LEN_BYTE_ARRAY', type_length: 2, logical_type: { type: 'FLOAT16' }, repetition_type } } + if (type === 'GEOMETRY') { + return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' }, repetition_type } + } + if (type === 'GEOGRAPHY') { + return { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' }, repetition_type } + } return { name, type, repetition_type } } diff --git a/src/types.d.ts b/src/types.d.ts index e7e7402..5a1afa0 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -12,7 +12,9 @@ export type BasicType = 'JSON' | 'TIMESTAMP' | 'UUID' | - 'FLOAT16' + 'FLOAT16' | + 'GEOMETRY' | + 'GEOGRAPHY' export interface ParquetWriteOptions { writer: Writer diff --git a/src/unconvert.js b/src/unconvert.js index 730e49a..a10cdbc 100644 --- a/src/unconvert.js +++ b/src/unconvert.js @@ -1,3 +1,5 @@ +import { geojsonToWkb } from './wkb.js' + const dayMillis = 86400000 // 1 day in milliseconds /** @@ -48,6 +50,10 @@ export function unconvert(element, values) { if (element.type_length !== 16) throw new Error('UUID expected type_length to be 16 bytes') return values.map(unconvertUuid) } + if (ltype?.type === 'GEOMETRY' || ltype?.type === 'GEOGRAPHY') { + if (!Array.isArray(values)) throw new Error('geometry must be an array') + return values.map(v => v && geojsonToWkb(v)) + } return values } diff --git a/src/wkb.js b/src/wkb.js new file mode 100644 index 0000000..7019624 --- /dev/null +++ b/src/wkb.js @@ -0,0 +1,147 @@ +import { ByteWriter } from './bytewriter.js' + +/** + * Serialize a GeoJSON geometry into ISO WKB. + * + * @import {Geometry, Position} from 'hyparquet/src/types.js' + * @param {Geometry} geometry + * @returns {Uint8Array} + */ +export function geojsonToWkb(geometry) { + const writer = new ByteWriter() + writeGeometry(writer, geometry) + return new Uint8Array(writer.getBuffer()) +} + +/** + * @param {ByteWriter} writer + * @param {Geometry} geometry + */ +function writeGeometry(writer, geometry) { + const typeCode = geometryTypeCode(geometry.type) + + // infer dimensions + const dim = inferGeometryDimensions(geometry) + let flag = 0 + if (dim === 3) flag = 1 + else if (dim === 4) flag = 3 + else if (dim > 4) throw new Error(`unsupported geometry dimensions: ${dim}`) + + writer.appendUint8(1) // little endian + writer.appendUint32(typeCode + flag * 1000) + + if (geometry.type === 'Point') { + writePosition(writer, geometry.coordinates, dim) + } else if (geometry.type === 'LineString') { + writeLine(writer, geometry.coordinates, dim) + } else if (geometry.type === 'Polygon') { + writePolygon(writer, geometry.coordinates, dim) + } else if (geometry.type === 'MultiPoint') { + writer.appendUint32(geometry.coordinates.length) + for (const coordinates of geometry.coordinates) { + writeGeometry(writer, { type: 'Point', coordinates }) + } + } else if (geometry.type === 'MultiLineString') { + writer.appendUint32(geometry.coordinates.length) + for (const coordinates of geometry.coordinates) { + writeGeometry(writer, { type: 'LineString', coordinates }) + } + } else if (geometry.type === 'MultiPolygon') { + writer.appendUint32(geometry.coordinates.length) + for (const coordinates of geometry.coordinates) { + writeGeometry(writer, { type: 'Polygon', coordinates }) + } + } else if (geometry.type === 'GeometryCollection') { + writer.appendUint32(geometry.geometries.length) + for (const child of geometry.geometries) { + writeGeometry(writer, child) + } + } else { + throw new Error('unsupported geometry type') + } +} + +/** + * @param {ByteWriter} writer + * @param {Position} position + * @param {number} dim + */ +function writePosition(writer, position, dim) { + if (position.length < dim) { + throw new Error('geometry position dimensions mismatch') + } + for (let i = 0; i < dim; i++) { + writer.appendFloat64(position[i]) + } +} + +/** + * @param {ByteWriter} writer + * @param {Position[]} coordinates + * @param {number} dim + */ +function writeLine(writer, coordinates, dim) { + writer.appendUint32(coordinates.length) + for (const position of coordinates) { + writePosition(writer, position, dim) + } +} + +/** + * @param {ByteWriter} writer + * @param {Position[][]} rings + * @param {number} dimensions + */ +function writePolygon(writer, rings, dimensions) { + writer.appendUint32(rings.length) + for (const ring of rings) { + writeLine(writer, ring, dimensions) + } +} + +/** + * @param {Geometry['type']} type + * @returns {number} + */ +function geometryTypeCode(type) { + if (type === 'Point') return 1 + if (type === 'LineString') return 2 + if (type === 'Polygon') return 3 + if (type === 'MultiPoint') return 4 + if (type === 'MultiLineString') return 5 + if (type === 'MultiPolygon') return 6 + if (type === 'GeometryCollection') return 7 + throw new Error(`unknown geometry type: ${type}`) +} + +/** + * Determine the maximum coordinate dimensions for the geometry. + * + * @param {Geometry} geometry + * @returns {number} + */ +function inferGeometryDimensions(geometry) { + if (geometry.type === 'GeometryCollection') { + let maxDim = 0 + for (const child of geometry.geometries) { + maxDim = Math.max(maxDim, inferGeometryDimensions(child)) + } + return maxDim || 2 + } + return inferCoordinateDimensions(geometry.coordinates) +} + +/** + * @param {any} value + * @returns {number} + */ +function inferCoordinateDimensions(value) { + if (!Array.isArray(value)) return 2 + if (!value.length) return 2 + if (typeof value[0] === 'number') return value.length + let maxDim = 0 + for (const item of value) { + maxDim = Math.max(maxDim, inferCoordinateDimensions(item)) + } + return maxDim || 2 +} diff --git a/test/files/geospatial.parquet b/test/files/geospatial.parquet new file mode 100644 index 0000000..9b13f05 Binary files /dev/null and b/test/files/geospatial.parquet differ diff --git a/test/wkb.test.js b/test/wkb.test.js new file mode 100644 index 0000000..1731040 --- /dev/null +++ b/test/wkb.test.js @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest' +import { geojsonToWkb } from '../src/wkb.js' +import { wkbToGeojson } from 'hyparquet/src/wkb.js' + +/** @import {Geometry} from 'hyparquet/src/types.js' */ + +describe('geojsonToWkb', () => { + it('encodes point geometries', () => { + /** @type {Geometry} */ + const geometry = { type: 'Point', coordinates: [30, 10] } + const decoded = decode(geojsonToWkb(geometry)) + expect(decoded).toEqual(geometry) + }) + + it('encodes polygons with holes', () => { + /** @type {Geometry} */ + const geometry = { + type: 'Polygon', + coordinates: [ + [[35, 10], [45, 45], [15, 40], [10, 20], [35, 10]], + [[20, 30], [35, 35], [30, 20], [20, 30]], + ], + } + const decoded = decode(geojsonToWkb(geometry)) + expect(decoded).toEqual(geometry) + }) + + it('encodes geometry collections with mixed dimensions', () => { + /** @type {Geometry} */ + const geometry = { + type: 'GeometryCollection', + geometries: [ + { type: 'Point', coordinates: [30, 10, 5] }, + { type: 'LineString', coordinates: [[30, 10, 5], [40, 40, 5], [20, 40, 5], [10, 20, 5]] }, + ], + } + const decoded = decode(geojsonToWkb(geometry)) + expect(decoded).toEqual(geometry) + }) +}) + +/** + * Decode WKB using the hyparquet reader for verification. + * + * @param {Uint8Array} wkb + * @returns {Geometry} + */ +function decode(wkb) { + const view = new DataView(wkb.buffer, wkb.byteOffset, wkb.byteLength) + const reader = { view, offset: 0 } + return wkbToGeojson(reader) +}