Add well-known-binary decoder for geometry and geography (#131)

This commit is contained in:
Kenny Daniel 2025-09-30 11:45:39 -07:00 committed by GitHub
parent cc6cc86ba4
commit d701904253
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 1851 additions and 31 deletions

@ -1,5 +1,7 @@
import { wkbToGeojson } from './wkb.js'
/**
* @import {ColumnDecoder, DecodedArray, Encoding, ParquetParsers} from '../src/types.d.ts'
* @import {ColumnDecoder, DecodedArray, Encoding, ParquetParsers} from '../src/types.js'
*/
const decoder = new TextDecoder()
@ -19,12 +21,17 @@ export const DEFAULT_PARSERS = {
return new Date(Number(nanos / 1000000n))
},
dateFromDays(days) {
const dayInMillis = 86400000
return new Date(days * dayInMillis)
return new Date(days * 86400000)
},
stringFromBytes(bytes) {
return bytes && decoder.decode(bytes)
},
geometryFromBytes(bytes) {
return bytes && wkbToGeojson({ view: new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength), offset: 0 })
},
geographyFromBytes(bytes) {
return bytes && wkbToGeojson({ view: new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength), offset: 0 })
},
}
/**
@ -76,35 +83,18 @@ export function convert(data, columnDecoder) {
return arr
}
if (!ctype && type === 'INT96') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = parsers.timestampFromNanoseconds(parseInt96Nanos(data[i]))
}
return arr
return Array.from(data).map(v => parsers.timestampFromNanoseconds(parseInt96Nanos(v)))
}
if (ctype === 'DATE') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = parsers.dateFromDays(data[i])
}
return arr
return Array.from(data).map(v => parsers.dateFromDays(v))
}
if (ctype === 'TIMESTAMP_MILLIS') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = parsers.timestampFromMilliseconds(data[i])
}
return arr
return Array.from(data).map(v => parsers.timestampFromMilliseconds(v))
}
if (ctype === 'TIMESTAMP_MICROS') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = parsers.timestampFromMicroseconds(data[i])
}
return arr
return Array.from(data).map(v => parsers.timestampFromMicroseconds(v))
}
if (ctype === 'JSON') {
const decoder = new TextDecoder()
return data.map(v => JSON.parse(decoder.decode(v)))
}
if (ctype === 'BSON') {
@ -113,13 +103,14 @@ export function convert(data, columnDecoder) {
if (ctype === 'INTERVAL') {
throw new Error('parquet interval not supported')
}
if (ltype?.type === 'GEOMETRY') {
return data.map(v => parsers.geometryFromBytes(v))
}
if (ltype?.type === 'GEOGRAPHY') {
return data.map(v => parsers.geographyFromBytes(v))
}
if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') {
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
const value = data[i]
arr[i] = value instanceof Uint8Array ? parsers.stringFromBytes(value) : value
}
return arr
return data.map(v => parsers.stringFromBytes(v))
}
if (ctype === 'UINT_64' || ltype?.type === 'INTEGER' && ltype.bitWidth === 64 && !ltype.isSigned) {
if (data instanceof BigInt64Array) {

64
src/types.d.ts vendored

@ -8,6 +8,8 @@ export interface ParquetParsers {
timestampFromNanoseconds(nanos: bigint): any
dateFromDays(days: number): any
stringFromBytes(bytes: Uint8Array): any
geometryFromBytes(bytes: Uint8Array): any
geographyFromBytes(bytes: Uint8Array): any
}
/**
@ -432,3 +434,65 @@ export interface AsyncRowGroup {
groupRows: number
asyncColumns: AsyncColumn[]
}
/**
* Geometry types based on the GeoJSON specification (RFC 7946)
*/
export type Geometry =
| Point
| MultiPoint
| LineString
| MultiLineString
| Polygon
| MultiPolygon
| GeometryCollection
/**
* Position is an array of at least two numbers.
* The order should be [longitude, latitude] with optional properties (eg- altitude).
*/
export type Position = number[]
export interface Point {
type: 'Point'
coordinates: Position
}
export interface MultiPoint {
type: 'MultiPoint'
coordinates: Position[]
}
export interface LineString {
type: 'LineString'
coordinates: Position[]
}
/**
* Each element is one LineString.
*/
export interface MultiLineString {
type: 'MultiLineString'
coordinates: Position[][]
}
/**
* Each element is a linear ring.
*/
export interface Polygon {
type: 'Polygon'
coordinates: Position[][]
}
/**
* Each element is one Polygon.
*/
export interface MultiPolygon {
type: 'MultiPolygon'
coordinates: Position[][][]
}
export interface GeometryCollection {
type: 'GeometryCollection'
geometries: Geometry[]
}

125
src/wkb.js Normal file

@ -0,0 +1,125 @@
/**
* WKB (Well-Known Binary) decoder for geometry objects.
*
* @import {DataReader, Geometry} from '../src/types.js'
* @param {DataReader} reader
* @returns {Geometry} geometry object
*/
export function wkbToGeojson(reader) {
const flags = getFlags(reader)
if (flags.type === 1) { // Point
return { type: 'Point', coordinates: readPosition(reader, flags) }
} else if (flags.type === 2) { // LineString
return { type: 'LineString', coordinates: readLine(reader, flags) }
} else if (flags.type === 3) { // Polygon
return { type: 'Polygon', coordinates: readPolygon(reader, flags) }
} else if (flags.type === 4) { // MultiPoint
const points = []
for (let i = 0; i < flags.count; i++) {
points.push(readPosition(reader, getFlags(reader)))
}
return { type: 'MultiPoint', coordinates: points }
} else if (flags.type === 5) { // MultiLineString
const lines = []
for (let i = 0; i < flags.count; i++) {
lines.push(readLine(reader, getFlags(reader)))
}
return { type: 'MultiLineString', coordinates: lines }
} else if (flags.type === 6) { // MultiPolygon
const polygons = []
for (let i = 0; i < flags.count; i++) {
polygons.push(readPolygon(reader, getFlags(reader)))
}
return { type: 'MultiPolygon', coordinates: polygons }
} else if (flags.type === 7) { // GeometryCollection
const geometries = []
for (let i = 0; i < flags.count; i++) {
geometries.push(wkbToGeojson(reader))
}
return { type: 'GeometryCollection', geometries }
} else {
throw new Error(`Unsupported geometry type: ${flags.type}`)
}
}
/**
* @typedef {object} WkbFlags
* @property {boolean} littleEndian
* @property {number} type
* @property {number} dim
* @property {number} count
*/
/**
* Extract ISO WKB flags and base geometry type.
*
* @param {DataReader} reader
* @returns {WkbFlags}
*/
function getFlags(reader) {
const { view } = reader
const littleEndian = view.getUint8(reader.offset++) === 1
const rawType = view.getUint32(reader.offset, littleEndian)
reader.offset += 4
const type = rawType % 1000
const flags = Math.floor(rawType / 1000)
let count = 0
if (type > 1 && type <= 7) {
count = view.getUint32(reader.offset, littleEndian)
reader.offset += 4
}
// XY, XYZ, XYM, XYZM
let dim = 2
if (flags) dim++
if (flags === 3) dim++
return { littleEndian, type, dim, count }
}
/**
* @param {DataReader} reader
* @param {WkbFlags} flags
* @returns {number[]}
*/
function readPosition(reader, flags) {
const points = []
for (let i = 0; i < flags.dim; i++) {
const coord = reader.view.getFloat64(reader.offset, flags.littleEndian)
reader.offset += 8
points.push(coord)
}
return points
}
/**
* @param {DataReader} reader
* @param {WkbFlags} flags
* @returns {number[][]}
*/
function readLine(reader, flags) {
const points = []
for (let i = 0; i < flags.count; i++) {
points.push(readPosition(reader, flags))
}
return points
}
/**
* @param {DataReader} reader
* @param {WkbFlags} flags
* @returns {number[][][]}
*/
function readPolygon(reader, flags) {
const { view } = reader
const rings = []
for (let r = 0; r < flags.count; r++) {
const count = view.getUint32(reader.offset, flags.littleEndian)
reader.offset += 4
rings.push(readLine(reader, { ...flags, count }))
}
return rings
}

@ -38,6 +38,32 @@ describe('convert function', () => {
])
})
it('decodes geometry logical type with default parser', () => {
const pointWkb = new Uint8Array([
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 128, 89, 64, 0, 0, 0, 0, 0, 0, 224,
63,
])
const data = [pointWkb]
/** @type {SchemaElement} */
const element = { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' } }
expect(convert(data, { element, parsers })).toEqual([
{ type: 'Point', coordinates: [102, 0.5] },
])
})
it('decodes geography logical type with default parser', () => {
const pointWkb = new Uint8Array([
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 128, 89, 64, 0, 0, 0, 0, 0, 0, 224,
63,
])
const data = [pointWkb]
/** @type {SchemaElement} */
const element = { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' } }
expect(convert(data, { element, parsers })).toEqual([
{ type: 'Point', coordinates: [102, 0.5] },
])
})
it('converts numbers to DECIMAL', () => {
const data = [100, 200]
/** @type {SchemaElement} */
@ -236,13 +262,53 @@ describe('convert function', () => {
parsers: {
...parsers,
stringFromBytes(/** @type {Uint8Array} */ bytes) {
return `custom-${new TextDecoder().decode(bytes)}`
return bytes && `custom-${new TextDecoder().decode(bytes)}`
},
},
}
expect(convert(data, columnParser)).toEqual(['custom-foo', undefined])
})
it('respects custom parsers - geometryFromBytes', () => {
const pointWkb = new Uint8Array([
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 128, 89, 64, 0, 0, 0, 0, 0, 0, 224,
63,
])
const data = [pointWkb]
/** @type {SchemaElement} */
const element = { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOMETRY' } }
/** @type {Pick<ColumnDecoder, "element" | "utf8" | "parsers">} */
const columnParser = {
element,
parsers: {
...parsers,
geometryFromBytes: () => 'custom-geometry',
},
}
expect(convert(data, columnParser)).toEqual(['custom-geometry'])
})
it('respects custom parsers - geographyFromBytes', () => {
const pointWkb = new Uint8Array([
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 128, 89, 64, 0, 0, 0, 0, 0, 0, 224,
63,
])
const data = [pointWkb]
/** @type {SchemaElement} */
const element = { name, type: 'BYTE_ARRAY', logical_type: { type: 'GEOGRAPHY' } }
/** @type {Pick<ColumnDecoder, "element" | "utf8" | "parsers">} */
const columnParser = {
element,
parsers: {
...parsers,
geographyFromBytes: () => 'custom-geojson',
},
}
expect(convert(data, columnParser)).toEqual(['custom-geojson'])
})
})
describe('parseFloat16', () => {

1142
test/files/geospatial.json Normal file

File diff suppressed because it is too large Load Diff

@ -0,0 +1,221 @@
{
"version": 2,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 3
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "group",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "wkt",
"converted_type": "UTF8",
"logical_type": {
"type": "STRING"
}
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "geometry",
"logical_type": {
"type": "GEOMETRY"
}
}
],
"num_rows": 28,
"row_groups": [
{
"columns": [
{
"file_offset": 0,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"group"
],
"codec": "UNCOMPRESSED",
"num_values": 28,
"total_uncompressed_size": 61,
"total_compressed_size": 61,
"data_page_offset": 25,
"dictionary_page_offset": 4,
"statistics": {
"null_count": 0,
"max_value": "all",
"min_value": "all"
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
],
"size_statistics": {
"unencoded_byte_array_data_bytes": 84,
"repetition_level_histogram": [],
"definition_level_histogram": [
0,
28
]
}
}
},
{
"file_offset": 0,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"wkt"
],
"codec": "UNCOMPRESSED",
"num_values": 28,
"total_uncompressed_size": 2841,
"total_compressed_size": 2841,
"data_page_offset": 2536,
"dictionary_page_offset": 65,
"statistics": {
"null_count": 0,
"max_value": "POLYGON ZM ((30 10 40 300, 40 40 80 1600, 20 40 60 800, 10 20 30 200, 30 10 40 300))",
"min_value": "GEOMETRYCOLLECTION (POINT (30 10), LINESTRING (30 10, 10 30, 40 40), POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10)), MULTIPOINT ((30 10)), MULTILINESTRING ((30 10, 10 30, 40 40)), MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10))))"
},
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
],
"size_statistics": {
"unencoded_byte_array_data_bytes": 2343,
"repetition_level_histogram": [],
"definition_level_histogram": [
0,
28
]
}
}
},
{
"file_offset": 0,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"PLAIN",
"RLE",
"RLE_DICTIONARY"
],
"path_in_schema": [
"geometry"
],
"codec": "UNCOMPRESSED",
"num_values": 28,
"total_uncompressed_size": 4315,
"total_compressed_size": 4315,
"data_page_offset": 7174,
"dictionary_page_offset": 2906,
"encoding_stats": [
{
"page_type": "DICTIONARY_PAGE",
"encoding": "PLAIN",
"count": 1
},
{
"page_type": "DATA_PAGE",
"encoding": "RLE_DICTIONARY",
"count": 1
}
],
"size_statistics": {
"unencoded_byte_array_data_bytes": 4140,
"repetition_level_histogram": [],
"definition_level_histogram": [
0,
28
]
},
"geospatial_statistics": {
"bbox": {
"xmin": 10,
"xmax": 40,
"ymin": 10,
"ymax": 40,
"zmin": 30,
"zmax": 80,
"mmin": 200,
"mmax": 1600
},
"geospatial_types": [
1,
2,
3,
4,
5,
6,
7,
1001,
1002,
1003,
1004,
1005,
1006,
1007,
2001,
2002,
2003,
2004,
2005,
2006,
2007,
3001,
3002,
3003,
3004,
3005,
3006,
3007
]
}
}
}
],
"total_byte_size": 7217,
"num_rows": 28,
"file_offset": 4,
"total_compressed_size": 7217
}
],
"created_by": "parquet-cpp-arrow version 21.0.0",
"metadata_length": 787
}

Binary file not shown.

211
test/wkb.test.js Normal file

@ -0,0 +1,211 @@
import { describe, expect, it } from 'vitest'
import { wkbToGeojson } from '../src/wkb.js'
/**
* @param {Uint8Array} buffer
* @returns {import('../src/types.d.ts').DataReader}
*/
function makeReader(buffer) {
return {
view: new DataView(buffer.buffer, buffer.byteOffset, buffer.byteLength),
offset: 0,
}
}
describe('wkbToGeojson', () => {
it('decodes little-endian Point', () => {
const buffer = new Uint8Array([
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 128, 89, 64, 0, 0, 0, 0, 0, 0, 224,
63,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'Point',
coordinates: [102, 0.5],
})
})
it('decodes big-endian LineString', () => {
const buffer = new Uint8Array([
0, 0, 0, 0, 2, 0, 0, 0, 2, 63, 248, 0, 0, 0, 0, 0, 0, 192, 12, 0,
0, 0, 0, 0, 0, 64, 17, 0, 0, 0, 0, 0, 0, 64, 23, 0, 0, 0, 0, 0,
0,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'LineString',
coordinates: [
[1.5, -3.5],
[4.25, 5.75],
],
})
})
it('decodes little-endian Polygon', () => {
const buffer = new Uint8Array([
1, 3, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240, 63, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240, 63, 0, 0, 0, 0, 0, 0, 240,
63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'Polygon',
coordinates: [
[
[0, 0],
[1, 0],
[1, 1],
[0, 0],
],
],
})
})
it('decodes little-endian MultiLineString', () => {
const buffer = new Uint8Array([
1, 5, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0,
0, 0, 0, 0, 240, 63, 0, 0, 0, 0, 0, 0, 240, 63, 0, 0, 0, 0, 0, 0,
0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 1, 2, 0, 0, 0, 2, 0, 0, 0, 0,
0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 0,
0, 16, 64, 0, 0, 0, 0, 0, 0, 16, 64,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'MultiLineString',
coordinates: [
[
[1, 1],
[2, 2],
],
[
[3, 3],
[4, 4],
],
],
})
})
it('decodes mixed-endian MultiPoint', () => {
const buffer = new Uint8Array([
1, 4, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 64, 0, 0, 0, 0, 0, 0, 8, 64, 0, 0, 0, 0, 1, 191, 240, 0, 0, 0,
0, 0, 0, 63, 224, 0, 0, 0, 0, 0, 0,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'MultiPoint',
coordinates: [
[2, 3],
[-1, 0.5],
],
})
})
it('decodes nested MultiPolygon', () => {
const buffer = new Uint8Array([
1, 6, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0,
0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 0,
0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'MultiPolygon',
coordinates: [
[
[
[0, 0],
[0, 2],
[2, 2],
[0, 0],
],
],
],
})
})
it('decodes GeometryCollection', () => {
const buffer = new Uint8Array([
1, 7, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 240,
63, 0, 0, 0, 0, 0, 0, 0, 64, 0, 0, 0, 0, 2, 0, 0, 0, 2, 64, 8, 0,
0, 0, 0, 0, 0, 64, 16, 0, 0, 0, 0, 0, 0, 64, 20, 0, 0, 0, 0, 0, 0,
64, 24, 0, 0, 0, 0, 0, 0,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'GeometryCollection',
geometries: [
{ type: 'Point', coordinates: [1, 2] },
{
type: 'LineString',
coordinates: [
[3, 4],
[5, 6],
],
},
],
})
})
it('throws on unsupported geometry type', () => {
const buffer = new Uint8Array([
1, 99, 0, 0, 0,
])
expect(() => wkbToGeojson(makeReader(buffer))).toThrowError('Unsupported geometry type: 99')
})
it('decodes ISO WKB Point with Z/M flags', () => {
const buffer = new Uint8Array([
1,
185, 11, 0, 0,
0, 0, 0, 0, 0, 0, 240, 63,
0, 0, 0, 0, 0, 0, 0, 64,
0, 0, 0, 0, 0, 0, 8, 64,
0, 0, 0, 0, 0, 0, 16, 64,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'Point',
coordinates: [1, 2, 3, 4],
})
})
it('decodes point encoded with dimensional offsets', () => {
const buffer = new Uint8Array([
1, 185, 11, 0, 0, 0, 0, 0, 0, 0, 0, 20, 64, 0, 0, 0, 0, 0,
0, 24, 64, 0, 0, 0, 0, 0, 0, 28, 64, 0, 0, 0, 0, 0, 0, 32, 64,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'Point',
coordinates: [5, 6, 7, 8],
})
})
it('decodes point with M-only dimensional offset', () => {
const buffer = new Uint8Array([
1, 209, 7, 0, 0, 0, 0, 0, 0, 0, 0, 34, 64, 0, 0, 0, 0, 0,
0, 36, 64, 0, 0, 0, 0, 0, 0, 38, 64,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'Point',
coordinates: [9, 10, 11],
})
})
it('decodes point with Z-only dimensional offset', () => {
const buffer = new Uint8Array([
1, 233, 3, 0, 0, 0, 0, 0, 0, 0, 0, 40, 64, 0, 0, 0, 0, 0,
0, 42, 64, 0, 0, 0, 0, 0, 0, 44, 64,
])
expect(wkbToGeojson(makeReader(buffer))).toEqual({
type: 'Point',
coordinates: [12, 13, 14],
})
})
})