Skip to content

Commit

Permalink
Include proj:geometry column in GeoParquet metadata when writing (#56)
Browse files Browse the repository at this point in the history
* Include proj:geometry column in GeoParquet metadata

* fix test

* fix type check
  • Loading branch information
kylebarron authored Jun 4, 2024
1 parent cf0699b commit ab8768c
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 5 deletions.
27 changes: 22 additions & 5 deletions stac_geoparquet/arrow/_to_parquet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
from pathlib import Path
from typing import Any, Iterable, Optional, Union
from typing import Any, Dict, Iterable, Optional, Union

import pyarrow as pa
import pyarrow.parquet as pq
Expand Down Expand Up @@ -35,7 +35,9 @@ def parse_stac_ndjson_to_parquet(
input_path, chunk_size=chunk_size, schema=schema
)
first_batch = next(batches_iter)
schema = first_batch.schema.with_metadata(_create_geoparquet_metadata())
schema = first_batch.schema.with_metadata(
_create_geoparquet_metadata(pa.Table.from_batches([first_batch]))
)
with pq.ParquetWriter(output_path, schema, **kwargs) as writer:
writer.write_batch(first_batch)
for batch in batches_iter:
Expand All @@ -52,13 +54,13 @@ def to_parquet(table: pa.Table, where: Any, **kwargs: Any) -> None:
where: The destination for saving.
"""
metadata = table.schema.metadata or {}
metadata.update(_create_geoparquet_metadata())
metadata.update(_create_geoparquet_metadata(table))
table = table.replace_schema_metadata(metadata)

pq.write_table(table, where, **kwargs)


def _create_geoparquet_metadata() -> dict[bytes, bytes]:
def _create_geoparquet_metadata(table: pa.Table) -> dict[bytes, bytes]:
# TODO: include bbox of geometries
column_meta = {
"encoding": "WKB",
Expand All @@ -75,9 +77,24 @@ def _create_geoparquet_metadata() -> dict[bytes, bytes]:
}
},
}
geo_meta = {
geo_meta: Dict[str, Any] = {
"version": "1.1.0-dev",
"columns": {"geometry": column_meta},
"primary_column": "geometry",
}

if "proj:geometry" in table.schema.names:
# Note we don't include proj:bbox as a covering here for a couple different
# reasons. For one, it's very common for the projected geometries to have a
# different CRS in each row, so having statistics for proj:bbox wouldn't be
# useful. Additionally, because of this we leave proj:bbox as a list instead of
# a struct.
geo_meta["columns"]["proj:geometry"] = {
"encoding": "WKB",
"geometry_types": [],
# Note that we have to set CRS to `null` to signify that the CRS is unknown.
# If the CRS key is missing, it gets inferred as WGS84.
"crs": None,
}

return {b"geo": json.dumps(geo_meta).encode("utf-8")}
23 changes: 23 additions & 0 deletions tests/test_arrow.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import json
from io import BytesIO
from pathlib import Path

import pyarrow as pa
import pyarrow.parquet as pq
import pytest

from stac_geoparquet.arrow import (
parse_stac_items_to_arrow,
parse_stac_ndjson_to_arrow,
stac_table_to_items,
stac_table_to_ndjson,
to_parquet,
)

from .json_equals import assert_json_value_equal
Expand Down Expand Up @@ -94,3 +97,23 @@ def test_from_arrow_deprecated():
import stac_geoparquet.from_arrow

stac_geoparquet.from_arrow.stac_table_to_items


def test_to_parquet_two_geometry_columns():
"""
When writing STAC Items that have a proj:geometry field, there should be two
geometry columns listed in the GeoParquet metadata.
"""
with open(HERE / "data" / "3dep-lidar-copc-pc.json") as f:
items = json.load(f)

table = pa.Table.from_batches(parse_stac_items_to_arrow(items))
with BytesIO() as bio:
to_parquet(table, bio)
bio.seek(0)
pq_meta = pq.read_metadata(bio)

geo_meta = json.loads(pq_meta.metadata[b"geo"])
assert geo_meta["primary_column"] == "geometry"
assert "geometry" in geo_meta["columns"].keys()
assert "proj:geometry" in geo_meta["columns"].keys()

0 comments on commit ab8768c

Please sign in to comment.