Include proj:geometry column in GeoParquet metadata when writing (#56)

* Include proj:geometry column in GeoParquet metadata * fix test * fix type check
stac-utils · Jun 4, 2024 · ab8768c · ab8768c
1 parent cf0699b
commit ab8768c
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 5 deletions.
diff --git a/stac_geoparquet/arrow/_to_parquet.py b/stac_geoparquet/arrow/_to_parquet.py
@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Any, Iterable, Optional, Union
+from typing import Any, Dict, Iterable, Optional, Union
 
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -35,7 +35,9 @@ def parse_stac_ndjson_to_parquet(
         input_path, chunk_size=chunk_size, schema=schema
     )
     first_batch = next(batches_iter)
-    schema = first_batch.schema.with_metadata(_create_geoparquet_metadata())
+    schema = first_batch.schema.with_metadata(
+        _create_geoparquet_metadata(pa.Table.from_batches([first_batch]))
+    )
     with pq.ParquetWriter(output_path, schema, **kwargs) as writer:
         writer.write_batch(first_batch)
         for batch in batches_iter:
@@ -52,13 +54,13 @@ def to_parquet(table: pa.Table, where: Any, **kwargs: Any) -> None:
         where: The destination for saving.
     """
     metadata = table.schema.metadata or {}
-    metadata.update(_create_geoparquet_metadata())
+    metadata.update(_create_geoparquet_metadata(table))
     table = table.replace_schema_metadata(metadata)
 
     pq.write_table(table, where, **kwargs)
 
 
-def _create_geoparquet_metadata() -> dict[bytes, bytes]:
+def _create_geoparquet_metadata(table: pa.Table) -> dict[bytes, bytes]:
     # TODO: include bbox of geometries
     column_meta = {
         "encoding": "WKB",
@@ -75,9 +77,24 @@ def _create_geoparquet_metadata() -> dict[bytes, bytes]:
             }
         },
     }
-    geo_meta = {
+    geo_meta: Dict[str, Any] = {
         "version": "1.1.0-dev",
         "columns": {"geometry": column_meta},
         "primary_column": "geometry",
     }
+
+    if "proj:geometry" in table.schema.names:
+        # Note we don't include proj:bbox as a covering here for a couple different
+        # reasons. For one, it's very common for the projected geometries to have a
+        # different CRS in each row, so having statistics for proj:bbox wouldn't be
+        # useful. Additionally, because of this we leave proj:bbox as a list instead of
+        # a struct.
+        geo_meta["columns"]["proj:geometry"] = {
+            "encoding": "WKB",
+            "geometry_types": [],
+            # Note that we have to set CRS to `null` to signify that the CRS is unknown.
+            # If the CRS key is missing, it gets inferred as WGS84.
+            "crs": None,
+        }
+
     return {b"geo": json.dumps(geo_meta).encode("utf-8")}
diff --git a/tests/test_arrow.py b/tests/test_arrow.py
@@ -1,14 +1,17 @@
 import json
+from io import BytesIO
 from pathlib import Path
 
 import pyarrow as pa
+import pyarrow.parquet as pq
 import pytest
 
 from stac_geoparquet.arrow import (
     parse_stac_items_to_arrow,
     parse_stac_ndjson_to_arrow,
     stac_table_to_items,
     stac_table_to_ndjson,
+    to_parquet,
 )
 
 from .json_equals import assert_json_value_equal
@@ -94,3 +97,23 @@ def test_from_arrow_deprecated():
         import stac_geoparquet.from_arrow
 
     stac_geoparquet.from_arrow.stac_table_to_items
+
+
+def test_to_parquet_two_geometry_columns():
+    """
+    When writing STAC Items that have a proj:geometry field, there should be two
+    geometry columns listed in the GeoParquet metadata.
+    """
+    with open(HERE / "data" / "3dep-lidar-copc-pc.json") as f:
+        items = json.load(f)
+
+    table = pa.Table.from_batches(parse_stac_items_to_arrow(items))
+    with BytesIO() as bio:
+        to_parquet(table, bio)
+        bio.seek(0)
+        pq_meta = pq.read_metadata(bio)
+
+    geo_meta = json.loads(pq_meta.metadata[b"geo"])
+    assert geo_meta["primary_column"] == "geometry"
+    assert "geometry" in geo_meta["columns"].keys()
+    assert "proj:geometry" in geo_meta["columns"].keys()