From 0944b44c738ec172ff8e30a2e83d9759b34633cd Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Thu, 24 Oct 2024 23:49:41 +0200 Subject: [PATCH] Let pyarrow cast strings to dates (#80) * let pyarrow cast strings to dates * use consistent test collection name structure --- stac_geoparquet/arrow/_to_arrow.py | 5 +- tests/data/umbra-sar.json | 224 +++++++++++++++++++++++++++++ tests/test_arrow.py | 42 +++--- 3 files changed, 248 insertions(+), 23 deletions(-) create mode 100644 tests/data/umbra-sar.json diff --git a/stac_geoparquet/arrow/_to_arrow.py b/stac_geoparquet/arrow/_to_arrow.py index 38e1511..cccdff2 100644 --- a/stac_geoparquet/arrow/_to_arrow.py +++ b/stac_geoparquet/arrow/_to_arrow.py @@ -1,6 +1,5 @@ """Convert STAC data into Arrow tables""" -import ciso8601 import numpy as np import orjson import pyarrow as pa @@ -78,9 +77,7 @@ def convert_timestamp_columns( def _convert_single_timestamp_column(column: pa.Array) -> pa.TimestampArray: """Convert an individual timestamp column from string to a Timestamp type""" - return pa.array( - [ciso8601.parse_rfc3339(str(t)) for t in column], pa.timestamp("us", tz="UTC") - ) + return pa.array(column, pa.timestamp("us", tz="UTC")) def _is_bbox_3d(bbox_col: pa.Array) -> bool: diff --git a/tests/data/umbra-sar.json b/tests/data/umbra-sar.json new file mode 100644 index 0000000..395e5f9 --- /dev/null +++ b/tests/data/umbra-sar.json @@ -0,0 +1,224 @@ +[ +{ + "type": "Feature", + "stac_version": "1.0.0", + "stac_extensions": [ + "https://stac-extensions.github.io/view/v1.0.0/schema.json", + "https://stac-extensions.github.io/sar/v1.0.0/schema.json" + ], + "id": "52f2317f-091b-4f90-b385-08c93655e089", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -79.54925151958344, + 8.974791994120121, + 14.312313693418984 + ], + [ + -79.58251694872273, + 9.005616405406611, + 14.318863303708582 + ], + [ + -79.61351789024071, + 8.972540909838038, + 14.312558707495423 + ], + [ + -79.58025285722145, + 8.941719234834665, + 14.318864497230194 + ], + [ + -79.54925151958344, + 8.974791994120121, + 14.312313693418984 + ] + ] + ] + }, + "bbox": [ + -79.61351789024071, + 8.941719234834665, + -79.54925151958344, + 9.005616405406611 + ], + "properties": { + "created": "2024-09-10T10:00:00.425293+00:00", + "updated": "2024-09-10T10:00:00.425300+00:00", + "platform": "Umbra-08", + "end_datetime": "2024-09-10T03:32:32.903484+00:00", + "umbra:task_id": "ba1ca3b0-f458-4cd9-8e99-52d2d899d5dd", + "start_datetime": "2024-09-10T03:32:23+00:00", + "sar:product_type": "GEC", + "sar:looks_azimuth": 2, + "sar:polarizations": [ + "VV" + ], + "umbra:collect_ids": [ + "7cfa17f0-9b69-4686-949e-5604d24beb3c" + ], + "sar:frequency_band": "X", + "sar:instrument_mode": "SPOTLIGHT", + "sar:resolution_range": 0.5, + "view:incidence_angle": 23.750572204589844, + "sar:resolution_azimuth": 0.25, + "umbra:open-data-catalog": true, + "umbra:squint_angle_degrees": 120.06241607666016, + "umbra:grazing_angle_degrees": 66.24942779541016, + "umbra:slant_range_kilometers": 567.7435913085938, + "umbra:target_azimuth_angle_degrees": 138.5594940185547, + "umbra:squint_angle_engineering_degrees": -30.062416076660156, + "umbra:squint_angle_exploitation_degrees": 59.937583923339844, + "umbra:squint_angle_degrees_off_broadside": 59.937583923339844, + "datetime": null + }, + "links": [ + { + "rel": "collection", + "href": "https://api.canopy.umbra.space/archive/collections/umbra-sar", + "type": "application/json" + }, + { + "rel": "parent", + "href": "https://api.canopy.umbra.space/archive/collections/umbra-sar", + "type": "application/json" + }, + { + "rel": "root", + "href": "https://api.canopy.umbra.space/archive/", + "type": "application/json", + "title": "stac-fastapi" + }, + { + "rel": "self", + "href": "https://api.canopy.umbra.space/archive/collections/umbra-sar/items/52f2317f-091b-4f90-b385-08c93655e089", + "type": "application/geo+json" + } + ], + "assets": { + "thumbnail": { + "href": "https://api.canopy.umbra.space/archive/thumbnail/52f2317f-091b-4f90-b385-08c93655e089", + "type": "image/png", + "title": "Thumbnail for 52f2317f-091b-4f90-b385-08c93655e089", + "description": "Low-resolution preview PNG thumbnail for 52f2317f-091b-4f90-b385-08c93655e089", + "roles": [ + "thumbnail" + ] + } + }, + "collection": "umbra-sar" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "stac_extensions": [ + "https://stac-extensions.github.io/view/v1.0.0/schema.json", + "https://stac-extensions.github.io/sar/v1.0.0/schema.json" + ], + "id": "192f767c-20f8-4b42-8ea2-d1f60fdaace1", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -79.55942755490392, + 8.960346708857864, + 0 + ], + [ + -79.56798414722924, + 8.995497807881366, + 0 + ], + [ + -79.60333954966426, + 8.986988377465938, + 0 + ], + [ + -79.59477975406185, + 8.951838081249301, + 0 + ], + [ + -79.55942755490392, + 8.960346708857864, + 0 + ] + ] + ] + }, + "bbox": [ + -79.60333954966426, + 8.951838081249301, + -79.55942755490392, + 8.995497807881366 + ], + "properties": { + "platform": "Umbra-05", + "end_datetime": "2023-02-01T02:17:12.594980+00:00", + "umbra:task_id": "fa8af008-6dc6-4382-8f5a-205f5a6af209", + "start_datetime": "2023-02-01T02:17:08.851006+00:00", + "sar:product_type": "GEC", + "sar:looks_azimuth": 1, + "sar:polarizations": [ + "VV" + ], + "umbra:collect_ids": [ + "1b454eab-7958-4755-bb6b-797bde214e8d" + ], + "sar:frequency_band": "X", + "sar:instrument_mode": "SPOTLIGHT", + "sar:resolution_range": 0.5, + "view:incidence_angle": 59.1085205078125, + "sar:resolution_azimuth": 0.5, + "umbra:open-data-catalog": true, + "umbra:squint_angle_degrees": 180.2089385986328, + "umbra:grazing_angle_degrees": 30.8914794921875, + "umbra:slant_range_kilometers": 939.5191650390625, + "umbra:target_azimuth_angle_degrees": 77.42530059814453, + "umbra:squint_angle_engineering_degrees": -90.20893859863281, + "umbra:squint_angle_exploitation_degrees": -0.2089385986328125, + "umbra:squint_angle_degrees_off_broadside": 0.2089385986328125, + "datetime": null + }, + "links": [ + { + "rel": "collection", + "href": "https://api.canopy.umbra.space/archive/collections/umbra-sar", + "type": "application/json" + }, + { + "rel": "parent", + "href": "https://api.canopy.umbra.space/archive/collections/umbra-sar", + "type": "application/json" + }, + { + "rel": "root", + "href": "https://api.canopy.umbra.space/archive/", + "type": "application/json", + "title": "stac-fastapi" + }, + { + "rel": "self", + "href": "https://api.canopy.umbra.space/archive/collections/umbra-sar/items/192f767c-20f8-4b42-8ea2-d1f60fdaace1", + "type": "application/geo+json" + } + ], + "assets": { + "thumbnail": { + "href": "https://api.canopy.umbra.space/archive/thumbnail/192f767c-20f8-4b42-8ea2-d1f60fdaace1", + "type": "image/png", + "title": "Thumbnail for 192f767c-20f8-4b42-8ea2-d1f60fdaace1", + "description": "512x512 PNG thumbnail for 192f767c-20f8-4b42-8ea2-d1f60fdaace1", + "roles": [ + "thumbnail" + ] + } + }, + "collection": "umbra-sar" + } +] \ No newline at end of file diff --git a/tests/test_arrow.py b/tests/test_arrow.py index e9f4151..edfed4b 100644 --- a/tests/test_arrow.py +++ b/tests/test_arrow.py @@ -21,18 +21,21 @@ HERE = Path(__file__).parent TEST_COLLECTIONS = [ - "3dep-lidar-copc", - "3dep-lidar-dsm", - "cop-dem-glo-30", - "io-lulc-annual-v02", - "io-lulc", - "landsat-c2-l1", - "landsat-c2-l2", - "naip", - "planet-nicfi-analytic", - "sentinel-1-rtc", - "sentinel-2-l2a", - "us-census", + # Microsoft Planetary Computer + "3dep-lidar-copc-pc", + "3dep-lidar-dsm-pc", + "cop-dem-glo-30-pc", + "io-lulc-annual-v02-pc", + "io-lulc-pc", + "landsat-c2-l1-pc", + "landsat-c2-l2-pc", + "naip-pc", + "planet-nicfi-analytic-pc", + "sentinel-1-rtc-pc", + "sentinel-2-l2a-pc", + "us-census-pc", + # Other + "umbra-sar", ] CHUNK_SIZES = [2, DEFAULT_JSON_CHUNK_SIZE] @@ -42,7 +45,7 @@ "collection_id,chunk_size", itertools.product(TEST_COLLECTIONS, CHUNK_SIZES) ) def test_round_trip_read_write(collection_id: str, chunk_size: int): - with open(HERE / "data" / f"{collection_id}-pc.json") as f: + with open(HERE / "data" / f"{collection_id}.json") as f: items = json.load(f) table = parse_stac_items_to_arrow(items, chunk_size=chunk_size).read_all() @@ -59,7 +62,7 @@ def test_round_trip_write_read_ndjson( collection_id: str, chunk_size: int, tmp_path: Path ): # First load into a STAC-GeoParquet table - path = HERE / "data" / f"{collection_id}-pc.json" + path = HERE / "data" / f"{collection_id}.json" table = parse_stac_ndjson_to_arrow(path, chunk_size=chunk_size).read_all() # Then write to disk @@ -78,8 +81,8 @@ def test_round_trip_write_read_ndjson( def test_table_contains_geoarrow_metadata(): - collection_id = "naip" - with open(HERE / "data" / f"{collection_id}-pc.json") as f: + collection_id = "naip-pc" + with open(HERE / "data" / f"{collection_id}.json") as f: items = json.load(f) table = parse_stac_items_to_arrow(items).read_all() @@ -93,11 +96,11 @@ def test_table_contains_geoarrow_metadata(): @pytest.mark.parametrize("collection_id", TEST_COLLECTIONS) def test_parse_json_to_arrow(collection_id: str): - path = HERE / "data" / f"{collection_id}-pc.json" + path = HERE / "data" / f"{collection_id}.json" table = pa.Table.from_batches(parse_stac_ndjson_to_arrow(path)) items_result = list(stac_table_to_items(table)) - with open(HERE / "data" / f"{collection_id}-pc.json") as f: + with open(HERE / "data" / f"{collection_id}.json") as f: items = json.load(f) for result, expected in zip(items_result, items): @@ -122,7 +125,8 @@ def test_to_parquet_two_geometry_columns(): When writing STAC Items that have a proj:geometry field, there should be two geometry columns listed in the GeoParquet metadata. """ - with open(HERE / "data" / "3dep-lidar-copc-pc.json") as f: + collection_id = "3dep-lidar-copc-pc" + with open(HERE / "data" / f"{collection_id}.json") as f: items = json.load(f) table = parse_stac_items_to_arrow(items).read_all()