Skip to content

Commit

Permalink
Let pyarrow cast strings to dates (#80)
Browse files Browse the repository at this point in the history
* let pyarrow cast strings to dates

* use consistent test collection name structure
  • Loading branch information
scottyhq authored Oct 24, 2024
1 parent 4b00f5b commit 0944b44
Show file tree
Hide file tree
Showing 3 changed files with 248 additions and 23 deletions.
5 changes: 1 addition & 4 deletions stac_geoparquet/arrow/_to_arrow.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Convert STAC data into Arrow tables"""

import ciso8601
import numpy as np
import orjson
import pyarrow as pa
Expand Down Expand Up @@ -78,9 +77,7 @@ def convert_timestamp_columns(

def _convert_single_timestamp_column(column: pa.Array) -> pa.TimestampArray:
"""Convert an individual timestamp column from string to a Timestamp type"""
return pa.array(
[ciso8601.parse_rfc3339(str(t)) for t in column], pa.timestamp("us", tz="UTC")
)
return pa.array(column, pa.timestamp("us", tz="UTC"))


def _is_bbox_3d(bbox_col: pa.Array) -> bool:
Expand Down
224 changes: 224 additions & 0 deletions tests/data/umbra-sar.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
[
{
"type": "Feature",
"stac_version": "1.0.0",
"stac_extensions": [
"https://stac-extensions.github.io/view/v1.0.0/schema.json",
"https://stac-extensions.github.io/sar/v1.0.0/schema.json"
],
"id": "52f2317f-091b-4f90-b385-08c93655e089",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
-79.54925151958344,
8.974791994120121,
14.312313693418984
],
[
-79.58251694872273,
9.005616405406611,
14.318863303708582
],
[
-79.61351789024071,
8.972540909838038,
14.312558707495423
],
[
-79.58025285722145,
8.941719234834665,
14.318864497230194
],
[
-79.54925151958344,
8.974791994120121,
14.312313693418984
]
]
]
},
"bbox": [
-79.61351789024071,
8.941719234834665,
-79.54925151958344,
9.005616405406611
],
"properties": {
"created": "2024-09-10T10:00:00.425293+00:00",
"updated": "2024-09-10T10:00:00.425300+00:00",
"platform": "Umbra-08",
"end_datetime": "2024-09-10T03:32:32.903484+00:00",
"umbra:task_id": "ba1ca3b0-f458-4cd9-8e99-52d2d899d5dd",
"start_datetime": "2024-09-10T03:32:23+00:00",
"sar:product_type": "GEC",
"sar:looks_azimuth": 2,
"sar:polarizations": [
"VV"
],
"umbra:collect_ids": [
"7cfa17f0-9b69-4686-949e-5604d24beb3c"
],
"sar:frequency_band": "X",
"sar:instrument_mode": "SPOTLIGHT",
"sar:resolution_range": 0.5,
"view:incidence_angle": 23.750572204589844,
"sar:resolution_azimuth": 0.25,
"umbra:open-data-catalog": true,
"umbra:squint_angle_degrees": 120.06241607666016,
"umbra:grazing_angle_degrees": 66.24942779541016,
"umbra:slant_range_kilometers": 567.7435913085938,
"umbra:target_azimuth_angle_degrees": 138.5594940185547,
"umbra:squint_angle_engineering_degrees": -30.062416076660156,
"umbra:squint_angle_exploitation_degrees": 59.937583923339844,
"umbra:squint_angle_degrees_off_broadside": 59.937583923339844,
"datetime": null
},
"links": [
{
"rel": "collection",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "parent",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "root",
"href": "https://api.canopy.umbra.space/archive/",
"type": "application/json",
"title": "stac-fastapi"
},
{
"rel": "self",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar/items/52f2317f-091b-4f90-b385-08c93655e089",
"type": "application/geo+json"
}
],
"assets": {
"thumbnail": {
"href": "https://api.canopy.umbra.space/archive/thumbnail/52f2317f-091b-4f90-b385-08c93655e089",
"type": "image/png",
"title": "Thumbnail for 52f2317f-091b-4f90-b385-08c93655e089",
"description": "Low-resolution preview PNG thumbnail for 52f2317f-091b-4f90-b385-08c93655e089",
"roles": [
"thumbnail"
]
}
},
"collection": "umbra-sar"
},
{
"type": "Feature",
"stac_version": "1.0.0",
"stac_extensions": [
"https://stac-extensions.github.io/view/v1.0.0/schema.json",
"https://stac-extensions.github.io/sar/v1.0.0/schema.json"
],
"id": "192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"geometry": {
"type": "Polygon",
"coordinates": [
[
[
-79.55942755490392,
8.960346708857864,
0
],
[
-79.56798414722924,
8.995497807881366,
0
],
[
-79.60333954966426,
8.986988377465938,
0
],
[
-79.59477975406185,
8.951838081249301,
0
],
[
-79.55942755490392,
8.960346708857864,
0
]
]
]
},
"bbox": [
-79.60333954966426,
8.951838081249301,
-79.55942755490392,
8.995497807881366
],
"properties": {
"platform": "Umbra-05",
"end_datetime": "2023-02-01T02:17:12.594980+00:00",
"umbra:task_id": "fa8af008-6dc6-4382-8f5a-205f5a6af209",
"start_datetime": "2023-02-01T02:17:08.851006+00:00",
"sar:product_type": "GEC",
"sar:looks_azimuth": 1,
"sar:polarizations": [
"VV"
],
"umbra:collect_ids": [
"1b454eab-7958-4755-bb6b-797bde214e8d"
],
"sar:frequency_band": "X",
"sar:instrument_mode": "SPOTLIGHT",
"sar:resolution_range": 0.5,
"view:incidence_angle": 59.1085205078125,
"sar:resolution_azimuth": 0.5,
"umbra:open-data-catalog": true,
"umbra:squint_angle_degrees": 180.2089385986328,
"umbra:grazing_angle_degrees": 30.8914794921875,
"umbra:slant_range_kilometers": 939.5191650390625,
"umbra:target_azimuth_angle_degrees": 77.42530059814453,
"umbra:squint_angle_engineering_degrees": -90.20893859863281,
"umbra:squint_angle_exploitation_degrees": -0.2089385986328125,
"umbra:squint_angle_degrees_off_broadside": 0.2089385986328125,
"datetime": null
},
"links": [
{
"rel": "collection",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "parent",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar",
"type": "application/json"
},
{
"rel": "root",
"href": "https://api.canopy.umbra.space/archive/",
"type": "application/json",
"title": "stac-fastapi"
},
{
"rel": "self",
"href": "https://api.canopy.umbra.space/archive/collections/umbra-sar/items/192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"type": "application/geo+json"
}
],
"assets": {
"thumbnail": {
"href": "https://api.canopy.umbra.space/archive/thumbnail/192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"type": "image/png",
"title": "Thumbnail for 192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"description": "512x512 PNG thumbnail for 192f767c-20f8-4b42-8ea2-d1f60fdaace1",
"roles": [
"thumbnail"
]
}
},
"collection": "umbra-sar"
}
]
42 changes: 23 additions & 19 deletions tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,21 @@
HERE = Path(__file__).parent

TEST_COLLECTIONS = [
"3dep-lidar-copc",
"3dep-lidar-dsm",
"cop-dem-glo-30",
"io-lulc-annual-v02",
"io-lulc",
"landsat-c2-l1",
"landsat-c2-l2",
"naip",
"planet-nicfi-analytic",
"sentinel-1-rtc",
"sentinel-2-l2a",
"us-census",
# Microsoft Planetary Computer
"3dep-lidar-copc-pc",
"3dep-lidar-dsm-pc",
"cop-dem-glo-30-pc",
"io-lulc-annual-v02-pc",
"io-lulc-pc",
"landsat-c2-l1-pc",
"landsat-c2-l2-pc",
"naip-pc",
"planet-nicfi-analytic-pc",
"sentinel-1-rtc-pc",
"sentinel-2-l2a-pc",
"us-census-pc",
# Other
"umbra-sar",
]

CHUNK_SIZES = [2, DEFAULT_JSON_CHUNK_SIZE]
Expand All @@ -42,7 +45,7 @@
"collection_id,chunk_size", itertools.product(TEST_COLLECTIONS, CHUNK_SIZES)
)
def test_round_trip_read_write(collection_id: str, chunk_size: int):
with open(HERE / "data" / f"{collection_id}-pc.json") as f:
with open(HERE / "data" / f"{collection_id}.json") as f:
items = json.load(f)

table = parse_stac_items_to_arrow(items, chunk_size=chunk_size).read_all()
Expand All @@ -59,7 +62,7 @@ def test_round_trip_write_read_ndjson(
collection_id: str, chunk_size: int, tmp_path: Path
):
# First load into a STAC-GeoParquet table
path = HERE / "data" / f"{collection_id}-pc.json"
path = HERE / "data" / f"{collection_id}.json"
table = parse_stac_ndjson_to_arrow(path, chunk_size=chunk_size).read_all()

# Then write to disk
Expand All @@ -78,8 +81,8 @@ def test_round_trip_write_read_ndjson(


def test_table_contains_geoarrow_metadata():
collection_id = "naip"
with open(HERE / "data" / f"{collection_id}-pc.json") as f:
collection_id = "naip-pc"
with open(HERE / "data" / f"{collection_id}.json") as f:
items = json.load(f)

table = parse_stac_items_to_arrow(items).read_all()
Expand All @@ -93,11 +96,11 @@ def test_table_contains_geoarrow_metadata():

@pytest.mark.parametrize("collection_id", TEST_COLLECTIONS)
def test_parse_json_to_arrow(collection_id: str):
path = HERE / "data" / f"{collection_id}-pc.json"
path = HERE / "data" / f"{collection_id}.json"
table = pa.Table.from_batches(parse_stac_ndjson_to_arrow(path))
items_result = list(stac_table_to_items(table))

with open(HERE / "data" / f"{collection_id}-pc.json") as f:
with open(HERE / "data" / f"{collection_id}.json") as f:
items = json.load(f)

for result, expected in zip(items_result, items):
Expand All @@ -122,7 +125,8 @@ def test_to_parquet_two_geometry_columns():
When writing STAC Items that have a proj:geometry field, there should be two
geometry columns listed in the GeoParquet metadata.
"""
with open(HERE / "data" / "3dep-lidar-copc-pc.json") as f:
collection_id = "3dep-lidar-copc-pc"
with open(HERE / "data" / f"{collection_id}.json") as f:
items = json.load(f)

table = parse_stac_items_to_arrow(items).read_all()
Expand Down

0 comments on commit 0944b44

Please sign in to comment.