replace the image metadata with file listing as per #4

NERC-CEH · Jul 1, 2024 · 334f552 · 334f552
1 parent ebf9546
commit 334f552
Showing 1 changed file with 16 additions and 17 deletions.
diff --git a/scripts/intake_metadata.py b/scripts/intake_metadata.py
@@ -1,4 +1,4 @@
-"""Convert the metadata into format usable with `intake`,
+"""Heavy-handed approach to create image metadata in usable with `intake`,
 for trial use with `scivision`:
 https://scivision.readthedocs.io/en/latest/api.html#scivision.io.reader.load_dataset
 https://intake.readthedocs.io/en/latest/catalog.html#yaml-format
@@ -10,40 +10,39 @@
 
 from cyto_ml.data.intake import intake_yaml
 from cyto_ml.data.s3 import s3_endpoint
+from s3fs import S3FileSystem
 import pandas as pd
 import os
 
 
-def load_metadata(path: str):
-    return pd.read_csv(f"{os.environ['ENDPOINT']}/{path}")
+def image_index(endpoint: S3FileSystem, location: str):
+    """Find and likely later filter records in a bucket"""
+    index = endpoint.ls(location)
+    return pd.DataFrame(
+        [f"{os.environ['ENDPOINT']}/untagged-images/{x}" for x in index],
+        columns=["Filename"],
+    )
 
 
 if __name__ == "__main__":
-    metadata = load_metadata("metadata/metadata.csv")
-
-    # rewrite it to add the full s3 image path
-    metadata["Filename"] = metadata["Filename"].apply(
-        lambda x: f"{os.environ['ENDPOINT']}/untagged-images/{x}"
-    )
 
     fs = s3_endpoint()
+    metadata = image_index(fs, "untagged-images")
+
     # Option to use a CSV as an index, rather than return the files
     catalog = "metadata/catalog.csv"
     with fs.open(catalog, "w") as out:
-        out.write(metadata.to_csv())
+        out.write(metadata.to_csv(index=False))
+
     cat_url = f"{os.environ['ENDPOINT']}/{catalog}"
 
     with fs.open("metadata/intake.yml", "w") as out:
         # Do we use a CSV driver and include the metadata?
         # out.write(write_yaml(f"{os.environ['ENDPOINT']}/{catalog}"))
 
-        # All the scivision examples have image collections in a single zipfile
-        # This format throws an s3 error on the directory listing -
-        # unsure if this is a permissions issue, or you just can't use a wildcard
-        cat_wildcard = f"{os.environ['ENDPOINT']}/untagged-images/*.tif"  # .replace('https://', 's3://')
-
-        # Create a testing record for a single file
-        cat_test = cat_wildcard.replace("*", "19_10_Tank22_1")
+        # See the issue here: https://github.com/NERC-CEH/plankton_ml/issues/3
+        # About data improvements needed before a better way to read a bucket into s3
+        cat_test = f"{os.environ['ENDPOINT']}/untagged-images/19_10_Tank22_1.tif"
 
         # Our options for the whole collection look like:
         # * a tiny http server that creates a zip, but assumes the images have more metadata