Skip to content

Commit

Permalink
replace the image metadata with file listing as per #4
Browse files Browse the repository at this point in the history
  • Loading branch information
metazool committed Jul 1, 2024
1 parent ebf9546 commit 334f552
Showing 1 changed file with 16 additions and 17 deletions.
33 changes: 16 additions & 17 deletions scripts/intake_metadata.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Convert the metadata into format usable with `intake`,
"""Heavy-handed approach to create image metadata in usable with `intake`,
for trial use with `scivision`:
https://scivision.readthedocs.io/en/latest/api.html#scivision.io.reader.load_dataset
https://intake.readthedocs.io/en/latest/catalog.html#yaml-format
Expand All @@ -10,40 +10,39 @@

from cyto_ml.data.intake import intake_yaml
from cyto_ml.data.s3 import s3_endpoint
from s3fs import S3FileSystem
import pandas as pd
import os


def load_metadata(path: str):
return pd.read_csv(f"{os.environ['ENDPOINT']}/{path}")
def image_index(endpoint: S3FileSystem, location: str):
"""Find and likely later filter records in a bucket"""
index = endpoint.ls(location)
return pd.DataFrame(
[f"{os.environ['ENDPOINT']}/untagged-images/{x}" for x in index],
columns=["Filename"],
)


if __name__ == "__main__":
metadata = load_metadata("metadata/metadata.csv")

# rewrite it to add the full s3 image path
metadata["Filename"] = metadata["Filename"].apply(
lambda x: f"{os.environ['ENDPOINT']}/untagged-images/{x}"
)

fs = s3_endpoint()
metadata = image_index(fs, "untagged-images")

# Option to use a CSV as an index, rather than return the files
catalog = "metadata/catalog.csv"
with fs.open(catalog, "w") as out:
out.write(metadata.to_csv())
out.write(metadata.to_csv(index=False))

cat_url = f"{os.environ['ENDPOINT']}/{catalog}"

with fs.open("metadata/intake.yml", "w") as out:
# Do we use a CSV driver and include the metadata?
# out.write(write_yaml(f"{os.environ['ENDPOINT']}/{catalog}"))

# All the scivision examples have image collections in a single zipfile
# This format throws an s3 error on the directory listing -
# unsure if this is a permissions issue, or you just can't use a wildcard
cat_wildcard = f"{os.environ['ENDPOINT']}/untagged-images/*.tif" # .replace('https://', 's3://')

# Create a testing record for a single file
cat_test = cat_wildcard.replace("*", "19_10_Tank22_1")
# See the issue here: https://github.com/NERC-CEH/plankton_ml/issues/3
# About data improvements needed before a better way to read a bucket into s3
cat_test = f"{os.environ['ENDPOINT']}/untagged-images/19_10_Tank22_1.tif"

# Our options for the whole collection look like:
# * a tiny http server that creates a zip, but assumes the images have more metadata
Expand Down

0 comments on commit 334f552

Please sign in to comment.