From 9b588a5a5a1f00ee0b64768942902450ac4b1ee0 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 20 Jul 2023 18:33:54 -0400 Subject: [PATCH 1/3] Allow parsing less specific dates --- nmdc_server/ingest/biosample.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/nmdc_server/ingest/biosample.py b/nmdc_server/ingest/biosample.py index 3361d0e1..fef2bdd0 100644 --- a/nmdc_server/ingest/biosample.py +++ b/nmdc_server/ingest/biosample.py @@ -46,6 +46,28 @@ def coerce_date(cls, v): return datetime.strptime(v, "%d-%b-%y %I.%M.%S.%f000 %p").isoformat() return v + @validator("collection_date", pre=True) + def coerce_collection_date(cls, value): + # { "has_raw_value": ... } + raw_value = value["has_raw_value"] + if isinstance(raw_value, str) and date_fmt.match(raw_value): + return datetime.strptime(raw_value, "%d-%b-%y %I.%M.%S.%f000 %p").isoformat() + try: + dt = datetime.strptime(raw_value, "%Y-%m-%d").isoformat() + return dt + except ValueError: + try: + raw_value = raw_value + "-01" + dt = datetime.strptime(raw_value, "%Y-%m-%d").isoformat() + return dt + except ValueError: + try: + raw_value = raw_value + "-01" + dt = datetime.strptime(raw_value, "%Y-%m-%d").isoformat() + return dt + except ValueError: + return None + def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collection): logger = get_logger(__name__) @@ -77,6 +99,11 @@ def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collectio biosample = Biosample(**obj) + collection_date_pre_validation = obj.get("collection_date", {}).get("has_raw_value", None) + collection_date_post_validation = biosample.collection_date + if collection_date_pre_validation and not collection_date_post_validation: + logger.error(f"Failed to parse collection_date for biosample: {biosample.id}") + # Merge other ambiguously named alternate identifier columns # TODO remove the hack to filter out gold from the alternate IDs biosample.alternate_identifiers += filter( From 5ab58a6db14bd25943301132d32146f1cab12825 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Fri, 21 Jul 2023 10:24:02 -0400 Subject: [PATCH 2/3] Pass raw value of collection date through Sometimes the raw value of collection date doesn't need to be changed. Our validator catches specific exceptions, but should otherwise leave the value alone. --- nmdc_server/ingest/biosample.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/nmdc_server/ingest/biosample.py b/nmdc_server/ingest/biosample.py index fef2bdd0..590bcd01 100644 --- a/nmdc_server/ingest/biosample.py +++ b/nmdc_server/ingest/biosample.py @@ -66,7 +66,10 @@ def coerce_collection_date(cls, value): dt = datetime.strptime(raw_value, "%Y-%m-%d").isoformat() return dt except ValueError: - return None + # The raw value may be parseable by pydantic. + # If not, we will a validation error in the + # ingest output + return raw_value def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collection): @@ -99,11 +102,6 @@ def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collectio biosample = Biosample(**obj) - collection_date_pre_validation = obj.get("collection_date", {}).get("has_raw_value", None) - collection_date_post_validation = biosample.collection_date - if collection_date_pre_validation and not collection_date_post_validation: - logger.error(f"Failed to parse collection_date for biosample: {biosample.id}") - # Merge other ambiguously named alternate identifier columns # TODO remove the hack to filter out gold from the alternate IDs biosample.alternate_identifiers += filter( From 703a73146817d9e6df4351cec9f663ac64641242 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Fri, 21 Jul 2023 13:44:33 -0400 Subject: [PATCH 3/3] Check for all expected patterns Testing was done on production data 7/21/23 to determine patterns to check. --- nmdc_server/ingest/biosample.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nmdc_server/ingest/biosample.py b/nmdc_server/ingest/biosample.py index 590bcd01..4f85e671 100644 --- a/nmdc_server/ingest/biosample.py +++ b/nmdc_server/ingest/biosample.py @@ -50,6 +50,26 @@ def coerce_date(cls, v): def coerce_collection_date(cls, value): # { "has_raw_value": ... } raw_value = value["has_raw_value"] + expected_formats = [ + "%d-%b-%y %I.%M.%S.%f000 %p", + "%y-%m-%dT%I:%M:%S", + "%y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%I:%M:%S", + "%Y-%m-%dT%H:%M:%S", + "%Y-%m-%dT%H:%M:%S%z", + "%y-%m-%d %I:%M:%S", + "%y-%m-%d %H:%M:%S", + "%Y-%m-%d %I:%M:%S", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M:%S%z", + "%Y-%m-%dT%H:%MZ", + ] + for date_format in expected_formats: + try: + dt = datetime.strptime(raw_value, date_format).isoformat() + return dt + except ValueError: + continue if isinstance(raw_value, str) and date_fmt.match(raw_value): return datetime.strptime(raw_value, "%d-%b-%y %I.%M.%S.%f000 %p").isoformat() try: