From fb37b4765e2097788d48a62d2a5fcc26eba86a7e Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 16 May 2024 18:13:37 -0400 Subject: [PATCH 01/28] Add GH action to build berkeley schema images Triggers on pushes to protected branch `berkeley-schema-migration`. Builds images and tags them as related to the migration effort. --- .github/workflows/berkeley-image.yml | 52 ++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/berkeley-image.yml diff --git a/.github/workflows/berkeley-image.yml b/.github/workflows/berkeley-image.yml new file mode 100644 index 00000000..f225b000 --- /dev/null +++ b/.github/workflows/berkeley-image.yml @@ -0,0 +1,52 @@ +name: build-berkeley-image + +on: + push: + branches: + - main + +env: + IS_ORIGINAL_REPO: ${{ github.repository == 'microbiomedata/nmdc-server' }} + +jobs: + build: + runs-on: ubuntu-latest + + strategy: + matrix: + image: [server, client, worker] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Login to Github Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + ghcr.io/microbiomedata/nmdc-server/${{ matrix.image }} + flavor: | + latest=false + prefix=berkeley + tags: | + type=ref,event=branch + type=raw,value=berkeley + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: ${{ matrix.image == 'client' && 'web' || '.' }} + push: ${{ env.IS_ORIGINAL_REPO }} + file: ${{ matrix.image == 'worker' && 'Dockerfile.worker' || matrix.image == 'client' && 'web/Dockerfile' || 'Dockerfile' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} From 69941cd766293ca2683f7585c5840ccf5299baf5 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Fri, 17 May 2024 10:28:38 -0400 Subject: [PATCH 02/28] Fix bad configuration --- .github/workflows/berkeley-image.yml | 70 ++++++++++++++-------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/.github/workflows/berkeley-image.yml b/.github/workflows/berkeley-image.yml index f225b000..bf461742 100644 --- a/.github/workflows/berkeley-image.yml +++ b/.github/workflows/berkeley-image.yml @@ -3,7 +3,7 @@ name: build-berkeley-image on: push: branches: - - main + - berkeley-schema-migration env: IS_ORIGINAL_REPO: ${{ github.repository == 'microbiomedata/nmdc-server' }} @@ -16,37 +16,37 @@ jobs: matrix: image: [server, client, worker] - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Login to Github Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: | - ghcr.io/microbiomedata/nmdc-server/${{ matrix.image }} - flavor: | - latest=false - prefix=berkeley - tags: | - type=ref,event=branch - type=raw,value=berkeley - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ${{ matrix.image == 'client' && 'web' || '.' }} - push: ${{ env.IS_ORIGINAL_REPO }} - file: ${{ matrix.image == 'worker' && 'Dockerfile.worker' || matrix.image == 'client' && 'web/Dockerfile' || 'Dockerfile' }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Login to Github Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: | + ghcr.io/microbiomedata/nmdc-server/${{ matrix.image }} + flavor: | + latest=false + prefix=berkeley + tags: | + type=ref,event=branch + type=raw,value=berkeley + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: ${{ matrix.image == 'client' && 'web' || '.' }} + push: ${{ env.IS_ORIGINAL_REPO }} + file: ${{ matrix.image == 'worker' && 'Dockerfile.worker' || matrix.image == 'client' && 'web/Dockerfile' || 'Dockerfile' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} From 3300c8406458714dc014f1dd8fe2d5ea831e8922 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Mon, 17 Jun 2024 16:02:51 -0400 Subject: [PATCH 03/28] Remove redundant addition of `berkeley` to tag --- .github/workflows/berkeley-image.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/berkeley-image.yml b/.github/workflows/berkeley-image.yml index bf461742..9aa108b0 100644 --- a/.github/workflows/berkeley-image.yml +++ b/.github/workflows/berkeley-image.yml @@ -40,7 +40,6 @@ jobs: prefix=berkeley tags: | type=ref,event=branch - type=raw,value=berkeley - name: Build and push uses: docker/build-push-action@v5 From d252581bcbad9fc69ecd9c9392c3cd6001ca729f Mon Sep 17 00:00:00 2001 From: naglepuff Date: Mon, 22 Jul 2024 10:46:08 -0400 Subject: [PATCH 04/28] Fix berkeley image tag Previously we were using prefix + the branch name. Since the intent of the workflow is to run on only a specific branch, that was redundant. --- .github/workflows/berkeley-image.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/berkeley-image.yml b/.github/workflows/berkeley-image.yml index 9aa108b0..8baf5574 100644 --- a/.github/workflows/berkeley-image.yml +++ b/.github/workflows/berkeley-image.yml @@ -37,9 +37,8 @@ jobs: ghcr.io/microbiomedata/nmdc-server/${{ matrix.image }} flavor: | latest=false - prefix=berkeley tags: | - type=ref,event=branch + type=raw,value=berkeley - name: Build and push uses: docker/build-push-action@v5 From eec7c85a813093adcab35b7d9d6fdac31f0f0a6f Mon Sep 17 00:00:00 2001 From: naglepuff Date: Fri, 5 Jul 2024 17:20:02 -0400 Subject: [PATCH 05/28] Only pass necessary info to `upsert_doi` New slots were added to NMDC schema that breaks our current usage of the function. --- nmdc_server/ingest/study.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/nmdc_server/ingest/study.py b/nmdc_server/ingest/study.py index b77ce4dc..57179130 100644 --- a/nmdc_server/ingest/study.py +++ b/nmdc_server/ingest/study.py @@ -77,7 +77,12 @@ def load(db: Session, cursor: Cursor): doi["doi_value"] = transform_doi(doi.pop("doi_value")) for doi in dois: - upsert_doi(db, **doi) + upsert_doi( + db, + doi_value=doi["doi_value"], + doi_category=doi["doi_category"], + doi_provider=doi.get("doi_provider", ""), + ) new_study = create_study(db, Study(**obj)) if dois: From ef81fa76e50909966c682bf913e9235ec58f5d15 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Fri, 5 Jul 2024 17:23:39 -0400 Subject: [PATCH 06/28] Update connection from sample to study NMDC schema changed the slot that links a Biosample to a study. Previously the relationship was contained in the `part_of` slot on class `Biosample`. That slot has been renamed to `associated_studies`. Also, temporarily disable the backup link through omics_processing, as that relationship has changed in a more complex way. --- nmdc_server/ingest/biosample.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nmdc_server/ingest/biosample.py b/nmdc_server/ingest/biosample.py index 4f85e671..c0eb5cc4 100644 --- a/nmdc_server/ingest/biosample.py +++ b/nmdc_server/ingest/biosample.py @@ -108,9 +108,10 @@ def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collectio if env_medium: obj["env_medium_id"] = env_medium.id - omics_processing_record = omics_processing.find_one({"has_input": obj["id"]}) - part_of = obj.pop("part_of", None) + part_of = obj.pop("associated_studies", None) if part_of is None: + # omics_processing_record = omics_processing.find_one({"has_input": obj["id"]}) + omics_processing_record = None if omics_processing_record is None: logger.error(f"Could not determine study for biosample {obj['id']}") return From e4efd5b1237555fcdafaf31b2dd3ceb35da87f1d Mon Sep 17 00:00:00 2001 From: naglepuff Date: Fri, 5 Jul 2024 17:49:17 -0400 Subject: [PATCH 07/28] Don't use omics_processing during sample ingest This isn't necessary since `associated_studies` is required and has cardinality of 1..*. --- nmdc_server/ingest/all.py | 1 - nmdc_server/ingest/biosample.py | 14 +++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index 499cbdf5..c8f987ef 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -90,7 +90,6 @@ def load(db: Session, function_limit=None, skip_annotation=False): biosample.load( db, cursor, - omics_processing=mongodb["omics_processing_set"], ) db.commit() diff --git a/nmdc_server/ingest/biosample.py b/nmdc_server/ingest/biosample.py index c0eb5cc4..7cca0c26 100644 --- a/nmdc_server/ingest/biosample.py +++ b/nmdc_server/ingest/biosample.py @@ -92,7 +92,7 @@ def coerce_collection_date(cls, value): return raw_value -def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collection): +def load_biosample(db: Session, obj: Dict[str, Any]): logger = get_logger(__name__) env_broad_scale_id = obj.pop("env_broad_scale", {}).get("term", {}).get("id", "") env_broad_scale = db.query(models.EnvoTerm).get(env_broad_scale_id.replace("_", ":")) @@ -110,12 +110,8 @@ def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collectio part_of = obj.pop("associated_studies", None) if part_of is None: - # omics_processing_record = omics_processing.find_one({"has_input": obj["id"]}) - omics_processing_record = None - if omics_processing_record is None: - logger.error(f"Could not determine study for biosample {obj['id']}") - return - part_of = omics_processing_record["part_of"] + logger.error(f"Could not determine study for biosample {obj['id']}") + return obj["study_id"] = part_of[0] depth_obj = obj.get("depth", {}) @@ -144,11 +140,11 @@ def load_biosample(db: Session, obj: Dict[str, Any], omics_processing: Collectio db.add(models.Biosample(**biosample.dict())) -def load(db: Session, cursor: Cursor, omics_processing: Collection): +def load(db: Session, cursor: Cursor): logger = get_logger(__name__) for obj in cursor: try: - load_biosample(db, obj, omics_processing) + load_biosample(db, obj) except Exception as err: logger.error(f"Error parsing biosample: {err}") logger.error(json.dumps(obj, indent=2, default=str)) From 3a529a3726ed44ab151f006913962d7f5a46b090 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Wed, 10 Jul 2024 17:00:15 -0400 Subject: [PATCH 08/28] Update ingest for data_generation objects Formerly known as omics_procecssing records, there were some small schema tweaks that needed to be reflected in ingest. Note how the process of obtaining input biosamples is simpler since we now only need to query one collection of processes. This is a result of the change in database structure that puts related objects into the same collection. --- nmdc_server/ingest/all.py | 2 +- nmdc_server/ingest/omics_processing.py | 47 ++++++-------------------- 2 files changed, 12 insertions(+), 37 deletions(-) diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index c8f987ef..1436c12e 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -96,7 +96,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading omics processing...") omics_processing.load( db, - mongodb["omics_processing_set"].find(), + mongodb["data_generation_set"].find(), mongodb, ) db.commit() diff --git a/nmdc_server/ingest/omics_processing.py b/nmdc_server/ingest/omics_processing.py index 8e3ba930..98667115 100644 --- a/nmdc_server/ingest/omics_processing.py +++ b/nmdc_server/ingest/omics_processing.py @@ -20,28 +20,13 @@ date_fmt = re.compile(r"\d\d-[A-Z]+-\d\d \d\d\.\d\d\.\d\d\.\d+ [AP]M") -process_types = [ - "pooling", - "extraction", - "library_preparation", -] - - -collections = { - "biosample": "biosample_set", - "processed_sample": "processed_sample_set", - "extraction": "extraction_set", - "library_preparation": "library_preparation_set", - "pooling": "pooling_set", -} - - omics_types = { "metagenome": "Metagenome", - "metabolomics": "Metabolomics", - "proteomics": "Proteomics", + "metabolome": "Metabolomics", + "metaproteome": "Proteomics", "metatranscriptome": "Metatranscriptome", "organic matter characterization": "Organic Matter Characterization", + "nom": "Organic Matter Characterization", } @@ -65,17 +50,11 @@ def is_biosample(object_id, biosample_collection): def find_parent_process(output_id: str, mongodb: Database) -> Optional[dict[str, Any]]: """Given a ProcessedSample ID, find the process (e.g. Extraction) that created it.""" - output_found = False - collections_left = True - while not output_found and collections_left: - for name in process_types: - collection: Collection = mongodb[collections[name]] - query = collection.find({"has_output": output_id}, no_cursor_timeout=True) - result_list = list(query) - if len(result_list): - output_found = True - return result_list[0] - collections_left = False + material_processing_collection: Collection = mongodb["material_processing_set"] + query = material_processing_collection.find({"has_output": output_id}, no_cursor_timeout=True) + result_list = list(query) + if len(result_list): + return result_list[0] return None @@ -117,10 +96,6 @@ def load_omics_processing(db: Session, obj: Dict[str, Any], mongodb: Database, l biosample_input_ids: set[str] = set() for input_id in input_ids: biosample_input_ids.union(get_biosample_input_ids(input_id, mongodb, biosample_input_ids)) - if len(biosample_input_ids) > 1: - logger.error("Processed sample input detected") - logger.error(obj["id"]) - logger.error(biosample_input_ids) obj["biosample_inputs"] = [] biosample_input_objects = [] @@ -133,9 +108,9 @@ def load_omics_processing(db: Session, obj: Dict[str, Any], mongodb: Database, l biosample_input_objects.append(biosample_object) data_objects = obj.pop("has_output", []) - obj["study_id"] = obj.pop("part_of", [None])[0] - raw_omics_type: str = obj["omics_type"]["has_raw_value"] - obj["omics_type"]["has_raw_value"] = omics_types[raw_omics_type.lower()] + obj["study_id"] = obj.pop("associated_studies", [None])[0] + obj["analyte_category"] = omics_types[obj["analyte_category"].lower()] + obj["omics_type"] = omics_types[obj["analyte_category"].lower()] omics_processing = models.OmicsProcessing(**OmicsProcessing(**obj).dict()) for biosample_object in biosample_input_objects: From 148d828069a96aa9ac3f35f86a3a2482c8f43141 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 11 Jul 2024 10:16:21 -0400 Subject: [PATCH 09/28] Use subset of collection for pipeline ingest --- nmdc_server/ingest/all.py | 42 ++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index 1436c12e..aa3485b3 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -3,6 +3,7 @@ import click from pymongo import MongoClient from pymongo.collection import Collection +from pymongo.cursor import Cursor from sqlalchemy.orm import Session from nmdc_server import models @@ -101,10 +102,21 @@ def load(db: Session, function_limit=None, skip_annotation=False): ) db.commit() + """ + nmdc:ReadQcAnalysis + nmdc:MagsAnalysis + nmdc:MetabolomicsAnalysis + nmdc:MetagenomeSequencing + nmdc:ReadBasedTaxonomyAnalysis + nmdc:MetagenomeAssembly + nmdc:MetagenomeAnnotation + nmdc:NomAnalysis + """ + logger.info("Loading metabolomics analysis...") pipeline.load( db, - mongodb["metabolomics_analysis_activity_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:MetabolomicsAnalysis"}), pipeline.load_metabolomics_analysis, WorkflowActivityTypeEnum.metabolomics_analysis.value, ) @@ -113,7 +125,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading read based analysis...") pipeline.load( db, - mongodb["read_based_taxonomy_analysis_activity_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:ReadBasedTaxonomyAnalysis"}), pipeline.load_read_based_analysis, WorkflowActivityTypeEnum.read_based_analysis.value, ) @@ -122,7 +134,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metatranscriptome expression analyses...") pipeline.load( db, - mongodb["metatranscriptome_expression_analysis_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:MetatranscriptomeAnalysis"}), pipeline.load_metatranscriptome, WorkflowActivityTypeEnum.metatranscriptome.value, ) @@ -138,7 +150,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading NOM analysis...") pipeline.load( db, - mongodb["nom_analysis_activity_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:NomAnalysis"}), pipeline.load_nom_analysis, WorkflowActivityTypeEnum.nom_analysis.value, ) @@ -147,7 +159,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading MAGs...") pipeline.load( db, - mongodb["mags_activity_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:MagsAnalysis"}), pipeline.load_mags, WorkflowActivityTypeEnum.mags_analysis.value, ) @@ -162,13 +174,14 @@ def load(db: Session, function_limit=None, skip_annotation=False): # This has historically been fast, but it is only for the progress bar. # It can be removed if it becomes slow. - count = mongodb["metagenome_annotation_activity_set"].estimated_document_count() - iterator = paginate_cursor( - mongodb["metagenome_annotation_activity_set"], - page_size=1, # prevent cursor from timing out - no_cursor_timeout=True, + annotation_activities = list( + mongodb["workflow_execution_set"].find( + {"type": "nmdc:MetagenomeAnnotation"}, batch_size=100 + ) ) - with click.progressbar(iterator, length=count) as bar: + # TODO test this and make sure it works as expected + # this undoes the pagination that existed before + with click.progressbar(annotation_activities, length=len(annotation_activities)) as bar: pipeline.load( db, bar, @@ -204,7 +217,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading read qc...") pipeline.load( db, - mongodb["read_qc_analysis_activity_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:ReadQcAnalysis"}), pipeline.load_reads_qc, WorkflowActivityTypeEnum.reads_qc.value, ) @@ -214,7 +227,8 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metaproteomic analysis...") pipeline.load( db, - mongodb["metaproteomics_analysis_activity_set"].find( + mongodb["workflow_execution_set"].find( + {"type": "nmdc:MetaproteomicAnalysis"}, no_cursor_timeout=True, ), pipeline.load_mp_analysis, @@ -231,7 +245,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metagenome assembly...") pipeline.load( db, - mongodb["metagenome_assembly_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:MetagenomeAssembly"}), pipeline.load_mg_assembly, WorkflowActivityTypeEnum.metagenome_assembly.value, ) From fde9d5adb424eaf9e109318cd5ea56bc5d5ca36f Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 11 Jul 2024 14:00:57 -0400 Subject: [PATCH 10/28] Add default value for `used` Note that this should be updated in the SQL schema to be optional/nullable. --- nmdc_server/ingest/all.py | 2 +- nmdc_server/ingest/omics_processing.py | 2 ++ nmdc_server/schemas.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index aa3485b3..cd7d867d 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -228,7 +228,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): pipeline.load( db, mongodb["workflow_execution_set"].find( - {"type": "nmdc:MetaproteomicAnalysis"}, + {"type": "nmdc:MetaproteomicsAnalysis"}, no_cursor_timeout=True, ), pipeline.load_mp_analysis, diff --git a/nmdc_server/ingest/omics_processing.py b/nmdc_server/ingest/omics_processing.py index 98667115..016e1293 100644 --- a/nmdc_server/ingest/omics_processing.py +++ b/nmdc_server/ingest/omics_processing.py @@ -23,7 +23,9 @@ omics_types = { "metagenome": "Metagenome", "metabolome": "Metabolomics", + "metabolomics": "Metabolomics", "metaproteome": "Proteomics", + "proteomics": "Proteomics", "metatranscriptome": "Metatranscriptome", "organic matter characterization": "Organic Matter Characterization", "nom": "Organic Matter Characterization", diff --git a/nmdc_server/schemas.py b/nmdc_server/schemas.py index 8888c07e..61b15fa7 100644 --- a/nmdc_server/schemas.py +++ b/nmdc_server/schemas.py @@ -566,7 +566,7 @@ class MAGsAnalysis(PipelineStep): class NOMAnalysisBase(PipelineStepBase): type: str = WorkflowActivityTypeEnum.nom_analysis.value - used: str + used: str = "" class NOMAnalysis(PipelineStep): @@ -591,7 +591,7 @@ class Metatranscriptome(PipelineStep): class MetabolomicsAnalysisBase(PipelineStepBase): type: str = WorkflowActivityTypeEnum.metabolomics_analysis.value - used: str + used: str = "" has_calibration: str From 6794e28951f6a9c52ee29da8bd256972ce6d56a2 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Mon, 15 Jul 2024 14:24:10 -0400 Subject: [PATCH 11/28] Extract instrument name from instrument collection Note that in the future we might want to make instrument a fully fledged model in our database. --- nmdc_server/ingest/omics_processing.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nmdc_server/ingest/omics_processing.py b/nmdc_server/ingest/omics_processing.py index 016e1293..7fe82c7c 100644 --- a/nmdc_server/ingest/omics_processing.py +++ b/nmdc_server/ingest/omics_processing.py @@ -114,6 +114,13 @@ def load_omics_processing(db: Session, obj: Dict[str, Any], mongodb: Database, l obj["analyte_category"] = omics_types[obj["analyte_category"].lower()] obj["omics_type"] = omics_types[obj["analyte_category"].lower()] + # Get instrument name + instrument_id = obj.pop("instrument_used", []) + if instrument_id: + instrument = mongodb["instrument_set"].find_one({"id": instrument_id[0]}) + if instrument: + obj["instrument_name"] = instrument["name"] + omics_processing = models.OmicsProcessing(**OmicsProcessing(**obj).dict()) for biosample_object in biosample_input_objects: # mypy thinks that omics_processing.biosample_inputs is of type Biosample From d7d2219cf19d1094d805f18478d56d482d2ca7e7 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Mon, 15 Jul 2024 17:10:33 -0400 Subject: [PATCH 12/28] Update nmdc workflow names enum --- nmdc_server/data_object_filters.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/nmdc_server/data_object_filters.py b/nmdc_server/data_object_filters.py index faaae9fd..e3da4902 100644 --- a/nmdc_server/data_object_filters.py +++ b/nmdc_server/data_object_filters.py @@ -28,18 +28,16 @@ def get_local_data_url(url: Optional[str]) -> Optional[str]: class WorkflowActivityTypeEnum(Enum): - reads_qc = "nmdc:ReadQCAnalysisActivity" + mags_analysis = "nmdc:MagsAnalysis" + metabolomics_analysis = "nmdc:MetabolomicsAnalysis" metagenome_assembly = "nmdc:MetagenomeAssembly" - metagenome_annotation = "nmdc:MetagenomeAnnotation" # TODO name out of date, fix - metatranscriptome_assembly = "nmdc:MetatranscriptomeAssembly" - metatranscriptome_annotation = "nmdc:MetatranscriptomeAnnotation" # TODO name out of date, fix - metaproteomic_analysis = "nmdc:MetaProteomicAnalysis" - mags_analysis = "nmdc:MAGsAnalysisActivity" - read_based_analysis = "nmdc:ReadbasedAnalysis" # TODO name out of date, fix - nom_analysis = "nmdc:NomAnalysisActivity" - metabolomics_analysis = "nmdc:MetabolomicsAnalysisActivity" + metagenome_annotation = "nmdc:MetagenomeAnnotation" + metaproteomic_analysis = "nmdc:MetaproteomicAnalysis" + metatranscriptome = "nmdc:MetatranscriptomeAnalysis" + nom_analysis = "nmdc:NomAnalysis" raw_data = "nmdc:RawData" - metatranscriptome = "nmdc:metaT" + read_based_analysis = "nmdc:ReadBasedTaxonomyAnalysis" + reads_qc = "nmdc:ReadQcAnalysis" @property def model(self): From cdada36c76ab9d3a396d4e6ce096c82a07745fd9 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Wed, 31 Jul 2024 15:09:08 -0400 Subject: [PATCH 13/28] Update metatranscriptome workflow ingest --- nmdc_server/data_object_filters.py | 2 ++ nmdc_server/ingest/all.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nmdc_server/data_object_filters.py b/nmdc_server/data_object_filters.py index e3da4902..8b7e1992 100644 --- a/nmdc_server/data_object_filters.py +++ b/nmdc_server/data_object_filters.py @@ -34,6 +34,8 @@ class WorkflowActivityTypeEnum(Enum): metagenome_annotation = "nmdc:MetagenomeAnnotation" metaproteomic_analysis = "nmdc:MetaproteomicAnalysis" metatranscriptome = "nmdc:MetatranscriptomeAnalysis" + metatranscriptome_assembly = "nmdc:MetatranscriptomeAssembly" + metatranscriptome_annotation = "nmdc:MetatranscriptomeAnnotation" nom_analysis = "nmdc:NomAnalysis" raw_data = "nmdc:RawData" read_based_analysis = "nmdc:ReadBasedTaxonomyAnalysis" diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index cd7d867d..d35b3e57 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -142,7 +142,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metatranscriptome assemblies...") pipeline.load( db, - mongodb["metatranscriptome_assembly_set"].find(), + mongodb["workflow_execution_set"].find({"type": "nmdc:MetatranscriptomeAssembly"}), pipeline.load_mt_assembly, WorkflowActivityTypeEnum.metatranscriptome_assembly.value, ) @@ -203,7 +203,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metatranscriptome annotation...") pipeline.load( db, - mongodb["metatranscriptome_annotation_set"].find(), + mongodb["workflow_execution_set"].find({"type": "MetatranscriptomeAnnotation"}), pipeline.load_mt_annotation, WorkflowActivityTypeEnum.metatranscriptome_annotation.value, annotations=mongodb["functional_annotation_agg"], From 12767b0acc23c0eee973ed4ab47b55d7161d19b9 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 1 Aug 2024 14:50:06 -0400 Subject: [PATCH 14/28] Remove unnecessary comment --- nmdc_server/ingest/all.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index d35b3e57..d173a75e 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -102,17 +102,6 @@ def load(db: Session, function_limit=None, skip_annotation=False): ) db.commit() - """ - nmdc:ReadQcAnalysis - nmdc:MagsAnalysis - nmdc:MetabolomicsAnalysis - nmdc:MetagenomeSequencing - nmdc:ReadBasedTaxonomyAnalysis - nmdc:MetagenomeAssembly - nmdc:MetagenomeAnnotation - nmdc:NomAnalysis - """ - logger.info("Loading metabolomics analysis...") pipeline.load( db, From 43d5b9e8db8d6165c671689701264ad6767c52a8 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 1 Aug 2024 14:52:14 -0400 Subject: [PATCH 15/28] Remove unused imports --- nmdc_server/ingest/all.py | 1 - nmdc_server/ingest/biosample.py | 1 - 2 files changed, 2 deletions(-) diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index d173a75e..d4c8056d 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -3,7 +3,6 @@ import click from pymongo import MongoClient from pymongo.collection import Collection -from pymongo.cursor import Cursor from sqlalchemy.orm import Session from nmdc_server import models diff --git a/nmdc_server/ingest/biosample.py b/nmdc_server/ingest/biosample.py index 7cca0c26..621640dc 100644 --- a/nmdc_server/ingest/biosample.py +++ b/nmdc_server/ingest/biosample.py @@ -4,7 +4,6 @@ from typing import Any, Dict from pydantic import root_validator, validator -from pymongo.collection import Collection from pymongo.cursor import Cursor from sqlalchemy.orm import Session From a9bccf8533b8647129415556e58c0298a6afd30a Mon Sep 17 00:00:00 2001 From: naglepuff Date: Tue, 6 Aug 2024 17:07:37 -0400 Subject: [PATCH 16/28] Reduce use of raw string values during ingest --- nmdc_server/ingest/all.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/nmdc_server/ingest/all.py b/nmdc_server/ingest/all.py index d4c8056d..f6f42166 100644 --- a/nmdc_server/ingest/all.py +++ b/nmdc_server/ingest/all.py @@ -101,10 +101,12 @@ def load(db: Session, function_limit=None, skip_annotation=False): ) db.commit() + workflow_set = "workflow_execution_set" + logger.info("Loading metabolomics analysis...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:MetabolomicsAnalysis"}), + mongodb[workflow_set].find({"type": WorkflowActivityTypeEnum.metabolomics_analysis.value}), pipeline.load_metabolomics_analysis, WorkflowActivityTypeEnum.metabolomics_analysis.value, ) @@ -113,7 +115,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading read based analysis...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:ReadBasedTaxonomyAnalysis"}), + mongodb[workflow_set].find({"type": WorkflowActivityTypeEnum.read_based_analysis.value}), pipeline.load_read_based_analysis, WorkflowActivityTypeEnum.read_based_analysis.value, ) @@ -122,7 +124,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metatranscriptome expression analyses...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:MetatranscriptomeAnalysis"}), + mongodb[workflow_set].find({"type": WorkflowActivityTypeEnum.metatranscriptome.value}), pipeline.load_metatranscriptome, WorkflowActivityTypeEnum.metatranscriptome.value, ) @@ -130,7 +132,9 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metatranscriptome assemblies...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:MetatranscriptomeAssembly"}), + mongodb[workflow_set].find( + {"type": WorkflowActivityTypeEnum.metatranscriptome_assembly.value} + ), pipeline.load_mt_assembly, WorkflowActivityTypeEnum.metatranscriptome_assembly.value, ) @@ -138,7 +142,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading NOM analysis...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:NomAnalysis"}), + mongodb[workflow_set].find({"type": WorkflowActivityTypeEnum.nom_analysis.value}), pipeline.load_nom_analysis, WorkflowActivityTypeEnum.nom_analysis.value, ) @@ -147,7 +151,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading MAGs...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:MagsAnalysis"}), + mongodb[workflow_set].find({"type": WorkflowActivityTypeEnum.mags_analysis.value}), pipeline.load_mags, WorkflowActivityTypeEnum.mags_analysis.value, ) @@ -163,9 +167,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): # This has historically been fast, but it is only for the progress bar. # It can be removed if it becomes slow. annotation_activities = list( - mongodb["workflow_execution_set"].find( - {"type": "nmdc:MetagenomeAnnotation"}, batch_size=100 - ) + mongodb[workflow_set].find({"type": "nmdc:MetagenomeAnnotation"}, batch_size=100) ) # TODO test this and make sure it works as expected # this undoes the pagination that existed before @@ -191,7 +193,9 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metatranscriptome annotation...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "MetatranscriptomeAnnotation"}), + mongodb[workflow_set].find( + {"type": WorkflowActivityTypeEnum.metatranscriptome_annotation.value} + ), pipeline.load_mt_annotation, WorkflowActivityTypeEnum.metatranscriptome_annotation.value, annotations=mongodb["functional_annotation_agg"], @@ -205,7 +209,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading read qc...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:ReadQcAnalysis"}), + mongodb[workflow_set].find({"type": WorkflowActivityTypeEnum.reads_qc.value}), pipeline.load_reads_qc, WorkflowActivityTypeEnum.reads_qc.value, ) @@ -215,7 +219,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metaproteomic analysis...") pipeline.load( db, - mongodb["workflow_execution_set"].find( + mongodb[workflow_set].find( {"type": "nmdc:MetaproteomicsAnalysis"}, no_cursor_timeout=True, ), @@ -233,7 +237,7 @@ def load(db: Session, function_limit=None, skip_annotation=False): logger.info("Loading metagenome assembly...") pipeline.load( db, - mongodb["workflow_execution_set"].find({"type": "nmdc:MetagenomeAssembly"}), + mongodb[workflow_set].find({"type": WorkflowActivityTypeEnum.metagenome_assembly.value}), pipeline.load_mg_assembly, WorkflowActivityTypeEnum.metagenome_assembly.value, ) From 8e9d032b34ae0b7c1753f019f0db9308a27f658f Mon Sep 17 00:00:00 2001 From: naglepuff Date: Tue, 6 Aug 2024 17:09:45 -0400 Subject: [PATCH 17/28] Rename variable for clarity --- nmdc_server/ingest/biosample.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nmdc_server/ingest/biosample.py b/nmdc_server/ingest/biosample.py index 621640dc..a570621f 100644 --- a/nmdc_server/ingest/biosample.py +++ b/nmdc_server/ingest/biosample.py @@ -107,12 +107,12 @@ def load_biosample(db: Session, obj: Dict[str, Any]): if env_medium: obj["env_medium_id"] = env_medium.id - part_of = obj.pop("associated_studies", None) - if part_of is None: + associated_studies = obj.pop("associated_studies", None) + if associated_studies is None: logger.error(f"Could not determine study for biosample {obj['id']}") return - obj["study_id"] = part_of[0] + obj["study_id"] = associated_studies[0] depth_obj = obj.get("depth", {}) obj["depth"] = extract_quantity(depth_obj, "biosample", "depth") From 8d5af2ab82b6a31aa5e539f5d79899bbd0382b20 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Fri, 9 Aug 2024 10:11:08 -0400 Subject: [PATCH 18/28] Replace instances of omics with data generation --- web/src/components/FacetedSearch.vue | 2 +- web/src/encoding.ts | 12 ++++++------ web/src/views/Search/SearchLayout.vue | 2 +- web/src/views/Search/SearchSidebar.vue | 8 ++++---- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/web/src/components/FacetedSearch.vue b/web/src/components/FacetedSearch.vue index 7b44c541..892ead48 100644 --- a/web/src/components/FacetedSearch.vue +++ b/web/src/components/FacetedSearch.vue @@ -11,7 +11,7 @@ const groupOrders = [ 'sample', 'gold ecosystems', 'mixs environmental triad', - 'omics processing', + 'data generation', ]; export interface SearchFacet { diff --git a/web/src/encoding.ts b/web/src/encoding.ts index e6860784..a0a0f366 100644 --- a/web/src/encoding.ts +++ b/web/src/encoding.ts @@ -100,9 +100,9 @@ const types: Record = { }, omics_processing: { icon: 'mdi-file-table-box-multiple-outline', - heading: 'Omics Types', + heading: 'Data Types', name: 'omics_processing', - plural: 'Omics Processing', + plural: 'Data Generations', visible: true, schemaName: 'OmicsProcessing', }, @@ -263,15 +263,15 @@ const fields: Record = { }, instrument_name: { name: 'Instrument Name', - group: 'Omics Processing', + group: 'Data Generation', }, omics_type: { - name: 'Omics Type', - group: 'Omics Processing', + name: 'Data Type', + group: 'Data Generation', }, processing_institution: { name: 'Processing Institution', - group: 'Omics Processing', + group: 'Data Generation', }, /* GOLD ecosystem type */ ecosystem: { diff --git a/web/src/views/Search/SearchLayout.vue b/web/src/views/Search/SearchLayout.vue index bc120fb7..bdb5da0b 100644 --- a/web/src/views/Search/SearchLayout.vue +++ b/web/src/views/Search/SearchLayout.vue @@ -229,7 +229,7 @@ export default defineComponent({ height="30px" > - Omics + Data Type Environment diff --git a/web/src/views/Search/SearchSidebar.vue b/web/src/views/Search/SearchSidebar.vue index a4b79faf..b1fa273e 100644 --- a/web/src/views/Search/SearchSidebar.vue +++ b/web/src/views/Search/SearchSidebar.vue @@ -76,21 +76,21 @@ const FunctionSearchFacets: SearchFacet[] = [ table: 'study', group: 'Study', }, - /** Omics Processing */ + /** Data Generation */ { field: 'instrument_name', table: 'omics_processing', - group: 'Omics Processing', + group: 'Data Generation', }, { field: 'omics_type', table: 'omics_processing', - group: 'Omics Processing', + group: 'Data Generation', }, { field: 'processing_institution', table: 'omics_processing', - group: 'Omics Processing', + group: 'Data Generation', }, ]; From f030cbc862ddfcb448637bed84d8b83703208213 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Mon, 26 Aug 2024 11:07:24 -0400 Subject: [PATCH 19/28] Update to berkeley schema --- pyproject.toml | 2 +- web/package.json | 2 +- web/yarn.lock | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6caa0bff..9e69a55f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "ipython==7.31.1", "itsdangerous==2.0.1", "mypy<0.920", - "nmdc-schema==10.7.0", + "nmdc-schema==11.0.0rc20", "nmdc-submission-schema==10.7.0", "pint==0.18", "psycopg2==2.9.3", diff --git a/web/package.json b/web/package.json index 51e23c6a..2070cc68 100644 --- a/web/package.json +++ b/web/package.json @@ -34,7 +34,7 @@ "linkify-it": "^4.0.1", "lodash": "^4.17.21", "moment": "^2.29.4", - "nmdc-schema": "https://github.com/microbiomedata/nmdc-schema#v10.7.0", + "nmdc-schema": "https://github.com/microbiomedata/berkeley-schema-fy24", "popper.js": "1.16.1", "protobufjs": "^6.11.3", "serialize-javascript": "^6.0.0", diff --git a/web/yarn.lock b/web/yarn.lock index 0f57a074..30a33316 100644 --- a/web/yarn.lock +++ b/web/yarn.lock @@ -7662,9 +7662,9 @@ nice-try@^1.0.4: resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366" integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ== -"nmdc-schema@https://github.com/microbiomedata/nmdc-schema#v10.7.0": +"nmdc-schema@https://github.com/microbiomedata/berkeley-schema-fy24": version "0.0.0" - resolved "https://github.com/microbiomedata/nmdc-schema#ad1440736c1e088705a8b09f88727c0e195abfb3" + resolved "https://github.com/microbiomedata/berkeley-schema-fy24#ecaefcf0e9fecfc2c5290736ed0d35e7024ce21e" no-case@^2.2.0: version "2.3.2" From 4ba5eab3b97b3c1a3120433019234c7eeaf7feef Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Thu, 29 Aug 2024 13:21:39 -0700 Subject: [PATCH 20/28] Upgrade to nmdc-submission-schema v11.0.0rc1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9e69a55f..b7a25c1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "itsdangerous==2.0.1", "mypy<0.920", "nmdc-schema==11.0.0rc20", - "nmdc-submission-schema==10.7.0", + "nmdc-submission-schema==11.0.0rc1", "pint==0.18", "psycopg2==2.9.3", "pydantic==1.10.2", From 4908d7c76b2b9f86e32b25a3f8740d55f3b9df2c Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Thu, 29 Aug 2024 14:15:44 -0700 Subject: [PATCH 21/28] Handle case where slot annotations are missing --- web/src/components/FacetedSearch.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/src/components/FacetedSearch.vue b/web/src/components/FacetedSearch.vue index c61cdc92..9964bdc1 100644 --- a/web/src/components/FacetedSearch.vue +++ b/web/src/components/FacetedSearch.vue @@ -83,7 +83,7 @@ export default Vue.extend({ goldDescription() { // @ts-ignore const schema = NmdcSchema.slots.gold_path_field; - return schema.annotations.tooltip.value || ''; + return schema.annotations?.tooltip?.value || ''; }, }, methods: { From df146c4c1cafae508f4aa2ab4f30f77c817b16ce Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Thu, 29 Aug 2024 14:16:14 -0700 Subject: [PATCH 22/28] Update ProcessingInstitutionEnum name --- .../views/SubmissionPortal/Components/SubmissionContextForm.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/src/views/SubmissionPortal/Components/SubmissionContextForm.vue b/web/src/views/SubmissionPortal/Components/SubmissionContextForm.vue index 83b4fc21..70edfaf3 100644 --- a/web/src/views/SubmissionPortal/Components/SubmissionContextForm.vue +++ b/web/src/views/SubmissionPortal/Components/SubmissionContextForm.vue @@ -24,7 +24,7 @@ export default defineComponent({ components: { SubmissionContextShippingForm, SubmissionDocsLink, SubmissionPermissionBanner }, setup() { const formRef = ref(); - const facilityEnum = Object.keys(NmdcSchema.enums.processing_institution_enum.permissible_values).filter( + const facilityEnum = Object.keys(NmdcSchema.enums.ProcessingInstitutionEnum.permissible_values).filter( (facility: string) => ['EMSL', 'JGI'].includes(facility), ); const projectAwardValidationRules = () => [(v: string) => { From 35c7a4ffcbafe001418d7f73d99bdcffc8e06bed Mon Sep 17 00:00:00 2001 From: naglepuff Date: Tue, 1 Oct 2024 17:33:04 -0400 Subject: [PATCH 23/28] Update public-facing API routes for data generation --- nmdc_server/api.py | 39 ++++++++++++++++++++++----------------- web/src/data/api.ts | 10 ++++++---- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/nmdc_server/api.py b/nmdc_server/api.py index 26437194..fbece825 100644 --- a/nmdc_server/api.py +++ b/nmdc_server/api.py @@ -411,13 +411,18 @@ async def get_study_image(study_id: str, db: Session = Depends(get_db)): return StreamingResponse(BytesIO(image), media_type="image/jpeg") -# omics_processing +# data_generation +# Note the intermingling of the terms "data generation" and "omics processing." +# The Berkeley schema (NMDC schema v11) did away with the phrase "omics processing." +# As a result, public-facing uses of "omics processing" should be replaced with +# "data generation." +# Future work should go in to a more thorough conversion of omics process to data generation. @router.post( - "/omics_processing/search", + "/data_generation/search", response_model=query.OmicsProcessingSearchResponse, - tags=["omics_processing"], - name="Search for omics processings", - description="Faceted search of omics_processing data.", + tags=["data_generation"], + name="Search for data generations", + description="Faceted search of data_generation data.", ) async def search_omics_processing( query: query.SearchQuery = query.SearchQuery(), @@ -428,9 +433,9 @@ async def search_omics_processing( @router.post( - "/omics_processing/facet", + "/data_generation/facet", response_model=query.FacetResponse, - tags=["omics_processing"], + tags=["data_generation"], name="Get all values of an attribute", ) async def facet_omics_processing(query: query.FacetQuery, db: Session = Depends(get_db)): @@ -438,9 +443,9 @@ async def facet_omics_processing(query: query.FacetQuery, db: Session = Depends( @router.post( - "/omics_processing/binned_facet", + "/data_generation/binned_facet", response_model=query.BinnedFacetResponse, - tags=["omics_processing"], + tags=["data_generation"], name="Get all values of a non-string attribute with binning", ) async def binned_facet_omics_processing( @@ -450,26 +455,26 @@ async def binned_facet_omics_processing( @router.get( - "/omics_processing/{omics_processing_id}", + "/data_generation/{data_generation_id}", response_model=schemas.OmicsProcessing, - tags=["omics_processing"], + tags=["data_generation"], ) -async def get_omics_processing(omics_processing_id: str, db: Session = Depends(get_db)): - db_omics_processing = crud.get_omics_processing(db, omics_processing_id) +async def get_omics_processing(data_generation_id: str, db: Session = Depends(get_db)): + db_omics_processing = crud.get_omics_processing(db, data_generation_id) if db_omics_processing is None: raise HTTPException(status_code=404, detail="OmicsProcessing not found") return db_omics_processing @router.get( - "/omics_processing/{omics_processing_id}/outputs", + "/data_generation/{data_generation_id}/outputs", response_model=List[schemas.DataObject], - tags=["omics_processing"], + tags=["data_generation"], ) async def list_omics_processing_data_objects( - omics_processing_id: str, db: Session = Depends(get_db) + data_generation_id: str, db: Session = Depends(get_db) ): - return crud.list_omics_processing_data_objects(db, omics_processing_id).all() + return crud.list_omics_processing_data_objects(db, data_generation_id).all() # data object diff --git a/web/src/data/api.ts b/web/src/data/api.ts index b8a5c5e6..56050e45 100644 --- a/web/src/data/api.ts +++ b/web/src/data/api.ts @@ -402,7 +402,7 @@ async function searchStudy(params: SearchParams) { } async function searchOmicsProcessing(params: SearchParams) { - return _search('omics_processing', params); + return _search('data_generation', params); } async function searchReadsQC(params: SearchParams) { @@ -483,7 +483,7 @@ async function getFacetSummary( field: string, conditions: Condition[], ): Promise { - const path = type; + const path = type === 'omics_processing' ? 'data_generation' : type; const { data } = await client.post<{ facets: Record }>(`${path}/facet`, { conditions, attribute: field, }); @@ -504,7 +504,8 @@ async function getBinnedFacet( numBins: number, resolution: 'day' | 'week' | 'month' | 'year' = 'month', ) { - const { data } = await client.post>(`${table}/binned_facet`, { + const path = table === 'omics_processing' ? 'data_generation' : table; + const { data } = await client.post>(`${path}/binned_facet`, { attribute, conditions, resolution, @@ -588,7 +589,8 @@ async function getDataObjectList( 'metaproteomic_analysis', ]; if (supportedTypes.indexOf(type) >= 0) { - const { data } = await client.get(`${type}/${parentId}/outputs`); + const path = type === 'omics_processing' ? 'data_generation' : type; + const { data } = await client.get(`${path}/${parentId}/outputs`); return data; } return []; From 542e3bbd6f163264b0a6cc2d81fa7ad0a81b522f Mon Sep 17 00:00:00 2001 From: naglepuff Date: Wed, 2 Oct 2024 13:41:48 -0400 Subject: [PATCH 24/28] Update test withe new endpoint name --- tests/test_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_app.py b/tests/test_app.py index eb9a6673..69d886d0 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -153,7 +153,7 @@ def test_get_environmental_aggregation(db: Session, client: TestClient): @pytest.mark.parametrize( "endpoint", [ - "omics_processing", + "data_generation", ], ) def test_list_data_objects(db: Session, client: TestClient, endpoint: str): From 036f918d114a944f838d3977fe36ec34fea59eac Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 3 Oct 2024 12:48:20 -0400 Subject: [PATCH 25/28] Translate new protocol slot to postgres column --- nmdc_server/ingest/study.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nmdc_server/ingest/study.py b/nmdc_server/ingest/study.py index 0cd98006..a51f221c 100644 --- a/nmdc_server/ingest/study.py +++ b/nmdc_server/ingest/study.py @@ -86,6 +86,14 @@ def load(db: Session, cursor: Cursor): doi_provider=doi.get("doi_provider", ""), ) + protocols = obj.pop("protocol_link", []) + relevant_protocols = [] + if protocols: + for protocol in protocols: + if "url" in protocol: + relevant_protocols.append(protocol["url"]) + obj["relevant_protocols"] = relevant_protocols + new_study = create_study(db, Study(**obj)) if dois: for doi in dois: From 50bd664038ae0f93ebc24b83e0170e1cf1922c52 Mon Sep 17 00:00:00 2001 From: naglepuff Date: Thu, 3 Oct 2024 13:44:39 -0400 Subject: [PATCH 26/28] Simplify protocol url extraction --- nmdc_server/ingest/study.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/nmdc_server/ingest/study.py b/nmdc_server/ingest/study.py index a51f221c..b0ab065e 100644 --- a/nmdc_server/ingest/study.py +++ b/nmdc_server/ingest/study.py @@ -86,13 +86,9 @@ def load(db: Session, cursor: Cursor): doi_provider=doi.get("doi_provider", ""), ) - protocols = obj.pop("protocol_link", []) - relevant_protocols = [] - if protocols: - for protocol in protocols: - if "url" in protocol: - relevant_protocols.append(protocol["url"]) - obj["relevant_protocols"] = relevant_protocols + protocol_links = obj.pop("protocol_link", None) + if protocol_links: + obj["relevant_protocols"] = [p["url"] for p in protocol_links if "url" in p] new_study = create_study(db, Study(**obj)) if dois: From 8ab870f575fef156c0367ba52f1d2dd018e02d9b Mon Sep 17 00:00:00 2001 From: naglepuff Date: Mon, 7 Oct 2024 14:30:01 -0400 Subject: [PATCH 27/28] Remove custom berkeley image GA action We don't need this image once the Berkeley branch is merged into main. --- .github/workflows/berkeley-image.yml | 50 ---------------------------- 1 file changed, 50 deletions(-) delete mode 100644 .github/workflows/berkeley-image.yml diff --git a/.github/workflows/berkeley-image.yml b/.github/workflows/berkeley-image.yml deleted file mode 100644 index 8baf5574..00000000 --- a/.github/workflows/berkeley-image.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: build-berkeley-image - -on: - push: - branches: - - berkeley-schema-migration - -env: - IS_ORIGINAL_REPO: ${{ github.repository == 'microbiomedata/nmdc-server' }} - -jobs: - build: - runs-on: ubuntu-latest - - strategy: - matrix: - image: [server, client, worker] - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Login to Github Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: | - ghcr.io/microbiomedata/nmdc-server/${{ matrix.image }} - flavor: | - latest=false - tags: | - type=raw,value=berkeley - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ${{ matrix.image == 'client' && 'web' || '.' }} - push: ${{ env.IS_ORIGINAL_REPO }} - file: ${{ matrix.image == 'worker' && 'Dockerfile.worker' || matrix.image == 'client' && 'web/Dockerfile' || 'Dockerfile' }} - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} From 3fc4284ada48acba4a804c541d2eaadd1e461aa2 Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Tue, 8 Oct 2024 13:36:28 -0700 Subject: [PATCH 28/28] Upgrade to nmdc-schema 11.0.1 --- pyproject.toml | 4 ++-- web/package.json | 2 +- web/yarn.lock | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2e7119c1..d41e497b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,8 +23,8 @@ dependencies = [ "ipython==8.10.0", "itsdangerous==2.0.1", "mypy<0.920", - "nmdc-schema==11.0.0rc20", - "nmdc-submission-schema==11.0.0rc1", + "nmdc-schema==11.0.1", + "nmdc-submission-schema==11.0.0", "nmdc-geoloc-tools==0.1.1", "pint==0.18", "psycopg2==2.9.3", diff --git a/web/package.json b/web/package.json index f1fd5551..3aded21c 100644 --- a/web/package.json +++ b/web/package.json @@ -34,7 +34,7 @@ "linkify-it": "^4.0.1", "lodash": "^4.17.21", "moment": "^2.29.4", - "nmdc-schema": "https://github.com/microbiomedata/berkeley-schema-fy24", + "nmdc-schema": "https://github.com/microbiomedata/nmdc-schema#v11.0.1", "popper.js": "1.16.1", "protobufjs": "^6.11.3", "serialize-javascript": "^6.0.0", diff --git a/web/yarn.lock b/web/yarn.lock index 0d4a74e8..7611d0e6 100644 --- a/web/yarn.lock +++ b/web/yarn.lock @@ -7762,9 +7762,9 @@ nice-try@^1.0.4: resolved "https://registry.yarnpkg.com/nice-try/-/nice-try-1.0.5.tgz#a3378a7696ce7d223e88fc9b764bd7ef1089e366" integrity sha512-1nh45deeb5olNY7eX82BkPO7SSxR5SSYJiPTrTdFUVYwAl8CKMA5N9PjTYkHiRjisVcxcQ1HXdLhx2qxxJzLNQ== -"nmdc-schema@https://github.com/microbiomedata/berkeley-schema-fy24": +"nmdc-schema@https://github.com/microbiomedata/nmdc-schema#v11.0.1": version "0.0.0" - resolved "https://github.com/microbiomedata/berkeley-schema-fy24#ecaefcf0e9fecfc2c5290736ed0d35e7024ce21e" + resolved "https://github.com/microbiomedata/nmdc-schema#65510b8efa1398671392dc98934e98aa3952e541" no-case@^2.2.0: version "2.3.2"