dfci · curlup · Oct 18, 2021 · Feb 17, 2022 · Mar 11, 2022 · Apr 1, 2022
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,28 @@
+# Use an official Python 3.8 runtime as a parent image
+FROM python:3.8-slim-buster
+
+# Set the working directory in the container to /app
+WORKDIR /app
+
+# Add the current directory contents into the container at /app
+ADD . /app
+
+# Install any needed packages specified in setup.py
+RUN pip install .
+
+# Install bash and bash-completion
+RUN apt-get update && apt-get install -y bash bash-completion
+
+# Make port 80 available to the world outside this container
+EXPOSE 80
+
+# Define environment variable
+ENV NAME MatchEngineV2
+ENV SECRETS_JSON /app/secrets.json
+
+RUN pip uninstall bson -y
+RUN pip uninstall pymongo -y
+RUN pip install pymongo==3.8.0
+
+# Run app.py when the container launches
+CMD tail -f /dev/null
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,12 @@
+version: '3'
+services:
+  app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "8000:80"
+    volumes:
+      - .:/app
+    environment:
+      - NAME=MatchEngineV2
diff --git a/matchengine/config/dfci_config.json b/matchengine/config/dfci_config.json
@@ -3,7 +3,7 @@
   "trial_identifier": "protocol_no",
   "match_trial_link_id": "protocol_no",
   "trial_status_key": {
-    "key_name": null,
+    "key_name": "summary",
     "open_to_accrual_values": ["open to accrual"]
   },
   "ctml_collection_mappings": {
@@ -16,6 +16,10 @@
           "sample_key": "BIRTH_DATE_INT",
           "sample_value": "age_range_to_date_int_query"
         },
+        "AGE_EXPRESSION": {
+          "sample_key": "AGE",
+          "sample_value": "age_expression_query"
+        },
         "ONCOTREE_PRIMARY_DIAGNOSIS": {
           "sample_key": "ONCOTREE_PRIMARY_DIAGNOSIS_NAME",
           "sample_value": "external_file_mapping",
@@ -30,13 +34,16 @@
           "sample_value": "tmb_range_to_query"
         },
         "HER2_STATUS": {
-          "ignore": true
+          "sample_key": "HER2_STATUS",
+          "sample_value": "true_false_map"
         },
         "PR_STATUS": {
-          "ignore": true
+          "sample_key": "PR_STATUS",
+          "sample_value": "true_false_map"
         },
         "ER_STATUS": {
-          "ignore": true
+          "sample_key": "ER_STATUS",
+          "sample_value": "true_false_map"
         },
         "DISEASE_STATUS": {
           "ignore": true
@@ -114,6 +121,14 @@
         "FUSION_PARTNER_HUGO_SYMBOL": {
           "sample_key": "FUSION_PARTNER_HUGO_SYMBOL",
           "sample_value": "nomap"
+        },
+        "MOLECULAR_FUNCTION": {
+          "sample_key": "MUTATION_EFFECT",
+          "sample_value": "molecular_function_map"
+        },
+        "MATCH_ALL": {
+          "sample_key": "MATCH_ALL",
+          "sample_value": "genomic_dummy_map"
         }
       }
     },
@@ -157,7 +172,9 @@
       "UVA_STATUS",
       "LEFT_PARTNER_GENE",
       "RIGHT_PARTNER_GENE",
-      "STRUCTURAL_VARIANT_TYPE"
+      "STRUCTURAL_VARIANT_TYPE",
+      "MOLECULAR_FUNCTION",
+      "MUTATION_EFFECT"
     ],
     "prior_treatments": [
       "DRUG"
@@ -169,13 +186,20 @@
       "MRN",
       "ONCOTREE_PRIMARY_DIAGNOSIS_NAME",
       "TUMOR_MUTATIONAL_BURDEN_PER_MEGABASE",
-      "VITAL_STATUS"
+      "PATIENT_ID",
+      "VITAL_STATUS",
+      "AGE",
+      "HER2_STATUS",
+      "PR_STATUS",
+      "ER_STATUS",
+      "STUDY_ID"
     ],
     "trial": [
       "protocol_no",
       "nct_id",
       "treatment_list",
       "status",
+      "short_title",
       "_summary"
     ]
   },
@@ -275,7 +299,8 @@
       "UVA_STATUS",
       "LEFT_PARTNER_GENE",
       "RIGHT_PARTNER_GENE",
-      "TRUE_HUGO_SYMBOL"
+      "TRUE_HUGO_SYMBOL",
+      "MOLECULAR_FUNCTION"
     ],
     "clinical": [
       "GENDER",
@@ -284,7 +309,12 @@
       "ONCOTREE_PRIMARY_DIAGNOSIS_NAME",
       "TUMOR_MUTATIONAL_BURDEN_PER_MEGABASE",
       "VITAL_STATUS",
-      "BIRTH_DATE_INT"
+      "BIRTH_DATE_INT",
+      "AGE",
+      "HER2_STATUS",
+      "PR_STATUS",
+      "ER_STATUS",
+      "STUDY_ID"
     ],
     "trial_match": [
       "hash",

diff --git a/matchengine/internals/engine.py b/matchengine/internals/engine.py
@@ -143,6 +143,7 @@ def __init__(
             db_secrets_class: str = None,
             report_all_clinical_reasons: bool = False,
             ignore_run_log: bool = False,
+            ignore_report_date: bool = False,
             skip_run_log_entry: bool = False,
             trial_match_collection: str = "trial_match",
             drop: bool = False,
@@ -161,6 +162,7 @@ def __init__(
         self.run_id = uuid.uuid4()
         self.run_log_entries = dict()
         self.ignore_run_log = ignore_run_log
+        self.ignore_report_date = ignore_report_date
         self.skip_run_log_entry = skip_run_log_entry
         self.clinical_run_log_entries = dict()
         self._protocol_nos_param = list(protocol_nos) if protocol_nos is not None else protocol_nos
@@ -514,7 +516,7 @@ async def _async_get_matches_for_trial(self, protocol_no: str) -> Dict[str, List
                         # check if node has any age criteria, to know to check for newly qualifying patients
                         # or patients aging out
                         for k, v in criteria.get('clinical', dict()).items():
-                            if k.lower() == 'age_numerical':
+                            if k.lower() == 'age_numerical' or k.lower() == 'age_expression':
                                 age_criteria.add(v)
                 if self.debug:
                     log.info(f"Query: {query}")
@@ -565,7 +567,7 @@ def _get_clinical_data(self):
         query: Dict = {}
         if self.sample_ids is not None:
             query.update({"SAMPLE_ID": {"$in": list(self.sample_ids)}})
-        projection = {'_id': 1, 'SAMPLE_ID': 1, 'VITAL_STATUS': 1, 'BIRTH_DATE_INT': 1}
+        projection = {'_id': 1, 'SAMPLE_ID': 1, 'VITAL_STATUS': 1, 'BIRTH_DATE_INT': 1, 'AGE': 1}
         if not self.ignore_run_log:
             projection.update({'_updated': 1, 'run_history': 1})
         projection.update({
@@ -598,11 +600,13 @@ def get_clinical_deceased(self) -> Set[ClinicalID]:
                 in self._clinical_data.items()
                 if clinical_data['VITAL_STATUS'] == 'deceased'}
 
+    # use the BIRTH_DATE_INT field, otherwise return the age field
     def get_clinical_birth_dates(self) -> Dict[ClinicalID, int]:
-        return {clinical_id: clinical_data['BIRTH_DATE_INT']
-                for clinical_id, clinical_data
-                in self._clinical_data.items()
-                }
+        for clinical_id, clinical_data in self._clinical_data.items():
+            if 'BIRTH_DATE_INT' in clinical_data:
+                return {clinical_id: clinical_data['BIRTH_DATE_INT']}
+            else:
+                return {clinical_id: clinical_data['AGE']}
 
     def get_clinical_ids_from_sample_ids(self) -> Dict[ClinicalID, str]:
         """
@@ -613,7 +617,8 @@ def get_clinical_ids_from_sample_ids(self) -> Dict[ClinicalID, str]:
                     self._clinical_data.items()}
         else:
             return {clinical_id: clinical_data['SAMPLE_ID'] for clinical_id, clinical_data in
-                    self._clinical_data.items() if clinical_data['VITAL_STATUS'] == 'alive'}
+                    self._clinical_data.items() if (clinical_data['VITAL_STATUS'] is not None and clinical_data['VITAL_STATUS'].lower() == 'alive')}
+
 
     def get_trials(self) -> Dict[str, Trial]:
         """

diff --git a/matchengine/internals/load.py b/matchengine/internals/load.py
@@ -5,6 +5,7 @@
 import os
 from argparse import Namespace
 from contextlib import ExitStack
+from typing import List
 
 import yaml
 from bson import json_util
@@ -14,6 +15,12 @@
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger('matchengine')
 
+def load_from_variable(data, data_format='json'):
+    with ExitStack() as stack:
+        db_rw = stack.enter_context(MongoDBConnection(read_only=False, db="matchminer", async_init=False))
+        log.info('Adding trial(s) to mongo...')
+        if data_format == 'json':
+            load_from_memory(db_rw, data)
 
 def load(args: Namespace):
     """
@@ -56,16 +63,32 @@ def load(args: Namespace):
 def load_trials(db_rw, args: Namespace):
     if args.trial_format == 'json':
         load_trials_json(args, db_rw)
-    elif args.trial_format == 'yaml':
+    elif args.trial_format == 'yml':
         load_trials_yaml(args, db_rw)
 
 
 def load_trials_yaml(args: Namespace, db_rw):
     if os.path.isdir(args.trial):
-        load_dir(args, db_rw, "yaml", args.trial, 'trial')
+        load_dir(args, db_rw, "yml", args.trial, 'trial')
     else:
-        load_file(db_rw, 'yaml', args.trial, 'trial')
-
+        load_file(db_rw, 'yml', args.trial, 'trial')
+
+def load_from_memory(db_rw, json_list: List[dict]):
+    for data in json_list:
+        if is_valid_single_json_dict(data):
+            for key in list(data.keys()):
+                if key == 'BIRTH_DATE':
+                    data[key] = convert_birthdate(data[key])
+                    data['BIRTH_DATE_INT'] = int(data[key].strftime('%Y%m%d'))
+                if key == 'AGE':
+                    data[key] = int(data[key])
+            db_rw.trial.insert_one(data)
+
+def is_valid_single_json_dict(json_dict: dict):
+    """Check if a JSON file is a single object or an array of JSON objects"""
+    if json_dict.__class__ is list:
+        return False
+    return True
 
 def load_trials_json(args: Namespace, db_rw):
     # load a directory of json files
@@ -108,6 +131,21 @@ def load_trials_json(args: Namespace, db_rw):
 ########################
 # patient data loading
 ########################
+
+def load_clinical_via_api(file_path: str):
+    with ExitStack() as stack:
+        db_rw = stack.enter_context(MongoDBConnection(read_only=False, db="matchminer", async_init=False))
+        load_file(db_rw, 'csv', file_path, 'clinical')
+
+def load_genomic_via_api(file_path: str):
+    with ExitStack() as stack:
+        db_rw = stack.enter_context(MongoDBConnection(read_only=False, db="matchminer", async_init=False))
+        db_ro = stack.enter_context(MongoDBConnection(read_only=True, db="matchminer", async_init=False))
+        if len(list(db_ro.clinical.find({}))) == 0:
+            raise RuntimeError("No clinical documents in db. Please load clinical documents before loading genomic.")
+        load_file(db_rw, 'csv', file_path, 'genomic')
+        map_clinical_to_genomic(db_rw, db_ro)
+
 def load_clinical(db_rw, args: Namespace):
     if args.patient_format == 'json':
 
@@ -135,7 +173,7 @@ def load_genomic(db_rw, db_ro, args: Namespace, ):
 
 
 def map_clinical_to_genomic(db_rw, db_ro):
-    """Ensure that all genomic docs are linked to their corresponding clinical docs by _id"""
+    """Ensure that all genomic docs are linked to their corresponding clinical docs by _id""" 
     clinical_docs = list(db_ro.clinical.find({}, {"_id": 1, "SAMPLE_ID": 1}))
     clinical_dict = dict(zip([i['SAMPLE_ID'] for i in clinical_docs], [i['_id'] for i in clinical_docs]))
 
@@ -169,10 +207,12 @@ def load_file(db_rw, filetype: str, path: str, collection: str):
                     if key == 'BIRTH_DATE':
                         row[key] = convert_birthdate(row[key])
                         row['BIRTH_DATE_INT'] = int(row[key].strftime('%Y%m%d'))
+                    if key == 'AGE':
+                        row[key] = int(row[key])
                 db_rw[collection].insert_one(row)
         else:
             raw_file_data = file_handle.read()
-            if filetype == 'yaml':
+            if filetype == 'yml':
                 data = yaml.safe_load_all(raw_file_data)
                 db_rw[collection].insert_many(data)
             elif filetype == 'json':
@@ -182,6 +222,8 @@ def load_file(db_rw, filetype: str, path: str, collection: str):
                         if key == 'BIRTH_DATE':
                             data[key] = convert_birthdate(data[key])
                             data['BIRTH_DATE_INT'] = int(data[key].strftime('%Y%m%d'))
+                        if key == 'AGE':
+                            data[key] = int(data[key])
                     db_rw[collection].insert_one(data)
 
 

diff --git a/matchengine/internals/query_transform.py b/matchengine/internals/query_transform.py
@@ -82,6 +82,26 @@ def age_range_to_date_int_query(self, **kwargs):
         query_date = current_date + (- relativedelta(years=years, months=months))
         return QueryTransformerResult({sample_key: {operator_map[operator]: int(query_date.strftime('%Y%m%d'))}}, False)
 
+    # straight comparison of a year value to the age field
+    def age_expression_query(self, **kwargs):
+        sample_key = kwargs['sample_key']
+        trial_value = kwargs['trial_value']
+        operator_map = {
+            "==": "$eq",
+            "<=": "$lte",
+            ">=": "$gte",
+            ">": "$gt",
+            "<": "$lt"
+        }
+        # funky logic is because 1 month curation is curated as "0.083" (1/12 a year)
+        operator = ''.join([i for i in trial_value if not i.isdigit() and i != '.'])
+        numeric = "".join([i for i in trial_value if i.isdigit() or i == '.'])
+        if numeric.startswith('.'):
+            numeric = '0' + numeric
+        split_time = numeric.split('.')
+        years = int(split_time[0] if split_time[0].isdigit() else 0)
+        return QueryTransformerResult({sample_key: {operator_map[operator]: years}}, False)
+
     def nomap(self, **kwargs):
         trial_path = kwargs['trial_path']
         trial_key = kwargs['trial_key']