Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] Pugh lab main [just to compare] #17

Draft
wants to merge 29 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1007557
fix loading genomic data flag, yml format, and matching clinical vita…
mickey-ng Oct 18, 2021
64bebc5
Add extra fields to match and fields to resulting match document
mickey-ng Feb 17, 2022
d121066
Add extra solid tumor mapping, make oncotree diagnosis query case ins…
mickey-ng Mar 11, 2022
763737f
add mutation effect to match document
mickey-ng Apr 1, 2022
718627d
add molecular function mapping, fix bug with oncotree diagnosis case …
mickey-ng Apr 1, 2022
52f57c3
all true false mapping, and query to get all genomic results
mickey-ng Apr 24, 2023
9e62654
fix valid clinical reason subset bug without using pop
mickey-ng Apr 24, 2023
4994641
fix case insensitive oncotree diagnosis match with regex to strict match
mickey-ng Apr 27, 2023
ec98770
rewrite case insensitive match to use or instead of in to pair with r…
mickey-ng May 19, 2023
fa8c8e8
make oncotree primary diagnosis query ignore NA in data
mickey-ng Jun 1, 2023
a376538
CTM-217 remove unknown from inactivating mapping
mickey-ng Jun 6, 2023
526e169
dockerize
artonio Jul 4, 2023
2d5d5af
add functions to load trial from var
artonio Jul 6, 2023
a881e01
add functions to load clinical and genomic via api
artonio Jul 6, 2023
fd14bea
added map_clinical_to_genomic
artonio Jul 12, 2023
e6e7611
add value match for wildtype when recording result
mickey-ng Jul 20, 2023
ece0b8e
Merge pull request #1 from pughlab/genomic_match_all
mickey-ng Jul 21, 2023
042df3a
fix to pass study id from clinical to results
mickey-ng Jul 21, 2023
cb3f772
CTM-289: fix structural variation matching without report date
mickey-ng Aug 2, 2023
a19aae9
Merge pull request #2 from pughlab/genomic_match_all
artonio Aug 3, 2023
b616ade
CTM-293 add short title to result db
mickey-ng Aug 4, 2023
99f5f19
CTM-303 update oncotree definition
mickey-ng Aug 16, 2023
11a84d3
test submodule commit
mickey-ng Aug 23, 2023
887cded
test upload
mickey-ng Aug 30, 2023
a2141f6
CTM-284 age expression query
mickey-ng Sep 5, 2023
510d1b3
test update
mickey-ng Sep 5, 2023
7ded6f2
fix age expression to use correct field in clinical data
mickey-ng Sep 5, 2023
860240a
fix age loading as string instead of int
mickey-ng Sep 5, 2023
71ea8dd
fix age matching query transform
mickey-ng Sep 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Use an official Python 3.8 runtime as a parent image
FROM python:3.8-slim-buster

# Set the working directory in the container to /app
WORKDIR /app

# Add the current directory contents into the container at /app
ADD . /app

# Install any needed packages specified in setup.py
RUN pip install .

# Install bash and bash-completion
RUN apt-get update && apt-get install -y bash bash-completion

# Make port 80 available to the world outside this container
EXPOSE 80

# Define environment variable
ENV NAME MatchEngineV2
ENV SECRETS_JSON /app/secrets.json

RUN pip uninstall bson -y
RUN pip uninstall pymongo -y
RUN pip install pymongo==3.8.0

# Run app.py when the container launches
CMD tail -f /dev/null
12 changes: 12 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: '3'
services:
app:
build:
context: .
dockerfile: Dockerfile
ports:
- "8000:80"
volumes:
- .:/app
environment:
- NAME=MatchEngineV2
46 changes: 38 additions & 8 deletions matchengine/config/dfci_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"trial_identifier": "protocol_no",
"match_trial_link_id": "protocol_no",
"trial_status_key": {
"key_name": null,
"key_name": "summary",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most of the changes to this file can be incorporated without issues. The one exception is the "trial_status_key," which determines how we decide if trials are open or closed; that may be something we need to keep separate for PMATCH.

"open_to_accrual_values": ["open to accrual"]
},
"ctml_collection_mappings": {
Expand All @@ -16,6 +16,10 @@
"sample_key": "BIRTH_DATE_INT",
"sample_value": "age_range_to_date_int_query"
},
"AGE_EXPRESSION": {
"sample_key": "AGE",
"sample_value": "age_expression_query"
},
"ONCOTREE_PRIMARY_DIAGNOSIS": {
"sample_key": "ONCOTREE_PRIMARY_DIAGNOSIS_NAME",
"sample_value": "external_file_mapping",
Expand All @@ -30,13 +34,16 @@
"sample_value": "tmb_range_to_query"
},
"HER2_STATUS": {
"ignore": true
"sample_key": "HER2_STATUS",
"sample_value": "true_false_map"
},
"PR_STATUS": {
"ignore": true
"sample_key": "PR_STATUS",
"sample_value": "true_false_map"
},
"ER_STATUS": {
"ignore": true
"sample_key": "ER_STATUS",
"sample_value": "true_false_map"
},
"DISEASE_STATUS": {
"ignore": true
Expand Down Expand Up @@ -114,6 +121,14 @@
"FUSION_PARTNER_HUGO_SYMBOL": {
"sample_key": "FUSION_PARTNER_HUGO_SYMBOL",
"sample_value": "nomap"
},
"MOLECULAR_FUNCTION": {
"sample_key": "MUTATION_EFFECT",
"sample_value": "molecular_function_map"
},
"MATCH_ALL": {
"sample_key": "MATCH_ALL",
"sample_value": "genomic_dummy_map"
}
}
},
Expand Down Expand Up @@ -157,7 +172,9 @@
"UVA_STATUS",
"LEFT_PARTNER_GENE",
"RIGHT_PARTNER_GENE",
"STRUCTURAL_VARIANT_TYPE"
"STRUCTURAL_VARIANT_TYPE",
"MOLECULAR_FUNCTION",
"MUTATION_EFFECT"
],
"prior_treatments": [
"DRUG"
Expand All @@ -169,13 +186,20 @@
"MRN",
"ONCOTREE_PRIMARY_DIAGNOSIS_NAME",
"TUMOR_MUTATIONAL_BURDEN_PER_MEGABASE",
"VITAL_STATUS"
"PATIENT_ID",
"VITAL_STATUS",
"AGE",
"HER2_STATUS",
"PR_STATUS",
"ER_STATUS",
"STUDY_ID"
],
"trial": [
"protocol_no",
"nct_id",
"treatment_list",
"status",
"short_title",
"_summary"
]
},
Expand Down Expand Up @@ -275,7 +299,8 @@
"UVA_STATUS",
"LEFT_PARTNER_GENE",
"RIGHT_PARTNER_GENE",
"TRUE_HUGO_SYMBOL"
"TRUE_HUGO_SYMBOL",
"MOLECULAR_FUNCTION"
],
"clinical": [
"GENDER",
Expand All @@ -284,7 +309,12 @@
"ONCOTREE_PRIMARY_DIAGNOSIS_NAME",
"TUMOR_MUTATIONAL_BURDEN_PER_MEGABASE",
"VITAL_STATUS",
"BIRTH_DATE_INT"
"BIRTH_DATE_INT",
"AGE",
"HER2_STATUS",
"PR_STATUS",
"ER_STATUS",
"STUDY_ID"
],
"trial_match": [
"hash",
Expand Down
19 changes: 12 additions & 7 deletions matchengine/internals/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def __init__(
db_secrets_class: str = None,
report_all_clinical_reasons: bool = False,
ignore_run_log: bool = False,
ignore_report_date: bool = False,
skip_run_log_entry: bool = False,
trial_match_collection: str = "trial_match",
drop: bool = False,
Expand All @@ -161,6 +162,7 @@ def __init__(
self.run_id = uuid.uuid4()
self.run_log_entries = dict()
self.ignore_run_log = ignore_run_log
self.ignore_report_date = ignore_report_date
self.skip_run_log_entry = skip_run_log_entry
self.clinical_run_log_entries = dict()
self._protocol_nos_param = list(protocol_nos) if protocol_nos is not None else protocol_nos
Expand Down Expand Up @@ -514,7 +516,7 @@ async def _async_get_matches_for_trial(self, protocol_no: str) -> Dict[str, List
# check if node has any age criteria, to know to check for newly qualifying patients
# or patients aging out
for k, v in criteria.get('clinical', dict()).items():
if k.lower() == 'age_numerical':
if k.lower() == 'age_numerical' or k.lower() == 'age_expression':
age_criteria.add(v)
if self.debug:
log.info(f"Query: {query}")
Expand Down Expand Up @@ -565,7 +567,7 @@ def _get_clinical_data(self):
query: Dict = {}
if self.sample_ids is not None:
query.update({"SAMPLE_ID": {"$in": list(self.sample_ids)}})
projection = {'_id': 1, 'SAMPLE_ID': 1, 'VITAL_STATUS': 1, 'BIRTH_DATE_INT': 1}
projection = {'_id': 1, 'SAMPLE_ID': 1, 'VITAL_STATUS': 1, 'BIRTH_DATE_INT': 1, 'AGE': 1}
if not self.ignore_run_log:
projection.update({'_updated': 1, 'run_history': 1})
projection.update({
Expand Down Expand Up @@ -598,11 +600,13 @@ def get_clinical_deceased(self) -> Set[ClinicalID]:
in self._clinical_data.items()
if clinical_data['VITAL_STATUS'] == 'deceased'}

# use the BIRTH_DATE_INT field, otherwise return the age field
def get_clinical_birth_dates(self) -> Dict[ClinicalID, int]:
return {clinical_id: clinical_data['BIRTH_DATE_INT']
for clinical_id, clinical_data
in self._clinical_data.items()
}
for clinical_id, clinical_data in self._clinical_data.items():
if 'BIRTH_DATE_INT' in clinical_data:
return {clinical_id: clinical_data['BIRTH_DATE_INT']}
else:
return {clinical_id: clinical_data['AGE']}

def get_clinical_ids_from_sample_ids(self) -> Dict[ClinicalID, str]:
"""
Expand All @@ -613,7 +617,8 @@ def get_clinical_ids_from_sample_ids(self) -> Dict[ClinicalID, str]:
self._clinical_data.items()}
else:
return {clinical_id: clinical_data['SAMPLE_ID'] for clinical_id, clinical_data in
self._clinical_data.items() if clinical_data['VITAL_STATUS'] == 'alive'}
self._clinical_data.items() if (clinical_data['VITAL_STATUS'] is not None and clinical_data['VITAL_STATUS'].lower() == 'alive')}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, we should have that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new ME version moves this logic around a bit. Now it ignores users with VITAL_STATUS "deceased," rather than including only users with VITAL_STATUS "alive" (i.e. everyone is alive by default).



def get_trials(self) -> Dict[str, Trial]:
"""
Expand Down
54 changes: 48 additions & 6 deletions matchengine/internals/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
from argparse import Namespace
from contextlib import ExitStack
from typing import List

import yaml
from bson import json_util
Expand All @@ -14,6 +15,12 @@
logging.basicConfig(level=logging.INFO)
log = logging.getLogger('matchengine')

def load_from_variable(data, data_format='json'):
with ExitStack() as stack:
db_rw = stack.enter_context(MongoDBConnection(read_only=False, db="matchminer", async_init=False))
log.info('Adding trial(s) to mongo...')
if data_format == 'json':
load_from_memory(db_rw, data)

def load(args: Namespace):
"""
Expand Down Expand Up @@ -56,16 +63,32 @@ def load(args: Namespace):
def load_trials(db_rw, args: Namespace):
if args.trial_format == 'json':
load_trials_json(args, db_rw)
elif args.trial_format == 'yaml':
elif args.trial_format == 'yml':
load_trials_yaml(args, db_rw)


def load_trials_yaml(args: Namespace, db_rw):
if os.path.isdir(args.trial):
load_dir(args, db_rw, "yaml", args.trial, 'trial')
load_dir(args, db_rw, "yml", args.trial, 'trial')
else:
load_file(db_rw, 'yaml', args.trial, 'trial')

load_file(db_rw, 'yml', args.trial, 'trial')

def load_from_memory(db_rw, json_list: List[dict]):
for data in json_list:
if is_valid_single_json_dict(data):
for key in list(data.keys()):
if key == 'BIRTH_DATE':
data[key] = convert_birthdate(data[key])
data['BIRTH_DATE_INT'] = int(data[key].strftime('%Y%m%d'))
if key == 'AGE':
data[key] = int(data[key])
db_rw.trial.insert_one(data)

def is_valid_single_json_dict(json_dict: dict):
"""Check if a JSON file is a single object or an array of JSON objects"""
if json_dict.__class__ is list:
return False
return True

def load_trials_json(args: Namespace, db_rw):
# load a directory of json files
Expand Down Expand Up @@ -108,6 +131,21 @@ def load_trials_json(args: Namespace, db_rw):
########################
# patient data loading
########################

def load_clinical_via_api(file_path: str):
with ExitStack() as stack:
db_rw = stack.enter_context(MongoDBConnection(read_only=False, db="matchminer", async_init=False))
load_file(db_rw, 'csv', file_path, 'clinical')

def load_genomic_via_api(file_path: str):
with ExitStack() as stack:
db_rw = stack.enter_context(MongoDBConnection(read_only=False, db="matchminer", async_init=False))
db_ro = stack.enter_context(MongoDBConnection(read_only=True, db="matchminer", async_init=False))
if len(list(db_ro.clinical.find({}))) == 0:
raise RuntimeError("No clinical documents in db. Please load clinical documents before loading genomic.")
load_file(db_rw, 'csv', file_path, 'genomic')
map_clinical_to_genomic(db_rw, db_ro)

def load_clinical(db_rw, args: Namespace):
if args.patient_format == 'json':

Expand Down Expand Up @@ -135,7 +173,7 @@ def load_genomic(db_rw, db_ro, args: Namespace, ):


def map_clinical_to_genomic(db_rw, db_ro):
"""Ensure that all genomic docs are linked to their corresponding clinical docs by _id"""
"""Ensure that all genomic docs are linked to their corresponding clinical docs by _id"""
clinical_docs = list(db_ro.clinical.find({}, {"_id": 1, "SAMPLE_ID": 1}))
clinical_dict = dict(zip([i['SAMPLE_ID'] for i in clinical_docs], [i['_id'] for i in clinical_docs]))

Expand Down Expand Up @@ -169,10 +207,12 @@ def load_file(db_rw, filetype: str, path: str, collection: str):
if key == 'BIRTH_DATE':
row[key] = convert_birthdate(row[key])
row['BIRTH_DATE_INT'] = int(row[key].strftime('%Y%m%d'))
if key == 'AGE':
row[key] = int(row[key])
db_rw[collection].insert_one(row)
else:
raw_file_data = file_handle.read()
if filetype == 'yaml':
if filetype == 'yml':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just a bugfix that we can incorporate.

data = yaml.safe_load_all(raw_file_data)
db_rw[collection].insert_many(data)
elif filetype == 'json':
Expand All @@ -182,6 +222,8 @@ def load_file(db_rw, filetype: str, path: str, collection: str):
if key == 'BIRTH_DATE':
data[key] = convert_birthdate(data[key])
data['BIRTH_DATE_INT'] = int(data[key].strftime('%Y%m%d'))
if key == 'AGE':
data[key] = int(data[key])
db_rw[collection].insert_one(data)


Expand Down
20 changes: 20 additions & 0 deletions matchengine/internals/query_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,26 @@ def age_range_to_date_int_query(self, **kwargs):
query_date = current_date + (- relativedelta(years=years, months=months))
return QueryTransformerResult({sample_key: {operator_map[operator]: int(query_date.strftime('%Y%m%d'))}}, False)

# straight comparison of a year value to the age field
def age_expression_query(self, **kwargs):
sample_key = kwargs['sample_key']
trial_value = kwargs['trial_value']
operator_map = {
"==": "$eq",
"<=": "$lte",
">=": "$gte",
">": "$gt",
"<": "$lt"
}
# funky logic is because 1 month curation is curated as "0.083" (1/12 a year)
operator = ''.join([i for i in trial_value if not i.isdigit() and i != '.'])
numeric = "".join([i for i in trial_value if i.isdigit() or i == '.'])
if numeric.startswith('.'):
numeric = '0' + numeric
split_time = numeric.split('.')
years = int(split_time[0] if split_time[0].isdigit() else 0)
return QueryTransformerResult({sample_key: {operator_map[operator]: years}}, False)

def nomap(self, **kwargs):
trial_path = kwargs['trial_path']
trial_key = kwargs['trial_key']
Expand Down
Loading