Skip to content

Commit

Permalink
Merge pull request #56 from mskcc/develop
Browse files Browse the repository at this point in the history
Release version 0.4.0
  • Loading branch information
nikhil authored Nov 18, 2022
2 parents 80a428e + 38f8429 commit 922a3da
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 49 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ To access other endpoints, export the environment variable `BEAGLE_ENDPOINT`.
```
beaglecli run latest-info --request-id requests.txt --completed --output-metadata-only --max-pages
```
- Return and clean output metadata for a given request id from files api
```
beaglecli files list --metadata=igoRequestId:13167_C --file-type fastq --all --packaged
```
Note: Use `requests.txt` as a template for providing a multiple request ids

#### Troubleshooting
Expand Down
43 changes: 25 additions & 18 deletions apps/access/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,40 +18,43 @@
def access_commands(arguments, config):
print('Running ACCESS')

request_id, sample_id, apps = get_arguments(arguments)
request_id, sample_id, apps, show_all_runs = get_arguments(arguments)
tags = '{"cmoSampleIds":"%s"}' % sample_id if sample_id else '{"igoRequestId":"%s"}' % request_id
if arguments.get('link'):
for (app, app_version) in apps:
(app_name, directory) = FLAG_TO_APPS[app]
operator_run = get_operator_run(app_name, app_version, tags, config)
operator_run = get_operator_run(app_name, app_version, tags, config, show_all_runs)
if operator_run:
if arguments.get('--single-dir'):
if app == "bams":
link_bams_to_single_dir(operator_run, app, request_id, sample_id, arguments, config)
link_bams_to_single_dir(operator_run, app, request_id, sample_id, arguments, config, show_all_runs)
else:
print("Apps other than bams not supported at this time")
else:
link_app(operator_run, directory, request_id, sample_id, arguments, config)
link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs)

if arguments.get('link-patient'):
for (app, app_version) in apps:
(app_name, directory) = FLAG_TO_APPS[app]
operator_run = get_operator_run(app_name, app_version, tags, config)
operator_run = get_operator_run(app_name, app_version, tags, config, show_all_runs)
if operator_run:
if(app == "bams"):
link_bams_by_patient_id(operator_run, "bams", request_id, sample_id, arguments, config)
link_bams_by_patient_id(operator_run, "bams", request_id, sample_id, arguments, config, show_all_runs)
else:
link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments,
config)
config, show_all_runs)

def get_operator_run(app_name, app_version=None, tags=None, config=None):
def get_operator_run(app_name, app_version=None, tags=None, config=None, show_all_runs=False):
latest_operator_run = {
"tags": tags,
"status": "COMPLETED",
"page_size": 1,
"app_name": app_name
}

if show_all_runs:
latest_operator_run.pop("status")

if app_version:
latest_operator_run["app_version"] = app_version

Expand All @@ -75,6 +78,7 @@ def get_arguments(arguments):
request_id = arguments.get('--request-id')
sample_id = arguments.get('--sample-id')
app_tags = arguments.get('--apps')
show_all_runs = arguments.get('--all-runs') or False
if request_id:
request_id = request_id[0]

Expand All @@ -86,16 +90,19 @@ def get_arguments(arguments):
else:
apps.append((r[0], None))

return request_id, sample_id, apps
return request_id, sample_id, apps, show_all_runs


def get_runs(operator_run_id, config):
def get_runs(operator_run_id, config, show_all_runs):
run_params = {
"operator_run": operator_run_id,
"page_size": 1000,
"status": "COMPLETED"
}

if show_all_runs:
run_params.pop("status")

response = requests.get(urljoin(config['beagle_endpoint'], config['api']['run']),
headers={'Authorization': 'Bearer %s' % config['token']}, params=run_params)

Expand All @@ -117,7 +124,7 @@ def get_files_by_run_id(run_id, config):
def get_file_path(file):
return file["location"][7:]

def link_app(operator_run, directory, request_id, sample_id, arguments, config):
def link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs):
version = arguments.get("--dir-version") or operator_run["app_version"]
should_delete = arguments.get("--delete") or False

Expand All @@ -126,7 +133,7 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config):
path = path_without_version / version
path.mkdir(parents=True, exist_ok=True, mode=0o755)

runs = get_runs(operator_run["id"], config)
runs = get_runs(operator_run["id"], config, show_all_runs)
if not runs:
return

Expand Down Expand Up @@ -156,13 +163,13 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config):
return "Completed"


def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config):
def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs):
version = arguments.get("--dir-version") or operator_run["app_version"]
should_delete = arguments.get("--delete") or False

path = Path("./") / directory

runs = get_runs(operator_run["id"], config)
runs = get_runs(operator_run["id"], config, show_all_runs)
if not runs:
return

Expand Down Expand Up @@ -199,12 +206,12 @@ def link_single_sample_workflows_by_patient_id(operator_run, directory, request_

return "Completed"

def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config):
def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs):
version = arguments.get("--dir-version") or operator_run["app_version"]

path = Path("./") / directory / ("Project_" + request_id)

runs = get_runs(operator_run["id"], config)
runs = get_runs(operator_run["id"], config, show_all_runs)

if not runs:
return
Expand Down Expand Up @@ -253,13 +260,13 @@ def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, argu

return "Completed"

def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config):
def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs):
version = arguments.get("--dir-version") or operator_run["app_version"]
should_delete = arguments.get("--delete") or False

path = Path("./") / directory

runs = get_runs(operator_run["id"], config)
runs = get_runs(operator_run["id"], config, show_all_runs)

if not runs:
return
Expand Down
82 changes: 82 additions & 0 deletions apps/cleaning/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json
import pandas as pd
from collections import defaultdict
from urllib.parse import urljoin
from pathlib import Path

C_NAMES = [
"igoRequestId",
"cmoSampleName",
"sampleName",
"sampleClass",
"cmoPatientId",
"investigatorSampleId",
"oncotreeCode",
"tumorOrNormal",
"tissueLocation",
"sampleOrigin",
"preservation",
"collectionYear",
"sex",
"species",
"tubeId",
"cfDNA2dBarcode",
"baitSet",
"qcReports",
"barcodeId",
"barcodeIndex",
"libraryIgoId",
"libraryVolume",
"libraryConcentrationNgul",
"dnaInputNg",
"captureConcentrationNm",
"captureInputNg",
"captureName"
]


def _clean_json(data):
# Iterating through the json
# list
# should do other check for format changes
results = []
for i in range(0,len(data["results"])):
results.append(data['results'][i]['metadata'])
df = pd.DataFrame(results)
# check all columns are present
if not set(C_NAMES).issubset(df.columns):
ValueError('missing column names expected in file metadata. Format has changed, or JSON is badly formed.')
# subset to important columns
df = df[C_NAMES]
# normalize columns
bf_list = [i for i in df.columns if isinstance(df[i][0],list)]
cleaned_columns = [df[column].apply(lambda x: x[0] if isinstance(x, list) else x) for column in bf_list]
df[bf_list] = pd.concat(cleaned_columns, axis=1)
bf_dict = [i for i in df.columns if isinstance(df[i][0],dict)]
normalized_columns = [pd.json_normalize(df[column]) for column in bf_dict]
normalized_df = pd.concat(normalized_columns, axis=1)
df = df.drop(columns=bf_dict)
df = pd.concat([df, normalized_df], axis=1)
df = df.drop_duplicates()
return df

def _write_output(data, out_data):
# csv
out_name = data['results'][0]['metadata']['igoRequestId']
out_data = out_data.loc[:,~out_data.columns.duplicated()].copy()
out_data.to_csv('{out_name}.csv'.format(out_name=out_name), index=False)
# json
out_data = out_data.to_dict('records')
with open('{out_name}.json'.format(out_name=out_name), 'w') as fout:
json.dump(out_data , fout, indent=4)



def clean_json_comands(results):
# get args
datain=json.loads(results)
# clean json
dataout = _clean_json(datain)
# write out
_write_output(datain, dataout)

Loading

0 comments on commit 922a3da

Please sign in to comment.