diff --git a/README.md b/README.md index 1d8afe2..c02fadc 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,10 @@ To access other endpoints, export the environment variable `BEAGLE_ENDPOINT`. ``` beaglecli run latest-info --request-id requests.txt --completed --output-metadata-only --max-pages ``` +- Return and clean output metadata for a given request id from files api + ``` + beaglecli files list --metadata=igoRequestId:13167_C --file-type fastq --all --packaged + ``` Note: Use `requests.txt` as a template for providing a multiple request ids #### Troubleshooting diff --git a/apps/access/__init__.py b/apps/access/__init__.py index 32c492d..59262f0 100644 --- a/apps/access/__init__.py +++ b/apps/access/__init__.py @@ -18,33 +18,33 @@ def access_commands(arguments, config): print('Running ACCESS') - request_id, sample_id, apps = get_arguments(arguments) + request_id, sample_id, apps, show_all_runs = get_arguments(arguments) tags = '{"cmoSampleIds":"%s"}' % sample_id if sample_id else '{"igoRequestId":"%s"}' % request_id if arguments.get('link'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] - operator_run = get_operator_run(app_name, app_version, tags, config) + operator_run = get_operator_run(app_name, app_version, tags, config, show_all_runs) if operator_run: if arguments.get('--single-dir'): if app == "bams": - link_bams_to_single_dir(operator_run, app, request_id, sample_id, arguments, config) + link_bams_to_single_dir(operator_run, app, request_id, sample_id, arguments, config, show_all_runs) else: print("Apps other than bams not supported at this time") else: - link_app(operator_run, directory, request_id, sample_id, arguments, config) + link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs) if arguments.get('link-patient'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] - operator_run = get_operator_run(app_name, app_version, tags, config) + operator_run = get_operator_run(app_name, app_version, tags, config, show_all_runs) if operator_run: if(app == "bams"): - link_bams_by_patient_id(operator_run, "bams", request_id, sample_id, arguments, config) + link_bams_by_patient_id(operator_run, "bams", request_id, sample_id, arguments, config, show_all_runs) else: link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, - config) + config, show_all_runs) -def get_operator_run(app_name, app_version=None, tags=None, config=None): +def get_operator_run(app_name, app_version=None, tags=None, config=None, show_all_runs=False): latest_operator_run = { "tags": tags, "status": "COMPLETED", @@ -52,6 +52,9 @@ def get_operator_run(app_name, app_version=None, tags=None, config=None): "app_name": app_name } + if show_all_runs: + latest_operator_run.pop("status") + if app_version: latest_operator_run["app_version"] = app_version @@ -75,6 +78,7 @@ def get_arguments(arguments): request_id = arguments.get('--request-id') sample_id = arguments.get('--sample-id') app_tags = arguments.get('--apps') + show_all_runs = arguments.get('--all-runs') or False if request_id: request_id = request_id[0] @@ -86,16 +90,19 @@ def get_arguments(arguments): else: apps.append((r[0], None)) - return request_id, sample_id, apps + return request_id, sample_id, apps, show_all_runs -def get_runs(operator_run_id, config): +def get_runs(operator_run_id, config, show_all_runs): run_params = { "operator_run": operator_run_id, "page_size": 1000, "status": "COMPLETED" } + if show_all_runs: + run_params.pop("status") + response = requests.get(urljoin(config['beagle_endpoint'], config['api']['run']), headers={'Authorization': 'Bearer %s' % config['token']}, params=run_params) @@ -117,7 +124,7 @@ def get_files_by_run_id(run_id, config): def get_file_path(file): return file["location"][7:] -def link_app(operator_run, directory, request_id, sample_id, arguments, config): +def link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False @@ -126,7 +133,7 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): path = path_without_version / version path.mkdir(parents=True, exist_ok=True, mode=0o755) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -156,13 +163,13 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): return "Completed" -def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -199,12 +206,12 @@ def link_single_sample_workflows_by_patient_id(operator_run, directory, request_ return "Completed" -def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] path = Path("./") / directory / ("Project_" + request_id) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -253,13 +260,13 @@ def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, argu return "Completed" -def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return diff --git a/apps/cleaning/__init__.py b/apps/cleaning/__init__.py new file mode 100644 index 0000000..ce836c3 --- /dev/null +++ b/apps/cleaning/__init__.py @@ -0,0 +1,82 @@ +import json +import pandas as pd +from collections import defaultdict +from urllib.parse import urljoin +from pathlib import Path + +C_NAMES = [ + "igoRequestId", + "cmoSampleName", + "sampleName", + "sampleClass", + "cmoPatientId", + "investigatorSampleId", + "oncotreeCode", + "tumorOrNormal", + "tissueLocation", + "sampleOrigin", + "preservation", + "collectionYear", + "sex", + "species", + "tubeId", + "cfDNA2dBarcode", + "baitSet", + "qcReports", + "barcodeId", + "barcodeIndex", + "libraryIgoId", + "libraryVolume", + "libraryConcentrationNgul", + "dnaInputNg", + "captureConcentrationNm", + "captureInputNg", + "captureName" +] + + +def _clean_json(data): + # Iterating through the json + # list + # should do other check for format changes + results = [] + for i in range(0,len(data["results"])): + results.append(data['results'][i]['metadata']) + df = pd.DataFrame(results) + # check all columns are present + if not set(C_NAMES).issubset(df.columns): + ValueError('missing column names expected in file metadata. Format has changed, or JSON is badly formed.') + # subset to important columns + df = df[C_NAMES] + # normalize columns + bf_list = [i for i in df.columns if isinstance(df[i][0],list)] + cleaned_columns = [df[column].apply(lambda x: x[0] if isinstance(x, list) else x) for column in bf_list] + df[bf_list] = pd.concat(cleaned_columns, axis=1) + bf_dict = [i for i in df.columns if isinstance(df[i][0],dict)] + normalized_columns = [pd.json_normalize(df[column]) for column in bf_dict] + normalized_df = pd.concat(normalized_columns, axis=1) + df = df.drop(columns=bf_dict) + df = pd.concat([df, normalized_df], axis=1) + df = df.drop_duplicates() + return df + +def _write_output(data, out_data): + # csv + out_name = data['results'][0]['metadata']['igoRequestId'] + out_data = out_data.loc[:,~out_data.columns.duplicated()].copy() + out_data.to_csv('{out_name}.csv'.format(out_name=out_name), index=False) + # json + out_data = out_data.to_dict('records') + with open('{out_name}.json'.format(out_name=out_name), 'w') as fout: + json.dump(out_data , fout, indent=4) + + + +def clean_json_comands(results): + # get args + datain=json.loads(results) + # clean json + dataout = _clean_json(datain) + # write out + _write_output(datain, dataout) + \ No newline at end of file diff --git a/apps/cmoch/__init__.py b/apps/cmoch/__init__.py index d94693a..a872030 100644 --- a/apps/cmoch/__init__.py +++ b/apps/cmoch/__init__.py @@ -7,46 +7,46 @@ import requests FLAG_TO_APPS = { - "bams": ("Access CMO-CH", "bams"), + "bams": ("cmo-ch nucleo", "bams"), } def cmoch_commands(arguments, config): print('Running CMOCH') - request_id, sample_id, apps = get_arguments(arguments) + request_id, sample_id, apps, show_all_runs = get_arguments(arguments) tags = '{"cmoSampleId":"%s"}' % sample_id if sample_id else '{"igoRequestId":"%s"}' % request_id if arguments.get('link'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] operator_run = get_operator_run( - app_name, app_version, tags, config) + app_name, app_version, tags, config, show_all_runs) if operator_run: if arguments.get('--single-dir'): if app == "bams": link_bams_to_single_dir( - operator_run, app, request_id, sample_id, arguments, config) + operator_run, app, request_id, sample_id, arguments, config, show_all_runs) else: print("Apps other than bams not supported at this time") else: link_app(operator_run, directory, request_id, - sample_id, arguments, config) + sample_id, arguments, config, show_all_runs) if arguments.get('link-patient'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] operator_run = get_operator_run( - app_name, app_version, tags, config) + app_name, app_version, tags, config, show_all_runs) if operator_run: if(app == "bams"): link_bams_by_patient_id( - operator_run, "bams", request_id, sample_id, arguments, config) + operator_run, "bams", request_id, sample_id, arguments, config, show_all_runs) else: link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, - config) + config, show_all_runs) -def get_operator_run(app_name, app_version=None, tags=None, config=None): +def get_operator_run(app_name, app_version=None, tags=None, config=None, show_all_runs=False): latest_operator_run = { "tags": tags, "status": "COMPLETED", @@ -54,6 +54,9 @@ def get_operator_run(app_name, app_version=None, tags=None, config=None): "app_name": app_name } + if show_all_runs: + latest_operator_run.pop("status") + if app_version: latest_operator_run["app_version"] = app_version @@ -75,6 +78,7 @@ def get_arguments(arguments): request_id = arguments.get('--request-id') sample_id = arguments.get('--sample-id') app_tags = arguments.get('--apps') + show_all_runs = arguments.get('--all-runs') or False if request_id: request_id = request_id[0] @@ -86,16 +90,19 @@ def get_arguments(arguments): else: apps.append((r[0], None)) - return request_id, sample_id, apps + return request_id, sample_id, apps, show_all_runs -def get_runs(operator_run_id, config): +def get_runs(operator_run_id, config, show_all_runs): run_params = { "operator_run": operator_run_id, "page_size": 1000, "status": "COMPLETED" } + if show_all_runs: + run_params.pop("status") + response = requests.get(urljoin(config['beagle_endpoint'], config['api']['run']), headers={'Authorization': 'Bearer %s' % config['token']}, params=run_params) @@ -120,7 +127,7 @@ def get_file_path(file): return file["location"][7:] -def link_app(operator_run, directory, request_id, sample_id, arguments, config): +def link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False @@ -129,7 +136,7 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): path = path_without_version / version path.mkdir(parents=True, exist_ok=True, mode=0o755) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -161,13 +168,13 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): return "Completed" -def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -208,12 +215,12 @@ def link_single_sample_workflows_by_patient_id(operator_run, directory, request_ return "Completed" -def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] path = Path("./") / directory / ("Project_" + request_id) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -264,13 +271,13 @@ def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, argu return "Completed" -def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return diff --git a/beaglecli b/beaglecli index c37b282..76510cd 100755 --- a/beaglecli +++ b/beaglecli @@ -31,6 +31,7 @@ import csv from apps.access import access_commands from apps.cmoch import cmoch_commands from apps.lims import lims_commands +from apps.cleaning import clean_json_comands BEAGLE_ENDPOINT = os.environ.get('BEAGLE_ENDPOINT', 'http://voyager:5001') @@ -42,7 +43,7 @@ CONFIG_TEMPLATE = { 'token': '', 'refresh': '', 'next': None, - 'prev': None, + 'prev': None } @@ -72,7 +73,7 @@ Usage: beaglecli files create [--metadata-path=] [--size=] beaglecli files update [--file-path=] [--file-type=] [--file-group=] [--metadata-path=] [--size=] beaglecli files patch [--file-path=] [--file-type=] [--file-group=] [--metadata=]... [--size=] - beaglecli files list [--page-size=] [--path=]... [--metadata=]... [--file-group=]... [--file-name=]... [--filename-regex=] [--file-type=]... + beaglecli files list [--page-size=] [--path=]... [--metadata=]... [--file-group=]... [--file-name=]... [--filename-regex=] [--file-type=]... [--all]... [--packaged]... beaglecli files delete --file-id=... beaglecli sample create beaglecli sample list [--sample-id=] @@ -84,8 +85,8 @@ Usage: beaglecli file-group create beaglecli file-group list [--page-size=] beaglecli etl delete --job-id=... - beaglecli run list [--page-size=] [--request-id=]... [--tags=]... [--apps="apps"]... [--job-groups=]... [--jira-ids=]... - beaglecli run latest-info [--request-id= ] [--job-group=] [--apps="apps"]... [--jira-id=] [--output-file=] [--completed][--page-size=] [--output-metadata-only] [--max-pages] + beaglecli run list [--page-size=] [--request-id=]... [--tags=]... [--apps="apps"]... [--job-groups=]... [--jira-ids=]... [--all]... + beaglecli run latest-info [--request-id= ] [--job-group=] [--apps="apps"]... [--jira-id=] [--output-file=] [--completed][--page-size=] [--output-metadata-only] [--max-pages] [--all]... beaglecli run get beaglecli run submit-request --pipeline= [--request-ids=] [--job-group-id=] [--for-each=] beaglecli run submit-runs --pipelines=... --versions=...[--run-file=] [--run-ids=]... [--job-group-id=] [--for-each=] @@ -93,10 +94,10 @@ Usage: beaglecli tempo-mpgen beaglecli tempo-mpgen override --normals= --tumors= beaglecli lims metadata [--request-id=] - beaglecli access link [--single-dir] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] - beaglecli access link-patient [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] - beaglecli cmoch link [--single-dir] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] - beaglecli cmoch link-patient [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli access link [--single-dir] [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli access link-patient [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli cmoch link [--single-dir] [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli cmoch link-patient [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] beaglecli --version Options: @@ -584,6 +585,8 @@ def _list_files(arguments, config): filename_regex = arguments.get('--filename-regex') page_size = arguments.get('--page-size') file_type = arguments.get('--file-type') + all_pages = arguments.get('--all') + packaged = arguments.get('--packaged') params = dict() params['path'] = paths params['metadata'] = metadata @@ -591,11 +594,15 @@ def _list_files(arguments, config): params['file_name'] = file_name params['filename_regex'] = filename_regex params['file_type'] = file_type - if page_size: - params['page_size'] = page_size + if all_pages: + count_params = params + count_params['count'] = True + params['page_size'] = requests.get(urljoin(BEAGLE_ENDPOINT, API['files']), headers={'Authorization': 'Bearer %s' % config.token}, params=count_params).json()['count'] response = requests.get(urljoin(BEAGLE_ENDPOINT, API['files']), headers={ 'Authorization': 'Bearer %s' % config.token}, params=params) response_json = json.dumps(response.json(), indent=4) + if packaged: + clean_json_comands(response_json) _set_next_and_prev(config, response.json()) return response_json @@ -952,7 +959,6 @@ def _redact_sample(arguments, config): response_json = json.dumps(response.json(), indent=4) return response_json - if __name__ == '__main__': config = Config.load() authenticate_command(config) diff --git a/column_names.txt b/column_names.txt new file mode 100644 index 0000000..a156f0a --- /dev/null +++ b/column_names.txt @@ -0,0 +1,27 @@ +igoId, +cmoSampleName, +sampleName, +cmoSampleClass, +cmoPatientId, +investigatorSampleId, +oncoTreeCode, +tumorOrNormal, +tissueLocation, +sampleOrigin, +preservation, +collectionYear, +sex, +species, +tubeId, +cfDNA2dBarcode, +baitSet, +qcReports, +barcodeId, +barcodeIndex, +libraryIgoId, +libraryVolume, +libraryConcentrationNgul, +dnaInputNg, +captureConcentrationNm, +captureInputNg, +captureName \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ac03762..6cf06d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ docopt==0.6.2 requests==2.22.0 +pandas \ No newline at end of file diff --git a/setup.py b/setup.py index ad90de0..4ea87c0 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name='beaglecli', - version='0.3.0', + version='0.4.0', scripts=['beaglecli'] , description="Beagle API command line tool", url="https://github.com/mskcc/beagle_cli",