From f4d853c69f8e06766107e8e89fc42a35b00b8352 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 11 Oct 2022 11:25:16 -0400 Subject: [PATCH 01/11] init feature commit :heavy_check_mark: add count to beaglecli configure. This allows the max page number to be returned if --all is specified on list commands :heavy_check_mark: add cleanup script to get the metadata formatted nicely :heavy_check_mark: add column_names file for check and docs update --- README.md | 4 +++ beaglecli | 26 ++++++++++------- clean_up.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ column_names.txt | 62 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 10 deletions(-) create mode 100644 clean_up.py create mode 100644 column_names.txt diff --git a/README.md b/README.md index 1d8afe2..912a779 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,10 @@ To access other endpoints, export the environment variable `BEAGLE_ENDPOINT`. ``` beaglecli run latest-info --request-id requests.txt --completed --output-metadata-only --max-pages ``` +- Return and clean output meatadata for a given request id from files api + ``` + beaglecli files list --metadata=igoRequestId:13167_C --file-type fastq --all | python3 clean_up.py + ``` Note: Use `requests.txt` as a template for providing a multiple request ids #### Troubleshooting diff --git a/beaglecli b/beaglecli index c37b282..11a1b25 100755 --- a/beaglecli +++ b/beaglecli @@ -43,6 +43,7 @@ CONFIG_TEMPLATE = { 'refresh': '', 'next': None, 'prev': None, + 'count': None } @@ -72,19 +73,19 @@ Usage: beaglecli files create [--metadata-path=] [--size=] beaglecli files update [--file-path=] [--file-type=] [--file-group=] [--metadata-path=] [--size=] beaglecli files patch [--file-path=] [--file-type=] [--file-group=] [--metadata=]... [--size=] - beaglecli files list [--page-size=] [--path=]... [--metadata=]... [--file-group=]... [--file-name=]... [--filename-regex=] [--file-type=]... + beaglecli files list [--page-size=] [--path=]... [--metadata=]... [--file-group=]... [--file-name=]... [--filename-regex=] [--file-type=]... [--all]... beaglecli files delete --file-id=... beaglecli sample create - beaglecli sample list [--sample-id=] + beaglecli sample list [--sample-id=] [--all]... beaglecli sample redact [--value=] beaglecli storage create - beaglecli storage list + beaglecli storage list [--all]... beaglecli file-types create beaglecli file-types list beaglecli file-group create - beaglecli file-group list [--page-size=] + beaglecli file-group list [--page-size=] [--all]... beaglecli etl delete --job-id=... - beaglecli run list [--page-size=] [--request-id=]... [--tags=]... [--apps="apps"]... [--job-groups=]... [--jira-ids=]... + beaglecli run list [--page-size=] [--request-id=]... [--tags=]... [--apps="apps"]... [--job-groups=]... [--jira-ids=]... [--all]... beaglecli run latest-info [--request-id= ] [--job-group=] [--apps="apps"]... [--jira-id=] [--output-file=] [--completed][--page-size=] [--output-metadata-only] [--max-pages] beaglecli run get beaglecli run submit-request --pipeline= [--request-ids=] [--job-group-id=] [--for-each=] @@ -110,11 +111,12 @@ CONFIG_LOCATION = os.path.join(expanduser("~"), '.beagle.conf') class Config(object): - def __init__(self, token, refresh, next, prev): + def __init__(self, token, refresh, next, prev, count): self.token = token self.refresh = refresh self.next = next self.prev = prev + self.count = count @classmethod def load(cls): @@ -123,7 +125,7 @@ class Config(object): config = cls(**json.load(config)) else: with open(CONFIG_LOCATION, 'w') as config: - config = cls('', '', None, None) + config = cls('', '', None, None, None) config.dump() return config @@ -134,10 +136,10 @@ class Config(object): def dump(self): with open(CONFIG_LOCATION, 'w') as f: json.dump({'token': self.token, 'refresh': self.refresh, - 'next': self.next, 'prev': self.prev}, f) + 'next': self.next, 'prev': self.prev, 'count':self.count}, f) def __repr__(self): - return 'token: %s, next: %s, prev: %s' % (self.token, self.next, self.prev) + return 'token: %s, next: %s, prev: %s, count: %s' % (self.token, self.next, self.prev, self.count) # Commands @@ -613,6 +615,7 @@ def _list_sample(arguments, config): def _set_next_and_prev(config, value): config.set('prev', value.get('previous')) config.set('next', value.get('next')) + config.set('count', value.get('count')) def next(config): @@ -952,11 +955,14 @@ def _redact_sample(arguments, config): response_json = json.dumps(response.json(), indent=4) return response_json - if __name__ == '__main__': config = Config.load() authenticate_command(config) arguments = docopt(USAGE, version='Beagle API 0.2.0') + # the '--all' flag controls whether '--page-size=count' to return all results for a query + # list will exit correclty since using querying with '--page-size=count' ensures 'next' and 'prev' are 'null' + if arguments.get('--all'): + arguments['--page-size'] = config.count result = command(arguments, config) print(result) if arguments.get('list'): diff --git a/clean_up.py b/clean_up.py new file mode 100644 index 0000000..628fad6 --- /dev/null +++ b/clean_up.py @@ -0,0 +1,75 @@ +import json +import argparse +import sys +import pandas as pd +import csv + + +def _collect_args(): + # Create the parser + parser = argparse.ArgumentParser() + # Add an argument group + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--input-file', '-i', + type=argparse.FileType('r'), + default=sys.stdin, + help='Input file name containing a valid JSON.') + group.add_argument( + 'json', + nargs='?', + type=str, + help='Input string containing a valid JSON.') + # Parse the argument + args = parser.parse_args() + data = args.json or args.input_file.read() + datain=json.loads(data) + #sys.stdin = open("/dev/tty") + return datain + +def _read_column_control(): + # read in column_names for check + with open('column_names.txt', newline='') as f: + reader = csv.reader(f) + c_names = [] + for name in reader: + c_names.append(name[0]) + return c_names + +def _clean_json(data): + # Iterating through the json + # list + # should do other check for format changes + results = [] + for i in range(0,len(data["results"])): + results.append(data['results'][i]['metadata']) + df = pd.DataFrame(results) + c_names = _read_column_control() + # check all columns are present + if not set(c_names).issubset(df.columns): + ValueError('missing column names expected in file metadata. Format has changed, or JSON is badly formed.') + # should do check for all columns + df = df.drop('R', axis=1) + df['flowCellLanes'] = [','.join(map(str, l)) for l in df['flowCellLanes']] + bf_list = [i for i in df.columns if isinstance(df[i][0],list)] + cleaned_columns = [df[column].apply(lambda x: x[0]) for column in bf_list] + df[bf_list] = pd.concat(cleaned_columns, axis=1) + bf_dict = [i for i in df.columns if isinstance(df[i][0],dict)] + normalized_columns = [pd.json_normalize(df[column]) for column in bf_dict] + normalized_df = pd.concat(normalized_columns, axis=1) + df = df.drop(columns=bf_dict) + df = pd.concat([df, normalized_df], axis=1) + df = df.drop_duplicates() + return df + +def _write_output(data, out_data): + out_name = data['results'][0]['metadata']['igoRequestId'] + out_data.to_csv('{out_name}.csv'.format(out_name=out_name), index=False) + +if __name__ == '__main__': + # get args + data = _collect_args() + # clean json + out_data = _clean_json(data) + # write data out + _write_output(data, out_data) diff --git a/column_names.txt b/column_names.txt new file mode 100644 index 0000000..8790e26 --- /dev/null +++ b/column_names.txt @@ -0,0 +1,62 @@ +R, +sex, +ciTag, +runId, +tubeId, +baitSet, +piEmail, +runDate, +runMode, +species, +platform, +barcodeId, +genePanel, +primaryId, +qcReports, +datasource, +dnaInputNg, +flowCellId, +importDate, +readLength, +sampleName, +sampleType, +captureName, +igoComplete, +labHeadName, +sampleClass, +barcodeIndex, +cmoInfoIgoId, +cmoPatientId, +igoProjectId, +igoRequestId, +labHeadEmail, +libraryIgoId, +oncotreeCode, +preservation, +sampleOrigin, +cmoSampleName, +flowCellLanes, +libraryVolume, +sampleAliases, +smileSampleId, +tumorOrNormal, +captureInputNg, +cfDNA2dBarcode, +collectionYear, +patientAliases, +qcAccessEmails, +smilePatientId, +tissueLocation, +dataAnalystName, +dataAccessEmails, +dataAnalystEmail, +investigatorName, +sequencingCenter, +cmoSampleIdFields, +investigatorEmail, +otherContactEmails, +projectManagerName, +additionalProperties, +investigatorSampleId, +captureConcentrationNm, +libraryConcentrationNgul \ No newline at end of file From 7adf68052b4d799511ccbf73f866b03654ce3523 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 11 Oct 2022 12:38:45 -0400 Subject: [PATCH 02/11] Update clean_up.py :heavy_check_mark: adding shabang --- clean_up.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clean_up.py b/clean_up.py index 628fad6..0d5ae21 100644 --- a/clean_up.py +++ b/clean_up.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- import json import argparse import sys From f6f01c620f07fc4bd45897cc79110bc3490f0f7a Mon Sep 17 00:00:00 2001 From: Nikhil Kumar Date: Mon, 17 Oct 2022 18:53:34 -0400 Subject: [PATCH 03/11] Added parameter to show all runs while linking --- apps/access/__init__.py | 43 ++++++++++++++++++++++++----------------- apps/cmoch/__init__.py | 43 ++++++++++++++++++++++++----------------- beaglecli | 8 ++++---- 3 files changed, 54 insertions(+), 40 deletions(-) diff --git a/apps/access/__init__.py b/apps/access/__init__.py index b1465e9..b72b265 100644 --- a/apps/access/__init__.py +++ b/apps/access/__init__.py @@ -18,33 +18,33 @@ def access_commands(arguments, config): print('Running ACCESS') - request_id, sample_id, apps = get_arguments(arguments) + request_id, sample_id, apps, show_all_runs = get_arguments(arguments) tags = '{"cmoSampleIds":"%s"}' % sample_id if sample_id else '{"igoRequestId":"%s"}' % request_id if arguments.get('link'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] - operator_run = get_operator_run(app_name, app_version, tags, config) + operator_run = get_operator_run(app_name, app_version, tags, config, show_all_runs) if operator_run: if arguments.get('--single-dir'): if app == "bams": - link_bams_to_single_dir(operator_run, app, request_id, sample_id, arguments, config) + link_bams_to_single_dir(operator_run, app, request_id, sample_id, arguments, config, show_all_runs) else: print("Apps other than bams not supported at this time") else: - link_app(operator_run, directory, request_id, sample_id, arguments, config) + link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs) if arguments.get('link-patient'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] - operator_run = get_operator_run(app_name, app_version, tags, config) + operator_run = get_operator_run(app_name, app_version, tags, config, show_all_runs) if operator_run: if(app == "bams"): - link_bams_by_patient_id(operator_run, "bams", request_id, sample_id, arguments, config) + link_bams_by_patient_id(operator_run, "bams", request_id, sample_id, arguments, config, show_all_runs) else: link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, - config) + config, show_all_runs) -def get_operator_run(app_name, app_version=None, tags=None, config=None): +def get_operator_run(app_name, app_version=None, tags=None, config=None, show_all_runs=False): latest_operator_run = { "tags": tags, "status": "COMPLETED", @@ -52,6 +52,9 @@ def get_operator_run(app_name, app_version=None, tags=None, config=None): "app_name": app_name } + if show_all_runs: + latest_operator_run.pop("status") + if app_version: latest_operator_run["app_version"] = app_version @@ -71,6 +74,7 @@ def get_arguments(arguments): request_id = arguments.get('--request-id') sample_id = arguments.get('--sample-id') app_tags = arguments.get('--apps') + show_all_runs = arguments.get('--all-runs') or False if request_id: request_id = request_id[0] @@ -82,16 +86,19 @@ def get_arguments(arguments): else: apps.append((r[0], None)) - return request_id, sample_id, apps + return request_id, sample_id, apps, show_all_runs -def get_runs(operator_run_id, config): +def get_runs(operator_run_id, config, show_all_runs): run_params = { "operator_run": operator_run_id, "page_size": 1000, "status": "COMPLETED" } + if show_all_runs: + run_params.pop("status") + response = requests.get(urljoin(config['beagle_endpoint'], config['api']['run']), headers={'Authorization': 'Bearer %s' % config['token']}, params=run_params) @@ -113,7 +120,7 @@ def get_files_by_run_id(run_id, config): def get_file_path(file): return file["location"][7:] -def link_app(operator_run, directory, request_id, sample_id, arguments, config): +def link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False @@ -122,7 +129,7 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): path = path_without_version / version path.mkdir(parents=True, exist_ok=True, mode=0o755) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -152,13 +159,13 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): return "Completed" -def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -195,12 +202,12 @@ def link_single_sample_workflows_by_patient_id(operator_run, directory, request_ return "Completed" -def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] path = Path("./") / directory / ("Project_" + request_id) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -249,13 +256,13 @@ def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, argu return "Completed" -def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return diff --git a/apps/cmoch/__init__.py b/apps/cmoch/__init__.py index d94693a..8be8548 100644 --- a/apps/cmoch/__init__.py +++ b/apps/cmoch/__init__.py @@ -14,39 +14,39 @@ def cmoch_commands(arguments, config): print('Running CMOCH') - request_id, sample_id, apps = get_arguments(arguments) + request_id, sample_id, apps, show_all_runs = get_arguments(arguments) tags = '{"cmoSampleId":"%s"}' % sample_id if sample_id else '{"igoRequestId":"%s"}' % request_id if arguments.get('link'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] operator_run = get_operator_run( - app_name, app_version, tags, config) + app_name, app_version, tags, config, show_all_runs) if operator_run: if arguments.get('--single-dir'): if app == "bams": link_bams_to_single_dir( - operator_run, app, request_id, sample_id, arguments, config) + operator_run, app, request_id, sample_id, arguments, config, show_all_runs) else: print("Apps other than bams not supported at this time") else: link_app(operator_run, directory, request_id, - sample_id, arguments, config) + sample_id, arguments, config, show_all_runs) if arguments.get('link-patient'): for (app, app_version) in apps: (app_name, directory) = FLAG_TO_APPS[app] operator_run = get_operator_run( - app_name, app_version, tags, config) + app_name, app_version, tags, config, show_all_runs) if operator_run: if(app == "bams"): link_bams_by_patient_id( - operator_run, "bams", request_id, sample_id, arguments, config) + operator_run, "bams", request_id, sample_id, arguments, config, show_all_runs) else: link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, - config) + config, show_all_runs) -def get_operator_run(app_name, app_version=None, tags=None, config=None): +def get_operator_run(app_name, app_version=None, tags=None, config=None, show_all_runs=False): latest_operator_run = { "tags": tags, "status": "COMPLETED", @@ -54,6 +54,9 @@ def get_operator_run(app_name, app_version=None, tags=None, config=None): "app_name": app_name } + if show_all_runs: + latest_operator_run.pop("status") + if app_version: latest_operator_run["app_version"] = app_version @@ -75,6 +78,7 @@ def get_arguments(arguments): request_id = arguments.get('--request-id') sample_id = arguments.get('--sample-id') app_tags = arguments.get('--apps') + show_all_runs = arguments.get('--all-runs') or False if request_id: request_id = request_id[0] @@ -86,16 +90,19 @@ def get_arguments(arguments): else: apps.append((r[0], None)) - return request_id, sample_id, apps + return request_id, sample_id, apps, show_all_runs -def get_runs(operator_run_id, config): +def get_runs(operator_run_id, config, show_all_runs): run_params = { "operator_run": operator_run_id, "page_size": 1000, "status": "COMPLETED" } + if show_all_runs: + run_params.pop("status") + response = requests.get(urljoin(config['beagle_endpoint'], config['api']['run']), headers={'Authorization': 'Bearer %s' % config['token']}, params=run_params) @@ -120,7 +127,7 @@ def get_file_path(file): return file["location"][7:] -def link_app(operator_run, directory, request_id, sample_id, arguments, config): +def link_app(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False @@ -129,7 +136,7 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): path = path_without_version / version path.mkdir(parents=True, exist_ok=True, mode=0o755) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -161,13 +168,13 @@ def link_app(operator_run, directory, request_id, sample_id, arguments, config): return "Completed" -def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_single_sample_workflows_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -208,12 +215,12 @@ def link_single_sample_workflows_by_patient_id(operator_run, directory, request_ return "Completed" -def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] path = Path("./") / directory / ("Project_" + request_id) - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return @@ -264,13 +271,13 @@ def link_bams_to_single_dir(operator_run, directory, request_id, sample_id, argu return "Completed" -def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config): +def link_bams_by_patient_id(operator_run, directory, request_id, sample_id, arguments, config, show_all_runs): version = arguments.get("--dir-version") or operator_run["app_version"] should_delete = arguments.get("--delete") or False path = Path("./") / directory - runs = get_runs(operator_run["id"], config) + runs = get_runs(operator_run["id"], config, show_all_runs) if not runs: return diff --git a/beaglecli b/beaglecli index c37b282..58d355e 100755 --- a/beaglecli +++ b/beaglecli @@ -93,10 +93,10 @@ Usage: beaglecli tempo-mpgen beaglecli tempo-mpgen override --normals= --tumors= beaglecli lims metadata [--request-id=] - beaglecli access link [--single-dir] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] - beaglecli access link-patient [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] - beaglecli cmoch link [--single-dir] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] - beaglecli cmoch link-patient [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli access link [--single-dir] [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli access link-patient [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli cmoch link [--single-dir] [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] + beaglecli cmoch link-patient [--all-runs] [--request-id=] [--sample-id=] [--dir-version=] [--apps=]... [--delete] beaglecli --version Options: From 602028bbb4895f8fffb8acffccbe326781151b50 Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 18 Oct 2022 10:57:03 -0400 Subject: [PATCH 04/11] Update clean_up.py :heavy_check_mark: adding test meta data output, might want to remove this. --- clean_up.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/clean_up.py b/clean_up.py index 0d5ae21..40633fa 100644 --- a/clean_up.py +++ b/clean_up.py @@ -62,12 +62,22 @@ def _clean_json(data): df = df.drop(columns=bf_dict) df = pd.concat([df, normalized_df], axis=1) df = df.drop_duplicates() - return df + return df def _write_output(data, out_data): out_name = data['results'][0]['metadata']['igoRequestId'] + out_data = out_data.loc[:,~out_data.columns.duplicated()].copy() out_data.to_csv('{out_name}.csv'.format(out_name=out_name), index=False) - + test_json_names = ['igoId', 'cmoSampleName', 'sampleName', 'cmoSampleClass', 'cmoPatientId', 'investigatorSampleId', 'oncoTreeCode', 'tumorOrNormal', 'tissueLocation', 'specimenType', 'sampleOrigin', 'preservation', 'collectionYear', 'sex', 'species', 'tubeId', 'cfDNA2dBarcode', 'baitSet', 'qcReports', 'barcodeId', 'barcodeIndex', 'libraryIgoId', 'libraryVolume', 'libraryConcentrationNgul', 'dnaInputNg', 'captureConcentrationNm', 'captureInputNg', 'captureName'] + + out_data['specimenType'] = None + out_data['qcReports'] = [[] for _ in range(len(out_data))] + out_data = out_data.rename(columns={'oncotreeCode': 'oncoTreeCode', 'igoRequestId': 'igoId', 'sampleClass':'cmoSampleClass'}) + out_data = out_data[test_json_names] + #sys.stdin = open("/dev/tty") + out_data = out_data.to_dict('records') + with open('{out_name}.json'.format(out_name=out_name), 'w') as fout: + json.dump(out_data , fout, indent=4) if __name__ == '__main__': # get args data = _collect_args() @@ -75,3 +85,4 @@ def _write_output(data, out_data): out_data = _clean_json(data) # write data out _write_output(data, out_data) + \ No newline at end of file From f0594749e3af2e70fd81d3348e9b32224ebbfedc Mon Sep 17 00:00:00 2001 From: buehlere Date: Tue, 8 Nov 2022 11:48:37 -0500 Subject: [PATCH 05/11] update cleanup script --- beaglecli | 2 +- clean_up.py | 34 +++++++++++----------- column_names.txt | 73 +++++++++++++----------------------------------- 3 files changed, 37 insertions(+), 72 deletions(-) diff --git a/beaglecli b/beaglecli index 11a1b25..cd9533a 100755 --- a/beaglecli +++ b/beaglecli @@ -86,7 +86,7 @@ Usage: beaglecli file-group list [--page-size=] [--all]... beaglecli etl delete --job-id=... beaglecli run list [--page-size=] [--request-id=]... [--tags=]... [--apps="apps"]... [--job-groups=]... [--jira-ids=]... [--all]... - beaglecli run latest-info [--request-id= ] [--job-group=] [--apps="apps"]... [--jira-id=] [--output-file=] [--completed][--page-size=] [--output-metadata-only] [--max-pages] + beaglecli run latest-info [--request-id= ] [--job-group=] [--apps="apps"]... [--jira-id=] [--output-file=] [--completed][--page-size=] [--output-metadata-only] [--max-pages] [--all]... beaglecli run get beaglecli run submit-request --pipeline= [--request-ids=] [--job-group-id=] [--for-each=] beaglecli run submit-runs --pipelines=... --versions=...[--run-file=] [--run-ids=]... [--job-group-id=] [--for-each=] diff --git a/clean_up.py b/clean_up.py index 40633fa..b642362 100644 --- a/clean_up.py +++ b/clean_up.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from importlib.metadata import metadata import json import argparse import sys @@ -22,12 +23,11 @@ def _collect_args(): nargs='?', type=str, help='Input string containing a valid JSON.') - # Parse the argument args = parser.parse_args() + # load json data = args.json or args.input_file.read() datain=json.loads(data) - #sys.stdin = open("/dev/tty") - return datain + return datain, args def _read_column_control(): # read in column_names for check @@ -50,11 +50,13 @@ def _clean_json(data): # check all columns are present if not set(c_names).issubset(df.columns): ValueError('missing column names expected in file metadata. Format has changed, or JSON is badly formed.') - # should do check for all columns - df = df.drop('R', axis=1) - df['flowCellLanes'] = [','.join(map(str, l)) for l in df['flowCellLanes']] + # rename some columns + df = df.rename(columns={'oncotreeCode': 'oncoTreeCode', 'igoRequestId': 'igoId', 'sampleClass':'cmoSampleClass'}) + # subset to important columns + df = df[c_names] + # normalize columns bf_list = [i for i in df.columns if isinstance(df[i][0],list)] - cleaned_columns = [df[column].apply(lambda x: x[0]) for column in bf_list] + cleaned_columns = [df[column].apply(lambda x: x[0] if isinstance(x, list) else x) for column in bf_list] df[bf_list] = pd.concat(cleaned_columns, axis=1) bf_dict = [i for i in df.columns if isinstance(df[i][0],dict)] normalized_columns = [pd.json_normalize(df[column]) for column in bf_dict] @@ -65,24 +67,22 @@ def _clean_json(data): return df def _write_output(data, out_data): + # csv out_name = data['results'][0]['metadata']['igoRequestId'] out_data = out_data.loc[:,~out_data.columns.duplicated()].copy() out_data.to_csv('{out_name}.csv'.format(out_name=out_name), index=False) - test_json_names = ['igoId', 'cmoSampleName', 'sampleName', 'cmoSampleClass', 'cmoPatientId', 'investigatorSampleId', 'oncoTreeCode', 'tumorOrNormal', 'tissueLocation', 'specimenType', 'sampleOrigin', 'preservation', 'collectionYear', 'sex', 'species', 'tubeId', 'cfDNA2dBarcode', 'baitSet', 'qcReports', 'barcodeId', 'barcodeIndex', 'libraryIgoId', 'libraryVolume', 'libraryConcentrationNgul', 'dnaInputNg', 'captureConcentrationNm', 'captureInputNg', 'captureName'] - - out_data['specimenType'] = None - out_data['qcReports'] = [[] for _ in range(len(out_data))] - out_data = out_data.rename(columns={'oncotreeCode': 'oncoTreeCode', 'igoRequestId': 'igoId', 'sampleClass':'cmoSampleClass'}) - out_data = out_data[test_json_names] - #sys.stdin = open("/dev/tty") + # json out_data = out_data.to_dict('records') with open('{out_name}.json'.format(out_name=out_name), 'w') as fout: json.dump(out_data , fout, indent=4) + + + if __name__ == '__main__': # get args - data = _collect_args() - # clean json + data, args = _collect_args() + # clean json out_data = _clean_json(data) - # write data out + # write out _write_output(data, out_data) \ No newline at end of file diff --git a/column_names.txt b/column_names.txt index 8790e26..a156f0a 100644 --- a/column_names.txt +++ b/column_names.txt @@ -1,62 +1,27 @@ -R, +igoId, +cmoSampleName, +sampleName, +cmoSampleClass, +cmoPatientId, +investigatorSampleId, +oncoTreeCode, +tumorOrNormal, +tissueLocation, +sampleOrigin, +preservation, +collectionYear, sex, -ciTag, -runId, +species, tubeId, +cfDNA2dBarcode, baitSet, -piEmail, -runDate, -runMode, -species, -platform, -barcodeId, -genePanel, -primaryId, qcReports, -datasource, -dnaInputNg, -flowCellId, -importDate, -readLength, -sampleName, -sampleType, -captureName, -igoComplete, -labHeadName, -sampleClass, +barcodeId, barcodeIndex, -cmoInfoIgoId, -cmoPatientId, -igoProjectId, -igoRequestId, -labHeadEmail, libraryIgoId, -oncotreeCode, -preservation, -sampleOrigin, -cmoSampleName, -flowCellLanes, libraryVolume, -sampleAliases, -smileSampleId, -tumorOrNormal, -captureInputNg, -cfDNA2dBarcode, -collectionYear, -patientAliases, -qcAccessEmails, -smilePatientId, -tissueLocation, -dataAnalystName, -dataAccessEmails, -dataAnalystEmail, -investigatorName, -sequencingCenter, -cmoSampleIdFields, -investigatorEmail, -otherContactEmails, -projectManagerName, -additionalProperties, -investigatorSampleId, +libraryConcentrationNgul, +dnaInputNg, captureConcentrationNm, -libraryConcentrationNgul \ No newline at end of file +captureInputNg, +captureName \ No newline at end of file From 5dd371a16abc43c5c3b7a058c002c704200b1aeb Mon Sep 17 00:00:00 2001 From: buehlere Date: Mon, 14 Nov 2022 16:40:40 -0500 Subject: [PATCH 06/11] Update __init__.py :heavy_check_mark: update app name to reflect change in Voyager --- apps/cmoch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/cmoch/__init__.py b/apps/cmoch/__init__.py index 8be8548..a872030 100644 --- a/apps/cmoch/__init__.py +++ b/apps/cmoch/__init__.py @@ -7,7 +7,7 @@ import requests FLAG_TO_APPS = { - "bams": ("Access CMO-CH", "bams"), + "bams": ("cmo-ch nucleo", "bams"), } From 1b60a50b90a009b2befa21544de6dd5991040881 Mon Sep 17 00:00:00 2001 From: Nikhil Kumar Date: Wed, 16 Nov 2022 13:42:43 -0500 Subject: [PATCH 07/11] Updated version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ad90de0..4ea87c0 100755 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name='beaglecli', - version='0.3.0', + version='0.4.0', scripts=['beaglecli'] , description="Beagle API command line tool", url="https://github.com/mskcc/beagle_cli", From 6bd42cb4171417bad3a1ea82cdc4b524f5cb0e1d Mon Sep 17 00:00:00 2001 From: buehlere Date: Thu, 17 Nov 2022 11:35:53 -0500 Subject: [PATCH 08/11] changing clean.py to app :heavy_check_mark: have cleaning script within the app dir and import as function --- clean_up.py => apps/cleaning/__init__.py | 81 +++++++++++------------- 1 file changed, 38 insertions(+), 43 deletions(-) rename clean_up.py => apps/cleaning/__init__.py (54%) diff --git a/clean_up.py b/apps/cleaning/__init__.py similarity index 54% rename from clean_up.py rename to apps/cleaning/__init__.py index b642362..b6b9789 100644 --- a/clean_up.py +++ b/apps/cleaning/__init__.py @@ -1,42 +1,40 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- from importlib.metadata import metadata import json -import argparse -import sys import pandas as pd -import csv +from collections import defaultdict +from urllib.parse import urljoin +from pathlib import Path +C_NAMES = [ + "igoRequestId", + "cmoSampleName", + "sampleName", + "sampleClass", + "cmoPatientId", + "investigatorSampleId", + "oncotreeCode", + "tumorOrNormal", + "tissueLocation", + "sampleOrigin", + "preservation", + "collectionYear", + "sex", + "species", + "tubeId", + "cfDNA2dBarcode", + "baitSet", + "qcReports", + "barcodeId", + "barcodeIndex", + "libraryIgoId", + "libraryVolume", + "libraryConcentrationNgul", + "dnaInputNg", + "captureConcentrationNm", + "captureInputNg", + "captureName" +] -def _collect_args(): - # Create the parser - parser = argparse.ArgumentParser() - # Add an argument group - group = parser.add_mutually_exclusive_group() - group.add_argument( - '--input-file', '-i', - type=argparse.FileType('r'), - default=sys.stdin, - help='Input file name containing a valid JSON.') - group.add_argument( - 'json', - nargs='?', - type=str, - help='Input string containing a valid JSON.') - args = parser.parse_args() - # load json - data = args.json or args.input_file.read() - datain=json.loads(data) - return datain, args - -def _read_column_control(): - # read in column_names for check - with open('column_names.txt', newline='') as f: - reader = csv.reader(f) - c_names = [] - for name in reader: - c_names.append(name[0]) - return c_names def _clean_json(data): # Iterating through the json @@ -46,14 +44,11 @@ def _clean_json(data): for i in range(0,len(data["results"])): results.append(data['results'][i]['metadata']) df = pd.DataFrame(results) - c_names = _read_column_control() # check all columns are present - if not set(c_names).issubset(df.columns): + if not set(C_NAMES).issubset(df.columns): ValueError('missing column names expected in file metadata. Format has changed, or JSON is badly formed.') - # rename some columns - df = df.rename(columns={'oncotreeCode': 'oncoTreeCode', 'igoRequestId': 'igoId', 'sampleClass':'cmoSampleClass'}) # subset to important columns - df = df[c_names] + df = df[C_NAMES] # normalize columns bf_list = [i for i in df.columns if isinstance(df[i][0],list)] cleaned_columns = [df[column].apply(lambda x: x[0] if isinstance(x, list) else x) for column in bf_list] @@ -78,11 +73,11 @@ def _write_output(data, out_data): -if __name__ == '__main__': +def clean_json_comands(results): # get args - data, args = _collect_args() + datain=json.loads(results) # clean json - out_data = _clean_json(data) + dataout = _clean_json(datain) # write out - _write_output(data, out_data) + _write_output(datain, dataout) \ No newline at end of file From 0683d81d826e98b69a8510fff03d8e55266991d6 Mon Sep 17 00:00:00 2001 From: buehlere Date: Thu, 17 Nov 2022 11:36:17 -0500 Subject: [PATCH 09/11] rmv config change / new cleaning cmds --- beaglecli | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/beaglecli b/beaglecli index a2b2e92..76510cd 100755 --- a/beaglecli +++ b/beaglecli @@ -31,6 +31,7 @@ import csv from apps.access import access_commands from apps.cmoch import cmoch_commands from apps.lims import lims_commands +from apps.cleaning import clean_json_comands BEAGLE_ENDPOINT = os.environ.get('BEAGLE_ENDPOINT', 'http://voyager:5001') @@ -42,8 +43,7 @@ CONFIG_TEMPLATE = { 'token': '', 'refresh': '', 'next': None, - 'prev': None, - 'count': None + 'prev': None } @@ -73,17 +73,17 @@ Usage: beaglecli files create [--metadata-path=] [--size=] beaglecli files update [--file-path=] [--file-type=] [--file-group=] [--metadata-path=] [--size=] beaglecli files patch [--file-path=] [--file-type=] [--file-group=] [--metadata=]... [--size=] - beaglecli files list [--page-size=] [--path=]... [--metadata=]... [--file-group=]... [--file-name=]... [--filename-regex=] [--file-type=]... [--all]... + beaglecli files list [--page-size=] [--path=]... [--metadata=]... [--file-group=]... [--file-name=]... [--filename-regex=] [--file-type=]... [--all]... [--packaged]... beaglecli files delete --file-id=... beaglecli sample create - beaglecli sample list [--sample-id=] [--all]... + beaglecli sample list [--sample-id=] beaglecli sample redact [--value=] beaglecli storage create - beaglecli storage list [--all]... + beaglecli storage list beaglecli file-types create beaglecli file-types list beaglecli file-group create - beaglecli file-group list [--page-size=] [--all]... + beaglecli file-group list [--page-size=] beaglecli etl delete --job-id=... beaglecli run list [--page-size=] [--request-id=]... [--tags=]... [--apps="apps"]... [--job-groups=]... [--jira-ids=]... [--all]... beaglecli run latest-info [--request-id= ] [--job-group=] [--apps="apps"]... [--jira-id=] [--output-file=] [--completed][--page-size=] [--output-metadata-only] [--max-pages] [--all]... @@ -111,12 +111,11 @@ CONFIG_LOCATION = os.path.join(expanduser("~"), '.beagle.conf') class Config(object): - def __init__(self, token, refresh, next, prev, count): + def __init__(self, token, refresh, next, prev): self.token = token self.refresh = refresh self.next = next self.prev = prev - self.count = count @classmethod def load(cls): @@ -125,7 +124,7 @@ class Config(object): config = cls(**json.load(config)) else: with open(CONFIG_LOCATION, 'w') as config: - config = cls('', '', None, None, None) + config = cls('', '', None, None) config.dump() return config @@ -136,10 +135,10 @@ class Config(object): def dump(self): with open(CONFIG_LOCATION, 'w') as f: json.dump({'token': self.token, 'refresh': self.refresh, - 'next': self.next, 'prev': self.prev, 'count':self.count}, f) + 'next': self.next, 'prev': self.prev}, f) def __repr__(self): - return 'token: %s, next: %s, prev: %s, count: %s' % (self.token, self.next, self.prev, self.count) + return 'token: %s, next: %s, prev: %s' % (self.token, self.next, self.prev) # Commands @@ -586,6 +585,8 @@ def _list_files(arguments, config): filename_regex = arguments.get('--filename-regex') page_size = arguments.get('--page-size') file_type = arguments.get('--file-type') + all_pages = arguments.get('--all') + packaged = arguments.get('--packaged') params = dict() params['path'] = paths params['metadata'] = metadata @@ -593,11 +594,15 @@ def _list_files(arguments, config): params['file_name'] = file_name params['filename_regex'] = filename_regex params['file_type'] = file_type - if page_size: - params['page_size'] = page_size + if all_pages: + count_params = params + count_params['count'] = True + params['page_size'] = requests.get(urljoin(BEAGLE_ENDPOINT, API['files']), headers={'Authorization': 'Bearer %s' % config.token}, params=count_params).json()['count'] response = requests.get(urljoin(BEAGLE_ENDPOINT, API['files']), headers={ 'Authorization': 'Bearer %s' % config.token}, params=params) response_json = json.dumps(response.json(), indent=4) + if packaged: + clean_json_comands(response_json) _set_next_and_prev(config, response.json()) return response_json @@ -615,7 +620,6 @@ def _list_sample(arguments, config): def _set_next_and_prev(config, value): config.set('prev', value.get('previous')) config.set('next', value.get('next')) - config.set('count', value.get('count')) def next(config): @@ -959,10 +963,6 @@ if __name__ == '__main__': config = Config.load() authenticate_command(config) arguments = docopt(USAGE, version='Beagle API 0.2.0') - # the '--all' flag controls whether '--page-size=count' to return all results for a query - # list will exit correclty since using querying with '--page-size=count' ensures 'next' and 'prev' are 'null' - if arguments.get('--all'): - arguments['--page-size'] = config.count result = command(arguments, config) print(result) if arguments.get('list'): From e583df792a37468c8d37f1df568bac20edf153be Mon Sep 17 00:00:00 2001 From: EricWilliam Buehler Date: Fri, 18 Nov 2022 12:09:35 -0500 Subject: [PATCH 10/11] update reqs --- apps/cleaning/__init__.py | 1 - requirements.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/cleaning/__init__.py b/apps/cleaning/__init__.py index b6b9789..ce836c3 100644 --- a/apps/cleaning/__init__.py +++ b/apps/cleaning/__init__.py @@ -1,4 +1,3 @@ -from importlib.metadata import metadata import json import pandas as pd from collections import defaultdict diff --git a/requirements.txt b/requirements.txt index ac03762..6cf06d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ docopt==0.6.2 requests==2.22.0 +pandas \ No newline at end of file From 38f84291b8a759ec218498da28ea9e02a5b6cfb3 Mon Sep 17 00:00:00 2001 From: buehlere Date: Fri, 18 Nov 2022 15:37:43 -0500 Subject: [PATCH 11/11] Update README.md :heavy_check_mark: update README for release --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 912a779..c02fadc 100644 --- a/README.md +++ b/README.md @@ -73,9 +73,9 @@ To access other endpoints, export the environment variable `BEAGLE_ENDPOINT`. ``` beaglecli run latest-info --request-id requests.txt --completed --output-metadata-only --max-pages ``` -- Return and clean output meatadata for a given request id from files api +- Return and clean output metadata for a given request id from files api ``` - beaglecli files list --metadata=igoRequestId:13167_C --file-type fastq --all | python3 clean_up.py + beaglecli files list --metadata=igoRequestId:13167_C --file-type fastq --all --packaged ``` Note: Use `requests.txt` as a template for providing a multiple request ids