diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6e0afc6e6..208d4d52f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,7 +21,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - python_version: [2.7, 3.6] + python_version: [3.6] # Steps represent a sequence of tasks that will be executed as part of the job steps: @@ -48,8 +48,3 @@ jobs: if: ${{ matrix.python_version == '3.6' }} run: | invoke test --no-flake - - - name: QA (awsf) - if: ${{ matrix.python_version == '2.7' }} - run: | - ./tests/awsf/test.sh diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/LICENSE.txt b/LICENSE.txt old mode 100644 new mode 100755 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100644 new mode 100755 index fb6d6bce6..f2f823268 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,4 @@ include LICENSE.txt include requirements.txt include requirements-test.txt include tibanna/lambdas/* +include awsf3/create_ami_userdata diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/awsf/awstat b/awsf/awstat deleted file mode 100755 index 26201a961..000000000 --- a/awsf/awstat +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/python - -import sys -import subprocess -import json -import csv - -JOB_LIST_FILE="./job_list" -JOB_LIST_HEADER=['job_id', 'instance_id', 'instance_type', 'public_ip', 'tag', 'start_time', 'outbucket'] -LOGDIR="./logs" - - -def check_status (instance_id): - instance_desc_command = "aws ec2 describe-instances --instance-id={instance_id}".format(instance_id=instance_id) - try: - instance_desc_logstr=subprocess.check_output(instance_desc_command.split(' ')) # capturing stdout from the launch command - instance_desc_log=json.loads(instance_desc_logstr) - return str(instance_desc_log['Reservations'][0]['Instances'][0]['State']['Name']) - except: # instance doesn't exist any more - return 'depricated' - - -def check_success_error (jobid, outbucket, instance_status): - if instance_status=='depricated' or instance_status=='terminated' or instance_status=='shutting-down': - instance_success_command = "aws s3 ls s3://{outbucket}/{jobid}.success".format(outbucket=outbucket, jobid=jobid) - try: - subprocess.check_output(instance_success_command.split(' ')) - return 'success' - except: - return 'fail' - else: - instance_error_command = "aws s3 ls s3://{outbucket}/{jobid}.error".format(outbucket=outbucket, jobid=jobid) - try: - subprocess.check_output(instance_error_command.split(' ')) - return 'error' - except: - return 'unknown' - - -def print_log (jobid, logdir, outbucket): - instance_log_command1 = "aws s3 cp s3://{outbucket}/{jobid}.log {logdir}".format(outbucket=outbucket, jobid=jobid, logdir=logdir) - instance_log_command2 = "cat {logdir}/{jobid}.log".format(jobid=jobid, logdir=logdir) - try: - subprocess.check_output(instance_log_command1.split(' ')) - instance_log = subprocess.check_output(instance_log_command2.split(' ')) - print(instance_log) - except: - print("instance log not available.") - - -def print_postrun_json (jobid, postrun_jsondir, outbucket): - instance_postrun_json_command1 = "aws s3 cp s3://{outbucket}/{jobid}.postrun.json {postrun_jsondir}".format(outbucket=outbucket, jobid=jobid, postrun_jsondir=postrun_jsondir) - instance_postrun_json_command2 = "cat {postrun_jsondir}/{jobid}.postrun.json".format(jobid=jobid, postrun_jsondir=postrun_jsondir) - try: - subprocess.check_output(instance_postrun_json_command1.split(' ')) - instance_postrun_json = subprocess.check_output(instance_postrun_json_command2.split(' ')) - print(instance_postrun_json) - except: - print("instance postrun.json not available.") - - -def run(specified_job_id, is_long, is_postrun_json): - - with open(JOB_LIST_FILE, 'r') as f: - jobs=csv.reader(f, delimiter='\t') - if next(jobs)!=JOB_LIST_HEADER: - sys.exit("job_list file doesn't contain proper header. Aborting.\n") - else: - nJobs=0 - for j0 in jobs: - j=j0[0:6] - instance_id = j[1] - job_id = j[0] - out_bucket = j0[6] - - if specified_job_id and job_id != specified_job_id: - continue - - # get job status for the instance - instance_status = check_status (instance_id) - j.append(instance_status) - - # adding whether the job was a success (if it was a success, there will be a file jobid.success in the output bucket.) - instance_success = check_success_error (job_id, out_bucket, instance_status) - j.append(instance_success) - - print('\t'.join(j)) - - ## print out log - if is_long == True: - print_log(job_id, LOGDIR, out_bucket) - - ## print out postrun json - if is_postrun_json == True: - print_postrun_json(job_id, LOGDIR, out_bucket) - - nJobs += 1 - - if nJobs==0: - print("No jobs.\n"); - - - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Arguments") - parser.add_argument("-l", "--log", help="Print out log as well.", action="store_true") - parser.add_argument("-p", "--postrun_json", help="Print out postrun.json as well.", action="store_true") - parser.add_argument("-j", "--job_id", help="Look at only the specified job_id.") - args = parser.parse_args() - - run(args.job_id, args.log, args.postrun_json) - - diff --git a/awsf/awsub b/awsf/awsub deleted file mode 100755 index e27e1df84..000000000 --- a/awsf/awsub +++ /dev/null @@ -1,334 +0,0 @@ -#!/usr/bin/python -import json -import sys -import time -import random -import string -import os -import subprocess -import argparse - -## random string generator -def randomword(length): - return ''.join(random.choice(string.lowercase+string.uppercase+string.digits) for i in range(length)) - -def create_jobid (): - return randomword(12) # date+random_string - -def get_start_time (): - return time.strftime("%Y%m%d-%H:%M:%S-%Z") - -def create_json_filename (jobid, json_dir): - return json_dir + '/' + jobid + '.run.json' - -## run command and check the output -## return value is [True/False, output_string(stdout)] -## If the command failed, the first value will be False and the output string will be null. -def run_command_out_check (command): - with open(os.devnull, 'w') as shutup: - try: - res=subprocess.check_output(command.split(" "), stderr=shutup) - return([True, res]) - except subprocess.CalledProcessError: - return([False, '']) - - -def launch_and_get_instance_id (launch_command, jobid): - try: - instance_launch_logstr=subprocess.check_output(launch_command, shell=True) # capturing stdout from the launch command - except: - sys.exit("failed to launch instance for job {jobid}".format(jobid=jobid)) - instance_launch_log=json.loads(instance_launch_logstr) - return instance_launch_log['Instances'][0]['InstanceId'] - - - -def read_config( CONFIG_FILE, CONFIG_KEYS): - - ## 1. read .workflow.config.json file and get some variables - with open( CONFIG_FILE , 'r') as f: - cfg=json.load(f) - - # checking all the necessary keys exist - for k in CONFIG_KEYS: - if not cfg.has_key(k): - sys.exit("The config file doesn't have key {}".format(k)) - - return cfg - - - -def create_json( a, json_dir, jobid, copy_to_s3 ): # a is the final_args dictionary. json_dir is the output directory for the json file - - ## create jobid here - if not jobid: - jobid = create_jobid() - - ## start time - start_time = get_start_time() - - ## pre is a dictionary to be printed as a pre-run json file. - pre = { 'JOBID': jobid, - 'App': { - 'App_name': a['app_name'], - 'App_version': a['app_version'], - 'cwl_url': a['cwl_directory'], - 'main_cwl': a['cwl'], - 'other_cwl_files': a['cwl_children'] - }, - 'Input': { - 'Input_files_data': {}, ## fill in later (below) - 'Input_files_reference': {}, ## fill in later (below) - 'Input_parameters': a['input_parameters'] - }, - 'Output': { - 'output_bucket_directory': a['output_bucket_directory'] - }, - 'Instance_type': a['instance_type'], - 'EBS_SIZE': a['storage_size'], - 'EBS_TYPE': a['storage_type'], - 'EBS_IOPS': a['storage_iops'], - "AMI_ID": "ami-78c13615", - "start_time" : start_time - } - - # fill in input_files and input_reference_files (restructured) - for item, value in a['input_files'].iteritems(): - pre['Input']['Input_files_data'][item]={'class':'File', 'dir':a['input_files_directory'], 'path':value} - for item, value in a['input_reference_files'].iteritems(): - pre['Input']['Input_files_reference'][item]={'class':'File', 'dir':a['input_reference_files_directory'], 'path':value} - - # wrap - pre={ 'Job': pre } - - ## writing to a json file - json_filename = create_json_filename(jobid, json_dir) - try: - os.stat(json_dir) - except: - os.makedirs(json_dir) - - ## write to new json file - with open(json_filename, 'w') as json_new_f: - json.dump(pre, json_new_f, indent=4, sort_keys=True) - - - # copy the json file to the s3 bucket - if copy_to_s3==True: - command = "aws s3 cp ./{json_dir}/{jobid}.run.json s3://{json_bucket}/{jobid}.run.json".format(json_bucket=json_bucket, jobid=jobid, json_dir=json_dir) - run_command_out_check (command) - - ## print & retur JOBID - print("jobid={}".format(jobid)) - return(jobid) - - - - -def launch_instance (par, jobid, shutdown_min ): - - ## Create a userdata script to pass to the instance. The userdata script is run_workflow.$JOBID.sh. - command="./create_run_workflow.sh {jobid} {dir} {shutdown_min}".format(jobid=jobid, dir=par['userdata_dir'], shutdown_min = shutdown_min) - run_command_out_check (command) - - - ## launch an instance - print "launching an instance..." - - # creating a launch command - Userdata_file = "{dir}/run_workflow.{jobid}.sh".format(jobid=jobid, dir=par['userdata_dir']) - launch_command = "aws ec2 run-instances --image-id {ami} --instance-type {instance_type} --instance-initiated-shutdown-behavior terminate --count 1 --enable-api-termination --iam-instance-profile Arn={arn} --user-data={userdata}".format(ami=par['worker_ami_id'], instance_type=par['instance_type'], arn=par['s3_access_arn'], userdata='file://'+Userdata_file) - if par['keyname'] != '': - launch_command += " --key-name {keyname}".format(keyname=par['keyname']) - if par['EBS_optimized']: - launch_command += " --ebs-optimized" - if par['storage_iops']: ## io1 type, specify iops - launch_command += " --block-device-mappings DeviceName=/dev/sdb, Ebs=\"{{VolumeSize={EBS_SIZE}, VolumeType={EBS_TYPE}, Iops={EBS_IOPS}, DeleteOnTermination=true}}\"".format(EBS_SIZE=par['storage_size'], EBS_TYPE=par['storage_type'], EBS_IOPS=par['storage_iops']) - else: ## gp type or other type? do not specify iops - launch_command += " --block-device-mappings DeviceName=/dev/sdb, Ebs=\"{{VolumeSize={EBS_SIZE}, VolumeType={EBS_TYPE}, DeleteOnTermination=true}}\"".format(EBS_SIZE=par['storage_size'], EBS_TYPE=par['storage_type']) - - - # launch instance and get id - instance_id=launch_and_get_instance_id (launch_command, jobid) - - # get public IP for the instance (This may not happen immediately) - instance_desc_command = "aws ec2 describe-instances --instance-id={instance_id}".format(instance_id=instance_id) - try_again=True - while try_again: ## keep trying until you get the result. - time.sleep(1) # wait for one second before trying again. - try: - instance_desc_logstr= run_command_out_check (instance_desc_command) # sometimes you don't get a description immediately - instance_desc_log=json.loads(instance_desc_logstr[1]) - instance_ip = instance_desc_log['Reservations'][0]['Instances'][0]['PublicIpAddress'] # sometimes you get a description but PublicIP is not available yet - try_again=False - except: - try_again=True - - print("instance_id={}, instance_ip={}".format(instance_id, instance_ip)) - ## 5. Add to the job list - with open(par['job_list_file'], 'a') as fo: - fo.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(jobid, instance_id, par['instance_type'], instance_ip, par['job_tag'], get_start_time(), par['outbucket'])) - - - - -## main -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument("-c", "--cwl", help="main cwl file name") - parser.add_argument("-cd", "--cwl_directory", help="the url and subdirectories for the main cwl file (override config)") - parser.add_argument("-co", "--cwl_children", help="names of the other cwl files used by main cwl file, delimiated by comma") - parser.add_argument("-a", "--app_name", help="name of the app") - parser.add_argument("-av", "--app_version", help="version of the app") - parser.add_argument("-i", "--input_files", help="input files in json format (parametername:filename)") - parser.add_argument("-ir", "--input_reference_files", help="input reference files in json format (parametername:filename)") - parser.add_argument("-ip", "--input_parameters", help="input parameters in json format (parametername:value)") - parser.add_argument("-id", "--input_files_directory", help="bucket name and subdirectory for input files") - parser.add_argument("-ird", "--input_reference_files_directory", help="bucket name and subdirectory for input reference files (override config)") - parser.add_argument("-o", "--output_bucket_directory", help="bucket name and subdirectory for output files and logs (override config)") - parser.add_argument("-t", "--instance_type", help="EC2 instance type (default set in config)") - parser.add_argument("-s", "--storage_size", help="EBS storage size in GB (default set in config)") - parser.add_argument("-sT", "--storage_type", help="EBS storage type (available values: gp2, io1, st1, sc1, standard (default: io1)") - parser.add_argument("-IO", "--storage_iops", help="EBS storage IOPS (default set in config)") - parser.add_argument("-NO", "--not_EBS_optimized", help="Use this flag if the instance type is not EBS-optimized (default: EBS-optimized)", action="store_true") - parser.add_argument("-jd", "--json_dir", help="Local directory in which the output json file will be written (default set in config)") - parser.add_argument("-J", "--job_id", help="Manually assign job ID as specififed (default: randomly generated)") - parser.add_argument("-m", "--shutdown_min", help="Number of minutes before shutdown after the jobs are finished. (default now)") - parser.add_argument("-u", "--copy_to_s3", help="Upload or copy the json file to S3 bucket json_bucket", action="store_true") - parser.add_argument("-e", "--launch_instance", help="Launch instance based on the json file.", action="store_true") - - - args = parser.parse_args() - - ## default variables - CONFIG_FILE=".tibanna.config" - CONFIG_KEYS=["reference_S3_bucket", "output_S3_bucket", "s3_access_arn", "keyname", "worker_ami_id", "default_instance_type", "default_ebs_size", "default_ebs_type", "ebs_iops", "userdata_dir", "json_dir", "json_bucket", "cwl_url", "job_list_file"] - cfg=read_config(CONFIG_FILE, CONFIG_KEYS) - - ## parameters that will go into the json file - final_args={ - 'cwl': '', ## required - 'cwl_directory': cfg['cwl_url'], - 'cwl_children': [], - 'app_name': '', - 'app_version': '', - 'input_files': {}, - 'input_reference_files': {}, - 'input_parameters': {}, - 'input_files_directory': '', ## required if input_files is not null - 'input_reference_files_directory': cfg['reference_S3_bucket'], - 'output_bucket_directory': cfg['output_S3_bucket'], - 'instance_type': cfg['default_instance_type'], - 'storage_size': cfg['default_ebs_size'], - 'storage_type': cfg['default_ebs_type'], - 'storage_iops': cfg['ebs_iops'] - } - # local directory in which the json file will be first created. - json_dir=cfg['json_dir'] - # bucket name to which the json file will be sent. - json_bucket=cfg['json_bucket'] - - # parameters needed to launch an instance - par={ - 's3_access_arn': cfg['s3_access_arn'], - 'worker_ami_id': cfg['worker_ami_id'], - 'keyname': cfg['keyname'], - 'userdata_dir': cfg['userdata_dir'], - 'instance_type': cfg['default_instance_type'], # redundant with final_args - 'storage_size': cfg['default_ebs_size'], # redudant with final_args - 'storage_type': cfg['default_ebs_type'], # redudant with final_args - 'storage_iops': cfg['ebs_iops'], # redundant with final_args - 'EBS_optimized': True, - 'job_list_file': cfg['job_list_file'], - 'job_tag': '', # app_name in final_args - 'outbucket': cfg['output_S3_bucket'] # redundant with output_bucket_directory in final_args - } - - shutdown_min='now' - - - if args.cwl: - final_args['cwl']=args.cwl - else: - sys.exit("cwl field is required") - - if args.cwl_directory: - final_args['cwl_directory']=args.cwl_directory - - if args.cwl_children: - final_args['cwl_children']=args.cwl_children.split(',') - - if args.app_name: - final_args['app_name']=args.app_name - - if args.app_version: - final_args['app_version']=args.app_version - - if args.input_files: - print args.input_files ##debugging - final_args['input_files']=json.loads(args.input_files) - - if args.input_reference_files: - final_args['input_reference_files']=json.loads(args.input_reference_files) - - if args.input_parameters: - final_args['input_parameters']=json.loads(args.input_parameters) - - if args.input_files_directory: - final_args['input_files_directory'] = args.input_files_directory - elif bool(final_args['input_files']): - sys.exit("input_files_directory must be provided if input_files is provided.") - - if args.input_reference_files_directory: - final_args['input_reference_files_directory'] = args.input_reference_files_directory - - if args.output_bucket_directory: - final_args['output_bucket_directory']=args.output_bucket_directory - - if args.instance_type: - final_args['instance_type']=args.instance_type - - if args.storage_size: - final_args['storage_size']=int(args.storage_size) - - if args.storage_type: - final_args['storage_type']=args.storage_type - - if args.storage_iops: - final_args['storage_iops']=int(args.storage_iops) - - if args.not_EBS_optimized: - par['EBS_optimized']=False - - if args.json_dir: - json_dir=args.json_dir - - if args.shutdown_min: - shutdown_min=args.shutdown_min - - - # make sure these parameters are consistent between par and a. - par['instance_type'] =final_args['instance_type'] - par['storage_size']= final_args['storage_size'] - par['storage_type']= final_args['storage_type'] - par['storage_iops']= final_args['storage_iops'] - par['job_tag']=final_args['app_name'] - par['outbucket']=final_args['output_bucket_directory'] - - # EBS type - if par['storage_type'] != 'io1': - par['storage_iops']='' - final_args['storage_iops']='' - - - # create json and copy to s3 - jobid=create_json(final_args, json_dir, args.job_id, args.copy_to_s3 ) - - # launch instance and execute workflow - if args.launch_instance: - launch_instance(par, jobid, shutdown_min) - - - - diff --git a/awsf/create_run_workflow.sh b/awsf/create_run_workflow.sh deleted file mode 100755 index c0e76e6f3..000000000 --- a/awsf/create_run_workflow.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -JOBID=$1 -USERDATA_DIR=$2 -SHUTDOWN_MIN=$3 -[!-d $USERDATA_DIR] || mkdir -p $USERDATA_DIR -RUN_WORKFLOW_FILE=$USERDATA_DIR/run_workflow.$JOBID.sh -SCRIPT_URL=https://raw.githubusercontent.com/hms-dbmi/tibanna/master/ -echo "#!/bin/bash" > $RUN_WORKFLOW_FILE -echo "JOBID=$JOBID" >> $RUN_WORKFLOW_FILE -echo "RUN_SCRIPT=aws_run_workflow.sh" >> $RUN_WORKFLOW_FILE -echo "SCRIPT_URL=$SCRIPT_URL" >> $RUN_WORKFLOW_FILE -echo "wget \$SCRIPT_URL/\$RUN_SCRIPT" >> $RUN_WORKFLOW_FILE -echo "chmod +x \$RUN_SCRIPT" >> $RUN_WORKFLOW_FILE -echo "source \$RUN_SCRIPT \$JOBID" \$SHUTDOWN_MIN >> $RUN_WORKFLOW_FILE - diff --git a/awsf3-docker/Dockerfile b/awsf3-docker/Dockerfile new file mode 100755 index 000000000..e11a36001 --- /dev/null +++ b/awsf3-docker/Dockerfile @@ -0,0 +1,103 @@ +FROM ubuntu:20.04 +MAINTAINER Soo Lee (duplexa@gmail.com) + + +# general updates & installing necessary Linux components +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +RUN apt update -y && apt upgrade -y && apt install -y \ + apt-transport-https \ + bzip2 \ + ca-certificates \ + cron \ + curl \ + fuse \ + gcc \ + g++ \ + git \ + less \ + locales \ + make \ + python3 \ + python3-pip \ + time \ + unzip \ + vim \ + wget \ + software-properties-common \ + build-essential \ + libssl-dev \ + libwww-perl \ + libdatetime-perl \ + uuid-dev \ + libgpgme11-dev \ + squashfs-tools \ + libseccomp-dev \ + pkg-config \ + openjdk-8-jre-headless \ + nodejs + +RUN ln -s /usr/bin/python3.8 /usr/bin/python +RUN ln -s /usr/bin/pip3 /usr/bin/pip + +WORKDIR /usr/local/bin + +# docker inside docker +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \ + && apt-key fingerprint 0EBFCD88 \ + && add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" +RUN apt-get update -y \ + && apt-cache policy docker-ce \ + && apt-get install -y docker-ce + +# singularity +RUN wget https://golang.org/dl/go1.15.3.linux-amd64.tar.gz && \ + tar -xzf go1.15.3.linux-amd64.tar.gz && \ + rm go1.15.3.linux-amd64.tar.gz +RUN export SINGULARITY_VERSION=3.3.0 && \ + export PATH=/usr/local/bin/go/bin/:$PATH && \ + wget https://github.com/sylabs/singularity/releases/download/v${SINGULARITY_VERSION}/singularity-${SINGULARITY_VERSION}.tar.gz && \ + tar -xzf singularity-${SINGULARITY_VERSION}.tar.gz && \ + rm singularity-${SINGULARITY_VERSION}.tar.gz && \ + cd singularity && \ + ./mconfig && \ + make -C ./builddir && \ + make -C ./builddir install && \ + cd .. && \ + rm -rf go && \ + mv singularity/singularity singularity2 && \ + rm -rf singularity && \ + mv singularity2 singularity + +# goofys +RUN curl -O -L http://bit.ly/goofys-latest && chmod +x goofys-latest + +# python packages +RUN pip install boto3==1.15 awscli==1.18.152 botocore==1.18.11 +RUN pip install psutil==5.7.3 +RUN pip install schema-salad==7.0.20200811075006 cwltool==3.0.20201017180608 +RUN pip install ec2metadata==2.0.1 + +# cromwell for WDL 1.0 +RUN wget https://github.com/broadinstitute/cromwell/releases/download/53.1/cromwell-53.1.jar && \ + ln -s cromwell-53.1.jar cromwell.jar +# Old cromwell for WDL draft-2 +RUN wget https://github.com/broadinstitute/cromwell/releases/download/31/cromwell-31.jar +RUN wget https://github.com/broadinstitute/cromwell/blob/develop/LICENSE.txt # cromwell license + +# awsf scripts +COPY run.sh . +COPY cron.sh . +RUN chmod +x run.sh cron.sh +ARG version +RUN pip install tibanna==$version + +# Move default docker daemon location to mounted EBS +COPY daemon.json /etc/docker/daemon.json + + +# supporting UTF-8 +RUN locale-gen "en_US.UTF-8" && update-locale LC_ALL="en_US.UTF-8" +ENV LC_ALL=en_US.UTF-8 + +CMD ["bash"] diff --git a/awsf3-docker/cron.sh b/awsf3-docker/cron.sh new file mode 100644 index 000000000..8d08aaa1d --- /dev/null +++ b/awsf3-docker/cron.sh @@ -0,0 +1,36 @@ +#!/bin/bash +shopt -s extglob + +printHelpAndExit() { + echo "Usage: ${0##*/} -l LOGBUCKET -L LOGFILE -t TOPFILE -T TOPLATESTFILE" + echo "-l LOGBUCKET : bucket for sending log file (required)" + echo "-L LOGFILE : path of log file (required)" + echo "-t TOPFILE : path of top file (required)" + echo "-T TOPLATESTFILE : path of top_latest file (required)" + exit "$1" +} +while getopts "l:L:t:T:" opt; do + case $opt in + l) export LOGBUCKET=$OPTARG;; # bucket for sending log file + L) export LOGFILE=$OPTARG;; # path of log file + t) export TOPFILE=$OPTARG;; # path of top file + T) export TOPLATESTFILE=$OPTARG;; # path of top_latest file + h) printHelpAndExit 0;; + [?]) printHelpAndExit 1;; + esac +done + +# function that executes a command and collecting log +extp(){ $@ > $TOPLATESTFILE; cat $TOPLATESTFILE >> $TOPFILE; } ## usage: extp command + +# function that sends log to s3 +send_top(){ /usr/local/bin/aws s3 cp $TOPFILE s3://$LOGBUCKET; /usr/local/bin/aws s3 cp $TOPLATESTFILE s3://$LOGBUCKET; } ## usage: send_top (no argument) +send_log(){ /usr/local/bin/aws s3 cp $LOGFILE s3://$LOGBUCKET; } ## usage: send_log (no argument) + +# add margin and timestamp to a command +stamp_command() { echo; echo -n 'Timestamp: '; date +%F-%H:%M:%S; $@; echo; } + +extp stamp_command top -b -n 1 -i -c -w512 +send_top +send_log + diff --git a/awsf3-docker/daemon.json b/awsf3-docker/daemon.json new file mode 100644 index 000000000..31c800b37 --- /dev/null +++ b/awsf3-docker/daemon.json @@ -0,0 +1,3 @@ +{ + "data-root": "/mnt/data1/docker" +} diff --git a/awsf3-docker/run.sh b/awsf3-docker/run.sh new file mode 100755 index 000000000..c4732711d --- /dev/null +++ b/awsf3-docker/run.sh @@ -0,0 +1,314 @@ +#!/bin/bash +shopt -s extglob +export SINGULARITY_OPTION= +export STATUS=0 +export LOGBUCKET= + +printHelpAndExit() { + echo "Usage: ${0##*/} -i JOBID -l LOGBUCKET -f EBS_DEVICE [-S STATUS] [-g]" + echo "-i JOBID : awsem job id (required)" + echo "-l LOGBUCKET : bucket for sending log file (required)" + echo "-f EBS_DEVICE : file system (/dev/xxxx) for data EBS" + echo "-S STATUS: inherited status environment variable, if any" + echo "-g : use singularity" + exit "$1" +} +while getopts "i:l:f:S:g" opt; do + case $opt in + i) export JOBID=$OPTARG;; + l) export LOGBUCKET=$OPTARG;; # bucket for sending log file + f) export EBS_DEVICE=$OPTARG;; # file system (/dev/xxxx) for data EBS + S) export STATUS=$OPTARG;; # inherited STATUS env + g) export SINGULARITY_OPTION=--singularity;; # use singularity + h) printHelpAndExit 0;; + [?]) printHelpAndExit 1;; + esac +done + +export RUN_JSON_FILE_NAME=$JOBID.run.json +export POSTRUN_JSON_FILE_NAME=$JOBID.postrun.json +export EBS_DIR=/data1 ## WARNING: also hardcoded in aws_decode_run_json.py +export LOCAL_OUTDIR=$EBS_DIR/out +export LOCAL_INPUT_DIR=$EBS_DIR/input ## WARNING: also hardcoded in aws_decode_run_json.py +export LOCAL_WF_TMPDIR=$EBS_DIR/tmp +export MD5FILE=$JOBID.md5sum.txt +export INPUT_YML_FILE=inputs.yml +export DOWNLOAD_COMMAND_FILE=download_command_list.txt +export MOUNT_COMMAND_FILE=mount_command_list.txt +export ENV_FILE=env_command_list.txt +export LOGFILE=$LOCAL_OUTDIR/$JOBID.log +export LOGJSONFILE=$LOCAL_OUTDIR/$JOBID.log.json +export ERRFILE=$LOCAL_OUTDIR/$JOBID.error # if this is found on s3, that means something went wrong. +export TOPFILE=$LOCAL_OUTDIR/$JOBID.top # now top command output goes to a separate file +export TOPLATESTFILE=$LOCAL_OUTDIR/$JOBID.top_latest # this one includes only the latest top command output +export INSTANCE_ID=$(ec2metadata --instance-id|cut -d' ' -f2) +export INSTANCE_REGION=$(ec2metadata --availability-zone | sed 's/[a-z]$//') +export AWS_ACCOUNT_ID=$(aws sts get-caller-identity| grep Account | sed 's/[^0-9]//g') +export AWS_REGION=$INSTANCE_REGION # this is for importing awsf3 package which imports tibanna package + +# function that executes a command and collecting log +exl(){ $@ >> $LOGFILE 2>> $LOGFILE; handle_error $?; } ## usage: exl command ## ERRCODE has the error code for the command. if something is wrong, send error to s3. +exlj(){ $@ >> $LOGJSONFILE 2>> $LOGFILE; $ERRCODE=$?; cat $LOGJSONFILE >> $LOGFILE; handle_error $ERRCODE; } ## usage: exl command ## ERRCODE has the error code for the command. if something is wrong, send error to s3. This one separates stdout to json as well. +exle(){ $@ >> /dev/null 2>> $LOGFILE; handle_error $?; } ## usage: exle command ## ERRCODE has the error code for the command. if something is wrong, send error to s3. This one eats stdout. Useful for downloading/uploading files to/from s3, because it writes progress to stdout. +exlo(){ $@ 2>> /dev/null >> $LOGFILE; handle_error $?; } ## usage: exlo command ## ERRCODE has the error code for the command. if something is wrong, send error to s3. This one eats stderr. Useful for hiding long errors or credentials. + +# function that sends log to s3 (it requires LOGBUCKET to be defined, which is done by sourcing $ENV_FILE.) +send_log(){ aws s3 cp $LOGFILE s3://$LOGBUCKET &>/dev/null; } ## usage: send_log (no argument) + +# function that sends error file to s3 to notify something went wrong. +send_error(){ touch $ERRFILE; aws s3 cp $ERRFILE s3://$LOGBUCKET; } ## usage: send_error (no argument) + +# function that handles errors - this function calls send_error and send_log +handle_error() { ERRCODE=$1; export STATUS+=,$ERRCODE; if [ "$ERRCODE" -ne 0 ]; then send_error; send_log; exit $ERRCODE; fi; } ## usage: handle_error + + +# make sure log bucket is defined +if [ -z "$LOGBUCKET" ]; then + exl echo "Error: log bucket not defined" + send_error 1; +fi + + +# EBS_DIR cannot be directly mounted to docker container since it's already a mount point for EBS, +# so mount /mnt/data1/ instead and create a symlink. +ln -s /mnt/$EBS_DIR $EBS_DIR + +# Transferring profile info +ln -s /home/ubuntu/.aws /root/.aws + +# log the first message from the container +exl echo +exl echo "## AWSF Docker container created" +exl echo "## instance id: $INSTANCE_ID" +exl echo "## instance region: $INSTANCE_REGION" + +# docker start +exl echo +exl echo "## Starting docker in the AWSF container" +exl service docker start + + +# versions of various tools +exl echo +exl echo "## $(docker --version)" +exl echo "## $(python --version)" +exl echo "## $(pip --version | cut -f1,2 -d' ')" +exl echo "## tibanna awsf3 version $(tibanna --version | cut -f2 -d' ')" +exl echo "## cwltool version $(cwltool --version | cut -f2 -d' ')" +exl echo "## cromwell version $(java -jar /usr/local/bin/cromwell-31.jar --version | cut -f2 -d ' ') for WDL draft2" +exl echo "## cromwell version $(java -jar /usr/local/bin/cromwell.jar --version | cut -f2 -d ' ') for WDL v1.0" +exl echo "## $(singularity --version)" + + +# getting run.json file +exl echo +exl echo "## Downloading and parsing run.json file" +exl cd /home/ubuntu/ +exl aws s3 cp s3://$LOGBUCKET/$RUN_JSON_FILE_NAME . +exl chmod -R +x . +exl awsf3 decode_run_json -i $RUN_JSON_FILE_NAME + + +### add instance ID and file system to postrun json and upload to s3 +exl awsf3 update_postrun_json_init -i $RUN_JSON_FILE_NAME -o $POSTRUN_JSON_FILE_NAME +exl awsf3 upload_postrun_json -i $POSTRUN_JSON_FILE_NAME + + +# setting additional env variables including LANGUAGE and language-related envs. +exl source $ENV_FILE + + +# create subdirectories +if [[ $LANGUAGE == 'wdl' || $LANGUAGE == 'wdl_v1' || $LANGUAGE == 'wdl_draft2' ]] +then + export LOCAL_WFDIR=$EBS_DIR/wdl +elif [[ $LANGUAGE == 'snakemake' ]] +then + export LOCAL_WFDIR=$EBS_DIR/snakemake +elif [[ $LANGUAGE == 'shell' ]] +then + export LOCAL_WFDIR=$EBS_DIR/shell +else + export LOCAL_WFDIR=$EBS_DIR/cwl +fi +exl mkdir -p $LOCAL_WFDIR +send_log + + +### download cwl from github or any other url. +exl echo +exl echo "## Downloading workflow files" +exl awsf3 download_workflow + + +### log into ECR if necessary +exl echo +exl echo "## Logging into ECR" +exlo docker login --username AWS --password $(aws ecr get-login-password --region $INSTANCE_REGION) $AWS_ACCOUNT_ID.dkr.ecr.$INSTANCE_REGION.amazonaws.com; +send_log + + +### download data & reference files from s3 +exl echo +exl echo "## Downloading data & reference files from S3" +exl date +exl mkdir -p $LOCAL_INPUT_DIR +exl cat $DOWNLOAD_COMMAND_FILE +exle source $DOWNLOAD_COMMAND_FILE +exl date +send_log + + +### mount input buckets +exl echo +exl echo "## Mounting input S3 buckets" +exl cat $MOUNT_COMMAND_FILE +exle source $MOUNT_COMMAND_FILE +send_log + + +### just some more logging +exl echo +exl echo "## Current file system status" +exl df -h +exl echo +exl ls -lh $EBS_DIR +exl echo +exl ls -lhR $LOCAL_INPUT_DIR +send_log + + +# set up cronjob for top command +exl echo +exl echo "## Setting up and starting cron job for top commands" +exl service cron start +echo "*/1 * * * * /usr/local/bin/cron.sh -l $LOGBUCKET -L $LOGFILE -t $TOPFILE -T $TOPLATESTFILE" | crontab - + + +### run command +exl echo +exl echo "## Running CWL/WDL/Snakemake/Shell commands" +exl echo +if [[ $LANGUAGE == 'wdl' ]] +then + exl echo "## workflow language: $LANGUAGE (wdl_draft2)" +else + exl echo "## workflow language: $LANGUAGE" +fi +exl echo "## $(docker info | grep 'Operating System')" +exl echo "## $(docker info | grep 'Docker Root Dir')" +exl echo "## $(docker info | grep 'CPUs')" +exl echo "## $(docker info | grep 'Total Memory')" +exl echo +send_log +cwd0=$(pwd) +cd $LOCAL_WFDIR +mkdir -p $LOCAL_WF_TMPDIR +if [[ $LANGUAGE == 'wdl_v1' ]] +then + exl java -jar /usr/local/bin/cromwell.jar run $MAIN_WDL -i $cwd0/$INPUT_YML_FILE -m $LOGJSONFILE + handle_error $? +elif [[ $LANGUAGE == 'wdl_draft2' || $LANGUAGE == 'wdl' ]] # 'wdl' defaults to 'wdl_draft2' for backward compatibility +then + exl java -jar /usr/local/bin/cromwell-31.jar run $MAIN_WDL -i $cwd0/$INPUT_YML_FILE -m $LOGJSONFILE + handle_error $? +elif [[ $LANGUAGE == 'snakemake' ]] +then + exl echo "running $COMMAND in docker image $CONTAINER_IMAGE..." + docker run --privileged -v $EBS_DIR:$EBS_DIR:rw -w $LOCAL_WFDIR $DOCKER_ENV_OPTION $CONTAINER_IMAGE sh -c "$COMMAND" >> $LOGFILE 2>> $LOGFILE; + handle_error $? +elif [[ $LANGUAGE == 'shell' ]] +then + exl echo "running $COMMAND in docker image $CONTAINER_IMAGE..." + exl echo "docker run --privileged -v $EBS_DIR:$EBS_DIR:rw -w $LOCAL_WFDIR $DOCKER_ENV_OPTION $CONTAINER_IMAGE sh -c \"$COMMAND\"" + docker run --privileged -v $EBS_DIR:$EBS_DIR:rw -w $LOCAL_WFDIR $DOCKER_ENV_OPTION $CONTAINER_IMAGE sh -c "$COMMAND" >> $LOGFILE 2>> $LOGFILE; + handle_error $? +else + if [[ $LANGUAGE == 'cwl_draft3' ]] + then + exl echo + exl echo "Error: CWL draft3 is no longer supported. Please switch to v1" + handle_error 1 + fi + exlj cwltool --enable-dev --non-strict --no-read-only --no-match-user --outdir $LOCAL_OUTDIR --tmp-outdir-prefix $LOCAL_WF_TMPDIR --tmpdir-prefix $LOCAL_WF_TMPDIR $PRESERVED_ENV_OPTION $SINGULARITY_OPTION $MAIN_CWL $cwd0/$INPUT_YML_FILE + handle_error $? +fi +cd $cwd0 +exl echo +exl echo "Finished running the command/workflow" +send_log + +### copy output files to s3 +exl echo +exl echo "## Calculating md5sum of output files" +exl date +md5sum $LOCAL_OUTDIR/* | grep -v "$JOBID" >> $MD5FILE ; ## calculate md5sum for output files (except log file, to avoid confusion) +exl cat $MD5FILE +mv $MD5FILE $LOCAL_OUTDIR +exl date ## done time +send_log + +exl echo +exl echo "## Current file system status" +exl df -h +exl echo +exl ls -lhtrR $LOCAL_OUTDIR/ +exl echo +exl ls -lhtr $EBS_DIR/ +exl echo +exl ls -lhtrR $LOCAL_INPUT_DIR/ +send_log + +# more comprehensive log for wdl +if [[ $LANGUAGE == 'wdl' || $LANGUAGE == 'wdl_v1' || $LANGUAGE == 'wdl_draft2' ]] +then + exl echo + exl echo "## Uploading WDL log files to S3" + cwd0=$(pwd) + cd $LOCAL_WFDIR + find . -type f -name 'stdout' -or -name 'stderr' -or -name 'script' -or \ +-name '*.qc' -or -name '*.txt' -or -name '*.log' -or -name '*.png' -or -name '*.pdf' \ +| xargs tar -zcvf debug.tar.gz + exle aws s3 cp debug.tar.gz s3://$LOGBUCKET/$JOBID.debug.tar.gz + cd $cwd0 +fi + +exl echo +exl echo "## Uploading output files to S3" +if [[ $LANGUAGE == 'snakemake' || $LANGUAGE == 'shell' ]] +then + # no log json file is produced + export LOGJSON_OPTION= +else + export LOGJSON_OPTION="-e $LOGJSONFILE" +fi +exl awsf3 update_postrun_json_upload_output -i $POSTRUN_JSON_FILE_NAME $LOGJSON_OPTION -m $LOCAL_OUTDIR/$MD5FILE -o $POSTRUN_JSON_FILE_NAME -L $LANGUAGE +exl awsf3 upload_postrun_json -i $POSTRUN_JSON_FILE_NAME +send_log + +### updating status +exl echo +exl echo "## Checking the job status (0 means success)" +## if STATUS is 21,0,0,1 JOB_STATUS is 21,0,0,1. If STATUS is 0,0,0,0,0,0, JOB_STATUS is 0. +if [ $(echo $STATUS| sed 's/0//g' | sed 's/,//g') ]; then export JOB_STATUS=$STATUS ; else export JOB_STATUS=0; fi +exl echo "JOB_STATUS=$JOB_STATUS" +# This env variable (JOB_STATUS) will be read by aws_update_run_json.py and the result will go into $POSTRUN_JSON_FILE_NAME. + +# update & upload postrun json +exl echo +exl echo "## Updating postrun json file with status, time stamp, input & output size" +# create a postrun.json file that contains the information in the run.json file and additional information (status, stop_time) +export INPUTSIZE=$(du -csh /data1/input| tail -1 | cut -f1) +export TEMPSIZE=$(du -csh /data1/tmp*| tail -1 | cut -f1) +export OUTPUTSIZE=$(du -csh /data1/out| tail -1 | cut -f1) +exl awsf3 update_postrun_json_final -i $POSTRUN_JSON_FILE_NAME -o $POSTRUN_JSON_FILE_NAME -l $LOGFILE +exl awsf3 upload_postrun_json -i $POSTRUN_JSON_FILE_NAME + +# send the final log +exl echo +exl echo "Done" +exl date +send_log + +# send success message +if [ ! -z $JOB_STATUS -a $JOB_STATUS == 0 ]; then touch $JOBID.success; aws s3 cp $JOBID.success s3://$LOGBUCKET/; fi diff --git a/awsf/__init__.py b/awsf3/__init__.py old mode 100644 new mode 100755 similarity index 100% rename from awsf/__init__.py rename to awsf3/__init__.py diff --git a/awsf3/__main__.py b/awsf3/__main__.py new file mode 100755 index 000000000..1932fc493 --- /dev/null +++ b/awsf3/__main__.py @@ -0,0 +1,126 @@ +""" +CLI for tibanna awsf3 package +""" + +# -*- coding: utf-8 -*- +import argparse +import inspect +from tibanna._version import __version__ # for now use the same version as tibanna +from . import utils + + +PACKAGE_NAME = 'awsf3' + + +class Subcommands(object): + + def __init__(self): + pass + + @property + def descriptions(self): + return { + 'decode_run_json': 'decode run json', + 'download_workflow': 'download workflow files', + 'update_postrun_json_init': 'update json json with instance ID and file system', + 'upload_postrun_json': 'upload postrun json file', + 'update_postrun_json_upload_output': 'update json json with output paths/target/md5 and upload outupt', + 'update_postrun_json_final': 'update postrun json with status, time stamp etc' + } + + @property + def args(self): + return { + 'decode_run_json': + [{'flag': ["-i", "--input-run-json"], 'help': "input run json file"}], + 'download_workflow': + [], + 'update_postrun_json_init': + [{'flag': ["-i", "--input-json"], 'help': "input run/postrun json file"}, + {'flag': ["-o", "--output-json"], 'help': "output postrun json file"}], + 'update_postrun_json_upload_output': + [{'flag': ["-i", "--input-json"], 'help': "input run/postrun json file"}, + {'flag': ["-e", "--execution-metadata-file"], + 'help': "execution metadata file (output json of cwltool / cromwell)"}, + {'flag': ["-m", "--md5file"], 'help': "text file storing md5 values for output files"}, + {'flag': ["-o", "--output-json"], 'help': "output postrun json file"}, + {'flag': ["-L", "--language"], 'help': "language", 'default': "cwl_v1"}], + 'upload_postrun_json': + [{'flag': ["-i", "--input-json"], 'help': "input postrun json file to upload to s3"}], + 'update_postrun_json_final': + [{'flag': ["-i", "--input-json"], 'help': "input run/postrun json file"}, + {'flag': ["-o", "--output-json"], 'help': "output postrun json file"}, + {'flag': ["-l", "--logfile"], 'help': "Tibanna awsem log file"}], + } + + +def decode_run_json(input_run_json): + utils.decode_run_json(input_run_json) + + +def download_workflow(): + utils.download_workflow() + + +def update_postrun_json_init(input_json, output_json): + utils.update_postrun_json_init(input_json, output_json) + + +def update_postrun_json_upload_output(input_json, execution_metadata_file, md5file, output_json, language): + utils.update_postrun_json_upload_output(input_json, execution_metadata_file, md5file, output_json, language) + + +def upload_postrun_json(input_json): + utils.upload_postrun_json(input_json) + + +def update_postrun_json_final(input_json, output_json, logfile): + utils.update_postrun_json_final(input_json, output_json, logfile) + + +def main(Subcommands=Subcommands): + """ + Execute the program from the command line + """ + scs = Subcommands() + + # the primary parser is used for awsf -v or -h + primary_parser = argparse.ArgumentParser(prog=PACKAGE_NAME, add_help=False) + primary_parser.add_argument('-v', '--version', action='version', + version='%(prog)s ' + __version__) + # the secondary parser is used for the specific run mode + secondary_parser = argparse.ArgumentParser(prog=PACKAGE_NAME, parents=[primary_parser]) + subparsers = secondary_parser.add_subparsers( + title=PACKAGE_NAME + ' subcommands', + description='choose one of the following subcommands to run ' + PACKAGE_NAME, + dest='subcommand', + metavar='subcommand: {%s}' % ', '.join(scs.descriptions.keys()) + ) + subparsers.required = True + + def add_arg(name, flag, **kwargs): + subparser[name].add_argument(flag[0], flag[1], **kwargs) + + def add_args(name, argdictlist): + for argdict in argdictlist: + add_arg(name, **argdict) + + subparser = dict() + for sc, desc in scs.descriptions.items(): + subparser[sc] = subparsers.add_parser(sc, help=desc, description=desc) + if sc in scs.args: + add_args(sc, scs.args[sc]) + + # two step argument parsing + # first check for top level -v or -h (i.e. `tibanna -v`) + (primary_namespace, remaining) = primary_parser.parse_known_args() + # get subcommand-specific args + args = secondary_parser.parse_args(args=remaining, namespace=primary_namespace) + subcommandf = eval(args.subcommand) + sc_args = [getattr(args, sc_arg) for sc_arg in inspect.getargspec(subcommandf).args] + # run subcommand + subcommandf(*sc_args) + + +if __name__ == '__main__': + main() diff --git a/awsf3/aws_run_workflow_generic.sh b/awsf3/aws_run_workflow_generic.sh new file mode 100755 index 000000000..0ebb718c6 --- /dev/null +++ b/awsf3/aws_run_workflow_generic.sh @@ -0,0 +1,176 @@ +#!/bin/bash +shopt -s extglob +export TIBANNA_VERSION= +export AWSF_IMAGE= +export SHUTDOWN_MIN=now +export PASSWORD= +export ACCESS_KEY= +export SECRET_KEY= +export REGION= +export SINGULARITY_OPTION_TO_PASS= + +printHelpAndExit() { + echo "Usage: ${0##*/} -i JOBID -l LOGBUCKET -V VERSION -A AWSF_IMAGE [-m SHUTDOWN_MIN] [-p PASSWORD] [-a ACCESS_KEY] [-s SECRET_KEY] [-r REGION] [-g]" + echo "-i JOBID : awsem job id (required)" + echo "-l LOGBUCKET : bucket for sending log file (required)" + echo "-V TIBANNA_VERSION : tibanna version (used in the run_task lambda that launched this instance)" + echo "-A AWSF_IMAGE : docker image name for awsf3 (e.g. 4dn-dcic/tibanna-awsf3:1.0.0)" + echo "-m SHUTDOWN_MIN : Possibly user can specify SHUTDOWN_MIN to hold it for a while for debugging. (default 'now')" + echo "-p PASSWORD : Password for ssh connection for user ec2-user (if not set, no password-based ssh)" + echo "-a ACCESS_KEY : access key for certain s3 bucket access (if not set, use IAM permission only)" + echo "-s SECRET_KEY : secret key for certian s3 bucket access (if not set, use IAM permission only)" + echo "-r REGION : region for the profile set for certain s3 bucket access (if not set, use IAM permission only)" + echo "-g : use singularity" + exit "$1" +} +while getopts "i:m:l:p:a:s:r:gV:A:" opt; do + case $opt in + i) export JOBID=$OPTARG;; + l) export LOGBUCKET=$OPTARG;; # bucket for sending log file + V) export TIBANNA_VERSION=$OPTARG;; # version of tibanna used in the run_task lambda that launched this instance + A) export AWSF_IMAGE=$OPTARG;; # docker image name for awsf3 (e.g. 4dn-dcic/tibanna-awsf3:1.0.0) + m) export SHUTDOWN_MIN=$OPTARG;; # Possibly user can specify SHUTDOWN_MIN to hold it for a while for debugging. + p) export PASSWORD=$OPTARG ;; # Password for ssh connection for user ec2-user + a) export ACCESS_KEY=$OPTARG;; # access key for certain s3 bucket access + s) export SECRET_KEY=$OPTARG;; # secret key for certian s3 bucket access + r) export REGION=$OPTARG;; # region for the profile set for certian s3 bucket access + g) export SINGULARITY_OPTION_TO_PASS=-g;; # use singularity + h) printHelpAndExit 0;; + [?]) printHelpAndExit 1;; + esac +done + +export EBS_DIR=/data1 ## WARNING: also hardcoded in aws_decode_run_json.py +export LOCAL_OUTDIR=$EBS_DIR/out +export LOGFILE1=templog___ # log before mounting ebs +export LOGFILE2=$LOCAL_OUTDIR/$JOBID.log +export STATUS=0 +export ERRFILE=$LOCAL_OUTDIR/$JOBID.error # if this is found on s3, that means something went wrong. +export INSTANCE_REGION=$(ec2metadata --availability-zone | sed 's/[a-z]$//') + + +# function that executes a command and collecting log +exl(){ $@ >> $LOGFILE 2>> $LOGFILE; handle_error $?; } ## usage: exl command ## ERRCODE has the error code for the command. if something is wrong, send error to s3. + +# function that sends log to s3 (it requires LOGBUCKET to be defined, which is done by sourcing $ENV_FILE.) +send_log(){ aws s3 cp $LOGFILE s3://$LOGBUCKET &>/dev/null; } ## usage: send_log (no argument) + +# function that sends error file to s3 to notify something went wrong. +send_error(){ touch $ERRFILE; aws s3 cp $ERRFILE s3://$LOGBUCKET; } ## usage: send_error (no argument) + +# function that handles errors - this function calls send_error and send_log +handle_error() { ERRCODE=$1; STATUS+=,$ERRCODE; if [ "$ERRCODE" -ne 0 ]; then send_error; send_log; shutdown -h $SHUTDOWN_MIN; fi; } ## usage: handle_error + +### start with a log under the home directory for ubuntu. Later this will be moved to the output directory, once the ebs is mounted. +export LOGFILE=$LOGFILE1 +cd /home/ubuntu/ +touch $LOGFILE + + +# make sure log bucket is defined +if [ -z "$LOGBUCKET" ]; then + exl echo "Error: log bucket not defined"; # just add this message to the log file, which may help debugging by ssh + shutdown -h $SHUTDOWN_MIN; +fi +# tibanna version and awsf image should also be defined +if [ -z "$TIBANNA_VERSION" ]; then + exl echo "Error: tibanna lambda version is not defined"; + handle_error; +fi +if [ -z "$AWSF_IMAGE" ]; then + exl echo "Error: awsf docker image is not defined"; + handle_error; +fi + + +### send job start message to S3 +touch $JOBID.job_started +aws s3 cp $JOBID.job_started s3://$LOGBUCKET/$JOBID.job_started + +### start logging +### env +exl echo "## job id: $JOBID" +exl echo "## instance type: $(ec2metadata --instance-type)" +exl echo "## instance id: $(ec2metadata --instance-id)" +exl echo "## instance region: $INSTANCE_REGION" +exl echo "## tibanna lambda version: $TIBANNA_VERSION" +exl echo "## awsf image: $AWSF_IMAGE" +exl echo "## ami id: $(ec2metadata --ami-id)" +exl echo "## availability zone: $(ec2metadata --availability-zone)" +exl echo "## security groups: $(ec2metadata --security-groups)" +exl echo "## log bucket: $LOGBUCKET" +exl echo "## shutdown min: $SHUTDOWN_MIN" +exl echo +exl echo "## Starting..." +exl date + + +### sshd configure for password recognition +exl echo +exl echo "## Configuring and starting ssh" +if [ ! -z $PASSWORD ]; then + echo -ne "$PASSWORD\n$PASSWORD\n" | sudo passwd ubuntu + sed 's/PasswordAuthentication no/PasswordAuthentication yes/g' /etc/ssh/sshd_config | sed 's/#PasswordAuthentication no/PasswordAuthentication yes/g' > tmpp + mv tmpp /etc/ssh/sshd_config + exl service ssh restart +fi + + +### mount the EBS volume to the EBS_DIR (This has changed and only works with the new ubuntu 20.04 AMI) +exl echo +exl echo "## Mounting EBS" +exl lsblk $TMPLOGFILE +exl export ROOT_EBS=$(lsblk -o PKNAME | tail +2 | awk '$1!=""') +exl export EBS_DEVICE=/dev/$(lsblk -o TYPE,KNAME | tail +2 | grep disk | grep -v $ROOT_EBS | cut -f2 -d' ') +exl mkfs -t ext4 $EBS_DEVICE # creating a file system +exl mkdir /mnt/$EBS_DIR +exl mount $EBS_DEVICE /mnt/$EBS_DIR # mount +exl ln -s /mnt/$EBS_DIR $EBS_DIR +exl chown -R ubuntu $EBS_DIR +exl chmod -R +x $EBS_DIR +exl echo "Mounting finished." +exl echo "Data EBS file system: $EBS_DEVICE" + + +### create local outdir under the mounted ebs directory and move log file into that output directory +exl mkdir -p $LOCAL_OUTDIR +mv $LOGFILE1 $LOGFILE2 +export LOGFILE=$LOGFILE2 + + +# set up cronjojb for cloudwatch metrics for memory, disk space and CPU utilization +exl echo +exl echo "## Turning on cloudwatch metrics for memory and disk space" +cwd0=$(pwd) +cd ~ +apt install -y unzip libwww-perl libdatetime-perl +curl https://aws-cloudwatch.s3.amazonaws.com/downloads/CloudWatchMonitoringScripts-1.2.2.zip -O +unzip CloudWatchMonitoringScripts-1.2.2.zip && rm CloudWatchMonitoringScripts-1.2.2.zip && cd aws-scripts-mon +echo "*/1 * * * * ~/aws-scripts-mon/mon-put-instance-data.pl --mem-util --mem-used --mem-avail --disk-space-util --disk-space-used --disk-path=/data1/ --from-cron" > cloudwatch.jobs +echo "*/1 * * * * ~/aws-scripts-mon/mon-put-instance-data.pl --disk-space-util --disk-space-used --disk-path=/ --from-cron" >> cloudwatch.jobs +cat cloudwatch.jobs | crontab - +cd $cwd0 + +# set additional profile +if [ -z $REGION ]; then + export REGION=$INSTANCE_REGION +fi +if [ ! -z $ACCESS_KEY -a ! -z $SECRET_KEY -a ! -z $REGION ]; then + echo -ne "$ACCESS_KEY\n$SECRET_KEY\n$REGION\njson" | aws configure --profile user1 +fi + +# send log before starting docker +exl echo +exl echo "## Running dockerized awsf scripts" +send_log + +# run dockerized awsf scripts +docker run --privileged --net host -v /home/ubuntu/:/home/ubuntu/:rw -v /mnt/:/mnt/:rw $AWSF_IMAGE run.sh -i $JOBID -l $LOGBUCKET -f $EBS_DEVICE -S $STATUS $SINGULARITY_OPTION_TO_PASS +handle_error $? + +### self-terminate +# (option 1) ## This is the easiest if the 'shutdown behavior' set to 'terminate' for the instance at launch. +shutdown -h $SHUTDOWN_MIN +# (option 2) ## This works only if the instance is given a proper permission (This is more standard but I never actually got it to work) +#id=$(ec2-metadata -i|cut -d' ' -f2) +#aws ec2 terminate-instances --instance-ids $id diff --git a/awsf3/log.py b/awsf3/log.py new file mode 100644 index 000000000..2da2f80be --- /dev/null +++ b/awsf3/log.py @@ -0,0 +1,30 @@ +def read_logfile_by_line(logfile): + """generator function that yields the log file content line by line""" + with open(logfile, 'r') as f: + for line in f: + yield line + yield None + + +def parse_commands(log_content): + """ + parse cwl commands from the line-by-line generator of log file content and + returns the commands as a list of command line lists, each corresponding to a step run. + """ + command_list = [] + command = [] + in_command = False + line = next(log_content) + while(line): + line = line.strip('\n') + if '[job' in line and line.endswith('docker \\'): + line = 'docker \\' # remove the other stuff + in_command = True + if in_command: + command.append(line.strip('\\').rstrip(' ')) + if not line.endswith('\\'): + in_command = False + command_list.append(command) + command = [] + line = next(log_content) + return(command_list) diff --git a/awsf3/target.py b/awsf3/target.py new file mode 100755 index 000000000..1990afd62 --- /dev/null +++ b/awsf3/target.py @@ -0,0 +1,239 @@ +import re +import os +import boto3 +import copy +from zipfile import ZipFile +from io import BytesIO +import mimetypes + + +class Target(object): + """Class handling output_target and secondary_output_target""" + + # source_directory = '/data1/out/' + + def __init__(self, output_bucket): + self.source = '' + self.bucket = output_bucket + self.dest = '' + self.unzip = False + self.s3 = None # boto3 client + + @property + def source_name(self): + return re.sub('^/data1/((shell|out)/)*', '', self.source) + # return self.source.replace(self.source_directory, '') + + @property + def is_valid(self): + if self.source and self.dest and self.bucket: + return True + else: + return False + + def is_custom_target(self, target_key): + if target_key.startswith('file://'): + return True + else: + return False + + def parse_custom_target(self, target_key, target_value): + """takes a key-value pair from output_target, parses the content. + This function only handles custom cases where the key starts with file:// + (not valid CWL/WDL targets)""" + if self.is_custom_target(target_key): + self.source = target_key.replace('file://', '') + if not target_value: + raise Exception("output_target missing for target %s" % target_key) + self.parse_target_value(target_value) + + def parse_cwl_target(self, target_key, target_value, prj_output_files): + """takes a key-value pair from output_target, parses the content. + prj_output_files is a dictionary that contains {: """ + if not self.is_custom_target(target_key): + self.source = prj_output_files[target_key].path + if target_value: + self.parse_target_value(target_value) + else: + self.dest = self.source_name # do not change file name + + def parse_target_value(self, target_value): + """target value can be a dictionary with following keys: object_key, bucket_name, object_prefix, unzip. + or it can be a string that refers to the object_key or in the format of s3:///. + This function changes attributes bucket, dest, unzip.""" + if isinstance(target_value, dict): + if 'unzip' in target_value and target_value['unzip'] is True: + if 'object_prefix' not in target_value: + raise Exception("object_prefix must be given with unzip=True") + self.unzip = True + if 'bucket_name' in target_value: # this allows using different output buckets + self.bucket = target_value['bucket_name'] + if 'object_prefix' in target_value: + if 'object_key' in target_value: + raise Exception("Specify either object_key or object_prefix, but not both in output_target") + if not target_value['object_prefix'].endswith('/'): + target_value['object_prefix'] += '/' + self.dest = target_value['object_prefix'] + if 'object_key' in target_value: + if target_value['object_key'].endswith('/'): + raise Exception("object_key cannot end with '/' - please use object_prefix instead") + self.dest = target_value['object_key'] + elif isinstance(target_value, str): + if target_value.startswith('s3://'): # this allows using different output buckets + output_path = re.sub('^s3://', '', target_value) + self.bucket = output_path.split('/')[0] + self.dest = re.sub('^' + self.bucket + '/', '', output_path) + else: + self.dest = target_value # change file name to what's specified in output_target + + def as_dict(self): + d = copy.deepcopy(self.__dict__) + for attr in self.exclude_from_dict: + del d[attr] + return d + + @property + def exclude_from_dict(self): + return ['s3'] + + def unzip_source(self): + if not self.unzip: + raise Exception("Unzip error: unzip=True is not set") + with open(self.source, 'rb') as zf: + body = zf.read() + z = ZipFile(BytesIO(body)) + for content_file_name in z.namelist(): + if content_file_name.endswith('/'): # only copy files + continue + yield {'name': content_file_name, 'content': z.open(content_file_name).read()} + yield None + + def upload_to_s3(self): + """upload target to s3, source can be either a file or a directory.""" + if not self.is_valid: + raise Exception('Upload Error: source / dest must be specified first') + if not self.s3: + self.s3 = boto3.client('s3') + err_msg = "failed to upload output file %s to %s. %s" + if os.path.isdir(self.source): + print("source " + self.source + " is a directory") + print("uploading output directory %s to %s in bucket %s" % (self.source, self.dest, self.bucket)) + if self.unzip: + print("Warning: unzip option is ignored because the source is a directory.") + source = self.source.rstrip('/') + for root, dirs, files in os.walk(source): + for f in files: + source_f = os.path.join(root, f) + if root == source: + dest_f = os.path.join(self.dest, f) + else: + dest_subdir = re.sub('^' + source + '/', '', root) + dest_f = os.path.join(self.dest, dest_subdir, f) + print("source_f=" + source_f) + print("dest_f=" + dest_f) + try: + self.s3.upload_file(source_f, self.bucket, dest_f) + except Exception as e: + raise Exception(err_msg % (source_f, self.bucket + '/' + dest_f, str(e))) + elif self.unzip: + # unzip the content files to S3 + try: + zip_content = self.unzip_source() + except: + print("Unzipping failed: source " + self.source + " may not be a zip file") + print("source " + self.source + " is a zip file. Unzipping..") + arcfile = next(zip_content) + while(arcfile): + # decide on content type + content_type = mimetypes.guess_type(arcfile['name'])[0] + if not content_type: + content_type = 'binary/octet-stream' + # upload to S3 + put_object_args = {'Bucket': self.bucket, + 'Key': self.dest + arcfile['name'], + 'Body': arcfile['content'], + 'ContentType': content_type} + try: + print("Putting object %s to %s in bucket %s" % (arcfile['name'], self.dest + arcfile['name'], self.bucket)) + self.s3.put_object(**put_object_args) + except Exception as e: + raise Exception("failed to put unzipped content %s for file %s. %s" % (arcfile['name'], self.source, str(e))) + arcfile = next(zip_content) + else: + print("source " + self.source + " is an ordinary file.") + if self.dest.endswith('/'): + # self.dest is a prefix + dest = os.path.join(self.dest, self.source_name) + print("uploading output source %s to %s in bucket %s" % (self.source, dest, self.bucket)) + try: + self.s3.upload_file(self.source, self.bucket, dest) + except Exception as e: + raise Exception(err_msg % (self.source, self.bucket + '/' + dest, str(e))) + else: + try: + print("uploading output source %s to %s in bucket %s" % (self.source, self.dest, self.bucket)) + self.s3.upload_file(self.source, self.bucket, self.dest) + except Exception as e: + raise Exception(err_msg % (self.source, self.bucket + '/' + self.dest, str(e))) + + +class SecondaryTarget(Target): + def is_matched(self, source_path): + if not self.dest: + raise Exception("first calculate dest (destination) to check matching.") + # check the last three letters between dest and source_path + if self.dest[-3:] == source_path[-3:]: + return True + else: + return False + + def parse_custom_target(self, target_key, target_value): + raise Exception("Function disabled") + + def parse_cwl_target(self, target_key, target_value, prj_output_files): + raise Exception("Function disabled") + + +class SecondaryTargetList(object): + def __init__(self, output_bucket): + self.n = 0 # size of the list (i.e. number of secondary targets) + self.secondary_targets = [] # list of SecondaryTarget objects + self.bucket = output_bucket + + def parse_target_values(self, target_values): + self.n = len(target_values) # size of the list (i.e. number of secondary targets) + self.secondary_targets = [SecondaryTarget(self.bucket) for i in range(self.n)] + for st, tv in zip(self.secondary_targets, target_values): + st.parse_target_value(tv) + + def reorder_by_source(self, source_paths): + if len(source_paths) < self.n: + raise Exception("Not enough source_paths for secondary targets " + + "(%d vs %d)" % (len(source_paths), self.n)) + n_assigned = 0 + reordered_secondary_targets = [] + for sp in source_paths: + matched = False + for st in self.secondary_targets: + if st.is_matched(sp): + st.source = sp + reordered_secondary_targets.append(st) + n_assigned += 1 + matched = True + break + if not matched: + # if no matching target is defined, use the source name + additional_st = SecondaryTarget(self.bucket) + additional_st.source = sp + additional_st.dest = additional_st.source_name + reordered_secondary_targets.append(additional_st) + n_assigned += 1 + self.n += 1 + + if n_assigned != self.n: + raise Exception("Error: Not all secondary output targets are being uploaded!" + + "{} vs {}".format(n_assigned, self.n)) + self.secondary_targets = reordered_secondary_targets + + def as_dict(self): + return [st.as_dict() for st in self.secondary_targets] diff --git a/awsf3/utils.py b/awsf3/utils.py new file mode 100644 index 000000000..d2ef71ffa --- /dev/null +++ b/awsf3/utils.py @@ -0,0 +1,453 @@ +import json +import os +import subprocess +import boto3 +import re +import time +from tibanna.awsem import ( + AwsemRunJson, + AwsemPostRunJson, + AwsemPostRunJsonOutput +) +from tibanna.nnested_array import ( + run_on_nested_arrays2, + flatten, + create_dim +) +from .target import Target, SecondaryTargetList +from . import log + + +downloadlist_filename = "download_command_list.txt" +mountlist_filename = "mount_command_list.txt" +input_yml_filename = "inputs.yml" +env_filename = "env_command_list.txt" +INPUT_DIR = "/data1/input" # data are downloaded to this directory +INPUT_MOUNT_DIR_PREFIX = "/data1/input-mounted-" # data are mounted to this directory + bucket name + + +def decode_run_json(input_json_file): + """reads a run json file and creates three text files: + download command list file (commands to download input files from s3) + input yml file (for cwl/wdl/snakemake run) + env list file (environment variables to be sourced) + """ + # read json file + with open(input_json_file, 'r') as f: + runjson = AwsemRunJson(**json.load(f)) + runjson_input = runjson.Job.Input + language = runjson.Job.App.language + + # create a download command list file from the information in json + create_download_command_list(downloadlist_filename, runjson_input) + + # create a bucket-mounting command list file + create_mount_command_list(mountlist_filename, runjson_input) + + # create an input yml file to be used on awsem + if language in ['wdl', 'wdl_v1', 'wdl_draft2']: # wdl + create_input_for_wdl(input_yml_filename, runjson_input) + elif language == 'snakemake': # snakemake + create_input_for_snakemake(input_yml_filename, runjson_input) + else: # cwl + create_input_for_cwl(input_yml_filename, runjson_input) + + # create a file that defines environmental variables + create_env_def_file(env_filename, runjson, language) + + +def create_mount_command_list(mountlist_filename, runjson_input): + buckets_to_be_mounted = set() + for category in ["Input_files_data", "Secondary_files_data"]: + for inkey, v in getattr(runjson_input, category).items(): + if v.mount: + buckets_to_be_mounted.add(v.dir_) + with open(mountlist_filename, 'w') as f: + for b in sorted(buckets_to_be_mounted): + f.write("mkdir -p %s\n" % (INPUT_MOUNT_DIR_PREFIX + b)) + f.write("goofys-latest -f %s %s &\n" % (b, INPUT_MOUNT_DIR_PREFIX + b)) + + +def create_download_command_list(downloadlist_filename, runjson_input): + """create a download command list file from the information in json""" + with open(downloadlist_filename, 'w') as f: + for category in ["Input_files_data", "Secondary_files_data"]: + for inkey, v in getattr(runjson_input, category).items(): + if v.mount: # do not download if it will be mounted + continue + if inkey.startswith('file://'): + target = inkey.replace('file://', '') + print("key %s will be downloaded to target %s" % (v.path, inkey)) + run_on_nested_arrays2(v.path, target, add_download_cmd, data_bucket=v.dir_, + profile=v.profile, f=f, unzip=v.unzip) + else: + target_template = INPUT_DIR + "/%s" + if not v.rename or len(flatten(v.rename)) == 0: + rename = create_dim(v.path, empty=True) + else: + rename = v.rename + run_on_nested_arrays2(v.path, rename, add_download_cmd, data_bucket=v.dir_, + profile=v.profile, f=f, unzip=v.unzip, target_template=target_template) + + +def add_download_cmd(data_file, rename, data_bucket, profile, f, unzip, target_template='%s'): + if data_file: + if not rename: + rename = data_file + target = target_template % rename + cmd = create_download_cmd(data_bucket, data_file, target, profile, unzip) + f.write(cmd + '\n') + + +def determine_key_type(bucket, key, profile): + """Return values : 'File', 'Folder' or 'Does not exist'""" + if profile: + s3 = boto3.session(profile_name=profile).client('s3') + else: + s3 = boto3.client('s3') + if not key: + raise Exception("Cannot determine key type - no key is specified") + if not bucket: + raise Exception("Cannot determine key type - no bucket is specified") + if key.endswith('/'): + key = key.rstrip('/') + res = s3.list_objects_v2(Bucket=bucket, Prefix=key + '/') + if not 'KeyCount' in res: + raise Exception("Cannot determine key type - no response from S3") + if res['KeyCount'] == 0: + res2 = s3.list_objects_v2(Bucket=bucket, Prefix=key) + if not 'KeyCount' in res2: + raise Exception("Cannot determine key type - no response from S3") + elif res2['KeyCount'] == 0: + return 'Does not exist' # key does not exist + # The file itself may be a prefix of another file (e.v. abc.vcf.gz vs abc.vcf.gz.tbi) + # but it doesn't matter. + else: + return 'File' + else: + # data_file is a folder + return 'Folder' + + +def create_download_cmd(data_bucket, data_file, target, profile, unzip=''): + profile_flag = ' --profile ' + profile if profile else '' + format_list = [data_bucket, data_file, target, profile_flag] + key_type = determine_key_type(data_bucket, data_file, profile) + if key_type == 'Does not exist': + raise Exception("Cannot download file s3://%s/%s - file does not exist." % (data_bucket, data_file)) + elif key_type == 'File': + download_cmd = 'aws s3 cp s3://{0}/{1} {2}{3}'.format(*format_list) + if unzip == 'gz': + unzip_cmd = 'gunzip {2}' + elif unzip == 'bz2': + unzip_cmd = 'bzip2 -d {2}' + else: + unzip_cmd = '' + cmd = download_cmd + '; ' + unzip_cmd + return cmd.format(*format_list) + else: # key_type == 'Folder': + download_cmd = 'aws s3 cp --recursive s3://{0}/{1} {2}{3}'.format(*format_list) + if unzip == 'gz': + unzip_cmd = 'for f in `find {2} -type f`; do if [[ $f =~ \\.gz$ ]]; then gunzip $f; fi; done;' + elif unzip == 'bz2': + unzip_cmd = 'for f in `find {2} -type f`; do if [[ $f =~ \\.bz2$ ]]; then bzip2 -d $f; fi; done;' + else: + unzip_cmd = '' + cmd = download_cmd + '; ' + unzip_cmd + return cmd.format(*format_list) + + +# create an input yml file for cwl-runner +def create_input_for_cwl(input_yml_filename, runjson_input): + yml = runjson_input.as_dict_as_cwl_input(INPUT_DIR, INPUT_MOUNT_DIR_PREFIX) + with open(input_yml_filename, 'w') as f_yml: + json.dump(yml, f_yml, indent=4, sort_keys=True) + + +def create_input_for_wdl(input_yml_filename, runjson_input): + yml = runjson_input.as_dict_as_wdl_input(INPUT_DIR, INPUT_MOUNT_DIR_PREFIX) + with open(input_yml_filename, 'w') as f_yml: + json.dump(yml, f_yml, indent=4, sort_keys=True) + + +def create_input_for_snakemake(input_yml_filename, runjson_input): + pass # for now assume no input yml + + +# create a file that defines environmental variables +def create_env_def_file(env_filename, runjson, language): + # I have to use these variables after this script finishes running. + # I didn't use os.environ + os.system('bash') because that would remove the other + # env variables set before this script started running. + app = runjson.Job.App + with open(env_filename, 'w') as f_env: + f_env.write("export LANGUAGE={}\n".format(app.language)) + if language in ['wdl', 'wdl_v1', 'wdl_draft2']: + f_env.write("export WDL_URL={}\n".format(app.wdl_url)) + f_env.write("export MAIN_WDL={}\n".format(app.main_wdl)) + f_env.write("export WDL_FILES=\"{}\"\n".format(' '.join(app.other_wdl_files.split(',')))) + elif language == 'snakemake': + f_env.write("export SNAKEMAKE_URL={}\n".format(app.snakemake_url)) + f_env.write("export MAIN_SNAKEMAKE={}\n".format(app.main_snakemake)) + f_env.write("export SNAKEMAKE_FILES=\"{}\"\n".format(' '.join(app.other_snakemake_files.split(',')))) + f_env.write("export COMMAND=\"{}\"\n".format(app.command.replace("\"", "\\\""))) + f_env.write("export CONTAINER_IMAGE={}\n".format(app.container_image)) + elif language == 'shell': + f_env.write("export COMMAND=\"{}\"\n".format(app.command.replace("\"", "\\\""))) + f_env.write("export CONTAINER_IMAGE={}\n".format(app.container_image)) + else: # cwl + f_env.write("export CWL_URL={}\n".format(app.cwl_url)) + f_env.write("export MAIN_CWL={}\n".format(app.main_cwl)) + f_env.write("export CWL_FILES=\"{}\"\n".format(' '.join(app.other_cwl_files.split(',')))) + # other env variables + env_preserv_str = '' + docker_env_str = '' + if runjson.Job.Input.Env: + for ev, val in sorted(runjson.Job.Input.Env.items()): + f_env.write("export {}={}\n".format(ev, val)) + env_preserv_str = env_preserv_str + "--preserve-environment " + ev + " " + docker_env_str = docker_env_str + "-e " + ev + " " + f_env.write("export PRESERVED_ENV_OPTION=\"{}\"\n".format(env_preserv_str)) + f_env.write("export DOCKER_ENV_OPTION=\"{}\"\n".format(docker_env_str)) + + +def download_workflow(): + language = os.environ.get('LANGUAGE') + if language == 'shell': + return + local_wfdir = os.environ.get('LOCAL_WFDIR') + subprocess.call(['mkdir', '-p', local_wfdir]) + + if language in ['wdl', 'wdl_v1', 'wdl_draft2']: + main_wf = os.environ.get('MAIN_WDL', '') + wf_files = os.environ.get('WDL_FILES', '') + wf_url = os.environ.get('WDL_URL') + elif language == 'snakemake': + main_wf = os.environ.get('MAIN_SNAKEMAKE', '') + wf_files = os.environ.get('SNAKEMAKE_FILES', '') + wf_url = os.environ.get('SNAKEMAKE_URL') + else: + main_wf = os.environ.get('MAIN_CWL', '') + wf_files = os.environ.get('CWL_FILES', '') + wf_url = os.environ.get('CWL_URL') + # turn into a list + if not wf_files: + wf_files = [] + elif ' ' in wf_files: + wf_files = wf_files.split(' ') + else: + wf_files = [wf_files] + wf_files.append(main_wf) + wf_url = wf_url.rstrip('/') + + print("main workflow file: %s" % main_wf) + print("workflow files: " + str(wf_files)) + + s3 = boto3.client('s3') + for wf_file in wf_files: + target = "%s/%s" % (local_wfdir, wf_file) + source = "%s/%s" % (wf_url, wf_file) + if wf_url.startswith('http'): + subprocess.call(["wget", "-O" + target, source]) + elif wf_url.startswith('s3'): + wf_loc = wf_url.replace('s3://', '') + bucket_name = wf_loc.split('/')[0] + if len(wf_loc.split('/')) > 1: + subdirectory = wf_loc.replace(bucket_name + '/', '') + key = subdirectory + '/' + wf_file + else: + key = wf_file + print("downloading key %s from bucket %s to target %s" % (key, bucket_name, target)) + if '/' in target: + targetdir = re.sub('[^/]+$', '', target) + subprocess.call(["mkdir", "-p", targetdir]) + s3.download_file(Bucket=bucket_name, Key=key, Filename=target) + + +def read_md5file(md5file): + with open(md5file, 'r') as md5_f: + md5dict = dict() + for line in md5_f: + a = line.split() + path = a[1] + md5sum = a[0] + md5dict[path] = md5sum + return md5dict + + +def create_output_files_dict(language='cwl', execution_metadata=None, md5dict=None): + """create a dictionary that contains 'path', 'secondaryFiles', 'md5sum' with argnames as keys. + For snakemake and shell, returns an empty dictionary (execution_metadata not required). + secondaryFiles is added only if the language is cwl. + execution_metadata is a dictionary read from wdl/cwl execution log json file. + md5dict is a dictionary with key=file path, value=md5sum (optional).""" + if language in ['cwl', 'cwl_v1', 'wdl'] and not execution_metadata: + raise Exception("execution_metadata is required for cwl/wdl.") + out_meta = dict() + if language in ['wdl', 'wdl_v1', 'wdl_draft2']: + for argname, outfile in execution_metadata['outputs'].items(): + if outfile: + out_meta[argname] = {'path': outfile} + elif language == 'snakemake' or language == 'shell': + out_meta = {} + else: # cwl, cwl_v1 + # read cwl output json file + out_meta = execution_metadata + + # add md5 + if not md5dict: + md5dict = {} + for of, ofv in out_meta.items(): + if ofv['path'] in md5dict: + ofv['md5sum'] = md5dict[ofv['path']] + if 'secondaryFiles' in ofv: + for sf in ofv['secondaryFiles']: + if sf['path'] in md5dict: + sf['md5sum'] = md5dict[sf['path']] + + return out_meta + + +def read_postrun_json(jsonfile): + # read old json file + with open(jsonfile, 'r') as json_f: + prj = AwsemPostRunJson(**json.load(json_f)) + return prj + + +def format_postrun_json(prj): + return json.dumps(prj.as_dict(), indent=4, sort_keys=True) + + +def write_postrun_json(jsonfile, prj): + with open(jsonfile, 'w') as f: + f.write(format_postrun_json(prj)) + + +def update_postrun_json_init(json_old, json_new): + """Update postrun json with just instance ID and filesystem""" + # read old json file + prj = read_postrun_json(json_old) + + # simply add instance ID and file system + prj.Job.instance_id = os.getenv('INSTANCE_ID') + prj.Job.filesystem = os.getenv('EBS_DEVICE') + + # write to new json file + write_postrun_json(json_new, prj) + + +def update_postrun_json_upload_output(json_old, execution_metadata_file, md5file, json_new, language='cwl_v1'): + """Update postrun json with output files""" + # read old json file and prepare postrunjson skeleton + prj = read_postrun_json(json_old) + + # read md5 file + md5dict = read_md5file(md5file) + + # read execution metadata file + if execution_metadata_file: + with open(execution_metadata_file, 'r') as f: + execution_metadata = json.load(f) + else: + execution_metadata = None + output_files = create_output_files_dict(language, execution_metadata, md5dict) + + # create output files for postrun json + prj.Job.Output.add_output_files(output_files) + + # upload output to S3 (this also updates postrun json) + upload_output(prj) + + # write to new json file + write_postrun_json(json_new, prj) + + +def upload_output(prj): + # parsing output_target and uploading output files to output target + upload_to_output_target(prj.Job.Output) + + +def upload_to_output_target(prj_out): + # parsing output_target and uploading output files to output target + output_bucket = prj_out.output_bucket_directory + output_argnames = prj_out.output_files.keys() + output_target = prj_out.alt_output_target(output_argnames) + + for k in output_target: + target = Target(output_bucket) + + # 'file://' output targets + if target.is_custom_target(k): + print("processing custom (path-based) target %s" % k) + target.parse_custom_target(k, output_target[k]) + if target.is_valid: + print("Target is valid. Uploading..") + target.upload_to_s3() + else: + raise Exception("Invalid target %s -> %s: failed to upload" % k, output_target[k]) + else: + # legitimate CWL/WDL output targets + print("processing argument-based target %s" % k) + target.parse_cwl_target(k, output_target.get(k, ''), prj_out.output_files) + if target.is_valid: + print("Target is valid. Uploading..") + target.upload_to_s3() + prj_out.output_files[k].add_target(target.dest) + + # upload secondary files + secondary_output_files = prj_out.output_files[k].secondaryFiles + if secondary_output_files: + stlist = SecondaryTargetList(output_bucket) + stlist.parse_target_values(prj_out.secondary_output_target.get(k, [])) + stlist.reorder_by_source([sf.path for sf in secondary_output_files]) + for st in stlist.secondary_targets: + st.upload_to_s3() + for i, sf in enumerate(secondary_output_files): + sf.add_target(stlist.secondary_targets[i].dest) + else: + raise Exception("Failed to upload to output target %s" % k) + + +def save_total_sizes(): + os.environ['INPUTSIZE'] = subprocess.getoutput('du -csh /data1/input| tail -1 | cut -f1') + os.environ['TEMPSIZE'] = subprocess.getoutput('du -csh /data1/tmp*| tail -1 | cut -f1') + os.environ['OUTPUTSIZE'] = subprocess.getoutput('du -csh /data1/out| tail -1 | cut -f1') + + +def update_postrun_json_final(json_old, json_new, logfile=None): + """Update postrun json with status, time stamps, parsed commands, + input/tmp/output sizes""" + prj = read_postrun_json(json_old) + + postrun_json_final(prj, logfile=logfile) + + # write to new json file + write_postrun_json(json_new, prj) + + +def postrun_json_final(prj, logfile=None): + # add commands + if logfile: + print("parsing commands from log file...") + log_content = log.read_logfile_by_line(logfile) + prj.update(commands=log.parse_commands(log_content)) + # add end time, status, instance_id + prj_job = prj.Job + prj_job.update(end_time=time.strftime("%Y%m%d-%H:%M:%S-%Z")) + prj_job.update(status=os.getenv('JOB_STATUS')) + prj_job.update(total_input_size=os.getenv('INPUTSIZE')) + prj_job.update(total_tmp_size=os.getenv('TEMPSIZE')) + prj_job.update(total_output_size=os.getenv('OUTPUTSIZE')) + + +def upload_postrun_json(jsonfile): + prj = read_postrun_json(jsonfile) + bucket = prj.Job.Log.log_bucket_directory + dest = prj.Job.JOBID + '.postrun.json' + if prj.config.public_postrun_json: + acl = 'public-read' + else: + acl = 'private' + s3 = boto3.client('s3') + s3.put_object(ACL=acl, Body=format_postrun_json(prj).encode('utf-8'), Bucket=bucket, Key=dest) diff --git a/cromwell/LICENSE.txt b/cromwell/LICENSE.txt old mode 100644 new mode 100755 diff --git a/cromwell/README b/cromwell/README old mode 100644 new mode 100755 diff --git a/docs/4dn_pipelines.rst b/docs/4dn_pipelines.rst old mode 100644 new mode 100755 diff --git a/docs/Makefile b/docs/Makefile old mode 100644 new mode 100755 diff --git a/docs/ami.rst b/docs/ami.rst old mode 100644 new mode 100755 diff --git a/docs/api.rst b/docs/api.rst old mode 100644 new mode 100755 index a15ecaefd..017727a34 --- a/docs/api.rst +++ b/docs/api.rst @@ -21,11 +21,9 @@ Example API().run_workflow(input_json='myrun.json') # json file or dictionary object -Basic_commands -++++++++++++++ +Admin only commands ++++++++++++++++++++ -Admin only -########## The following commands require admin previlege to one's AWS account. @@ -152,8 +150,48 @@ To remove Tibanna components on AWS. verbose= Verbose if True. (default False) -Non-admin -######### + +setup_tibanna_env +----------------- + +- Advanced user only + +To set up environment on AWS without deploying tibanna, use `tibanna setup_tibanna_env`. + + +:: + + API().setup_tibanna_env(...) + + +**Options** + +:: + + usergroup_tag= an identifier for a usergroup that shares + a tibanna permission + + no_randomize If set True, Tibanna does not add a random + number to generate a usergroup name (e.g. the + usergroup name used will be identical to the + one specified using the ``usergrou_tag`` option. + By default, a random number will be added at the + end (e.g. default_2721). Default False. + + buckets= A comma-delimited list of bucket names - the + buckets to which Tibanna needs access to + through IAM role (input, output, log). + + do_not_delete_public_access_block If set True, Tibanna does not delete public + access block from the specified buckets + (this way postrunjson and metrics reports will + not be public). Default False. + + + + +Non-admin commands +++++++++++++++++++ The following commands can be used by a non-admin user, as long as the user belongs to the right user group. @@ -183,6 +221,30 @@ To run workflow out (default 3) +run_batch_workflows +------------------- + +To run multiple workflows in a batch. This function does not open browser and job ids are +always automatically assigned. This function is available for Tibanna versions >= ``1.0.0``. + +:: + + API().run_batch_workflows(input_json_list=, ...) + + +**Options** + +:: + + sfn= An example step function name may be + 'tibanna_unicorn_defaut_3978'. If not specified, default + value is taken from environmental variable + TIBANNA_DEFAULT_STEP_FUNCTION_NAME. + sleep= Number of seconds between submission, to avoid drop- + out (default 3) + + + stat ---- @@ -208,6 +270,8 @@ To check status of workflows, n= print out only the first n lines + job_ids= filter by a list of job ids. This option is + available only for version >= ``1.0.0``. The output is a table (an example below) @@ -234,37 +298,36 @@ To check the log or postrun json (summary) of a workflow run :: - sfn= By default, TIBANNA_DEFAULT_STEP_FUNCTION_NAME (environmental variable). - Not necessary to rerun by ``exec-arn``. - Specify this to rerun by ``job-id`` instead of ``exec-arn`` on a non-default step function. - An example step function name may be 'tibanna_unicorn_defaut_3978'. - - postrunjson The postrunjson option streams out a postrun json file instead of a log file. + postrunjson= The postrunjson option streams out a postrun json file instead of a log file. A postrun json file is available only after the run finishes. It contains the summary of the job including input, output, EC2 config and Cloudwatch metrics on memory/CPU/disk space. + runjson= prints out run json instead, which is the json file tibanna sends to the instance + before the run starts. (new in ``1.0.0``) + + top= prints out top file (log file containing top command output) instead. This top file + contains all the top batch command output at a 1-minute interval. (new in ``1.0.0``) + + top_latest= prints out the latest content of the top file. This one contains only the latest + top command output (latest 1-minute interval). (new in ``1.0.0``) + rerun ----- -To rerun a failed job with the same input json +To rerun a failed job with the same input json on a specific step function. :: - API().rerun(exec_arn=|job_id=|exec_name=, ...) + API().rerun(exec_arn=|job_id=, sfn=, ...) **Options** :: - sfn= By default, TIBANNA_DEFAULT_STEP_FUNCTION_NAME (environmental variable). - Not necessary to rerun by ``exec-arn``. - Specify this to rerun by ``job-id`` instead of ``exec-arn`` on a non-default step function. - An example step function name may be 'tibanna_unicorn_defaut_3978'. - instance_type= Override instance type for the rerun shutdown_min= Override shutdown minutes for the rerun @@ -383,7 +446,6 @@ The following message is printed out https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default3537:fastqc_85ba7f41-daf5-4f82-946f-06d31d0cd293 JOBID jLeL6vMbhL63 submitted EXECUTION ARN = arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default3537:fastqc_85ba7f41-daf5-4f82-946f-06d31d0cd293 - Couldn't get a file descriptor referring to the console To kill this job, use the execution arn in the above message ('EXECUTION_ARN') (it can also be found on the Step Function Console) @@ -393,6 +455,12 @@ To kill this job, use the execution arn in the above message ('EXECUTION_ARN') ( API().kill(exec_arn='arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default3537:fastqc_85ba7f41-daf5-4f82-946f-06d31d0cd293') +or + +:: + + API().kill(job_id='jLeL6vMbhL63') + kill_all @@ -497,93 +565,3 @@ To retrieve the cost and update the metrics report file created with plot_metric update_tsv This flag specify to update with cost the tsv file that stores metrics information on the S3 bucket - - -Admin only -########## - -setup_tibanna_env ------------------ - -- Advanced user only - -To set up environment on AWS without deploying tibanna, use `tibanna setup_tibanna_env`. - - -:: - - API().setup_tibanna_env(...) - - -**Options** - -:: - - usergroup_tag= an identifier for a usergroup that shares - a tibanna permission - - no_randomize If set True, Tibanna does not add a random - number to generate a usergroup name (e.g. the - usergroup name used will be identical to the - one specified using the ``usergrou_tag`` option. - By default, a random number will be added at the - end (e.g. default_2721). Default False. - - buckets= A comma-delimited list of bucket names - the - buckets to which Tibanna needs access to - through IAM role (input, output, log). - - do_not_delete_public_access_block If set True, Tibanna does not delete public - access block from the specified buckets - (this way postrunjson and metrics reports will - not be public). Default False. - - -Additional commands for tibanna_4dn -+++++++++++++++++++++++++++++++++++ - - -``tibanna_4dn`` is a 4dn extension of ``tibanna``. All the subcommands of ``tibanna`` can also be used by ``tibanna_4dn``. In addition, ``tibanna_4dn`` supports additional 4dn-specific subcommands. - - -General Usage - -:: - - from tibanna_4dn.core import API - API().method(...) - - -In ``tibanna_4dn``, ``TIBANNA_DEFAULT_STEP_FUNCTION_NAME`` is set to `tibanna_pony` unless specified by the user. - - - -deploy_pony ------------ - - -This function deploys tibanna pony (4dn extension of tibanna). -You need the following environmental variables set on your local machine from which you're deploying a pony. - -:: - - export S3_ENCRYPT_KEY=<4dn_s3_encryption_key> - -To create an instance of tibanna (step function + lambdas) - -:: - - API().deploy_pony(suffix=, usergroup=) - # (use suffix for development version) - # example : dev - # : a AWS user group that share permission to tibanna and the associated buckets given by the setup_tibanna_env command. - - -example - -:: - - API().deploy_pony(suffix='dev2') - - -The above command will create a step function named tibanna_pony_dev2 that uses a set of lambdas with suffix _dev2, and deploys these lambdas. diff --git a/docs/commands.rst b/docs/commands.rst old mode 100644 new mode 100755 index 6a131f372..c42aeb482 --- a/docs/commands.rst +++ b/docs/commands.rst @@ -23,12 +23,8 @@ To check Tibanna version, tibanna -v - -Basic_commands -++++++++++++++ - -Admin only -########## +Admin only commands ++++++++++++++++++++ The following commands require admin previlege to one's AWS account. @@ -160,8 +156,46 @@ To remove Tibanna components on AWS. -Non-admin -######### + +setup_tibanna_env +----------------- + +- Advanced user only + +To set up environment on AWS without deploying tibanna, use `tibanna setup_tibanna_env`. + + +:: + + tibanna setup_tibanna_env + + +**Options** + +:: + + -g|--usergroup-tag= an identifier for a usergroup that shares + a tibanna permission + + -R|--no-randomize do not add a random number to generate a + usergroup name (e.g. the usergroup name used + will be identical to the one specified using + the ``--usergrou-tag`` option. By default, + a random number will be added at the end + (e.g. default_2721). + + -b|--buckets= A comma-delimited list of bucket names - the + buckets to which Tibanna needs access to + through IAM role (input, output, log). + + -P|--do-not-delete-public-access-block Do not delete public access block from buckets + (this way postrunjson and metrics reports will + not be public) + + + +Non-admin user commands ++++++++++++++++++++++++ The following commands can be used by a non-admin user, as long as the user belongs to the right user group. @@ -190,6 +224,29 @@ To run workflow out (default 3) +run_batch_workflows +------------------- + +To run multiple workflows in a batch. This command does not open browser and job ids are +always automatically assigned. This function is available for Tibanna versions >= ``1.0.0``. + +:: + + tibanna run_batch_workflows -i [] [...] [] + +**Options** + +:: + + -s|--sfn= An example step function name may be + 'tibanna_unicorn_defaut_3978'. If not specified, default + value is taken from environmental variable + TIBANNA_DEFAULT_STEP_FUNCTION_NAME. + -S SLEEP, --sleep SLEEP Number of seconds between submission, to avoid drop- + out (default 3) + + + stat ---- @@ -204,19 +261,23 @@ To check status of workflows, :: - -t|--status= filter by run status (all runs if not specified). - Status must be one of the following values: - RUNNING|SUCCEEDED|FAILED|TIMED_OUT|ABORTED + -t|--status= filter by run status (all runs if not specified). + Status must be one of the following values: + RUNNING|SUCCEEDED|FAILED|TIMED_OUT|ABORTED - -s|--sfn= An example step function name may be - 'tibanna_unicorn_defaut_3978'. If not specified, default - value is taken from environmental variable - TIBANNA_DEFAULT_STEP_FUNCTION_NAME. If the environmental - variable is not set, it uses name 'tibanna_pony' (4dn - default, works only for 4dn). + -s|--sfn= An example step function name may be + 'tibanna_unicorn_defaut_3978'. If not specified, default + value is taken from environmental variable + TIBANNA_DEFAULT_STEP_FUNCTION_NAME. If the environmental + variable is not set, it uses name 'tibanna_pony' (4dn + default, works only for 4dn). - -n|--nlines print out only the first n lines + -n|--nlines= print out only the first n lines + -j|--job-ids [] ... job ids of the specific jobs to display, separated by + space. This option cannot be combined with + --nlines(-n), --status(-t) or --sfn(-s). This option is + available only for version >= ``1.0.0``. The output is a table (an example below) @@ -229,6 +290,7 @@ The output is a table (an example below) UlkvH3gbBBA2 FAILED repliseq-parta 2018-08-09 18:26 2018-08-09 19:01 j7hvisheBV27 SUCCEEDED bwa-mem 2018-08-09 18:44 2018-08-09 18:59 + log --- @@ -238,42 +300,48 @@ To check the log or postrun json (summary) of a workflow run tibanna log --exec-arn=|--job-id= [] +or + +:: + + tibanna log --exec-name= --sfn= [] + **Options** :: - -s|--sfn= By default, TIBANNA_DEFAULT_STEP_FUNCTION_NAME (environmental variable). - Not necessary to rerun by ``exec-arn``. - Specify this to rerun by ``job-id`` instead of ``exec-arn`` on a non-default step function. - An example step function name may be 'tibanna_unicorn_defaut_3978'. + -p|--postrunjson The -p option streams out a postrun json file instead of a log file. + A postrun json file is available only after the run finishes. + It contains the summary of the job including input, output, EC2 config and + Cloudwatch metrics on memory/CPU/disk space. + + -r|--runjson print out run json instead, which is the json file tibanna sends to the instance + before the run starts. (new in ``1.0.0``) - -p|--postrunjson The -p option streams out a postrun json file instead of a log file. - A postrun json file is available only after the run finishes. - It contains the summary of the job including input, output, EC2 config and - Cloudwatch metrics on memory/CPU/disk space. + -t|--top prints out top file (log file containing top command + output) instead. This top file contains all the top batch command output + at a 1-minute interval. (new in ``1.0.0``) + + -T|--top-latest prints out the latest content of the top file. This one contains only the latest + top command output (latest 1-minute interval). (new in ``1.0.0``) rerun ----- -To rerun a failed job with the same input json +To rerun a failed job with the same input json on a specific step function. :: - tibanna rerun --exec-arn=|--job-id=|--exec-name= [] + tibanna rerun --exec-arn=|--job-id= --sfn= [] **Options** :: - -s|--sfn= By default, TIBANNA_DEFAULT_STEP_FUNCTION_NAME (environmental variable). - Not necessary to rerun by ``exec-arn``. - Specify this to rerun by ``job-id`` instead of ``exec-arn`` on a non-default step function. - An example step function name may be 'tibanna_unicorn_defaut_3978'. - -i|--instance-type= Override instance type for the rerun -d|--shutdown-min= Override shutdown minutes for the rerun @@ -361,16 +429,10 @@ To kill a specific job through its execution arn or a jobid :: - tibanna kill --exec-arn= - -or - -:: - - tibanna kill --job-id= --sfn= + tibanna kill --exec-arn=|--job-id= -If ``jobid`` is specified but not ``stepfunctionname``, then by default it assumes ``TIBANNA_DEFAULT_STEP_FUNCTION_NAME``. If the job id is not found in the executions on the default or specified step function, then only the EC2 instance will be terminated and the step function status may still be RUNNING. +If the execution id or job id is not found in the current RUNNING executions (e.g. the execution has already been aborted), then only the EC2 instance will be terminated. @@ -393,7 +455,6 @@ The following message is printed out https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default3537:fastqc_85ba7f41-daf5-4f82-946f-06d31d0cd293 JOBID jLeL6vMbhL63 submitted EXECUTION ARN = arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default3537:fastqc_85ba7f41-daf5-4f82-946f-06d31d0cd293 - Couldn't get a file descriptor referring to the console To kill this job, use the execution arn in the above message ('EXECUTION_ARN') (it can also be found on the Step Function Console) @@ -403,6 +464,12 @@ To kill this job, use the execution arn in the above message ('EXECUTION_ARN') ( $ tibanna kill --exec-arn=arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default3537:fastqc_85ba7f41-daf5-4f82-946f-06d31d0cd293 +or + +:: + + $ tibanna kill --job-id jLeL6vMbhL63 + kill_all @@ -501,83 +568,3 @@ To retrieve the cost and update the metrics report file created with plot_metric information on the S3 bucket -Admin only -########## - -setup_tibanna_env ------------------ - -- Advanced user only - -To set up environment on AWS without deploying tibanna, use `tibanna setup_tibanna_env`. - - -:: - - tibanna setup_tibanna_env - - -**Options** - -:: - - -g|--usergroup-tag= an identifier for a usergroup that shares - a tibanna permission - - -R|--no-randomize do not add a random number to generate a - usergroup name (e.g. the usergroup name used - will be identical to the one specified using - the ``--usergrou-tag`` option. By default, - a random number will be added at the end - (e.g. default_2721). - - -b|--buckets= A comma-delimited list of bucket names - the - buckets to which Tibanna needs access to - through IAM role (input, output, log). - - -P|--do-not-delete-public-access-block Do not delete public access block from buckets - (this way postrunjson and metrics reports will - not be public) - - -Additional commands for tibanna_4dn -+++++++++++++++++++++++++++++++++++ - - -``tibanna_4dn`` is a 4dn extension of ``tibanna``. All the subcommands of ``tibanna`` can also be used by ``tibanna_4dn``. In addition, ``tibanna_4dn`` supports additional 4dn-specific subcommands. - - -:: - - tibanna_4dn - - -deploy_pony ------------ - - -This function deploys tibanna pony (4dn extension of tibanna). -You need the following environmental variables set on your local machine from which you're deploying a pony. - -:: - - export S3_ENCRYPT_KEY=<4dn_s3_encryption_key> - -To create an instance of tibanna (step function + lambdas) - -:: - - tibanna_4dn deploy_pony [--suffix=] [--usergroup=] - # (use suffix for development version) - # example : dev - # : a AWS user group that share permission to tibanna and the associated buckets given by the `tibanna setup_tibanna_env` command.. - - -example - -:: - - tibanna_4dn deploy_pony --suffix=dev2 - - -The above command will create a step function named tibanna_pony_dev2 that uses a set of lambdas with suffix _dev2, and deploys these lambdas. diff --git a/docs/conf.py b/docs/conf.py old mode 100644 new mode 100755 diff --git a/docs/cwl.rst b/docs/cwl.rst old mode 100644 new mode 100755 index d61ea79eb..3b955f765 --- a/docs/cwl.rst +++ b/docs/cwl.rst @@ -2,6 +2,6 @@ Common Workflow Language (CWL) ============================== -Tibanna supports CWL version 1.0 (https://www.commonwl.org/) and draft-3 (https://www.commonwl.org/draft-3/Workflow.html) +Tibanna supports CWL version 1.0 (https://www.commonwl.org/). Starting with Tibanna version ``1.0.0``, CWL draft-3 is no longer supported. diff --git a/docs/execution_json.rst b/docs/execution_json.rst old mode 100644 new mode 100755 index caea58c65..6afb577a4 --- a/docs/execution_json.rst +++ b/docs/execution_json.rst @@ -40,9 +40,9 @@ Example job description for CWL } }, "config": { - "instance_type": "t2.micro", + "instance_type": "t3.micro", "ebs_size": 10, - "EBS_optimized": false, + "EBS_optimized": true, "log_bucket": "montys-log-bucket" } } @@ -82,7 +82,7 @@ CWL-specific - An array of all the other cwl files that are called by the main cwl file. If the main CWL file is of 'workflow' type, the other CWL files corresponding to steps or subworkflows should be listed here. :cwl_version: - - either ``v1`` or ``draft-3`` + - either ``v1`` or ``draft-3`` (starting with tibanna version ``1.0.0``, ``draft-3`` is no longer supported.) :singularity: - This option uses Singularity to run Docker images internally (slower). This option does NOT support native Singularity images, since CWL does not support native Singularity images. @@ -95,6 +95,7 @@ WDL-specific :language: - This field must be set to ``wdl`` to run a WDL pipeline. + - To run an old version (draft2) of WDL, set it to ``wdl_draft2``. This will direct Tibanna to specifically use an older version of Cromwell. Some draft2 WDLs may be supported by the later version of Cromwell. Use the ``wdl_draft2`` option only if the old WDL does not work with the later version of Cromwell. :wdl_directory_url: - @@ -145,7 +146,7 @@ Snakemake-specific :container_image: - This is a required field. - It is highly recommended to use the official Snakemake Docker image - (``quay.io/snakemake/snakemake``) + (``snakemake/snakemake``) :command: - This is a required field. @@ -200,7 +201,7 @@ Other pipeline-related fields - Version of the pipeline/app, for the user to keep in track. :language: - - 'cwl_v1', 'cwl_draft3' or 'wdl' + - 'cwl_v1', 'cwl_draft3' (tibanna < ``1.0.0`` only) or 'wdl' (='wdl_v1' for backward compatibility) or 'wdl_draft2' or 'wdl_v1' (tibanna >= ``1.0.0``) - For WDL, it is a required field. For CWL, the language field can be omitted. @@ -323,9 +324,6 @@ Output target specification - key can be a source file path (to be used inside container run environment) starting with ``file://`` instead of CWL/WDL argument name. - - It is highly recommended to stick to using only argument names for CWL/WDL for pipeline - reproducibility, since they are already clearly defined in CWL/WDL (especially for CWL). - - (e.g. :: @@ -334,6 +332,48 @@ Output target specification "file:///data1/out/some_random_output.txt": "output/some_random_output.txt" } + - It is highly recommended to stick to using only argument names for CWL/WDL for pipeline + reproducibility, since they are already clearly defined in CWL/WDL (especially for CWL). + + - Starting with version ``1.0.0``, a dictionary format is also accepted for individual target, with keys ``object_key`` ``bucket_name``, ``object_prefix`` and/or ``unzip``. For a regular file output, ``object_key`` and ``bucket_name`` can be used. The use of ``bucket_name`` here allows using a different output bucket for specific output files. For a directory, ``object_prefix`` can be used instead which will be used as if it is the directory name on S3. ``object_prefix`` may or may not have the trailing ``\``. ``unzip`` is boolean (either ``true`` or ``false``) and can be applied to a case when the output file is a ``zip`` file and you want the content to be extracted into a directory on an S3 bucket. + + - (e.g. + + :: + + { + "out_pairsam": { + "object_key": "output/renamed_pairsam_file" + } + } + + :: + + { + "out_pairsam": { + "object_key": "output/renamed_pairsam_file", + "bucket" : "some_different_bucket" + } + } + + :: + + { + "some_output_as_dir": { + "object_prefix": "some_dir_output/", + "bucket": "some_different_bucket" + } + } + + :: + + { + "out_zip": { + "object_prefix": "zip_output/", + "unzip": true + } + + :secondary_output_target: - Similar to ``output_target`` but for secondary files. @@ -448,13 +488,13 @@ The ``config`` field describes execution configuration. :root_ebs_size: - - default 8 - - Tibanna uses two separate EBS volumes, one for docker image, another for data. Most of the times, the 8GB - root EBS that is used for docker images has enough space. However, if the docker image is larger than 5GB - or if multiple large docker images are used together, one may consider increasing root ebs size. Any directory - that is used inside a docker image (e.g. ``/tmp`` when running in the ``shell`` mode) that is not mounted - from the data EBS could also cause a ``no space left in device`` error on the root EBS volume. It is - recommended to use a directory under ``/data1`` as a temp directory when running in the ``shell`` mode, which - is mounted from data EBS. + - For versions < ``1.0.0``, Tibanna uses two separate EBS volumes, one for docker image, another for data. + Most of the times, the 8GB root EBS that is used for docker images has enough space. However, if the + docker image is larger than 5GB or if multiple large docker images are used together, one may consider + increasing root ebs size. Any directory that is used inside a docker image (e.g. ``/tmp`` when running + in the ``shell`` mode) that is not mounted from the data EBS could also cause a ``no space left in device`` + error on the root EBS volume. It is recommended to use a directory under ``/data1`` as a temp directory + when running in the ``shell`` mode, which is mounted from data EBS. - This field is supported in version ``0.9.0`` or higher. If an older version has been used, redeploy ``run_task_awsem`` to enable this feature, after installing ``0.9.0`` or higher, as below. @@ -462,6 +502,12 @@ The ``config`` field describes execution configuration. tibanna deploy_core -n run_task_awsem -g [-s ] + - For versions >= ``1.0.0``, this field is no longer needed (though still supported) since the docker image + also uses the data EBS and not the root EBS starting ``1.0.0``. This means for a large docker image, it is + recommended to increase ``ebs_size`` rather than ``root_ebs_size``. It takes effect only if ``run_task_awsem`` + is redeployed as above. For consistency, when you redeploy ``run_task_awsem`` from version < ``1.0.0`` to + version >= ``1.0.0``, it is also recommended to redeploy ``check_task_awsem`` with the same version. + :shutdown_min: - either number of minutes or string 'now' @@ -485,8 +531,8 @@ The ``config`` field describes execution configuration. - optional (default: unset) :ebs_type: - - type of EBS (either ``gp2`` or ``io1``) - - optional (default: gp2) + - type of EBS (e.g. ``gp3``, ``gp2``, ``io1``) + - optional (default: gp3 (version >= ``1.0.0``) or gp2 (version < ``1.0.0``)) :cloudwatch_dashboard: - **This option is now depricated.** diff --git a/docs/how_it_works.rst b/docs/how_it_works.rst old mode 100644 new mode 100755 index 4d8e75341..0ea44b21a --- a/docs/how_it_works.rst +++ b/docs/how_it_works.rst @@ -4,15 +4,9 @@ How it works ============ -.. image:: images/tibanna_diagram_20180207.png - - - Tibanna launches and monitors pipeline runs using two-layer scheduling. The upstream regulator is based on a finite state machine called AWS Step Function and the downstream workflow engine is based on ``cwltool`` which runs Docker/CWL-based pipelines on an EC2 instance (or ``cromwell`` for Docker/WDL-based pipelines). Tibanna’s AWS Step Function launches several AWS Serverless Lambda functions that submits and monitors a pipeline execution on a pre-custom-configured autonomous virtual machine (EC2 instance) (AWSEM; Autonomous Workflow Step Executor Machine). The ``cwltool``/``cromwell` is auto-executed on an instance. -For 4DN, Tibanna's upstream step function consists of four steps instead of two, with one extra step before and one extra step after. These outer steps read and write metadata for 4DN portal for input files, output files, workflows and workflow runs. Some workflow runs (e.g. md5, fastqc) are auto-triggered upon data submission (file upload). - -Tibanna allows multi-layer, real-time monitoring. The logs of what's happening on an instance including the ``cwltool`` log is regularly sent to a designated S3 bucket (Tibanna log bucket). Logs generated by the AWS Lambda functions are sent to AWS CloudWatch, a service provided by AWS; AWS Step function sends logs either as an output json or as an exception. Users can ssh into the EC2 instance where a workflow is currently being executed, for more detailed investigation. +Tibanna allows multi-layer, real-time monitoring. The logs of what's happening on an instance including the ``cwltool`` log is regularly sent to a designated S3 bucket (Tibanna log bucket). Logs generated by the AWS Lambda functions are sent to AWS CloudWatch, a service provided by AWS; AWS Step function sends logs either as an output json or as an exception. Users can ssh into the EC2 instance where a workflow is currently being executed, for more detailed investigation. A metrics plot is generated and stored for every job for monitoring how CPU/Memory/disk usage changes over time during the run and for each process. The user can also check the top command outputs generated from the instance at 1 minute interval, without ssh-ing into the machine, since these reports are sent to S3 regularly. Tibanna provides API to access these reports easily. Tibanna uses AWS IAM roles to ensure secure access but also allows use of profile information for accessing public data that requires AWS access keys and secret keys by setting environmental variables for AWS Lambda functions. There is no need to store any security information inside docker image or anywhere in the code. diff --git a/docs/images/atacseq_4dn_run.png b/docs/images/atacseq_4dn_run.png old mode 100644 new mode 100755 diff --git a/docs/images/atacseq_aln_4dn_wf.png b/docs/images/atacseq_aln_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/atacseq_postaln_4dn_wf.png b/docs/images/atacseq_postaln_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/awsem_ec2_console.png b/docs/images/awsem_ec2_console.png old mode 100644 new mode 100755 diff --git a/docs/images/bwa_4dn_wf.png b/docs/images/bwa_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/chipseq_4dn_run.png b/docs/images/chipseq_4dn_run.png old mode 100644 new mode 100755 diff --git a/docs/images/chipseq_aln_chip_4dn_wf.png b/docs/images/chipseq_aln_chip_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/chipseq_aln_ctl_4dn_wf.png b/docs/images/chipseq_aln_ctl_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/chipseq_postaln_4dn_wf.png b/docs/images/chipseq_postaln_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/cloudwatch_dashboard_example.png b/docs/images/cloudwatch_dashboard_example.png old mode 100644 new mode 100755 diff --git a/docs/images/console_account.png b/docs/images/console_account.png old mode 100644 new mode 100755 diff --git a/docs/images/console_account_number.png b/docs/images/console_account_number.png old mode 100644 new mode 100755 diff --git a/docs/images/double-nested_array_example.png b/docs/images/double-nested_array_example.png old mode 100644 new mode 100755 diff --git a/docs/images/fastqc_4dn_run.png b/docs/images/fastqc_4dn_run.png old mode 100644 new mode 100755 diff --git a/docs/images/hic_4dn_run.png b/docs/images/hic_4dn_run.png old mode 100644 new mode 100755 diff --git a/docs/images/hicbam_4dn_wf.png b/docs/images/hicbam_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/hicpairs_4dn_wf.png b/docs/images/hicpairs_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/md5_4dn_run.png b/docs/images/md5_4dn_run.png old mode 100644 new mode 100755 diff --git a/docs/images/metrics_plot_01.png b/docs/images/metrics_plot_01.png new file mode 100644 index 000000000..247fe0f9f Binary files /dev/null and b/docs/images/metrics_plot_01.png differ diff --git a/docs/images/metrics_plot_02.png b/docs/images/metrics_plot_02.png new file mode 100644 index 000000000..31fcfa504 Binary files /dev/null and b/docs/images/metrics_plot_02.png differ diff --git a/docs/images/metrics_plot_03.png b/docs/images/metrics_plot_03.png new file mode 100644 index 000000000..652ee82bc Binary files /dev/null and b/docs/images/metrics_plot_03.png differ diff --git a/docs/images/metrics_plot_04.png b/docs/images/metrics_plot_04.png new file mode 100644 index 000000000..595db4afd Binary files /dev/null and b/docs/images/metrics_plot_04.png differ diff --git a/docs/images/metrics_plot_05.png b/docs/images/metrics_plot_05.png new file mode 100644 index 000000000..402e19f28 Binary files /dev/null and b/docs/images/metrics_plot_05.png differ diff --git a/docs/images/metrics_plot_06.png b/docs/images/metrics_plot_06.png new file mode 100644 index 000000000..227ebb151 Binary files /dev/null and b/docs/images/metrics_plot_06.png differ diff --git a/docs/images/metrics_plot_07.png b/docs/images/metrics_plot_07.png new file mode 100644 index 000000000..500d091ae Binary files /dev/null and b/docs/images/metrics_plot_07.png differ diff --git a/docs/images/nested_array_example.png b/docs/images/nested_array_example.png old mode 100644 new mode 100755 diff --git a/docs/images/pairsqc_4dn_wf.png b/docs/images/pairsqc_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/repliseq_a_4dn_wf.png b/docs/images/repliseq_a_4dn_wf.png old mode 100644 new mode 100755 diff --git a/docs/images/report.png b/docs/images/report.png old mode 100644 new mode 100755 diff --git a/docs/images/screenshot_tibanna_pony.png b/docs/images/screenshot_tibanna_pony.png old mode 100644 new mode 100755 diff --git a/docs/images/screenshot_tibanna_unicorn.png b/docs/images/screenshot_tibanna_unicorn.png old mode 100644 new mode 100755 diff --git a/docs/images/stepfunction_unicorn_screenshot.png b/docs/images/stepfunction_unicorn_screenshot.png old mode 100644 new mode 100755 diff --git a/docs/images/stepfunction_unicorn_screenshot_fail.png b/docs/images/stepfunction_unicorn_screenshot_fail.png old mode 100644 new mode 100755 diff --git a/docs/images/tibanna_diagram_20180207.png b/docs/images/tibanna_diagram_20180207.png deleted file mode 100644 index 3f48f5523..000000000 Binary files a/docs/images/tibanna_diagram_20180207.png and /dev/null differ diff --git a/docs/images/tibanna_diagram_20180817.png b/docs/images/tibanna_diagram_20180817.png old mode 100644 new mode 100755 diff --git a/docs/images/tibanna_v6.png b/docs/images/tibanna_v6.png old mode 100644 new mode 100755 diff --git a/docs/index.rst b/docs/index.rst old mode 100644 new mode 100755 diff --git a/docs/installation.rst b/docs/installation.rst old mode 100644 new mode 100755 index 5b0927c0a..1f61d3c08 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -11,7 +11,7 @@ Installing Tibanna package Tibanna works with the following Python and pip versions. - Python 3.6 -- Pip 9, 10, 18 or 19 +- Pip 9, 10, 18, 19, 20 Install Tibanna on your local machine or server from which you want to send commands to run workflows. @@ -42,6 +42,15 @@ Alternatively, use ``git clone`` followed by ``setup.py`` python setup.py install +Starting version ``1.0.0``, there is also a Docker image that contains the same version of tibanna as the image tag. This image is used on the EC2 AWSEM instances and not for a local use. The image contains many other things including Docker, Singularity, Cromwell, cwltool, etc. in addition to Tibanna and therefore not recommended, but in case the above two somehow didn't work in your environment, and if you have Docker, you could try: + +:: + + docker run -it 4dndcic/tibanna-awsf:1.0.0 bash + # You could use a different version tag instead of 1.0.0 + # you can also mount your local directories and files as needed. + + AWS configuration ----------------- diff --git a/docs/monitoring.rst b/docs/monitoring.rst old mode 100644 new mode 100755 index bdd0edd08..1a95ba78c --- a/docs/monitoring.rst +++ b/docs/monitoring.rst @@ -15,7 +15,7 @@ General stats :: - tibanna stat [--sfn=] [--status=RUNNING|SUCCEEDED|FAILED|TIMED_OUT|ABORTED] [-l] [-n ] + tibanna stat [--sfn=] [--status=RUNNING|SUCCEEDED|FAILED|TIMED_OUT|ABORTED] [-l] [-n ] [-j [] [...]] The output is a table (an example below) @@ -56,68 +56,61 @@ Using your job ID, you can also check your S3 bucket to see if you can find a fi tibanna log --job-id=lSbkdVIQ6VtX | tail -60 -:: +The output looks as below for version ``1.0.0`` or higher (much better organized / formatted than older version logs). - "atac.conservative_peak": "/data1/wdl/cromwell-executions/atac/14efe06b-a010-42c9-be0f-82f33f4d877c/call-reproducibility_overlap/execution/glob-c12e49ae1deb87ae04019b575ae1ffe9/conservative_peak.narrowPeak.bb" - } - [2019-02-04 17:09:59,15] [info] WorkflowManagerActor WorkflowActor-14efe06b-a010-42c9-be0f-82f33f4d877c is in a terminal state: WorkflowSucceededState - [2019-02-04 17:10:41,23] [info] SingleWorkflowRunnerActor workflow finished with status 'Succeeded'. - { - "outputs": { - "atac.optimal_peak": "/data1/wdl/cromwell-executions/atac/14efe06b-a010-42c9-be0f-82f33f4d877c/call-reproducibility_overlap/execution/glob-6150deffcc38df7a1bcd007f08a547cd/optimal_peak.narrowPeak.bb", - "atac.sig_fc": "/data1/wdl/cromwell-executions/atac/14efe06b-a010-42c9-be0f-82f33f4d877c/call-macs2_pooled/execution/glob-8876d8ced974dc46a0c7a4fac20a3a95/4DNFIZYWOA3Y.pooled.fc.signal.bigwig", - "atac.report": "/data1/wdl/cromwell-executions/atac/14efe06b-a010-42c9-be0f-82f33f4d877c/call-qc_report/execution/glob-eae855c82d0f7e2185388856e7b2cc7b/qc.html", - "atac.first_ta": null, - "atac.qc_json": "/data1/wdl/cromwell-executions/atac/14efe06b-a010-42c9-be0f-82f33f4d877c/call-qc_report/execution/glob-3440f922973abb7a616aaf203e0db08b/qc.json", - "atac.conservative_peak": "/data1/wdl/cromwell-executions/atac/14efe06b-a010-42c9-be0f-82f33f4d877c/call-reproducibility_overlap/execution/glob-c12e49ae1deb87ae04019b575ae1ffe9/conservative_peak.narrowPeak.bb" - }, - "id": "14efe06b-a010-42c9-be0f-82f33f4d877c" - } - [2019-02-04 17:10:43,02] [info] SingleWorkflowRunnerActor writing metadata to /data1/out/lSbkdVIQ6VtX.log.json - [2019-02-04 17:10:43,03] [info] Workflow polling stopped - [2019-02-04 17:10:43,04] [info] Shutting down WorkflowStoreActor - Timeout = 5 seconds - [2019-02-04 17:10:43,05] [info] Shutting down WorkflowLogCopyRouter - Timeout = 5 seconds - [2019-02-04 17:10:43,05] [info] Shutting down JobExecutionTokenDispenser - Timeout = 5 seconds - [2019-02-04 17:10:43,05] [info] JobExecutionTokenDispenser stopped - [2019-02-04 17:10:43,06] [info] Aborting all running workflows. - [2019-02-04 17:10:43,06] [info] WorkflowStoreActor stopped - [2019-02-04 17:10:43,06] [info] WorkflowLogCopyRouter stopped - [2019-02-04 17:10:43,06] [info] Shutting down WorkflowManagerActor - Timeout = 3600 seconds - [2019-02-04 17:10:43,06] [info] WorkflowManagerActor All workflows finished - [2019-02-04 17:10:43,06] [info] WorkflowManagerActor stopped - [2019-02-04 17:10:43,06] [info] Connection pools shut down - [2019-02-04 17:10:43,06] [info] Shutting down SubWorkflowStoreActor - Timeout = 1800 seconds - [2019-02-04 17:10:43,06] [info] Shutting down JobStoreActor - Timeout = 1800 seconds - [2019-02-04 17:10:43,06] [info] Shutting down CallCacheWriteActor - Timeout = 1800 seconds - [2019-02-04 17:10:43,06] [info] SubWorkflowStoreActor stopped - [2019-02-04 17:10:43,06] [info] Shutting down ServiceRegistryActor - Timeout = 1800 seconds - [2019-02-04 17:10:43,06] [info] Shutting down DockerHashActor - Timeout = 1800 seconds - [2019-02-04 17:10:43,06] [info] Shutting down IoProxy - Timeout = 1800 seconds - [2019-02-04 17:10:43,07] [info] KvWriteActor Shutting down: 0 queued messages to process - [2019-02-04 17:10:43,07] [info] WriteMetadataActor Shutting down: 0 queued messages to process - [2019-02-04 17:10:43,07] [info] CallCacheWriteActor Shutting down: 0 queued messages to process - [2019-02-04 17:10:43,07] [info] CallCacheWriteActor stopped - [2019-02-04 17:10:43,07] [info] DockerHashActor stopped - [2019-02-04 17:10:43,07] [info] IoProxy stopped - [2019-02-04 17:10:43,07] [info] ServiceRegistryActor stopped - [2019-02-04 17:10:43,07] [info] JobStoreActor stopped - [2019-02-04 17:10:43,08] [info] Database closed - [2019-02-04 17:10:43,08] [info] Stream materializer shut down - [2019-02-04 17:10:43,08] [info] WDL HTTP import resolver closed - Mon Feb 4 17:10:44 UTC 2019 - total 228K - -rw-r--r-- 1 root root 144K Feb 4 17:10 lSbkdVIQ6VtX.log.json - -rw-r--r-- 1 root root 0 Feb 4 17:10 lSbkdVIQ6VtX.md5sum.txt - -rwxr-xr-x 1 ubuntu root 78K Feb 4 17:10 lSbkdVIQ6VtX.log - Filesystem Size Used Avail Use% Mounted on - udev 16G 0 16G 0% /dev - tmpfs 3.1G 8.5M 3.1G 1% /run - /dev/nvme0n1p1 7.7G 5.9G 1.9G 76% / - tmpfs 16G 0 16G 0% /dev/shm - tmpfs 5.0M 0 5.0M 0% /run/lock - tmpfs 16G 0 16G 0% /sys/fs/cgroup - /dev/nvme1n1 90G 8.4G 77G 10% /data1 +:: + ## job id: tvfZLFlt3PBz + ## instance type: t3.micro + ## instance id: i-0be6e6be5723ecd24 + ## instance region: us-east-1 + ## tibanna lambda version: 1.0.0 + ## awsf image: duplexa/tibanna-awsf:1.0.0 + ## ami id: ami-0a7ddfc7e412ab6e0 + ## availability zone: us-east-1f + ## security groups: default + ## log bucket: my-tibanna-test-bucket + ## shutdown min: 30 + + ## Starting... + Tue Nov 3 20:47:19 UTC 2020 + + ... + + + ## Running CWL/WDL/Snakemake/Shell commands + + ## workflow language: wdl + ## Operating System: Ubuntu 20.04.1 LTS (containerized) + ## Docker Root Dir: /mnt/data1/docker + ## CPUs: 16 + ## Total Memory: 40.18GiB + + ... + + + INFO /usr/local/bin/cwltool 3.0.20201017180608 + INFO Resolved 'workflow_gatk-GenotypeGVCFs_plus_vcf-integrity-check.cwl' to 'file:///mnt/data1/cwl/workflow_gatk-GenotypeGVCFs_plus_vcf-integrity-check.cwl' + INFO [workflow ] start + INFO [workflow ] starting step gatk-GenotypeGVCFs + INFO [step gatk-GenotypeGVCFs] start + + ... + + + 22:12:34.599 WARN InbreedingCoeff - Annotation will not be calculated, must provide at least 10 samples + 22:12:34.599 WARN InbreedingCoeff - Annotation will not be calculated, must provide at least 10 samples + 22:12:34.600 WARN InbreedingCoeff - Annotation will not be calculated, must provide at least 10 samples + 22:12:34.601 WARN InbreedingCoeff - Annotation will not be calculated, must provide at least 10 samples + 22:12:35.852 INFO ProgressMeter - chr14:106769920 50.4 79043000 1567469.6 + 22:12:36.890 INFO ProgressMeter - chr14:106882957 50.4 79071726 1567501.5 + 22:12:36.890 INFO ProgressMeter - Traversal complete. Processed 79071726 total variants in 50.4 minutes. + 22:12:36.999 INFO GenotypeGVCFs - Shutting down engine + [November 3, 2020 10:12:37 PM UTC] org.broadinstitute.hellbender.tools.walkers.GenotypeGVCFs done. Elapsed time: 50.48 minutes. + Runtime.totalMemory()=1915224064 + Using GATK jar /miniconda3/share/gatk4-4.1.2.0-1/gatk-package-4.1.2.0-local.jar + + To Download the log file manually, the following command also works. @@ -127,11 +120,72 @@ To Download the log file manually, the following command also works. aws s3 cp s3:///.log . +Top and Top_latest +################## + + +As of version ``1.0.0``, the top command output is sent to ``.top`` and ``.top_latest`` in the log bucket. The top command output used to be mixed in the log file (``.log``) in previous versions. With ``tibanna log`` command and option ``-t`` (all top output) and ``-T`` (latest only), one can print out the top command output from the running instance. The data is collected at 1-minute intervals and only while the command is running (e.g. not while the input data are downloaded to the EC2 instance or ssh is being configured etc). + +To use this feature, the tibanna unicorn must be deployed with tibanna >= ``1.0.0`` and the locally installed version must be >= ``1.0.0`` as well. + +Below is an example command and the output, executed twice with a 1-minute interval. In this example, the user can see that around 20:49:01, ``unpigz`` was running and around 20:50:01, many ``java`` processes were running (they depend on the command / workflow). + +:: + + tibanna log -j OiHYCN1QoEiP -T + +:: + + Timestamp: 2021-01-20-20:49:01 + top - 20:49:01 up 1 min, 0 users, load average: 2.11, 0.75, 0.27 + Tasks: 15 total, 2 running, 13 sleeping, 0 stopped, 0 zombie + %Cpu(s): 13.1 us, 6.4 sy, 0.0 ni, 80.5 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st + MiB Mem : 41139.5 total, 32216.5 free, 675.9 used, 8247.1 buff/cache + MiB Swap: 0.0 total, 0.0 free, 0.0 used. 39951.0 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 54 root 20 0 2928856 102488 48260 S 186.7 0.2 0:44.95 dockerd + 858 root 20 0 28904 1228 1128 R 153.3 0.0 0:09.18 unpigz + 859 root 20 0 1673140 80084 44464 S 46.7 0.2 0:02.91 exe + 1 root 20 0 7104 3692 3348 S 0.0 0.0 0:00.02 run.sh + 94 root 20 0 1781488 45328 25740 S 0.0 0.1 0:00.12 contain+ + 319 root 20 0 1792992 14660 9056 S 0.0 0.0 0:00.10 goofys-+ + 325 root 20 0 1571284 14136 9080 S 0.0 0.0 0:00.08 goofys-+ + 382 root 20 0 6812 2076 1868 S 0.0 0.0 0:00.00 cron + + +If we run the command again in ~1 min, we may get a different snapshot. This way, we can monitor in near-real time what kind of programs are running and how much resources they are using. + +:: + + tibanna log -j OiHYCN1QoEiP -T + +:: + + Timestamp: 2021-01-20-20:50:01 + top - 20:50:01 up 2 min, 0 users, load average: 18.06, 4.84, 1.67 + Tasks: 45 total, 1 running, 44 sleeping, 0 stopped, 0 zombie + %Cpu(s): 93.6 us, 6.4 sy, 0.0 ni, 0.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st + MiB Mem : 41139.5 total, 16099.9 free, 16978.6 used, 8061.1 buff/cache + MiB Swap: 0.0 total, 0.0 free, 0.0 used. 23657.1 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 2085 root 20 0 7984200 1.1g 31356 S 253.3 2.8 0:28.85 java + 2114 root 20 0 7984200 1.2g 31512 S 206.7 2.9 0:25.40 java + 2095 root 20 0 7984200 1.2g 31328 S 186.7 3.0 0:24.46 java + 2208 root 20 0 7984200 1.1g 31356 S 133.3 2.8 0:27.61 java + 2121 root 20 0 7984200 1.2g 31480 S 120.0 2.9 0:26.81 java + 2189 root 20 0 7984200 1.2g 31372 S 120.0 3.0 0:30.18 java + 2122 root 20 0 7984200 1.1g 31232 S 100.0 2.8 0:28.88 java + 2148 root 20 0 7984200 1.0g 31284 S 100.0 2.5 0:29.71 java + + Postrun.json ############ Once the job is finished, you should be able to find the ``.postrun.json`` file as well. This file can be viewed likewise using the ``tibanna log`` command, but with the ``-p`` option. The postrun json file contains the summary of the run, including the input / output / EC2 configuration and Cloudwatch metrics for memory/CPU/disk space usage. +Starting version ``1.0.0``, you can get an incomplete postrun.json before the job is finished, in addition to a complete postrun.json that you get at the end of the run. The incomplete postrun.json will not have the metrics, job status, end time, etc, but will include instance ID and file system. :: @@ -415,10 +469,15 @@ By default the command will retrieve the data from cloud watch, and creates seve - a metrics_report.tsv containing the average statistics and other information about the EC2 instance - a metrics.html report for visualization -All the files are eventually uploaded to a folder named .metrics inside the log S3 bucket specified for tibanna output. +All the files are eventually uploaded to a folder named ``.metrics`` inside the log S3 bucket specified for tibanna output. To visualize the html report the URL structure is: ``https://.s3.amazonaws.com/.metrics/metrics.html`` -**Basic Command** +Starting with ``1.0.0``, the metrics plot will include per-process CPU and memory profiles retrived from the top command reports at a 1-minute interval. Additional files `top_cpu.tsv` and `top_mem.tsv` will also be created under the same folder ``.metrics``. + + + +Basic Command +############# :: @@ -444,15 +503,20 @@ To visualize the html report the URL structure is: ``https://.s3.ama -B|--do-not-open-browser Do not open the browser to visualize the metrics html after it has been created/updated + -e|--endtime= Endtime (default job end time if the job has finished + or the current time) + -i|--instance-id= Manually provide instance_id if somehow tibanna fails + to retrieve the info + When metrics are collected for a run that is complete, a lock file is automatically created inside the same folder. The command will not update the metrics files if a lock file is present. To override this behavior the ``--force-upload`` flag allows to upload the metrics files ignoring the lock. The ``--update-html-only`` allows to only update the metrics.html file without modifying the other tsv files. By default the command will open the html report in the browser for visualization when execution is complete, ``--do-not-open-browser`` can be added to prevent this behavior. -Metrics collected -################# +Summary metrics collected as a table +#################################### -The metrics that are collected are: +Some summary metrics are collected and shown in the table of at the beginning of the metrics report. They are: - EC2 Instance type ---- @@ -469,7 +533,20 @@ The metrics that are collected are: html report example ################### -.. image:: images/report.png + +.. image:: images/metrics_plot_01.png + +.. image:: images/metrics_plot_02.png + +.. image:: images/metrics_plot_03.png + +.. image:: images/metrics_plot_04.png + +.. image:: images/metrics_plot_05.png + +.. image:: images/metrics_plot_06.png + +.. image:: images/metrics_plot_07.png cost diff --git a/docs/news.rst b/docs/news.rst old mode 100644 new mode 100755 diff --git a/docs/requirements.txt b/docs/requirements.txt old mode 100644 new mode 100755 diff --git a/docs/simple_example.rst b/docs/simple_example.rst old mode 100644 new mode 100755 diff --git a/docs/simple_example_cond_merge.rst b/docs/simple_example_cond_merge.rst old mode 100644 new mode 100755 index 7ffce5760..ec7cb8046 --- a/docs/simple_example_cond_merge.rst +++ b/docs/simple_example_cond_merge.rst @@ -140,15 +140,10 @@ Job description for WDL }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/docs/simple_example_hello.rst b/docs/simple_example_hello.rst old mode 100644 new mode 100755 index ff4d1fcce..2dd226c46 --- a/docs/simple_example_hello.rst +++ b/docs/simple_example_hello.rst @@ -35,8 +35,8 @@ This json can be found at https://github.com/4dn-dcic/tibanna/blob/master/exampl }, "config": { "ebs_size": 10, - "instance_type": "t2.micro", - "EBS_optimized": false, + "instance_type": "t3.micro", + "EBS_optimized": true, "password": "whateverpasswordworks", "log_bucket": "my-tibanna-test-bucket" } diff --git a/docs/simple_example_md5.rst b/docs/simple_example_md5.rst old mode 100644 new mode 100755 index 895c158af..a3f882dd4 --- a/docs/simple_example_md5.rst +++ b/docs/simple_example_md5.rst @@ -224,7 +224,7 @@ Job description for CWL "config": { "ebs_size": 10, "EBS_optimized": false, - "instance_type": "t2.micro", + "instance_type": "t3.micro", "password": "whateverpasswordworks", "log_bucket": "my-tibanna-test-bucket" } @@ -233,7 +233,7 @@ Job description for CWL The json file specifies the input with ``gzfile``, matching the name in CWL. In this example it is ``somefastqfile.fastq.gz`` on bucket ``my-tibanna-test-input-bucket``. The output file will be renamed to ``some_sub_dirname/my_first_md5_report`` in a bucket named ``my-tibanna-test-bucket``. In the input json, we specify the CWL file with ``cwl_main_filename`` and its url with ``cwl_directory_url``. Note that the file name itself is not included in the url). - We also specified in ``config``, that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t2.micro`` which comes with 1 CPU and 1GB memory. + We also specified in ``config``, that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t3.micro`` which comes with 1 CPU and 1GB memory. Job description for WDL @@ -270,7 +270,7 @@ Job description for WDL "config": { "ebs_size": 10, "EBS_optimized": false, - "instance_type": "t2.micro", + "instance_type": "t3.micro", "password": "whateverpasswordworks", "log_bucket": "my-tibanna-test-bucket" } @@ -279,7 +279,7 @@ Job description for WDL The json file specifies the input with ``md5.md5_step.gzfile``, matching the name in WDL. In this example it is ``somefastqfile.fastq.gz`` on bucket ``my-tibanna-test-input-bucket``. The output file will be renamed to ``some_sub_dirname/my_first_md5_report`` in a bucket named ``my-tibanna-test-bucket``. In the input json, we specify the WDL file with ``wdl_filename`` and its url with ``wdl_directory_url``. Note that the file name itself is not included in the url). - The config field is identical to the CWL input json. In ``config``, we specify that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t2.micro`` which comes with 1 CPU and 1GB memory. + The config field is identical to the CWL input json. In ``config``, we specify that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t3.micro`` which comes with 1 CPU and 1GB memory. @@ -316,7 +316,7 @@ Job description for shell }, "config": { "ebs_size": 10, - "instance_type": "t2.micro", + "instance_type": "t3.micro", "EBS_optimized": false, "password": "whateverpasswordworks", "log_bucket": "my-tibanna-test-bucket" diff --git a/docs/simple_example_merge.rst b/docs/simple_example_merge.rst old mode 100644 new mode 100755 index 1d69a9e07..95b06fab0 --- a/docs/simple_example_merge.rst +++ b/docs/simple_example_merge.rst @@ -56,7 +56,7 @@ CWL Since this is a multi-step pipeline, we use three CWL files, ``merge.cwl`` (master workflow CWL) and two other CWL files ``paste.cwl`` and ``cat.cwl`` that are called by ``merge.cwl``. These CWL files can be found at https://raw.githubusercontent.com/4dn-dcic/tibanna/master/examples/merge/merge.cwl, https://raw.githubusercontent.com/4dn-dcic/tibanna/master/examples/merge/paste.cwl and https://raw.githubusercontent.com/4dn-dcic/tibanna/master/examples/merge/cat.cwl. - To use your own CWL file, you'll need to make sure it is accessible via HTTP so Tibanna can download it with ``wget``: If you're using github, you could use raw.githubusercontent.com like the link above. + To use your own CWL file, you'll need to make sure it is accessible via HTTP so Tibanna can download it with ``wget``: If you're using github, you could use raw.githubusercontent.com like the link above. Alternatively, you can have tham as a local file and provide ``cwl_directory_local`` instead of ``cwl_directory_url``. The following is ``merge.cwl``. It is of class 'workflow' and defines inputs, outputs and steps. For the other two CWL files (``paste.cwl`` and ``cat.cwl``), see the links above. @@ -195,21 +195,16 @@ Job description for CWL }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } The json file specifies the input nested file array ("smallfiles") (``[["smallfile1", "smallfile2"], ["smallfile3", "smallfile4"]]``), matching the name in CWL. The output file will be renamed to ``some_sub_dirname/my_first_merged_file`` in a bucket named ``my-tibanna-test-bucket``. In the input json, we specify the CWL file with ``cwl_main_filename`` and its url with ``cwl_directory_url``. Note that the file name itself is not included in the url). Note that child CWL files are also specified in this case (``"cwl_child_filenames": ["paste.cwl", "cat.cwl"]``). - We also specified in ``config``, that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t2.micro`` which comes with 1 CPU and 1GB memory. + We also specified in ``config``, that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t3.micro`` which comes with 1 CPU and 1GB memory. Job description for WDL @@ -245,15 +240,10 @@ Job description for WDL }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/docs/simple_example_merge_and_cut.rst b/docs/simple_example_merge_and_cut.rst old mode 100644 new mode 100755 index a0532c7ca..7b26b32da --- a/docs/simple_example_merge_and_cut.rst +++ b/docs/simple_example_merge_and_cut.rst @@ -151,7 +151,7 @@ To run the pipeline on a specific input file using Tibanna, we need to create an Job description for CWL ####################### - The example job description for CWL is shown below and it can also be found at https://raw.githubusercontent.com/4dn-dcic/tibanna/master/examples/merge/merge_cwl_input.json. + The example job description for CWL is shown below and it can also be found at https://raw.githubusercontent.com/4dn-dcic/tibanna/master/examples/merge_and_cut/merge_and_cut_cwl_input.json. :: @@ -182,22 +182,17 @@ Job description for CWL }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": "now", - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } The json file specifies the input double-nested file array ("smallfiles"), matching the name in CWL. The output file will be renamed to ``some_sub_dirname/my_first_merged_and_cut_file`` in a bucket named ``my-tibanna-test-bucket``. In the input json, we specify the CWL file with ``cwl_main_filename`` and its url with ``cwl_directory_url``. Note that the file name itself is not included in the url). Note that child CWL files are also specified in this case (``"cwl_child_filenames": ["merge.cwl", "paste.cwl", "cat.cwl", "cut.cwl"]``). - We also specified in ``config``, that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t2.micro`` which comes with 1 CPU and 1GB memory. + We also specified in ``config``, that we need 10GB space total (``ebs_size``) and we're going to run an EC2 instance (VM) of type ``t3.micro`` which comes with 1 CPU and 1GB memory. Job description for WDL @@ -236,15 +231,10 @@ Job description for WDL }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/docs/snakemake.rst b/docs/snakemake.rst old mode 100644 new mode 100755 diff --git a/docs/startaws.rst b/docs/startaws.rst old mode 100644 new mode 100755 index e9639c0a6..07497839d --- a/docs/startaws.rst +++ b/docs/startaws.rst @@ -6,5 +6,5 @@ Check Before using Tibanna - Before using Tibanna, one must have an **AWS account**. - An **admin** user with access key and secret key **sets up and deploys Tibanna** for a specific user group and specific buckets. - A **regular user**, with their own access key and secret key, associated with the user group can upload data to the bucket and **run jobs using Tibanna**. -- In addition, your *workflows* must be written in either *CWL (Common Workflow Language)* or *WDL (Workflow Description Language)* which point to a docker image on *docker hub*. The CWL/WDL files must have a *public url*. +- In addition, your *workflows* must be written in either *CWL (Common Workflow Language)* or *WDL (Workflow Description Language)* which point to a docker image on *docker hub* or AWS ECR (Elastic Container Registry) on the same AWS account. Alternatively, you can use *Snakemake* workflow to be run as a whole on a single EC2 machine, inside a Snakemake docker image. A CWL/WDL/Snakemake file can be a public url, a local file, or on a (public or private) S3 bucket. diff --git a/docs/wdl.rst b/docs/wdl.rst old mode 100644 new mode 100755 index 9b5db5798..4d03449e2 --- a/docs/wdl.rst +++ b/docs/wdl.rst @@ -2,4 +2,5 @@ Workflow Description Language (WDL) =================================== -Tibanna supports WDL draft-2, through Cromwell binary version 31. +Tibanna version < ``1.0.0`` supports WDL draft-2, through Cromwell binary version 31. Tibanna version >= ``1.0.0`` supports both WDL draft-2 and v1.0, through Cromwell binary version 31 and 53, respectively. This is because some of our old WDL pipelines written in draft-2 version no longer works with the new Cromwell version and we wanted to ensure the backward compatibility. But if you want to use WDL draft-2, specify ``"language": "wdl_draft2"`` instead of ``"language": "wdl"`` which defaults to WDL v1.0. + diff --git a/examples/cond_merge/cond_merge.wdl b/examples/cond_merge/cond_merge.wdl old mode 100644 new mode 100755 diff --git a/examples/cond_merge/cond_merge_wdl_input.json b/examples/cond_merge/cond_merge_wdl_input.json old mode 100644 new mode 100755 index 96fb38a25..e7acca5bc --- a/examples/cond_merge/cond_merge_wdl_input.json +++ b/examples/cond_merge/cond_merge_wdl_input.json @@ -25,14 +25,9 @@ }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/examples/cond_merge/cond_merge_wdl_input2.json b/examples/cond_merge/cond_merge_wdl_input2.json old mode 100644 new mode 100755 index c618ca17a..b0086f532 --- a/examples/cond_merge/cond_merge_wdl_input2.json +++ b/examples/cond_merge/cond_merge_wdl_input2.json @@ -25,14 +25,9 @@ }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/examples/cond_merge/sf1 b/examples/cond_merge/sf1 old mode 100644 new mode 100755 diff --git a/examples/cond_merge/sf2 b/examples/cond_merge/sf2 old mode 100644 new mode 100755 diff --git a/examples/cond_merge/sf3 b/examples/cond_merge/sf3 old mode 100644 new mode 100755 diff --git a/examples/cond_merge/sf4 b/examples/cond_merge/sf4 old mode 100644 new mode 100755 diff --git a/examples/hello/hello_shell_input.json b/examples/hello/hello_shell_input.json old mode 100644 new mode 100755 index 6274dd3b9..1ec56a309 --- a/examples/hello/hello_shell_input.json +++ b/examples/hello/hello_shell_input.json @@ -15,8 +15,8 @@ }, "config": { "ebs_size": 10, - "instance_type": "t2.micro", - "EBS_optimized": false, + "instance_type": "t3.micro", + "EBS_optimized": true, "password": "whateverpasswordworks", "log_bucket": "my-tibanna-test-bucket" } diff --git a/examples/md5/md5.cwl b/examples/md5/md5.cwl old mode 100644 new mode 100755 diff --git a/examples/md5/md5.wdl b/examples/md5/md5.wdl old mode 100644 new mode 100755 diff --git a/examples/md5/md5_cwl_input.json b/examples/md5/md5_cwl_input.json old mode 100644 new mode 100755 diff --git a/examples/md5/md5_shell_input.json b/examples/md5/md5_shell_input.json old mode 100644 new mode 100755 diff --git a/examples/md5/md5_wdl_input.json b/examples/md5/md5_wdl_input.json old mode 100644 new mode 100755 diff --git a/examples/merge/cat.cwl b/examples/merge/cat.cwl old mode 100644 new mode 100755 diff --git a/examples/merge/concatenated b/examples/merge/concatenated old mode 100644 new mode 100755 diff --git a/examples/merge/merge.cwl b/examples/merge/merge.cwl old mode 100644 new mode 100755 diff --git a/examples/merge/merge.wdl b/examples/merge/merge.wdl old mode 100644 new mode 100755 diff --git a/examples/merge/merge_cwl.run.json b/examples/merge/merge_cwl.run.json old mode 100644 new mode 100755 diff --git a/examples/merge/merge_cwl_input.json b/examples/merge/merge_cwl_input.json old mode 100644 new mode 100755 index cfc9d2137..582d219dd --- a/examples/merge/merge_cwl_input.json +++ b/examples/merge/merge_cwl_input.json @@ -22,14 +22,9 @@ }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/examples/merge/merge_wdl_input.json b/examples/merge/merge_wdl_input.json old mode 100644 new mode 100755 index f3bb3a74b..b5be3adca --- a/examples/merge/merge_wdl_input.json +++ b/examples/merge/merge_wdl_input.json @@ -22,14 +22,9 @@ }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/examples/merge/paste.cwl b/examples/merge/paste.cwl old mode 100644 new mode 100755 diff --git a/examples/merge/paste_cwl.run.json b/examples/merge/paste_cwl.run.json old mode 100644 new mode 100755 diff --git a/examples/merge/pasted b/examples/merge/pasted old mode 100644 new mode 100755 diff --git a/examples/merge/sf1 b/examples/merge/sf1 old mode 100644 new mode 100755 diff --git a/examples/merge/sf2 b/examples/merge/sf2 old mode 100644 new mode 100755 diff --git a/examples/merge/sf3 b/examples/merge/sf3 old mode 100644 new mode 100755 diff --git a/examples/merge/sf4 b/examples/merge/sf4 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/cat.cwl b/examples/merge_and_cut/cat.cwl old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/cut.cwl b/examples/merge_and_cut/cut.cwl old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/cut1 b/examples/merge_and_cut/cut1 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/merge.cwl b/examples/merge_and_cut/merge.cwl old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/merge.wdl b/examples/merge_and_cut/merge.wdl old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/merge_and_cut.cwl b/examples/merge_and_cut/merge_and_cut.cwl old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/merge_and_cut.wdl b/examples/merge_and_cut/merge_and_cut.wdl old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/merge_and_cut_cwl.run.json b/examples/merge_and_cut/merge_and_cut_cwl.run.json old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/merge_and_cut_cwl_input.json b/examples/merge_and_cut/merge_and_cut_cwl_input.json old mode 100644 new mode 100755 index 12f2a3889..c50e44148 --- a/examples/merge_and_cut/merge_and_cut_cwl_input.json +++ b/examples/merge_and_cut/merge_and_cut_cwl_input.json @@ -25,14 +25,9 @@ }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": "now", - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/examples/merge_and_cut/merge_and_cut_wdl.run.json b/examples/merge_and_cut/merge_and_cut_wdl.run.json old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/merge_and_cut_wdl_input.json b/examples/merge_and_cut/merge_and_cut_wdl_input.json old mode 100644 new mode 100755 index c62af8fb8..5537f77b5 --- a/examples/merge_and_cut/merge_and_cut_wdl_input.json +++ b/examples/merge_and_cut/merge_and_cut_wdl_input.json @@ -25,14 +25,9 @@ }, "config": { "ebs_size": 10, - "json_bucket": "my-tibanna-test-bucket", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.micro", - "ebs_type": "io1", + "EBS_optimized": true, + "instance_type": "t3.micro", "password": "whateverpasswordworks", - "log_bucket": "my-tibanna-test-bucket", - "key_name": "" + "log_bucket": "my-tibanna-test-bucket" } } diff --git a/examples/merge_and_cut/paste.cwl b/examples/merge_and_cut/paste.cwl old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf1 b/examples/merge_and_cut/sf1 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf2 b/examples/merge_and_cut/sf2 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf3 b/examples/merge_and_cut/sf3 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf4 b/examples/merge_and_cut/sf4 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf5 b/examples/merge_and_cut/sf5 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf6 b/examples/merge_and_cut/sf6 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf7 b/examples/merge_and_cut/sf7 old mode 100644 new mode 100755 diff --git a/examples/merge_and_cut/sf8 b/examples/merge_and_cut/sf8 old mode 100644 new mode 100755 diff --git a/old/AMI/create_tibanna_ami.py b/old/AMI/create_tibanna_ami.py old mode 100644 new mode 100755 diff --git a/old/AMI/launch_for_tibanna_ami.sh b/old/AMI/launch_for_tibanna_ami.sh old mode 100644 new mode 100755 diff --git a/old/AMI/tibanna_ami.sh b/old/AMI/tibanna_ami.sh old mode 100644 new mode 100755 diff --git a/old/AMI/tibanna_ami_test.sh b/old/AMI/tibanna_ami_test.sh old mode 100644 new mode 100755 diff --git a/release.sh b/release.sh new file mode 100755 index 000000000..479478856 --- /dev/null +++ b/release.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# build and upload awsf docker image +export BUILD_LOG=/tmp/build-log +export VERSION=$(python -c 'from tibanna._version import __version__; print(__version__)') +export AWSF_IMAGE=$(python -c 'from tibanna.vars import DEFAULT_AWSF_IMAGE; print(DEFAULT_AWSF_IMAGE)') +docker build -t $AWSF_IMAGE --build-arg version=$VERSION awsf3-docker/ > $BUILD_LOG +docker push $AWSF_IMAGE diff --git a/requirements-test.txt b/requirements-test.txt old mode 100644 new mode 100755 index c7cf87b06..7251e0eb9 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -2,8 +2,9 @@ invoke==0.18.1 # syntax checker flake8==2.4.1 # for testing -pytest==3.0.5 +pytest==5.0 pytest-cov==2.3.1 pytest-runner pytest-parallel -mock +mock==4.0 +pytest-mock==3.3 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 diff --git a/setup.cfg b/setup.cfg old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index a19c06b28..d0caaacc0 --- a/setup.py +++ b/setup.py @@ -31,11 +31,12 @@ description='Tibanna runs portable pipelines (in CWL/WDL) on the AWS Cloud.', long_description=long_description, long_description_content_type='text/markdown', - packages=['tibanna'], + packages=['tibanna', 'awsf3'], zip_safe=False, author='4DN Team at Harvard Medical School', - author_email='duplexa@gmail.com, jeremy_johnson@hms.harvard.edu, carl_vitzthum@hms.harvard.edu', - url='http://data.4dnucleome.org', + author_email='duplexa@gmail.com, jeremy_johnson@hms.harvard.edu,' + + 'carl_vitzthum@hms.harvard.edu, Michele_Berselli@hms.harvard.edu', + url='http://github.com/4dn-dcic/tibanna', license='MIT', classifiers=[ 'License :: OSI Approved :: MIT License', @@ -50,6 +51,7 @@ entry_points={ 'console_scripts': [ 'tibanna = tibanna.__main__:main', + 'awsf3 = awsf3.__main__:main', ] } ) diff --git a/tasks.py b/tasks.py old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_bwa.json b/test_json/unicorn/4dn_bwa.json deleted file mode 100644 index cf3cbcaf0..000000000 --- a/test_json/unicorn/4dn_bwa.json +++ /dev/null @@ -1,62 +0,0 @@ -{ - "config": { - "ebs_size": 30, - "ebs_type": "io1", - "ebs_iops": 500, - "log_bucket": "tibanna-output", - "key_name": "", - "run_name": "bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "overwrite_input_extra": false, - "public_postrun_json": true, - "email": false - }, - "_tibanna": { - "run_id": "d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "env": "fourfront-webdev", - "url": "https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_pony_dev:bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "run_type": "bwa-mem", - "run_name": "bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "exec_arn": "arn:aws:states:us-east-1:643366669028:execution:tibanna_pony_dev:bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99" - }, - "args": { - "app_name": "bwa-mem", - "app_version": "0.2.0", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.2.0/cwl_awsem/", - "cwl_main_filename": "bwa-mem.cwl", - "cwl_child_filenames": [], - "wdl_directory_url": "", - "wdl_main_filename": "", - "wdl_child_filenames": "", - "cwl_version": "draft3", - "input_parameters": { - "nThreads": 4 - }, - "additional_benchmarking_parameters": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "output_target": { - "out_bam": "17a279ba-5dad-4813-a06d-3c93185c8603/4DNFIVXSGA6C.bam" - }, - "secondary_output_target": {}, - "input_files": { - "bwa_index": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "rename": "", - "unzip": "", - "object_key": "1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz" - }, - "fastq1": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "rename": "", - "unzip": "", - "object_key": "1150b428-272b-4a0c-b3e6-4b405c148f7c/4DNFIVOZN511.fastq.gz" - }, - "fastq2": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "rename": "", - "unzip": "", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - }, - "secondary_files": {} - } -} diff --git a/test_json/unicorn/4dn_bwa.mount.json b/test_json/unicorn/4dn_bwa.mount.json deleted file mode 100644 index 2efb3dbfd..000000000 --- a/test_json/unicorn/4dn_bwa.mount.json +++ /dev/null @@ -1,63 +0,0 @@ -{ - "config": { - "ebs_size": 30, - "ebs_type": "io1", - "ebs_iops": 500, - "log_bucket": "tibanna-output", - "key_name": "", - "run_name": "bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "overwrite_input_extra": false, - "public_postrun_json": true, - "email": false - }, - "_tibanna": { - "run_id": "d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "env": "fourfront-webdev", - "url": "https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_pony_dev:bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "run_type": "bwa-mem", - "run_name": "bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99", - "exec_arn": "arn:aws:states:us-east-1:643366669028:execution:tibanna_pony_dev:bwa-mem_d07f0f7b-6b82-4396-aa58-596cf56f4c99" - }, - "args": { - "app_name": "bwa-mem", - "app_version": "0.2.0", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.2.0/cwl_awsem/", - "cwl_main_filename": "bwa-mem.cwl", - "cwl_child_filenames": [], - "wdl_directory_url": "", - "wdl_main_filename": "", - "wdl_child_filenames": "", - "cwl_version": "draft3", - "input_parameters": { - "nThreads": 4 - }, - "additional_benchmarking_parameters": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "output_target": { - "out_bam": "17a279ba-5dad-4813-a06d-3c93185c8603/4DNFIVXSGA6C.bam" - }, - "secondary_output_target": {}, - "input_files": { - "bwa_index": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "rename": "", - "unzip": "", - "object_key": "1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz", - "mount": true - }, - "fastq1": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "rename": "", - "unzip": "", - "object_key": "1150b428-272b-4a0c-b3e6-4b405c148f7c/4DNFIVOZN511.fastq.gz" - }, - "fastq2": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "rename": "", - "unzip": "", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - }, - "secondary_files": {} - } -} diff --git a/test_json/unicorn/4dn_bwa.runonly-werror.json b/test_json/unicorn/4dn_bwa.runonly-werror.json deleted file mode 100644 index b45ebe39d..000000000 --- a/test_json/unicorn/4dn_bwa.runonly-werror.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "bwa-mem" - }, - "args": { - "app_name": "bwa-mem", - "input_parameters": {}, - "cwl_child_filenames": [], - "output_target": { - "report": "lalala/out.bam" - }, - "secondary_output_target": {}, - "cwl_main_filename": "bwa-mem.11.cwl", - "secondary_files": {}, - "output_S3_bucket": "tibanna-output", - "app_version": "5", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.3/cwl_awsem/", - "cwl_version": "draft3", - "input_files": { - "fastq1": { - "bucket_name": "4dn-tool-evaluation-files", - "object_key": "GM12878_SRR1658581_1pc_1_R1.fastq.h10000" - }, - "fastq2": { - "bucket_name": "4dn-tool-evaluation-files", - "object_key": "GM12878_SRR1658581_1pc_1_R2.fastq.h10000" - }, - "bwa_index": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz" - } - }, - "input_parameters": { - "nThreads": 2 - } - }, - "config": { - - "ebs_size": 30, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.large", - - - "password": "hahaha", - "log_bucket": "tibanna-output" - } -} diff --git a/test_json/unicorn/4dn_bwa.runonly.json b/test_json/unicorn/4dn_bwa.runonly.json deleted file mode 100644 index 1bc0ec968..000000000 --- a/test_json/unicorn/4dn_bwa.runonly.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "bwa-mem" - }, - "args": { - "app_name": "bwa-mem", - "input_parameters": {}, - "cwl_child_filenames": [], - "output_target": { - "out_bam": "lalala/out.bam" - }, - "secondary_output_target": {}, - "cwl_main_filename": "bwa-mem.11.cwl", - "secondary_files": {}, - "output_S3_bucket": "tibanna-output", - "app_version": "5", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.3/cwl_awsem/", - "cwl_version": "draft3", - "input_files": { - "fastq1": { - "bucket_name": "4dn-tool-evaluation-files", - "object_key": "GM12878_SRR1658581_1pc_1_R1.fastq.h10000" - }, - "fastq2": { - "bucket_name": "4dn-tool-evaluation-files", - "object_key": "GM12878_SRR1658581_1pc_1_R2.fastq.h10000" - }, - "bwa_index": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz" - } - }, - "input_parameters": { - "nThreads": 2 - } - }, - "config": { - "ebs_size": 30, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.large", - "password": "dragonfly", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/test_json/unicorn/4dn_bwa.runonly.s3.json b/test_json/unicorn/4dn_bwa.runonly.s3.json deleted file mode 100644 index 8ffbaba6e..000000000 --- a/test_json/unicorn/4dn_bwa.runonly.s3.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "bwa-mem" - }, - "args": { - "app_name": "bwa-mem", - "input_parameters": {}, - "cwl_child_filenames": [], - "output_target": { - "out_bam": "lalala/out.bam" - }, - "secondary_output_target": {}, - "cwl_main_filename": "bwa-mem.11.cwl", - "secondary_files": {}, - "output_S3_bucket": "tibanna-output", - "app_version": "5", - "cwl_directory_url": "s3://soos-4dn-bucket/testcwl2/", - "cwl_version": "draft3", - "input_files": { - "fastq1": { - "bucket_name": "4dn-tool-evaluation-files", - "object_key": "GM12878_SRR1658581_1pc_1_R1.fastq.h10000" - }, - "fastq2": { - "bucket_name": "4dn-tool-evaluation-files", - "object_key": "GM12878_SRR1658581_1pc_1_R2.fastq.h10000" - }, - "bwa_index": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz" - } - }, - "input_parameters": { - "nThreads": 2 - } - }, - "config": { - "ebs_size": 30, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "t2.large", - "password": "dragonfly", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/test_json/unicorn/4dn_bwa.runonly.v1.json b/test_json/unicorn/4dn_bwa.runonly.v1.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_bwa.runonly.v1.local.json b/test_json/unicorn/4dn_bwa.runonly.v1.local.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_bwa.runonly.v1.singularity.json b/test_json/unicorn/4dn_bwa.runonly.v1.singularity.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_chip_seq_alignment.v1.json b/test_json/unicorn/4dn_chip_seq_alignment.v1.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_atacseq_test1.json b/test_json/unicorn/4dn_encode_atacseq_test1.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq-fail.json b/test_json/unicorn/4dn_encode_chipseq-fail.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq.json b/test_json/unicorn/4dn_encode_chipseq.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast.json b/test_json/unicorn/4dn_encode_chipseq_fast.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast2.json b/test_json/unicorn/4dn_encode_chipseq_fast2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast2_se.json b/test_json/unicorn/4dn_encode_chipseq_fast2_se.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast3.json b/test_json/unicorn/4dn_encode_chipseq_fast3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast4.json b/test_json/unicorn/4dn_encode_chipseq_fast4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm2.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm2_se.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm2_se.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm3.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm3_se.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm3_se.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm_chip_1rep.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm_chip_1rep.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm_chip_1rep_se.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm_chip_1rep_se.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm_ctl_1rep.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm_ctl_1rep.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_mm_se.json b/test_json/unicorn/4dn_encode_chipseq_fast_mm_se.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_se2.json b/test_json/unicorn/4dn_encode_chipseq_fast_se2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_se3.json b/test_json/unicorn/4dn_encode_chipseq_fast_se3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_fast_se4.json b/test_json/unicorn/4dn_encode_chipseq_fast_se4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_histone-pe-1rep-alignonly-chiponly.json b/test_json/unicorn/4dn_encode_chipseq_histone-pe-1rep-alignonly-chiponly.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_histone-pe-1rep-alignonly-ctlonly.json b/test_json/unicorn/4dn_encode_chipseq_histone-pe-1rep-alignonly-ctlonly.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_histone-pe-1rep-alignonly.json b/test_json/unicorn/4dn_encode_chipseq_histone-pe-1rep-alignonly.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_histone-se-2rep-alignonly.json b/test_json/unicorn/4dn_encode_chipseq_histone-se-2rep-alignonly.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_encode_chipseq_pe_rep2_aln.json b/test_json/unicorn/4dn_encode_chipseq_pe_rep2_aln.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_gatk.runonly.json b/test_json/unicorn/4dn_gatk.runonly.json deleted file mode 100644 index b82d63037..000000000 --- a/test_json/unicorn/4dn_gatk.runonly.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "config": { - - "instance_type" : "t2.medium", - "ebs_size" : 100, - "ebs_type" : "gp2", - "ebs_iops" : 5000, - "password": "hahaha", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized" : false, - "shutdown_min" : "now", - - - "log_bucket": "tibanna-output" - }, - "args" : { - "cwl" : "gatk-gvcf.cwl", - "cwl_children" : "", - "app_name" : "gatk-gvcf", - "app_version" : "", - "cwl_directory" : "https://raw.githubusercontent.com/hms-dbmi/4dn-dcic-workflow-codes/master/cwl/", - "cwl_version": "draft3", - "input_reference_files_directory" : "maestro-resources", - "output_S3_bucket" : "tibanna-output", - "input_files" : {"BAM": "test.2_1.bam", "BAM_BAI": "test.2_1.bai"}, - "secondary_files": {}, - "output_target": {}, - "secondary_output_target": {}, - "input_reference_files" : {"FASTA": "human_g1k_v37_decoy.fasta", "FASTA_FAI": "human_g1k_v37_decoy.fasta.fai", "FASTA_DICT": "human_g1k_v37_decoy.dict", "dbSNP": "dbsnp_138.b37.vcf", "dbSNP_IDX": "dbsnp_138.b37.vcf.idx"}, - "input_parameters" : {"region": "21", "prefix": "test.2_1", "ncore": 2, "mem": "4G"}, - "input_files_directory" : "tibanna-testinput" - } -} diff --git a/test_json/unicorn/4dn_hic_processing_bam-2.json b/test_json/unicorn/4dn_hic_processing_bam-2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_hic_processing_bam-3.json b/test_json/unicorn/4dn_hic_processing_bam-3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_hic_processing_bam-diff-v42.2.json b/test_json/unicorn/4dn_hic_processing_bam-diff-v42.2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_hic_processing_bam-diff-v43.json b/test_json/unicorn/4dn_hic_processing_bam-diff-v43.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_hic_processing_bam.json b/test_json/unicorn/4dn_hic_processing_bam.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_hicpairs_easy.runonly.json b/test_json/unicorn/4dn_hicpairs_easy.runonly.json deleted file mode 100644 index 1f61ddc07..000000000 --- a/test_json/unicorn/4dn_hicpairs_easy.runonly.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "app_name": "hi-c-processing-pairs", - "parameters": { - "ncores": 1, - "custom_res": "100000,200000,500000", - "min_res": 100000, - "maxmem": "8g" - }, - "args": { - "app_name": "hi-c-processing-pairs", - "input_parameters": { - "custom_res": "100000,200000,500000", - "min_res": 100000, - "ncores": 1, - "maxmem": "8g" - }, - "cwl_child_filenames": [ - "merge-pairs.cwl", - "addfragtopairs.cwl", - "pairs2hic.cwl", - "cooler.cwl", - "cool2mcool.cwl", - "extract-mcool-normvector-for-juicebox.cwl", - "add-hic-normvector-to-mcool.cwl" - ], - "output_target": { - "mcool": "976823d0-7f03-4f45-8aea-a464188937fb/4DNFI2SQQGU7.cool", - "merged_pairs": "f874820f-9b27-4018-8f3a-dab35455edc0/4DNFISY6XCC2.pairs.gz", - "hic": "593bef3f-eb63-4ef9-a371-c9cff0a29662/4DNFISNLMQWI.hic", - "cooler_normvector": "normvector" - }, - "secondary_output_target": { - "output_pairs": "f874820f-9b27-4018-8f3a-dab35455edc0/4DNFISY6XCC2.pairs.gz.px2" - }, - "cwl_main_filename": "hi-c-processing-pairs.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "dev", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/docker-4dn-hic/v42/cwl/", - "cwl_version": "draft3", - "input_files": { - "chromsizes": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "4a6d10ee-2edb-4402-a98f-0edb1d58f5e9/4DNFI823LSII.chrom.sizes" - }, - "input_pairs": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-wfoutput", - "object_key": [ - "e0b32fa9-a54e-4f62-86dc-039f60b34812/4DNFIIQN4FKO.pairs.gz" - ] - }, - "restriction_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "4a6d10ee-2edb-4402-a98f-0edb1d582084/4DNFI823L812.txt" - } - } - }, - "_tibanna": { - "env": "fourfront-webdev", - "settings": { - "run_type": "hicpairs", - "run_id": "b1c18dee-d172-4b53-bf81-042a0124eeac", - "env": "fourfront-webdev", - "run_name": "hicpairs" - } - }, - "output_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "config": { - - "ebs_size": 100, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": "120", - "instance_type": "t2.large", - - - "password": "hahaha", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/test_json/unicorn/4dn_md5_v1.json b/test_json/unicorn/4dn_md5_v1.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_md5_v1.mount.json b/test_json/unicorn/4dn_md5_v1.mount.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_md5_v1_singularity.json b/test_json/unicorn/4dn_md5_v1_singularity.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_mergebed.json b/test_json/unicorn/4dn_mergebed.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_align_se.json b/test_json/unicorn/4dn_repliseq_align_se.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_align_se2.json b/test_json/unicorn/4dn_repliseq_align_se2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_align_se3.json b/test_json/unicorn/4dn_repliseq_align_se3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_align_se4.json b/test_json/unicorn/4dn_repliseq_align_se4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_clip.json b/test_json/unicorn/4dn_repliseq_clip.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_clip2.json b/test_json/unicorn/4dn_repliseq_clip2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_clip3.json b/test_json/unicorn/4dn_repliseq_clip3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_clip4.json b/test_json/unicorn/4dn_repliseq_clip4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_count.json b/test_json/unicorn/4dn_repliseq_count.json deleted file mode 100644 index b39338f96..000000000 --- a/test_json/unicorn/4dn_repliseq_count.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "args": { - "app_name": "count", - "input_parameters": {}, - "cwl_child_filenames": [], - "output_target": { - "out_count_bg": "gm12878_rep1_early.clipped.q20_sorted.rmdup.w5000.bg" - }, - "secondary_output_target": {}, - "cwl_main_filename": "count.cwl", - "secondary_files": {}, - "output_S3_bucket": "tibanna-testinput", - "app_version": "0.2.2", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/dev/cwl_awsem/repliseq/", - "cwl_version" : "draft-3", - "input_files": { - "input_bam": { - "bucket_name": "tibanna-testinput", - "object_key": "gm12878_rep1_early.clipped.q20_sorted.rmdup.bam" - }, - "chrsizes": { - "object_key": "4a6d10ee-2edb-4402-a98f-0edb1d58f5e9/4DNFI823LSII.chrom.sizes", - "bucket_name": "elasticbeanstalk-fourfront-webdev-files" - } - } - }, - "config": { - "ebs_size": 10, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "t3.medium", - "password": "dragonfly", - "log_bucket": "tibanna-output", - "key_name": "" - } -} diff --git a/test_json/unicorn/4dn_repliseq_count2.json b/test_json/unicorn/4dn_repliseq_count2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_count3.json b/test_json/unicorn/4dn_repliseq_count3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_count4.json b/test_json/unicorn/4dn_repliseq_count4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_dedup.json b/test_json/unicorn/4dn_repliseq_dedup.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_dedup2.json b/test_json/unicorn/4dn_repliseq_dedup2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_dedup3.json b/test_json/unicorn/4dn_repliseq_dedup3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_dedup4.json b/test_json/unicorn/4dn_repliseq_dedup4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filter.json b/test_json/unicorn/4dn_repliseq_filter.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filter2.json b/test_json/unicorn/4dn_repliseq_filter2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filter3.json b/test_json/unicorn/4dn_repliseq_filter3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filter4.json b/test_json/unicorn/4dn_repliseq_filter4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filtersort.json b/test_json/unicorn/4dn_repliseq_filtersort.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filtersort2.json b/test_json/unicorn/4dn_repliseq_filtersort2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filtersort3.json b/test_json/unicorn/4dn_repliseq_filtersort3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_filtersort4.json b/test_json/unicorn/4dn_repliseq_filtersort4.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_log2ratio.json b/test_json/unicorn/4dn_repliseq_log2ratio.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_log2ratio2.json b/test_json/unicorn/4dn_repliseq_log2ratio2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_make_filteredbed.json b/test_json/unicorn/4dn_repliseq_make_filteredbed.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_make_referencebg.json b/test_json/unicorn/4dn_repliseq_make_referencebg.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_normalize.json b/test_json/unicorn/4dn_repliseq_normalize.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_normalize2.json b/test_json/unicorn/4dn_repliseq_normalize2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_parta.json b/test_json/unicorn/4dn_repliseq_parta.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_parta.v15.pe_size1_nthread1.json b/test_json/unicorn/4dn_repliseq_parta.v15.pe_size1_nthread1.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_parta.v15.se_size1_nthread1.json b/test_json/unicorn/4dn_repliseq_parta.v15.se_size1_nthread1.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_parta.v16.se_size1_nthread1.json b/test_json/unicorn/4dn_repliseq_parta.v16.se_size1_nthread1.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_smooth.json b/test_json/unicorn/4dn_repliseq_smooth.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/4dn_repliseq_smooth2.json b/test_json/unicorn/4dn_repliseq_smooth2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/awsf.cwlout.json b/test_json/unicorn/awsf.cwlout.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/awsf.postrun.json b/test_json/unicorn/awsf.postrun.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/awsf.run.json b/test_json/unicorn/awsf.run.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/md5-leelab.json b/test_json/unicorn/md5-leelab.json deleted file mode 100644 index f982201dd..000000000 --- a/test_json/unicorn/md5-leelab.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "leelab-datafiles", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "leelab-datafiles", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - } - }, - "config": { - "ebs_size": 10, - "ebs_type": "io1", - "json_bucket": "leelab-tibanna-log", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": "now", - "instance_type": "t2.micro", - "cloudwatch_dashboard" : true, - "password": "dragonfly", - "log_bucket": "leelab-tibanna-log", - "key_name": "" - }, - "_tibanna": { - "run_type": "md5_test" - } -} diff --git a/test_json/unicorn/md5-profile-leelab.json b/test_json/unicorn/md5-profile-leelab.json deleted file mode 100644 index 995338aa1..000000000 --- a/test_json/unicorn/md5-profile-leelab.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "leelab-datafiles", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "sscwgs", - "object_key": "11002/BAM/Sample_SSC03070/analysis/SSC03070.final.bai", - "profile": "user1" - } - } - }, - "config": { - "ebs_size": 10, - "ebs_type": "io1", - "json_bucket": "leelab-tibanna-log", - "EBS_optimized": false, - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "t2.micro", - - - "password": "dragonfly", - "log_bucket": "leelab-tibanna-log", - "key_name": "" - } -} diff --git a/test_json/unicorn/my_test_tibanna_bucket.json b/test_json/unicorn/my_test_tibanna_bucket.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/my_test_tibanna_bucket2.json b/test_json/unicorn/my_test_tibanna_bucket2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/my_test_tibanna_bucket3.json b/test_json/unicorn/my_test_tibanna_bucket3.json deleted file mode 100644 index 38ce57055..000000000 --- a/test_json/unicorn/my_test_tibanna_bucket3.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "my_outdir/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "4dntest-tibanna-data", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "4dntest-tibanna-data", - "object_key": "somefastqfile.fastq.gz" - } - } - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dntest-tibanna-log", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "", - - - "password": "dragonfly", - "log_bucket": "4dntest-tibanna-log", - "key_name": "" - } -} diff --git a/test_json/unicorn/shelltest-ecr.json b/test_json/unicorn/shelltest-ecr.json old mode 100644 new mode 100755 index 31770a7b2..aa1343a1b --- a/test_json/unicorn/shelltest-ecr.json +++ b/test_json/unicorn/shelltest-ecr.json @@ -13,7 +13,7 @@ "output_S3_bucket": "soos-4dn-bucket", "app_version": "5", "input_files": { - "file:///data1/shell/somefile": "s3://soos-4dn-bucket/4DNFIITTJYNR.mcool.bins.juicerformat.gz" + "file:///data1/shell/somefile": "s3://soos-4dn-bucket/hg38.blacklist.bed.gz" }, "input_parameters": { }, diff --git a/test_json/unicorn/shelltest-soo10k.json b/test_json/unicorn/shelltest-soo10k.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/shelltest-zonetest-soo10k.json b/test_json/unicorn/shelltest-zonetest-soo10k.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/shelltest.json b/test_json/unicorn/shelltest.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/shelltest2.json b/test_json/unicorn/shelltest2.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/shelltest3.json b/test_json/unicorn/shelltest3.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/shelltest4.json b/test_json/unicorn/shelltest4.json old mode 100644 new mode 100755 index 50c1d955c..fd8c041fe --- a/test_json/unicorn/shelltest4.json +++ b/test_json/unicorn/shelltest4.json @@ -13,7 +13,7 @@ "output_S3_bucket": "soos-4dn-bucket", "app_version": "5", "input_files": { - "file:///data1/shell/somefile": "s3://soos-4dn-bucket/4DNFIITTJYNR.mcool.bins.juicerformat.gz" + "file:///data1/shell/somefile": "s3://soos-4dn-bucket/hg38.blacklist.bed.gz" }, "input_parameters": { }, @@ -28,7 +28,6 @@ "EBS_optimized": false, "log_bucket": "tibanna-output", "key_name": "4dn-encode", - "cloudwatch_dashboard": false, "subnet": "subnet-efb1b3c4" } } diff --git a/test_json/unicorn/snakemaketest.json b/test_json/unicorn/snakemaketest.json old mode 100644 new mode 100755 index 755c0bfaa..d39f30d5d --- a/test_json/unicorn/snakemaketest.json +++ b/test_json/unicorn/snakemaketest.json @@ -6,7 +6,7 @@ "command": "snakemake", "snakemake_main_filename": "Snakefile", "snakemake_directory_local": "tests/files/snakemake/", - "container_image": "quay.io/snakemake/snakemake", + "container_image": "snakemake/snakemake", "output_target": { "file:///data1/out/lala": "snakemake-test-output-lala" }, diff --git a/test_json/unicorn/snakemaketest2.json b/test_json/unicorn/snakemaketest2.json old mode 100644 new mode 100755 index 40d7f9c36..897caf9d6 --- a/test_json/unicorn/snakemaketest2.json +++ b/test_json/unicorn/snakemaketest2.json @@ -6,7 +6,7 @@ "command": "snakemake step2", "snakemake_main_filename": "Snakefile", "snakemake_directory_local": "tests/files/snakemake2/", - "container_image": "quay.io/snakemake/snakemake", + "container_image": "snakemake/snakemake", "output_target": { "file:///data1/out/lalala": "snakemake-test-output-lalala" }, diff --git a/test_json/unicorn/snakemaketest2b.json b/test_json/unicorn/snakemaketest2b.json old mode 100644 new mode 100755 index ef39f4533..a90791943 --- a/test_json/unicorn/snakemaketest2b.json +++ b/test_json/unicorn/snakemaketest2b.json @@ -6,7 +6,7 @@ "command": "snakemake step2", "snakemake_main_filename": "Snakefile", "snakemake_directory_local": "tests/files/snakemake2b/", - "container_image": "quay.io/snakemake/snakemake", + "container_image": "snakemake/snakemake", "output_target": { "file:///data1/snakemake/lalala": "snakemake-test-output-lalala" }, diff --git a/test_json/unicorn/snakemaketest2c.json b/test_json/unicorn/snakemaketest2c.json old mode 100644 new mode 100755 index 55a79c07a..204083679 --- a/test_json/unicorn/snakemaketest2c.json +++ b/test_json/unicorn/snakemaketest2c.json @@ -6,7 +6,7 @@ "command": "snakemake step2 step3", "snakemake_main_filename": "Snakefile", "snakemake_directory_local": "tests/files/snakemake2b/", - "container_image": "quay.io/snakemake/snakemake", + "container_image": "snakemake/snakemake", "output_target": { "file:///data1/snakemake/lalala": "snakemake-test-output-lalala", "file:///data1/snakemake/lalalala": "snakemake-test-output-lalalala" diff --git a/test_json/unicorn/snakemaketest3.json b/test_json/unicorn/snakemaketest3.json old mode 100644 new mode 100755 index a678489ce..81a28a83c --- a/test_json/unicorn/snakemaketest3.json +++ b/test_json/unicorn/snakemaketest3.json @@ -6,7 +6,7 @@ "command": "snakemake", "snakemake_main_filename": "Snakefile", "snakemake_directory_local": "tests/files/snakemake3/", - "container_image": "quay.io/snakemake/snakemake", + "container_image": "snakemake/snakemake", "output_target": { "file:///data1/snakemake/lalalala": "snakemake-test3-output-lalalala" }, diff --git a/test_json/unicorn/snakemaketest4.json b/test_json/unicorn/snakemaketest4.json old mode 100644 new mode 100755 index b43cf558c..3f439b78c --- a/test_json/unicorn/snakemaketest4.json +++ b/test_json/unicorn/snakemaketest4.json @@ -3,10 +3,10 @@ "app_name": "snakemake-test", "input_parameters": {}, "language": "snakemake", - "command": "snakemake", + "command": "snakemake --cores all", "snakemake_main_filename": "Snakefile", "snakemake_directory_local": "tests/files/snakemake4/", - "container_image": "quay.io/snakemake/snakemake", + "container_image": "snakemake/snakemake", "output_target": { "file:///data1/snakemake/lalalala": "snakemake-test4-output-lalalala" }, @@ -21,7 +21,8 @@ "ebs_size": 10, "EBS_optimized": false, "shutdown_min": 30, - "instance_type": "t3.micro", + "cpu": 1, + "mem": 1, "log_bucket": "tibanna-output", "key_name": "4dn-encode", "cloudwatch_dashboard": false diff --git a/test_json/unicorn/snakemaketest4b.json b/test_json/unicorn/snakemaketest4b.json old mode 100644 new mode 100755 index 0e7498f23..191dffc57 --- a/test_json/unicorn/snakemaketest4b.json +++ b/test_json/unicorn/snakemaketest4b.json @@ -6,7 +6,7 @@ "command": "snakemake", "snakemake_main_filename": "Snakefile", "snakemake_directory_local": "tests/files/snakemake4/", - "container_image": "quay.io/snakemake/snakemake", + "container_image": "snakemake/snakemake", "output_target": {}, "secondary_output_target": {}, "secondary_files": {}, diff --git a/test_json/unicorn/suwang_filter.json b/test_json/unicorn/suwang_filter.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/suwang_markasdup.json b/test_json/unicorn/suwang_markasdup.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/suwang_pairsam_merge.json b/test_json/unicorn/suwang_pairsam_merge.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/suwang_pairsam_parse_sort.json b/test_json/unicorn/suwang_pairsam_parse_sort.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/tmojson b/test_json/unicorn/tmojson old mode 100644 new mode 100755 diff --git a/test_json/unicorn/xtea-leelab.json b/test_json/unicorn/xtea-leelab.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/xtea-profile-leelab-hg38.json b/test_json/unicorn/xtea-profile-leelab-hg38.json old mode 100644 new mode 100755 diff --git a/test_json/unicorn/xtea-profile-leelab.json b/test_json/unicorn/xtea-profile-leelab.json old mode 100644 new mode 100755 diff --git a/tests/__init__.py b/tests/__init__.py old mode 100644 new mode 100755 diff --git a/tests/additional_manual_tests b/tests/additional_manual_tests old mode 100644 new mode 100755 diff --git a/tests/awsf/Ffbc4BqvRYIq.run.json b/tests/awsf/Ffbc4BqvRYIq.run.json old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzq.log.json b/tests/awsf/bqLd8oa7Tdzq.log.json old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzq.md5sum.txt b/tests/awsf/bqLd8oa7Tdzq.md5sum.txt old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzq.postrun.json b/tests/awsf/bqLd8oa7Tdzq.postrun.json old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzq.postrun.json.out b/tests/awsf/bqLd8oa7Tdzq.postrun.json.out old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzr.log b/tests/awsf/bqLd8oa7Tdzr.log old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzr.md5sum.txt b/tests/awsf/bqLd8oa7Tdzr.md5sum.txt old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzr.postrun.json.out b/tests/awsf/bqLd8oa7Tdzr.postrun.json.out old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzs.log b/tests/awsf/bqLd8oa7Tdzs.log old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzs.md5sum.txt b/tests/awsf/bqLd8oa7Tdzs.md5sum.txt old mode 100644 new mode 100755 diff --git a/tests/awsf/bqLd8oa7Tdzs.postrun.json.out b/tests/awsf/bqLd8oa7Tdzs.postrun.json.out old mode 100644 new mode 100755 diff --git a/tests/awsf/haha b/tests/awsf/haha old mode 100644 new mode 100755 diff --git a/tests/awsf/mydir/dir2/haha3 b/tests/awsf/mydir/dir2/haha3 old mode 100644 new mode 100755 diff --git a/tests/awsf/mydir/haha b/tests/awsf/mydir/haha old mode 100644 new mode 100755 diff --git a/tests/awsf/mydir/haha2 b/tests/awsf/mydir/haha2 old mode 100644 new mode 100755 diff --git a/tests/awsf/out.rmdup.bam b/tests/awsf/out.rmdup.bam old mode 100644 new mode 100755 diff --git a/tests/awsf/out.rmdup.bam.bai b/tests/awsf/out.rmdup.bam.bai old mode 100644 new mode 100755 diff --git a/tests/awsf/out.rmdup.log.qc_report.zip b/tests/awsf/out.rmdup.log.qc_report.zip old mode 100644 new mode 100755 diff --git a/tests/awsf/out.w5000.bedGraph.gz b/tests/awsf/out.w5000.bedGraph.gz old mode 100644 new mode 100755 diff --git a/tests/awsf/out.w5000.bedGraph.gz.px2 b/tests/awsf/out.w5000.bedGraph.gz.px2 old mode 100644 new mode 100755 diff --git a/tests/awsf/out.w5000.bw b/tests/awsf/out.w5000.bw old mode 100644 new mode 100755 diff --git a/tests/awsf/qD9zkkqAWjnE.run.json b/tests/awsf/qD9zkkqAWjnE.run.json old mode 100644 new mode 100755 diff --git a/tests/awsf/test.py b/tests/awsf/test.py old mode 100644 new mode 100755 diff --git a/tests/awsf/wdl/a3T0RlZ09WuR.log.json b/tests/awsf/wdl/a3T0RlZ09WuR.log.json old mode 100644 new mode 100755 diff --git a/tests/awsf/wdl/a3T0RlZ09WuR.md5sum.txt b/tests/awsf/wdl/a3T0RlZ09WuR.md5sum.txt old mode 100644 new mode 100755 diff --git a/tests/awsf/wdl/a3T0RlZ09WuR.postrun.json b/tests/awsf/wdl/a3T0RlZ09WuR.postrun.json old mode 100644 new mode 100755 diff --git a/tests/awsf/wdl/a3T0RlZ09WuS.log.json b/tests/awsf/wdl/a3T0RlZ09WuS.log.json old mode 100644 new mode 100755 diff --git a/tests/awsf/wdl/a3T0RlZ09WuS.md5sum.txt b/tests/awsf/wdl/a3T0RlZ09WuS.md5sum.txt old mode 100644 new mode 100755 diff --git a/tests/awsf/wdl/pzn3Us98y21I.run.json b/tests/awsf/wdl/pzn3Us98y21I.run.json old mode 100644 new mode 100755 diff --git a/tests/awsf/wdl/uGeIte1giKxt.run.json b/tests/awsf/wdl/uGeIte1giKxt.run.json old mode 100644 new mode 100755 diff --git a/tests/awsf3/__init__.py b/tests/awsf3/__init__.py new file mode 100755 index 000000000..e69de29bb diff --git a/tests/awsf3/conftest.py b/tests/awsf3/conftest.py new file mode 100755 index 000000000..32230d19d --- /dev/null +++ b/tests/awsf3/conftest.py @@ -0,0 +1,22 @@ +import sys +import io + + +upload_test_bucket = 'tibanna-output' + + +class CaptureOut: + """Context manager for capturing stdout, since capsys somehow didn't work""" + def __init__(self): + pass + + def get_captured_out(self): + return self.captured_stdout.getvalue() + + def __enter__(self): + self.old_stdout = sys.stdout + self.captured_stdout = io.StringIO() + sys.stdout = self.captured_stdout + + def __exit__(self, type, value, tb): + sys.stdout = self.old_stdout diff --git a/tests/awsf3/run_task_events/fastqc_unzip.json b/tests/awsf3/run_task_events/fastqc_unzip.json new file mode 100644 index 000000000..de5fc6dfa --- /dev/null +++ b/tests/awsf3/run_task_events/fastqc_unzip.json @@ -0,0 +1,46 @@ +{ + "config": { + "ebs_size": 10, + "ebs_type": "gp2", + "ebs_iops": "", + "instance_type": "t3.small", + "shutdown_min": 0, + "log_bucket": "tibanna-output", + "key_name": "4dn-encode", + "public_postrun_json": true, + "email": false + }, + "jobid": "tyzA7bsM21IE", + "args": { + "app_name": "fastqc", + "app_version": "v2", + "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/docker-fastqc/v2/cwl/", + "cwl_main_filename": "fastqc.cwl", + "cwl_child_filenames": "", + "wdl_directory_url": "", + "wdl_main_filename": "", + "wdl_child_filenames": "", + "cwl_version": "v1", + "input_parameters": {}, + "additional_benchmarking_parameters": {}, + "output_S3_bucket": "soos-4dn-bucket", + "output_target": { + "report_zip": { + "object_prefix": "test_zip", + "unzip": true + } + }, + "secondary_output_target": {}, + "input_files": { + "input_fastq": { + "bucket_name": "elasticbeanstalk-fourfront-cgap-files", + "rename": "", + "unzip": "", + "mount": true, + "object_key": "9ec06a34-ec71-4e92-8a0c-8107bcb6912a/GAPFIV4YX618.fastq.gz" + } + }, + "secondary_files": {}, + "custom_errors": [] + } +} diff --git a/tests/awsf3/run_task_events/genotypeGVCFs.json b/tests/awsf3/run_task_events/genotypeGVCFs.json new file mode 100644 index 000000000..096ac220a --- /dev/null +++ b/tests/awsf3/run_task_events/genotypeGVCFs.json @@ -0,0 +1,98 @@ +{ + "config": { + "instance_type": "c5n.4xlarge", + "ebs_size": "1.5x", + "root_ebs_size": 15, + "EBS_optimized": true, + "spot_instance": false, + "log_bucket": "tibanna-output", + "key_name": "4dn-encode", + "public_postrun_json": true, + "behavior_on_capacity_limit": "wait_and_retry", + "run_name": "workflow_gatk-GenotypeGVCFs-check_GAPFIP42679E-638e86bc-7670-4bf1-85d7-1642450f", + "shutdown_min": "now", + "overwrite_input_extra": false, + "email": false + }, + "jobid": "OiHYCN1QoEiP", + "args": { + "app_name": "workflow_gatk-GenotypeGVCFs-check", + "app_version": "v17", + "cwl_directory_url": "https://raw.githubusercontent.com/dbmi-bgm/cgap-pipeline/v17/cwl", + "cwl_main_filename": "workflow_gatk-GenotypeGVCFs_plus_vcf-integrity-check.cwl", + "cwl_child_filenames": [ + "gatk-GenotypeGVCFs.cwl", + "vcf-integrity-check.cwl" + ], + "wdl_directory_url": "", + "wdl_main_filename": "", + "wdl_child_filenames": "", + "cwl_version": "v1", + "input_parameters": {}, + "additional_benchmarking_parameters": {}, + "output_S3_bucket": "elasticbeanstalk-fourfront-cgapwolf-wfoutput", + "output_target": { + "vcf": "test/test.vcf.gz", + "vcf-check": "test/vcf-check-test" + }, + "secondary_output_target": { + "vcf": [ + "test/test.vcf.gz.tbi" + ] + }, + "input_files": { + "chromosomes": { + "bucket_name": "elasticbeanstalk-fourfront-cgapwolf-files", + "rename": "", + "unzip": "", + "mount": false, + "object_key": "a1d504ee-a313-4064-b6ae-65fed9738980/GAPFIGJVJDUY.txt" + }, + "input_gvcf": { + "bucket_name": "elasticbeanstalk-fourfront-cgapwolf-wfoutput", + "rename": "", + "unzip": "", + "mount": false, + "object_key": "1b690f4b-e9ec-4a3a-a3ea-5499fd9b7623/GAPFIP42679E.gvcf.gz" + }, + "known-sites-snp": { + "bucket_name": "elasticbeanstalk-fourfront-cgapwolf-files", + "rename": "", + "unzip": "", + "mount": false, + "object_key": "8ed35691-0af4-467a-adbc-81eb088549f0/GAPFI4LJRN98.vcf.gz" + }, + "reference": { + "bucket_name": "elasticbeanstalk-fourfront-cgapwolf-files", + "rename": "", + "unzip": "", + "mount": false, + "object_key": "1936f246-22e1-45dc-bb5c-9cfd55537fe7/GAPFIXRDPDK5.fa" + } + }, + "secondary_files": { + "input_gvcf": { + "bucket_name": "elasticbeanstalk-fourfront-cgapwolf-wfoutput", + "rename": "", + "mount": false, + "object_key": "1b690f4b-e9ec-4a3a-a3ea-5499fd9b7623/GAPFIP42679E.gvcf.gz.tbi" + }, + "known-sites-snp": { + "bucket_name": "elasticbeanstalk-fourfront-cgapwolf-files", + "rename": "", + "mount": false, + "object_key": "8ed35691-0af4-467a-adbc-81eb088549f0/GAPFI4LJRN98.vcf.gz.tbi" + }, + "reference": { + "bucket_name": "elasticbeanstalk-fourfront-cgapwolf-files", + "rename": "", + "mount": false, + "object_key": [ + "1936f246-22e1-45dc-bb5c-9cfd55537fe7/GAPFIXRDPDK5.dict", + "1936f246-22e1-45dc-bb5c-9cfd55537fe7/GAPFIXRDPDK5.fa.fai" + ] + } + }, + "custom_errors": [] + } +} diff --git a/tests/awsf3/run_task_events/md5.json b/tests/awsf3/run_task_events/md5.json new file mode 100644 index 000000000..e846eee89 --- /dev/null +++ b/tests/awsf3/run_task_events/md5.json @@ -0,0 +1,39 @@ +{ + "jobid": "T1dScyWtr3aZ", + "args": { + "app_name": "md5", + "app_version": "0.2.6", + "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.2.6/cwl_awsem_v1/", + "cwl_main_filename": "md5.cwl", + "cwl_child_filenames": [], + "cwl_version": "v1", + "input_parameters": {}, + "output_S3_bucket": "soos-4dn-bucket", + "output_target": { + "report": "test_report" + }, + "input_files": { + "input_file": { + "bucket_name": "elasticbeanstalk-fourfront-cgap-wfoutput", + "rename": "", + "unzip": "", + "mount": false, + "object_key": "4bd46ae0-9e85-42d1-b61d-924b8d14a590/GAPFICJTMTXJ.vcf.gz.tbi", + "format_if_extra": "vcf_gz_tbi" + } + }, + "language": "cwl_v1" + }, + "config": { + "ebs_size": 10, + "ebs_type": "gp2", + "shutdown_min": "now", + "log_bucket": "tibanna-output", + "key_name": "4dn-encode", + "public_postrun_json": true, + "email": false, + "instance_type": "t3.micro", + "EBS_optimized": true, + "spot_instance": false + } +} diff --git a/tests/awsf3/run_task_events/md5_wdl.json b/tests/awsf3/run_task_events/md5_wdl.json new file mode 100644 index 000000000..c6c7f948c --- /dev/null +++ b/tests/awsf3/run_task_events/md5_wdl.json @@ -0,0 +1,30 @@ +{ + "args": { + "wdl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/tibanna/master/examples/md5", + "wdl_main_filename": "md5.wdl", + "wdl_child_filenames": [], + "language": "wdl", + "input_files": { + "md5.md5_step.gzfile": { + "bucket_name": "my-tibanna-test-input-bucket", + "object_key": "somefastqfile.fastq.gz" + } + }, + "secondary_files": {}, + "input_parameters": {}, + "output_S3_bucket": "my-tibanna-test-bucket", + "output_target": { + "md5.md5_step.report": "some_sub_dirname/my_first_md5_report" + }, + "secondary_output_target": {} + }, + "config": { + "shutdown_min": 30, + "ebs_size": 10, + "EBS_optimized": false, + "instance_type": "t3.small", + "password": "whateverpasswordworks", + "log_bucket": "my-tibanna-test-bucket" + }, + "jobid": "tvfZLFlt3PBz" +} diff --git a/tests/awsf3/run_task_events/shelltest.json b/tests/awsf3/run_task_events/shelltest.json new file mode 100644 index 000000000..1ffbe91fc --- /dev/null +++ b/tests/awsf3/run_task_events/shelltest.json @@ -0,0 +1,33 @@ +{ + "args": { + "app_name": "shell-test", + "input_parameters": {}, + "language": "shell", + "command": "\\$ECHO_COMMAND \"haha\" > /data1/out/shell-test-output; ls -l somefile >> /data1/out/shell-test-output", + "container_image": "ubuntu:16.04", + "output_target": { + "file:///data1/out/shell-test-output": "s3://soos-4dn-bucket/shell-test-output" + }, + "secondary_output_target": {}, + "secondary_files": {}, + "output_S3_bucket": "soos-4dn-bucket", + "app_version": "5", + "input_files": { + "file:///data1/shell/somefile": "s3://soos-4dn-bucket/hg38.blacklist.bed.gz" + }, + "input_parameters": { + }, + "input_env": { + "ECHO_COMMAND": "echo" + } + }, + "config": { + "mem": 2, + "cpu": 1, + "ebs_size": 10, + "EBS_optimized": false, + "log_bucket": "tibanna-output", + "key_name": "4dn-encode", + "subnet": "subnet-efb1b3c4" + } +} diff --git a/tests/awsf3/test_files/some_test_dir_to_upload/dir1/file1 b/tests/awsf3/test_files/some_test_dir_to_upload/dir1/file1 new file mode 100755 index 000000000..81c545efe --- /dev/null +++ b/tests/awsf3/test_files/some_test_dir_to_upload/dir1/file1 @@ -0,0 +1 @@ +1234 diff --git a/tests/awsf3/test_files/some_test_dir_to_upload/file1 b/tests/awsf3/test_files/some_test_dir_to_upload/file1 new file mode 100755 index 000000000..5ad28e227 --- /dev/null +++ b/tests/awsf3/test_files/some_test_dir_to_upload/file1 @@ -0,0 +1 @@ +haha diff --git a/tests/awsf3/test_files/some_test_dir_to_upload/file2 b/tests/awsf3/test_files/some_test_dir_to_upload/file2 new file mode 100755 index 000000000..224fd2305 --- /dev/null +++ b/tests/awsf3/test_files/some_test_dir_to_upload/file2 @@ -0,0 +1 @@ +lala diff --git a/tests/awsf3/test_files/some_test_file_to_upload b/tests/awsf3/test_files/some_test_file_to_upload new file mode 100755 index 000000000..acbe86c7c --- /dev/null +++ b/tests/awsf3/test_files/some_test_file_to_upload @@ -0,0 +1 @@ +abcd diff --git a/tests/awsf3/test_files/some_test_file_to_upload2 b/tests/awsf3/test_files/some_test_file_to_upload2 new file mode 100644 index 000000000..81c545efe --- /dev/null +++ b/tests/awsf3/test_files/some_test_file_to_upload2 @@ -0,0 +1 @@ +1234 diff --git a/tests/awsf3/test_files/some_test_file_to_upload3.abc b/tests/awsf3/test_files/some_test_file_to_upload3.abc new file mode 100644 index 000000000..81c545efe --- /dev/null +++ b/tests/awsf3/test_files/some_test_file_to_upload3.abc @@ -0,0 +1 @@ +1234 diff --git a/tests/awsf3/test_files/some_test_file_to_upload3.def b/tests/awsf3/test_files/some_test_file_to_upload3.def new file mode 100644 index 000000000..81c545efe --- /dev/null +++ b/tests/awsf3/test_files/some_test_file_to_upload3.def @@ -0,0 +1 @@ +1234 diff --git a/tests/awsf3/test_files/some_test_file_to_upload3.ghi b/tests/awsf3/test_files/some_test_file_to_upload3.ghi new file mode 100644 index 000000000..81c545efe --- /dev/null +++ b/tests/awsf3/test_files/some_test_file_to_upload3.ghi @@ -0,0 +1 @@ +1234 diff --git a/tests/awsf3/test_files/some_zip_file_to_upload.zip b/tests/awsf3/test_files/some_zip_file_to_upload.zip new file mode 100755 index 000000000..707c647c1 Binary files /dev/null and b/tests/awsf3/test_files/some_zip_file_to_upload.zip differ diff --git a/tests/awsf3/test_target.py b/tests/awsf3/test_target.py new file mode 100755 index 000000000..4de0b030b --- /dev/null +++ b/tests/awsf3/test_target.py @@ -0,0 +1,435 @@ +import io +import boto3 +import pytest +from awsf3.target import ( + Target, + SecondaryTarget, + SecondaryTargetList +) +from tibanna.awsem import ( + AwsemPostRunJsonOutputFile +) +from tests.awsf3.conftest import upload_test_bucket, CaptureOut + + +def test_target_init(): + target = Target('some_bucket') + assert target.bucket == 'some_bucket' + assert target.unzip is False + assert target.is_valid is False # source/dest not set yet + + +def test_target_parse_target_value_str_object_key(): + target = Target('some_bucket') + target.parse_target_value('some_object_key') + assert target.dest == 'some_object_key' + assert target.bucket == 'some_bucket' + assert target.unzip is False + + +def test_target_parse_target_value_str_url(): + target = Target('some_bucket') + target.parse_target_value('s3://another_bucket/some_object_key') + assert target.dest == 'some_object_key' + assert target.bucket == 'another_bucket' + assert target.unzip is False + + +def test_target_parse_target_value_dict_object_key(): + target = Target('some_bucket') + target.parse_target_value({'object_key': 'some_object_key'}) + assert target.dest == 'some_object_key' + assert target.bucket == 'some_bucket' + assert target.unzip is False + + +def test_target_parse_target_value_dict_object_key_err(): + target = Target('some_bucket') + with pytest.raises(Exception) as ex: + target.parse_target_value({'object_key': 'some_object_key/'}) + assert 'object_prefix' in str(ex.value) + + +def test_target_parse_target_value_dict_object_key_bucket(): + target = Target('some_bucket') + target.parse_target_value({'object_key': 'some_object_key', 'bucket_name': 'another_bucket'}) + assert target.dest == 'some_object_key' + assert target.bucket == 'another_bucket' + assert target.unzip is False + + +def test_target_parse_target_value_dict_object_prefix(): + target = Target('some_bucket') + target.parse_target_value({'object_prefix': 'some_dir/'}) + assert target.dest == 'some_dir/' + assert target.bucket == 'some_bucket' + assert target.unzip is False + + +def test_target_parse_target_value_dict_object_prefix_wo_slash(): + target = Target('some_bucket') + target.parse_target_value({'object_prefix': 'some_dir'}) + assert target.dest == 'some_dir/' + assert target.bucket == 'some_bucket' + assert target.unzip is False + + +def test_target_parse_target_value_unzip(): + target = Target('some_bucket') + target.parse_target_value({'unzip': True, 'object_prefix': 'some_dir/'}) + assert target.dest == 'some_dir/' + assert target.bucket == 'some_bucket' + assert target.unzip is True + + +def test_target_parse_target_value_unzip_wo_prefix(): + target = Target('some_bucket') + with pytest.raises(Exception) as ex: + target.parse_target_value({'unzip': True, 'object_key': 'some_object_key'}) + assert 'prefix' in str(ex.value) + + +def test_target_parse_target_value_object_key_prefix_conflict(): + target = Target('some_bucket') + with pytest.raises(Exception) as ex: + target.parse_target_value({'object_prefix': 'some_dir/', 'object_key': 'some_object_key'}) + assert 'not both' in str(ex.value) + + +def test_target_parse_custom_target_str_object_key(): + target = Target('some_bucket') + target.parse_custom_target(target_key='file:///data1/out/somefile', + target_value='some_object_key') + assert target.dest == 'some_object_key' + assert target.bucket == 'some_bucket' + assert target.unzip is False + assert target.source == '/data1/out/somefile' + assert target.source_name == 'somefile' + + +def test_target_parse_custom_target_str_url(): + target = Target('some_bucket') + target.parse_custom_target(target_key='file:///data1/out/somefile', + target_value='s3://another_bucket/some_object_key') + assert target.dest == 'some_object_key' + assert target.bucket == 'another_bucket' + assert target.unzip is False + assert target.source == '/data1/out/somefile' + + +def test_target_parse_custom_target_dict_object_key(): + target = Target('some_bucket') + target.parse_custom_target(target_key='file:///data1/out/somefile', + target_value={'object_key': 'some_object_key'}) + assert target.dest == 'some_object_key' + assert target.bucket == 'some_bucket' + assert target.unzip is False + assert target.source == '/data1/out/somefile' + + +def test_target_parse_custom_target_null_target_value(): + # target value must be given + target = Target('some_bucket') + with pytest.raises(Exception) as ex: + target.parse_custom_target(target_key='file:///data1/out/somefile', + target_value=None) + assert 'target' in str(ex.value) + + +def test_target_parse_cwl_target_str_object_key(): + target = Target('some_bucket') + outfile = AwsemPostRunJsonOutputFile(**{'path': '/data1/out/1/somefile'}) + target.parse_cwl_target(target_key='some_argname', + target_value='some_object_key', + prj_output_files={'some_argname': outfile}) + assert target.dest == 'some_object_key' + assert target.bucket == 'some_bucket' + assert target.unzip is False + assert target.source == '/data1/out/1/somefile' + + +def test_target_parse_cwl_target_dict_object_key(): + target = Target('some_bucket') + outfile = AwsemPostRunJsonOutputFile(**{'path': '/data1/out/1/somefile'}) + target.parse_cwl_target(target_key='some_argname', + target_value={'object_key': 'some_object_key'}, + prj_output_files={'some_argname': outfile}) + assert target.dest == 'some_object_key' + assert target.bucket == 'some_bucket' + assert target.unzip is False + assert target.source == '/data1/out/1/somefile' + + +def test_target_parse_cwl_target_null_target_value(): + target = Target('some_bucket') + outfile = AwsemPostRunJsonOutputFile(**{'path': '/data1/out/1/somefile'}) + target.parse_cwl_target(target_key='some_argname', + target_value=None, + prj_output_files={'some_argname': outfile}) + # use sourcename as dest if target_value is not given + assert target.dest == '1/somefile' + assert target.bucket == 'some_bucket' + assert target.unzip is False + assert target.source == '/data1/out/1/somefile' + + +def test_target_sourcename(): + target = Target('some_bucket') + target.source = '/data1/out/somefile' + assert target.source_name == 'somefile' + + +def test_target_sourcename2(): + target = Target('some_bucket') + target.source = '/data1/out/1/somefile' + assert target.source_name == '1/somefile' + + +def test_target_sourcename3(): + target = Target('some_bucket') + target.source = '/data1/shell/1/somefile' + assert target.source_name == '1/somefile' + + +def test_target_sourcename4(): + target = Target('some_bucket') + target.source = '/data1/whatever/1/somefile' + assert target.source_name == 'whatever/1/somefile' + + +def test_target_is_valid(): + target = Target('some_bucket') + target.source = '/data1/whatever/1/somefile' + target.dest = 'some_dest' + assert target.is_valid is True + + +def test_target_is_valid(): + target = Target('some_bucket') + target.source = '/data1/whatever/1/somefile' + assert target.is_valid is False # no destination set + + +def test_target_is_valid(): + target = Target('some_bucket') + target.dest = 'some_dest' + assert target.is_valid is False # no source set + + +def test_target_as_dict(): + target = Target('some_bucket') + target.dest = 'some_dest' + target.source = 'some_source' + assert target.as_dict() == {'source': 'some_source', + 'dest': 'some_dest', + 'bucket': 'some_bucket', + 'unzip': False} + + +def test_secondary_target_is_matched(): + st = SecondaryTarget('some_bucket') + st.dest = 'some_dest.abc' + assert st.is_matched('some_source.abc') + + +def test_secondary_target_is_not_matched(): + st = SecondaryTarget('some_bucket') + st.dest = 'some_dest.abc' + assert not st.is_matched('some_source.def') + + +def test_secondary_target_list_init(): + stlist = SecondaryTargetList('some_bucket') + assert stlist.n == 0 + assert stlist.secondary_targets == [] + assert stlist.bucket == 'some_bucket' + + +def test_secondary_target_list_parse_target_values_1_str_object_key(): + # an output target with one secondary target + stlist = SecondaryTargetList('some_bucket') + stlist.parse_target_values(['some_secondary_object_key']) + assert len(stlist.secondary_targets) == 1 + assert stlist.n == 1 + assert stlist.secondary_targets[0].dest == 'some_secondary_object_key' + assert stlist.secondary_targets[0].bucket == 'some_bucket' + + +def test_secondary_target_list_parse_target_values_2_str_object_key(): + # an output target with two secondary targets + stlist = SecondaryTargetList('some_bucket') + stlist.parse_target_values(['some_secondary_object_key', 'another_secondary_object_key']) + assert len(stlist.secondary_targets) == 2 + assert stlist.n == 2 + assert stlist.secondary_targets[0].dest == 'some_secondary_object_key' + assert stlist.secondary_targets[1].dest == 'another_secondary_object_key' + assert stlist.secondary_targets[0].bucket == 'some_bucket' + assert stlist.secondary_targets[1].bucket == 'some_bucket' + + +def test_secondary_target_list_parse_target_values_2_str_url(): + # an output target with two secondary targets + stlist = SecondaryTargetList('some_bucket') + stlist.parse_target_values(['some_secondary_object_key', 's3://another_bucket/another_secondary_object_key']) + assert len(stlist.secondary_targets) == 2 + assert stlist.n == 2 + assert stlist.secondary_targets[0].dest == 'some_secondary_object_key' + assert stlist.secondary_targets[1].dest == 'another_secondary_object_key' + assert stlist.secondary_targets[0].bucket == 'some_bucket' + assert stlist.secondary_targets[1].bucket == 'another_bucket' + + +def test_secondary_target_list_reorder_by_source_same_number(): + stlist = SecondaryTargetList('some_bucket') + stlist.parse_target_values(['somefile.abc', 's3://another_bucket/somefile.def', {'object_key': 'somefile.ghi'}]) + stlist.reorder_by_source(['somesource.def', 'somesource.abc', 'somesource.jkl', 'somesource.ghi']) + assert stlist.n == 4 + assert len(stlist.secondary_targets) == 4 + assert stlist.secondary_targets[0].dest == 'somefile.def' + assert stlist.secondary_targets[1].dest == 'somefile.abc' + assert stlist.secondary_targets[2].dest == 'somesource.jkl' # inserted + assert stlist.secondary_targets[3].dest == 'somefile.ghi' + assert stlist.secondary_targets[0].bucket == 'another_bucket' # bucket should be reordered, too. + assert stlist.secondary_targets[1].bucket == 'some_bucket' + assert stlist.secondary_targets[2].bucket == 'some_bucket' + assert stlist.secondary_targets[3].bucket == 'some_bucket' + assert stlist.secondary_targets[0].source == 'somesource.def' + assert stlist.secondary_targets[1].source == 'somesource.abc' + assert stlist.secondary_targets[2].source == 'somesource.jkl' + assert stlist.secondary_targets[3].source == 'somesource.ghi' + + +def test_secondary_target_list_as_dict(): + stlist = SecondaryTargetList('some_bucket') + stlist.parse_target_values(['somefile.abc', 'somefile.def']) + stlist.reorder_by_source(['somesource.def', 'somesource.abc']) + assert stlist.n == 2 + assert stlist.as_dict() == [stlist.secondary_targets[0].as_dict(), + stlist.secondary_targets[1].as_dict()] + + +def test_upload_file(): + target = Target(upload_test_bucket) + target.source = 'tests/awsf3/test_files/some_test_file_to_upload' + target.dest = 'some_test_object_key' + target.upload_to_s3() + s3 = boto3.client('s3') + res = s3.get_object(Bucket=upload_test_bucket, Key='some_test_object_key') + assert res['Body'].read().decode('utf-8') == 'abcd\n' + s3.delete_object(Bucket=upload_test_bucket, Key='some_test_object_key') + with pytest.raises(Exception) as ex: + res = s3.get_object(Bucket=upload_test_bucket, Key='some_test_object_key') + assert 'NoSuchKey' in str(ex.value) + + +def test_upload_file_prefix(): + target = Target(upload_test_bucket) + target.source = 'tests/awsf3/test_files/some_test_file_to_upload' + target.dest = 'some_test_object_prefix/' + target.upload_to_s3() + s3 = boto3.client('s3') + res = s3.get_object(Bucket=upload_test_bucket, Key='some_test_object_prefix/tests/awsf3/test_files/some_test_file_to_upload') + assert res['Body'].read().decode('utf-8') == 'abcd\n' + s3.delete_object(Bucket=upload_test_bucket, Key='some_test_object_prefix/tests/awsf3/test_files/some_test_file_to_upload') + with pytest.raises(Exception) as ex: + res = s3.get_object(Bucket=upload_test_bucket, Key='some_test_object_prefix/tests/awsf3/test_files/some_test_file_to_upload') + assert 'NoSuchKey' in str(ex.value) + + +def test_upload_dir(): + target = Target(upload_test_bucket) + target.source = 'tests/awsf3/test_files/some_test_dir_to_upload' # has two files and one subdir + target.dest = 'some_test_object_prefix/' + target.upload_to_s3() + s3 = boto3.client('s3') + + def test_and_delete_key(key): + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert res['Body'].read() + s3.delete_object(Bucket=upload_test_bucket, Key=key) + with pytest.raises(Exception) as ex: + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert 'NoSuchKey' in str(ex.value) + + test_and_delete_key('some_test_object_prefix/file1') + test_and_delete_key('some_test_object_prefix/file2') + test_and_delete_key('some_test_object_prefix/dir1/file1') + + +def test_upload_zip(): + target = Target(upload_test_bucket) + target.source = 'tests/awsf3/test_files/some_zip_file_to_upload.zip' # has two files and one subdir + target.dest = 'some_test_object_prefix/' + target.unzip = True + target.upload_to_s3() + s3 = boto3.client('s3') + + def test_and_delete_key(key): + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert res['Body'].read() + s3.delete_object(Bucket=upload_test_bucket, Key=key) + with pytest.raises(Exception) as ex: + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert 'NoSuchKey' in str(ex.value) + + test_and_delete_key('some_test_object_prefix/file1') + test_and_delete_key('some_test_object_prefix/file2') + test_and_delete_key('some_test_object_prefix/dir1/file1') + + +def test_upload_file_err(): + target = Target(upload_test_bucket) + target.source = 'some_test_file_that_does_not_exist' + target.dest = 'whatever' + with pytest.raises(Exception) as ex: + target.upload_to_s3() + assert 'failed to upload output file some_test_file_that_does_not_exist' + \ + ' to %s/whatever' % upload_test_bucket in str(ex.value) + + +def test_upload_zip_not_a_zip_file_err(): + target = Target(upload_test_bucket) + target.source = 'tests/awsf3/test_files/some_test_file_to_upload' # not a zip file + target.dest = 'some_test_object_prefix/' + target.unzip = True + with pytest.raises(Exception) as ex: + target.upload_to_s3() + assert 'not a zip file' in str(ex.value) + + +def test_upload_zip_not_a_zip_file_err2(): + target = Target(upload_test_bucket) + target.source = 'some_test_file_that_does_not_exist' + target.dest = 'some_test_object_prefix/' + target.unzip = True + with pytest.raises(FileNotFoundError) as ex: + target.upload_to_s3() + assert 'No such file' in str(ex.value) + + +def test_upload_zip_directory_conflict(capsys): + target = Target(upload_test_bucket) + target.source = 'tests/awsf3/test_files/some_test_dir_to_upload' # has two files and one subdir + target.dest = 'some_test_object_prefix/' + target.unzip = True # conflict, since the source is a directory + + # test stdout + cap = CaptureOut() + with cap: + target.upload_to_s3() + assert 'Warning' in cap.get_captured_out() + + # still the directory should be uploaded despite the unzip conflict + s3 = boto3.client('s3') + + def test_and_delete_key(key): + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert res['Body'].read() + s3.delete_object(Bucket=upload_test_bucket, Key=key) + with pytest.raises(Exception) as ex: + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert 'NoSuchKey' in str(ex.value) + + test_and_delete_key('some_test_object_prefix/file1') + test_and_delete_key('some_test_object_prefix/file2') + test_and_delete_key('some_test_object_prefix/dir1/file1') diff --git a/tests/awsf3/test_utils.py b/tests/awsf3/test_utils.py new file mode 100644 index 000000000..f1a454d7b --- /dev/null +++ b/tests/awsf3/test_utils.py @@ -0,0 +1,611 @@ +import os +import pytest +import json +import boto3 +from datetime import datetime +from awsf3.utils import ( + create_env_def_file, + create_mount_command_list, + create_download_command_list, + create_download_cmd, + add_download_cmd, + determine_key_type, + create_output_files_dict, + read_md5file, + update_postrun_json_init, + update_postrun_json_upload_output, + postrun_json_final, + upload_postrun_json, + upload_to_output_target, + upload_output +) +from awsf3.log import ( + parse_commands, + read_logfile_by_line +) +from tibanna.awsem import ( + AwsemRunJson, + AwsemRunJsonInput, + AwsemPostRunJsonOutput, + AwsemPostRunJson +) +from tests.awsf3.conftest import upload_test_bucket + + +def test_create_env_def_file_cwl(): + """testing create_env_def_file with cwl option and an input Env variable""" + envfilename = 'someenvfile' + + runjson_dict = {'Job': {'App': {'language': 'cwl_v1', + 'cwl_url': 'someurl', + 'main_cwl': 'somecwl', + 'other_cwl_files': 'othercwl1,othercwl2'}, + 'Input': {'Env': {'SOME_ENV': '1234'}}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'cwl') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=cwl_v1\n' + 'export CWL_URL=someurl\n' + 'export MAIN_CWL=somecwl\n' + 'export CWL_FILES="othercwl1 othercwl2"\n' + 'export SOME_ENV=1234\n' + 'export PRESERVED_ENV_OPTION="--preserve-environment SOME_ENV "\n' + 'export DOCKER_ENV_OPTION="-e SOME_ENV "\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_env_def_file_wdl_v1(): + """testing create_env_def_file with wdl option and no input Env variable""" + envfilename = 'someenvfile' + + runjson_dict = {'Job': {'App': {'language': 'wdl_v1', + 'wdl_url': 'someurl', + 'main_wdl': 'somewdl', + 'other_wdl_files': 'otherwdl1,otherwdl2'}, + 'Input': {'Env': {}}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'wdl_v1') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=wdl_v1\n' + 'export WDL_URL=someurl\n' + 'export MAIN_WDL=somewdl\n' + 'export WDL_FILES="otherwdl1 otherwdl2"\n' + 'export PRESERVED_ENV_OPTION=""\n' + 'export DOCKER_ENV_OPTION=""\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_env_def_file_wdl_draft2(): + """testing create_env_def_file with wdl option and no input Env variable""" + envfilename = 'someenvfile' + + runjson_dict = {'Job': {'App': {'language': 'wdl_draft2', + 'wdl_url': 'someurl', + 'main_wdl': 'somewdl', + 'other_wdl_files': 'otherwdl1,otherwdl2'}, + 'Input': {'Env': {}}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'wdl_draft2') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=wdl_draft2\n' + 'export WDL_URL=someurl\n' + 'export MAIN_WDL=somewdl\n' + 'export WDL_FILES="otherwdl1 otherwdl2"\n' + 'export PRESERVED_ENV_OPTION=""\n' + 'export DOCKER_ENV_OPTION=""\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_env_def_file_wdl(): + """testing create_env_def_file with wdl option and no input Env variable""" + envfilename = 'someenvfile' + + runjson_dict = {'Job': {'App': {'language': 'wdl', + 'wdl_url': 'someurl', + 'main_wdl': 'somewdl', + 'other_wdl_files': 'otherwdl1,otherwdl2'}, + 'Input': {'Env': {}}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'wdl') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=wdl\n' + 'export WDL_URL=someurl\n' + 'export MAIN_WDL=somewdl\n' + 'export WDL_FILES="otherwdl1 otherwdl2"\n' + 'export PRESERVED_ENV_OPTION=""\n' + 'export DOCKER_ENV_OPTION=""\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_env_def_file_shell(): + """testing create_env_def_file with shell option and two input Env variables""" + envfilename = 'someenvfile' + + runjson_dict = {'Job': {'App': {'language': 'shell', + 'command': 'com1;com2', + 'container_image': 'someimage'}, + 'Input': {'Env': {'ENV1': '1234', 'ENV2': '5678'}}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'shell') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=shell\n' + 'export COMMAND="com1;com2"\n' + 'export CONTAINER_IMAGE=someimage\n' + 'export ENV1=1234\n' + 'export ENV2=5678\n' + 'export PRESERVED_ENV_OPTION="--preserve-environment ENV1 --preserve-environment ENV2 "\n' + 'export DOCKER_ENV_OPTION="-e ENV1 -e ENV2 "\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_env_def_file_shell2(): + """testing create_env_def_file with shell option with complex commands and an env variable""" + envfilename = 'someenvfile' + + complex_command = 'echo $SOME_ENV | xargs -i echo {} > somedir/somefile' + runjson_dict = {'Job': {'App': {'language': 'shell', + 'command': complex_command, + 'container_image': 'someimage'}, + 'Input': {'Env': {'SOME_ENV': '1234'}}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'shell') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=shell\n' + 'export COMMAND="echo $SOME_ENV | xargs -i echo {} > somedir/somefile"\n' + 'export CONTAINER_IMAGE=someimage\n' + 'export SOME_ENV=1234\n' + 'export PRESERVED_ENV_OPTION="--preserve-environment SOME_ENV "\n' + 'export DOCKER_ENV_OPTION="-e SOME_ENV "\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_env_def_file_shell3(): + """testing create_env_def_file with shell option with complex commands and an env variable. + double-quotes are escaped when written to the env file ('"' -> '\"')""" + envfilename = 'someenvfile' + + complex_command = 'echo "haha" > somefile; ls -1 [st]*' + runjson_dict = {'Job': {'App': {'language': 'shell', + 'command': complex_command, + 'container_image': 'someimage'}, + 'Input': {'Env': {}}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'shell') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=shell\n' + 'export COMMAND="echo \\"haha\\" > somefile; ls -1 [st]*"\n' + 'export CONTAINER_IMAGE=someimage\n' + 'export PRESERVED_ENV_OPTION=""\n' + 'export DOCKER_ENV_OPTION=""\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_env_def_file_snakemake(): + """testing create_env_def_file with shell option and two input Env variables""" + envfilename = 'someenvfile' + + runjson_dict = {'Job': {'App': {'language': 'snakemake', + 'command': 'com1;com2', + 'container_image': 'someimage', + 'snakemake_url': 'someurl', + 'main_snakemake': 'somesnakemake', + 'other_snakemake_files': 'othersnakemake1,othersnakemake2'}, + 'JOBID': 'somejobid', + 'Input': {}, + 'Output': {'output_bucket_directory': 'somebucket'}, + 'JOBID': 'somejobid'}, + 'config': {'log_bucket': 'somebucket'}} + runjson = AwsemRunJson(**runjson_dict) + create_env_def_file(envfilename, runjson, 'snakemake') + + with open(envfilename, 'r') as f: + envfile_content = f.read() + + right_content = ('export LANGUAGE=snakemake\n' + 'export SNAKEMAKE_URL=someurl\n' + 'export MAIN_SNAKEMAKE=somesnakemake\n' + 'export SNAKEMAKE_FILES="othersnakemake1 othersnakemake2"\n' + 'export COMMAND="com1;com2"\n' + 'export CONTAINER_IMAGE=someimage\n' + 'export PRESERVED_ENV_OPTION=""\n' + 'export DOCKER_ENV_OPTION=""\n') + + assert envfile_content == right_content + os.remove(envfilename) + + +def test_create_mount_command_list(): + mountcommand_filename = 'some_mountcommand_filename' + rji_dict = {'arg1': {'path': 'somefile', 'dir': 'somebucket', 'mount': True}, + 'arg2': {'path': 'somefile2', 'dir': 'somebucket', 'mount': True}, + 'arg3': {'path': 'whatever', 'dir': 'do_not_mount_this_bucket', 'mount': False}, + 'arg4': {'path': 'somefile3', 'dir': 'somebucket2', 'mount': True}} + runjson_input = AwsemRunJsonInput(**{'Input_files_data': rji_dict}) + create_mount_command_list(mountcommand_filename, runjson_input) + + with open(mountcommand_filename, 'r') as f: + mcfile_content = f.read() + + right_content = ('mkdir -p /data1/input-mounted-somebucket\n' + 'goofys-latest -f somebucket /data1/input-mounted-somebucket &\n' + 'mkdir -p /data1/input-mounted-somebucket2\n' + 'goofys-latest -f somebucket2 /data1/input-mounted-somebucket2 &\n') + + assert mcfile_content == right_content + os.remove(mountcommand_filename) + + +def test_create_download_command_list_args(mocker): + dl_command_filename = 'some_dlcommand_filename' + rji_dict = {'arg1': {'path': 'somefile', 'dir': 'somebucket', 'mount': False}, + 'arg2': {'path': 'somefile2.gz', 'dir': 'somebucket', 'mount': False, 'unzip': 'gz'}, + 'arg3': {'path': 'whatever', 'dir': 'mount_this_bucket', 'mount': True}, + 'arg4': {'path': 'somefile3', 'dir': 'somebucket2', 'mount': False}} + runjson_input = AwsemRunJsonInput(**{'Input_files_data': rji_dict}) + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + create_download_command_list(dl_command_filename, runjson_input) + + with open(dl_command_filename, 'r') as f: + dcfile_content = f.read() + + right_content = ('aws s3 cp s3://somebucket/somefile /data1/input/somefile; \n' + 'aws s3 cp s3://somebucket/somefile2.gz /data1/input/somefile2.gz; ' + 'gunzip /data1/input/somefile2.gz\n' + 'aws s3 cp s3://somebucket2/somefile3 /data1/input/somefile3; \n') + + assert dcfile_content == right_content + os.remove(dl_command_filename) + + +def test_create_download_command_list_args_rename(mocker): + dl_command_filename = 'some_dlcommand_filename' + rji_dict = {'arg1': {'path': 'somefile', 'dir': 'somebucket', 'mount': False, 'rename': 'renamed_file'}, + 'arg2': {'path': 'somefile2.gz', 'dir': 'somebucket', 'mount': False, 'unzip': 'gz'}, + 'arg3': {'path': 'whatever', 'dir': 'mount_this_bucket', 'mount': True}, + 'arg4': {'path': 'somefile3', 'dir': 'somebucket2', 'mount': False, 'rename': 'renamed_file2'}} + runjson_input = AwsemRunJsonInput(**{'Input_files_data': rji_dict}) + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + create_download_command_list(dl_command_filename, runjson_input) + + with open(dl_command_filename, 'r') as f: + dcfile_content = f.read() + + right_content = ('aws s3 cp s3://somebucket/somefile /data1/input/renamed_file; \n' + 'aws s3 cp s3://somebucket/somefile2.gz /data1/input/somefile2.gz; ' + 'gunzip /data1/input/somefile2.gz\n' + 'aws s3 cp s3://somebucket2/somefile3 /data1/input/renamed_file2; \n') + + assert dcfile_content == right_content + os.remove(dl_command_filename) + + +def test_create_download_command_list_args_array(mocker): + dl_command_filename = 'some_dlcommand_filename' + rji_dict = {'arg1': {'path': [['somefilea', 'somefileb'], ['somefilec']], 'dir': 'somebucket', 'mount': False, + 'rename': [['renameda', 'renamedb'], ['renamedc']]}, + 'arg2': {'path': [['anotherfilea', 'anotherfileb'], ['anotherfilec']], 'dir': 'somebucket', 'mount': False, + 'rename': ''}} + runjson_input = AwsemRunJsonInput(**{'Input_files_data': rji_dict}) + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + create_download_command_list(dl_command_filename, runjson_input) + + with open(dl_command_filename, 'r') as f: + dcfile_content = f.read() + + right_content = ('aws s3 cp s3://somebucket/somefilea /data1/input/renameda; \n' + 'aws s3 cp s3://somebucket/somefileb /data1/input/renamedb; \n' + 'aws s3 cp s3://somebucket/somefilec /data1/input/renamedc; \n' + 'aws s3 cp s3://somebucket/anotherfilea /data1/input/anotherfilea; \n' + 'aws s3 cp s3://somebucket/anotherfileb /data1/input/anotherfileb; \n' + 'aws s3 cp s3://somebucket/anotherfilec /data1/input/anotherfilec; \n') + + assert dcfile_content == right_content + os.remove(dl_command_filename) + + +def test_create_download_command_list_file_uri(mocker): + dl_command_filename = 'some_dlcommand_filename' + rji_dict = {'file:///data1/input/file1': {'path': 'somefile', 'dir': 'somebucket', 'mount': False}, + 'file:///data1/input/file2.gz': {'path': 'somefile2.gz', 'dir': 'somebucket', 'mount': False, 'unzip': 'gz'}, + 'file:///data1/input/haha': {'path': 'whatever', 'dir': 'mount_this_bucket', 'mount': True}, + 'file:///data1/input/file3': {'path': 'somefile3', 'dir': 'somebucket2', 'mount': False}} + runjson_input = AwsemRunJsonInput(**{'Input_files_data': rji_dict}) + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + create_download_command_list(dl_command_filename, runjson_input) + + with open(dl_command_filename, 'r') as f: + dcfile_content = f.read() + + right_content = ('aws s3 cp s3://somebucket/somefile /data1/input/file1; \n' + 'aws s3 cp s3://somebucket/somefile2.gz /data1/input/file2.gz; ' + 'gunzip /data1/input/file2.gz\n' + 'aws s3 cp s3://somebucket2/somefile3 /data1/input/file3; \n') + + assert dcfile_content == right_content + os.remove(dl_command_filename) + + +def test_create_download_cmd_unzip_bz2(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + dc_cmd = create_download_cmd('somebucket', 'somefile.bz2', 'sometarget.bz2', '', 'bz2') + assert dc_cmd == 'aws s3 cp s3://somebucket/somefile.bz2 sometarget.bz2; bzip2 -d sometarget.bz2; ' + + +def test_create_download_cmd_unzip_bz2(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + dc_cmd = create_download_cmd('somebucket', 'somefile.gz', 'sometarget.gz', '', 'gz') + assert dc_cmd == 'aws s3 cp s3://somebucket/somefile.gz sometarget.gz; gunzip sometarget.gz' + + +def test_create_download_cmd_nounzip(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + dc_cmd = create_download_cmd('somebucket', 'somefile.gz', 'sometarget.gz', '', '') + assert dc_cmd == 'aws s3 cp s3://somebucket/somefile.gz sometarget.gz; ' + + +def test_create_download_cmd_nounzip_profile(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='File') + dc_cmd = create_download_cmd('somebucket', 'somefile.gz', 'sometarget.gz', 'user1', '') + assert dc_cmd == 'aws s3 cp s3://somebucket/somefile.gz sometarget.gz --profile user1; ' + + +def test_create_download_cmd_unzip_bz2_dir(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='Folder') + dc_cmd = create_download_cmd('somebucket', 'somedir', 'sometarget', '', 'bz2') + assert dc_cmd == 'aws s3 cp s3://somebucket/somedir sometarget; bzip2 -d sometarget.bz2' + right_cmd = ('aws s3 cp --recursive s3://somebucket/somedir sometarget; ' + 'for f in `find sometarget -type f`; ' + 'do if [[ $f =~ \\.bz2$ ]]; then bzip2 $f; fi; done;') + assert dc_cmd == right_cmd + + +def test_create_download_cmd_unzip_bz2_dir(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='Folder') + dc_cmd = create_download_cmd('somebucket', 'somedir', 'sometarget', '', 'gz') + right_cmd = ('aws s3 cp --recursive s3://somebucket/somedir sometarget; ' + 'for f in `find sometarget -type f`; ' + 'do if [[ $f =~ \\.gz$ ]]; then gunzip $f; fi; done;') + assert dc_cmd == right_cmd + + +def test_create_download_cmd_nounzip_dir(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='Folder') + dc_cmd = create_download_cmd('somebucket', 'somedir', 'sometarget', '', '') + assert dc_cmd == 'aws s3 cp --recursive s3://somebucket/somedir sometarget; ' + + +def test_create_download_cmd_nounzip_profile_dir(mocker): + mocker.patch('awsf3.utils.determine_key_type', return_value='Folder') + dc_cmd = create_download_cmd('somebucket', 'somedir', 'sometarget', 'user1', '') + assert dc_cmd == 'aws s3 cp --recursive s3://somebucket/somedir sometarget --profile user1; ' + + +def test_read_md5file(): + test_md5file_name = 'some_test_md5_file' + with open(test_md5file_name, 'w') as fo: + fo.write('62449071d08c9a9dfa0efbaaa82a62f3\tsomefile\n') # could be tab-delimited + fo.write('d41d8cd98f00b204e9800998ecf8427e anotherfile\n') # could be space-delimited + md5dict = read_md5file(test_md5file_name) + assert md5dict == {'somefile': '62449071d08c9a9dfa0efbaaa82a62f3', + 'anotherfile': 'd41d8cd98f00b204e9800998ecf8427e'} + os.remove(test_md5file_name) + + +def test_read_logfile_by_line(): + test_logfile_name = 'some_test_log_file' + with open(test_logfile_name, 'w') as fo: + fo.write('1\n2\n3\n') + log_content = read_logfile_by_line(test_logfile_name) + assert next(log_content) == '1\n' + assert next(log_content) == '2\n' + assert next(log_content) == '3\n' + assert next(log_content) is None + os.remove(test_logfile_name) + + +def test_parse_commands(): + def log_gen(): + log = ['Status: Downloaded newer image', + '[job clip] /data1/tmpQM7Ol5$ docker \\', + 'run \\', + '-i \\', + 'duplexa/4dn-repliseq:v13 \\', + 'clip \\', + 'VFL.fastq.gz', + 'Pulled Docker image node:slim', + 'f2b6b4884fc8: Pulling fs layer', + '[job align] /data1/tmp2EQtm2$ docker \\', + 'run \\', + '-i \\', + 'duplexa/4dn-repliseq:v14 \\', + 'run-align.sh'] + + for line in log: + yield line + yield None + + log_content = log_gen() + commands = parse_commands(log_content) + assert commands == [['docker', 'run', '-i', 'duplexa/4dn-repliseq:v13', 'clip', 'VFL.fastq.gz'], + ['docker', 'run', '-i', 'duplexa/4dn-repliseq:v14', 'run-align.sh']] + + +def test_create_output_files_dict_cwl(): + md5dict = {'path1': '683153f0051fef9e778ce0866cfd97e9', 'path2': 'c14105f8209836cd3b1cc1b63b906fed'} + outmeta = create_output_files_dict('cwl', {'arg1': {'path': 'path1'}, 'arg2': {'path': 'path2'}}, md5dict=md5dict) + assert outmeta == {'arg1': {'path': 'path1', 'md5sum': md5dict['path1']}, + 'arg2': {'path': 'path2', 'md5sum': md5dict['path2']}} + + +def test_create_output_files_dict_cwl_secondary_files(): + md5dict = {'path1': '683153f0051fef9e778ce0866cfd97e9', 'path2': 'c14105f8209836cd3b1cc1b63b906fed'} + outmeta = create_output_files_dict('cwl', {'arg1': {'path': 'path1', 'secondaryFiles': [{'path': 'path2'}]}}, md5dict=md5dict) + assert outmeta == {'arg1': {'path': 'path1', 'md5sum': md5dict['path1'], + 'secondaryFiles': [{'path': 'path2', 'md5sum': md5dict['path2']}]}} + + +def test_create_output_files_dict_cwl_no_md5(): + outmeta = create_output_files_dict('cwl', {'arg1': {'path': 'path1'}, 'arg2': {'path': 'path2'}}) + assert outmeta == {'arg1': {'path': 'path1'}, 'arg2': {'path': 'path2'}} + + +def test_create_output_files_dict_cwl_no_execution_metadata(): + with pytest.raises(Exception) as ex: + outmeta = create_output_files_dict('cwl') + assert 'execution_metadata' in str(ex.value) + + +def test_create_output_files_dict_wdl(): + md5dict = {'path1': '683153f0051fef9e778ce0866cfd97e9', 'path2': 'c14105f8209836cd3b1cc1b63b906fed'} + outmeta = create_output_files_dict('wdl', {'outputs': {'arg1': 'path1', 'arg2': 'path2'}}, md5dict=md5dict) + assert outmeta == {'arg1': {'path': 'path1', 'md5sum': md5dict['path1']}, + 'arg2': {'path': 'path2', 'md5sum': md5dict['path2']}} + + +def test_create_output_files_dict_wdl_no_md5(): + outmeta = create_output_files_dict('wdl', {'outputs': {'arg1': 'path1', 'arg2': 'path2'}}) + assert outmeta == {'arg1': {'path': 'path1'}, 'arg2': {'path': 'path2'}} + + +def test_create_output_files_dict_wdl_no_execution_metadata(): + with pytest.raises(Exception) as ex: + outmeta = create_output_files_dict('wdl') + assert 'execution_metadata' in str(ex.value) + + +def test_create_output_files_dict_snakemake(): + outmeta = create_output_files_dict('snakemake') + assert outmeta == {} + + +def test_create_output_files_dict_shell(): + outmeta = create_output_files_dict('shell') + assert outmeta == {} + + +def test_postrun_json_final(): + os.environ['JOB_STATUS'] = '0' + os.environ['INPUTSIZE'] = '34K' + os.environ['TEMPSIZE'] = '56M' + os.environ['OUTPUTSIZE'] = '78K' + + prj = AwsemPostRunJson(**{"Job": {"App": {"App_name": "repliseq-parta"}, "JOBID": "alw3r78v3"}}, strict=False) + postrun_json_final(prj) + d_job = prj.Job.as_dict() + + for k in ['end_time', 'status', 'instance_id', 'total_input_size', + 'total_tmp_size', 'total_output_size', 'App', 'JOBID']: + assert k in d_job + + today = datetime.now().strftime('%Y%m%d') + assert d_job['end_time'].startswith(today) + assert len(d_job['end_time'].split('-')) == 3 + assert d_job['status'] == '0' + assert d_job['total_input_size'] == '34K' + assert d_job['total_tmp_size'] == '56M' + assert d_job['total_output_size'] == '78K' + + +def test_upload_to_output_target(): + """testing comprehensively that includes custom target (file://), + cwl with two secondary file, wdl with conditional arg names""" + testfiledir = 'tests/awsf3/test_files/' + localfile1 = testfiledir + 'some_test_file_to_upload' + localfile2 = testfiledir + 'some_test_file_to_upload2' + localfile3 = testfiledir + 'some_test_file_to_upload3.abc' + localfile4 = testfiledir + 'some_test_file_to_upload3.def' + localfile5 = testfiledir + 'some_test_file_to_upload3.ghi' + + # prep prjo (postrun_json_output) + output_target = {'file://' + localfile1: 'somekey', + 'arg1': 'somekey2', + 'arg2': 'somekey3.abc'} + secondary_output_target = {'arg2': ['somekey3.def', 'somekey3.ghi']} + output_files = {'file://' + localfile1: {'path': localfile1}, + 'arg1b': {'path': localfile2}, + 'arg2': {'path': localfile3, + 'secondaryFiles': [{'path': localfile4}, + {'path': localfile5}]}} + alt_cond_output_argnames = {'arg1': ['arg1a', 'arg1b']} + prjo_dict = {'output_target': output_target, + 'Output files': output_files, + 'secondary_output_target': secondary_output_target, + 'alt_cond_output_argnames': alt_cond_output_argnames, + 'output_bucket_directory': upload_test_bucket} + prjo = AwsemPostRunJsonOutput(**prjo_dict) + + # run function upload_to_output_target + upload_to_output_target(prjo) + + # still the directory should be uploaded despite the unzip conflict + s3 = boto3.client('s3') + + def test_and_delete_key(key): + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert res['Body'].read() + s3.delete_object(Bucket=upload_test_bucket, Key=key) + with pytest.raises(Exception) as ex: + res = s3.get_object(Bucket=upload_test_bucket, Key=key) + assert 'NoSuchKey' in str(ex.value) + + test_and_delete_key('somekey2') + test_and_delete_key('somekey3.abc') + test_and_delete_key('somekey3.def') + test_and_delete_key('somekey3.ghi') + test_and_delete_key('somekey') diff --git a/tests/files/SRR2671430_1.fastq.gz b/tests/files/SRR2671430_1.fastq.gz old mode 100644 new mode 100755 diff --git a/tests/files/cwl_upload/bwa-mem.cwl b/tests/files/cwl_upload/bwa-mem.cwl old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report.zip b/tests/files/fastqc_report.zip old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Icons/error.png b/tests/files/fastqc_report/Icons/error.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Icons/fastqc_icon.png b/tests/files/fastqc_report/Icons/fastqc_icon.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Icons/tick.png b/tests/files/fastqc_report/Icons/tick.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Icons/warning.png b/tests/files/fastqc_report/Icons/warning.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/adapter_content.png b/tests/files/fastqc_report/Images/adapter_content.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/duplication_levels.png b/tests/files/fastqc_report/Images/duplication_levels.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/kmer_profiles.png b/tests/files/fastqc_report/Images/kmer_profiles.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/per_base_n_content.png b/tests/files/fastqc_report/Images/per_base_n_content.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/per_base_quality.png b/tests/files/fastqc_report/Images/per_base_quality.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/per_base_sequence_content.png b/tests/files/fastqc_report/Images/per_base_sequence_content.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/per_sequence_gc_content.png b/tests/files/fastqc_report/Images/per_sequence_gc_content.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/per_sequence_quality.png b/tests/files/fastqc_report/Images/per_sequence_quality.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/per_tile_quality.png b/tests/files/fastqc_report/Images/per_tile_quality.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/Images/sequence_length_distribution.png b/tests/files/fastqc_report/Images/sequence_length_distribution.png old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/fastqc.fo b/tests/files/fastqc_report/fastqc.fo old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/fastqc_data.txt b/tests/files/fastqc_report/fastqc_data.txt old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/fastqc_report.html b/tests/files/fastqc_report/fastqc_report.html old mode 100644 new mode 100755 diff --git a/tests/files/fastqc_report/summary.txt b/tests/files/fastqc_report/summary.txt old mode 100644 new mode 100755 diff --git a/tests/files/pairsqc_report.zip b/tests/files/pairsqc_report.zip old mode 100644 new mode 100755 diff --git a/tests/files/pairsqc_report/sample1.summary.out b/tests/files/pairsqc_report/sample1.summary.out old mode 100644 new mode 100755 diff --git a/tests/files/snakemake/Snakefile b/tests/files/snakemake/Snakefile old mode 100644 new mode 100755 diff --git a/tests/files/snakemake2/Snakefile b/tests/files/snakemake2/Snakefile old mode 100644 new mode 100755 diff --git a/tests/files/snakemake2b/Snakefile b/tests/files/snakemake2b/Snakefile old mode 100644 new mode 100755 diff --git a/tests/files/snakemake3/Snakefile b/tests/files/snakemake3/Snakefile old mode 100644 new mode 100755 diff --git a/tests/files/snakemake4/Snakefile b/tests/files/snakemake4/Snakefile old mode 100644 new mode 100755 diff --git a/tests/tibanna/__init__.py b/tests/tibanna/__init__.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/post_deployment/__init__.py b/tests/tibanna/post_deployment/__init__.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/post_deployment/conftest.py b/tests/tibanna/post_deployment/conftest.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/post_deployment/test_json/4dn_bwa.runonly.v1.json b/tests/tibanna/post_deployment/test_json/4dn_bwa.runonly.v1.json old mode 100644 new mode 100755 diff --git a/tests/tibanna/post_deployment/test_json/shelltest-ecr.json b/tests/tibanna/post_deployment/test_json/shelltest-ecr.json old mode 100644 new mode 100755 diff --git a/tests/tibanna/post_deployment/test_json/shelltest4.json b/tests/tibanna/post_deployment/test_json/shelltest4.json old mode 100644 new mode 100755 diff --git a/tests/tibanna/post_deployment/test_unicorn.py b/tests/tibanna/post_deployment/test_unicorn.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/__init__.py b/tests/tibanna/unicorn/__init__.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/check_task_awsem/__init__.py b/tests/tibanna/unicorn/check_task_awsem/__init__.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/check_task_awsem/event.json b/tests/tibanna/unicorn/check_task_awsem/event.json deleted file mode 100644 index 241fd0a55..000000000 --- a/tests/tibanna/unicorn/check_task_awsem/event.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "args": { - "app_version": "8", - "app_name": "fastqc-0-11-4-1", - "input_parameters": {}, - "input_files": { - "input_fastq": { - "object_key": "test.h2500.fastq", - "bucket_name": "tibanna-output" - } - }, - "output_target": {}, - "secondary_output_target": {}, - "secondary_files": {}, - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/master/cwl_awsem/", - "output_S3_bucket": "tibanna-output", - "cwl_child_filenames": ["fastqc-0-11-4.6.cwl"], - "cwl_main_filename": "fastqc-0-11-4-1.8.cwl" - }, - "config": { - "cwl_version": "draft3", - "ebs_size": 20, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "json_dir": "/tmp/json", - "EBS_optimized": false, - "ebs_iops": 500, - "log_bucket": "tibanna-output", - "shutdown_min": "30", - "instance_type": "t2.nano", - - - "job_tag": "fastqc-0-11-4-1", - "userdata_dir": "/tmp/userdata", - - "password": "hahaha" - }, - "jobid": "YoW5iAVOjoMV" -} diff --git a/tests/tibanna/unicorn/check_task_awsem/test_handler.py b/tests/tibanna/unicorn/check_task_awsem/test_handler.py old mode 100644 new mode 100755 index 4d93670af..7740febeb --- a/tests/tibanna/unicorn/check_task_awsem/test_handler.py +++ b/tests/tibanna/unicorn/check_task_awsem/test_handler.py @@ -85,7 +85,7 @@ def test_check_task_awsem(check_task_input, s3): s3.put_object(Body=jsoncontent.encode(), Key=postrunjson) with pytest.raises(MetricRetrievalException) as excinfo: retval = service.handler(check_task_input_modified, '') - assert 'error getting metrics' in str(excinfo) + assert 'error getting metrics' in str(excinfo.value) s3.delete_objects(Delete={'Objects': [{'Key': job_started}]}) s3.delete_objects(Delete={'Objects': [{'Key': job_success}]}) s3.delete_objects(Delete={'Objects': [{'Key': postrunjson}]}) @@ -115,7 +115,7 @@ def test_check_task_awsem_with_long_postrunjson(check_task_input, s3): s3.put_object(Body=jsoncontent.encode(), Key=postrunjson) with pytest.raises(MetricRetrievalException) as excinfo: retval = service.handler(check_task_input_modified, '') - assert 'error getting metrics' in str(excinfo) + assert 'error getting metrics' in str(excinfo.value) s3.delete_objects(Delete={'Objects': [{'Key': job_started}]}) s3.delete_objects(Delete={'Objects': [{'Key': job_success}]}) s3.delete_objects(Delete={'Objects': [{'Key': postrunjson}]}) diff --git a/tests/tibanna/unicorn/conftest.py b/tests/tibanna/unicorn/conftest.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/__init__.py b/tests/tibanna/unicorn/run_task_awsem/__init__.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/cwl_upload/child1.cwl b/tests/tibanna/unicorn/run_task_awsem/cwl_upload/child1.cwl old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/cwl_upload/child2.cwl b/tests/tibanna/unicorn/run_task_awsem/cwl_upload/child2.cwl old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/cwl_upload/main.cwl b/tests/tibanna/unicorn/run_task_awsem/cwl_upload/main.cwl old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/event.json b/tests/tibanna/unicorn/run_task_awsem/event.json deleted file mode 100644 index 6f7676448..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "md5" - }, - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - }, - "input_env": {"TESTENV": 1234} - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/tests/tibanna/unicorn/run_task_awsem/event_chipseq.json b/tests/tibanna/unicorn/run_task_awsem/event_chipseq.json old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/event_cwl_upload.json b/tests/tibanna/unicorn/run_task_awsem/event_cwl_upload.json index 1bb663109..da9fb7ff2 100644 --- a/tests/tibanna/unicorn/run_task_awsem/event_cwl_upload.json +++ b/tests/tibanna/unicorn/run_task_awsem/event_cwl_upload.json @@ -1,37 +1,25 @@ { "args": { - "app_name": "md5", - "language": "cwl draft-3", "input_parameters": {}, - "cwl_version": "draft-3", "output_target": { "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" }, "secondary_output_target": {}, + "output_S3_bucket": "somebucket", "cwl_main_filename": "main.cwl", "cwl_child_filenames": ["child1.cwl", "child2.cwl"], - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "0.0.4", "cwl_directory_local": "tests/tibanna/unicorn/run_task_awsem/cwl_upload/", "input_files": { "input_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" + "bucket_name": "somebucket", + "object_key": "someinput" } }, "input_env": {"TESTENV": 1234} }, "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" + "instance_type": "t3.micro", + "log_bucket": "tibanna-output" } } + diff --git a/tests/tibanna/unicorn/run_task_awsem/event_dependency.json b/tests/tibanna/unicorn/run_task_awsem/event_dependency.json deleted file mode 100644 index 08002828c..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event_dependency.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "md5" - }, - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - }, - "dependency": { "exec_arn": ["arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default_7927:md5_test"]} - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "", - - - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/tests/tibanna/unicorn/run_task_awsem/event_dependency2.json b/tests/tibanna/unicorn/run_task_awsem/event_dependency2.json deleted file mode 100644 index 98c60b08d..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event_dependency2.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "md5" - }, - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - }, - "dependency": { "exec_arn": ["arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default_7412:md5_test", - "arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default_7412:md5_test2"]} - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "", - - - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/tests/tibanna/unicorn/run_task_awsem/event_dependency_fail.json b/tests/tibanna/unicorn/run_task_awsem/event_dependency_fail.json deleted file mode 100644 index 249f61920..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event_dependency_fail.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "md5" - }, - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - }, - "dependency": { "exec_arn": ["arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default_7412:md5_test", - "arn:aws:states:us-east-1:643366669028:execution:tibanna_unicorn_default_7412:md5_fail"]} - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "", - - - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/tests/tibanna/unicorn/run_task_awsem/event_fail.json b/tests/tibanna/unicorn/run_task_awsem/event_fail.json deleted file mode 100644 index dfb88b1f1..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event_fail.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "args": { - "secondary_output_target": {}, - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "output_target": { - "report": "some_random_output_file_name" - }, - "cwl_main_filename": "md5.cwl", - "cwl_version": "draft3", - "secondary_files": {}, - "output_S3_bucket": "random_nonexisting_outputbucket", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "random_nonexisting_bucket", - "object_key": "random_nonexisting_file" - } - } - }, - "app_name": "md5", - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "key_name": "", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "", - - "json_bucket": "some_random_json_bucket", - - "password": "thisisnotmypassword", - "log_bucket": "tibanna-output" - }, - "push_error_to_end": true -} diff --git a/tests/tibanna/unicorn/run_task_awsem/event_fixedname.json b/tests/tibanna/unicorn/run_task_awsem/event_fixedname.json deleted file mode 100644 index e40e526f0..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event_fixedname.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "md5", - "run_name": "md5_test" - }, - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - } - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "", - - - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/tests/tibanna/unicorn/run_task_awsem/event_fixedname2.json b/tests/tibanna/unicorn/run_task_awsem/event_fixedname2.json deleted file mode 100644 index 496b16412..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event_fixedname2.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "_tibanna": { - "env": "fourfront-webdev", - "run_type": "md5", - "run_name": "md5_test2" - }, - "args": { - "app_name": "md5", - "input_parameters": {}, - "cwl_child_filenames": [], - "cwl_version": "draft-3", - "output_target": { - "report": "ac18f2bb-c256-40bf-9562-cdc6179d6f9a/report" - }, - "secondary_output_target": {}, - "cwl_main_filename": "md5.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": "0.0.4", - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/0.0.4/cwl_awsem/", - "input_files": { - "input_file": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "f4864029-a8ad-4bb8-93e7-5108f462ccaa/4DNFIRSRJH45.fastq.gz" - } - } - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 120, - "instance_type": "", - - - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - } -} diff --git a/tests/tibanna/unicorn/run_task_awsem/event_omit_fields.json b/tests/tibanna/unicorn/run_task_awsem/event_omit_fields.json old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/event_omit_fields2.json b/tests/tibanna/unicorn/run_task_awsem/event_omit_fields2.json old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/run_task_awsem/event_repliseq.json b/tests/tibanna/unicorn/run_task_awsem/event_repliseq.json deleted file mode 100644 index 5db24a671..000000000 --- a/tests/tibanna/unicorn/run_task_awsem/event_repliseq.json +++ /dev/null @@ -1,220 +0,0 @@ -{ - "args": { - "secondary_output_target": {}, - "app_name": "repliseq-parta", - "input_parameters": { - "memperthread": "2G", - "nthreads": 4 - }, - "cwl_child_filenames": [ - "clip.cwl", - "align.cwl", - "filtersort.cwl", - "dedup.cwl", - "count.cwl" - ], - "output_target": { - "filtered_sorted_deduped_bam": "13eeef39-050e-4353-ae6f-ea6b8e0078bf/4DNFI85VSERH.bam", - "bam": "6088f403-c5f9-4f05-ae13-94c97ae4f5bc/4DNFIIU4BKGB.bam", - "filtered_sorted_bam": "57220af8-3ec4-44e3-91ca-33fb3995766a/4DNFIW9Z73VK.bam", - "count_bg": "675facd9-404c-4927-861e-f3e6d0e54f33/4DNFIB15HW7P.bg", - "dedup_qc_report": "4da92aa3-41c0-4689-b81d-43e3e829434f/dedup_qc_report" - }, - "cwl_main_filename": "repliseq-parta.cwl", - "secondary_files": {}, - "output_S3_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "app_version": null, - "cwl_directory_url": "https://raw.githubusercontent.com/4dn-dcic/pipelines-cwl/dev/cwl_awsem/", - "cwl_version": "draft3", - "input_files": { - "bwaIndex": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "1f53df95-4cf3-41cc-971d-81bb16c486dd/4DNFIZQZ39L9.bwaIndex.tgz" - }, - "chromsizes": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "4a6d10ee-2edb-4402-a98f-0edb1d58f5e9/4DNFI823LSII.chrom.sizes" - }, - "fastq": { - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "object_key": "a6ffb820-c600-4c32-9d99-249ed8029022/4DNFI2ZDNVFL.fastq.gz" - } - } - }, - "app_name": "repliseq-parta", - "_tibanna": { - "env": "fourfront-webdev", - "settings": { - "url": "https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_pony:repliseq-parta_9df4c60e-b2d3-43ae-bc80-fd4502fee6e7", - "run_type": "repliseq-parta", - "run_id": "9df4c60e-b2d3-43ae-bc80-fd4502fee6e7", - "env": "fourfront-webdev", - "run_name": "repliseq-parta_9df4c60e-b2d3-43ae-bc80-fd4502fee6e7" - } - }, - "ff_meta": { - "run_platform": "AWSEM", - "uuid": "4da92aa3-41c0-4689-b81d-43e3e829434f", - "parameters": [ - { - "workflow_argument_name": "memperthread", - "value": "2G" - }, - { - "workflow_argument_name": "nthreads", - "value": "4" - } - ], - "workflow": "146da22a-502d-4500-bf57-a7cf0b4b2364", - "title": "repliseq-parta run 2018-03-20 19:17:08.345369", - "award": "1U01CA200059-01", - "awsem_job_id": "", - "run_url": "https://console.aws.amazon.com/states/home?region=us-east-1#/executions/details/arn:aws:states:us-east-1:643366669028:execution:tibanna_pony:repliseq-parta_9df4c60e-b2d3-43ae-bc80-fd4502fee6e7", - "awsem_app_name": "repliseq-parta", - "lab": "4dn-dcic-lab", - "run_status": "started", - "output_files": [ - { - "extension": ".bam", - "format": "bam", - "workflow_argument_name": "bam", - "value": "6088f403-c5f9-4f05-ae13-94c97ae4f5bc", - "upload_key": "6088f403-c5f9-4f05-ae13-94c97ae4f5bc/4DNFIIU4BKGB.bam", - "type": "Output processed file", - "extra_files": null - }, - { - "extension": ".bam", - "format": "bam", - "workflow_argument_name": "filtered_sorted_bam", - "value": "57220af8-3ec4-44e3-91ca-33fb3995766a", - "upload_key": "57220af8-3ec4-44e3-91ca-33fb3995766a/4DNFIW9Z73VK.bam", - "type": "Output processed file", - "extra_files": null - }, - { - "extension": ".bam", - "format": "bam", - "workflow_argument_name": "filtered_sorted_deduped_bam", - "value": "13eeef39-050e-4353-ae6f-ea6b8e0078bf", - "upload_key": "13eeef39-050e-4353-ae6f-ea6b8e0078bf/4DNFI85VSERH.bam", - "type": "Output processed file", - "extra_files": null - }, - { - "type": "Output QC file", - "workflow_argument_name": "dedup_qc_report" - }, - { - "extension": ".bg", - "format": "bg", - "workflow_argument_name": "count_bg", - "value": "675facd9-404c-4927-861e-f3e6d0e54f33", - "upload_key": "675facd9-404c-4927-861e-f3e6d0e54f33/4DNFIB15HW7P.bg", - "type": "Output processed file", - "extra_files": null - } - ], - "input_files": [ - { - "ordinal": 1, - "workflow_argument_name": "bwaIndex", - "value": "1f53df95-4cf3-41cc-971d-81bb16c486dd" - }, - { - "ordinal": 1, - "workflow_argument_name": "fastq", - "value": "a6ffb820-c600-4c32-9d99-249ed8029022" - }, - { - "ordinal": 1, - "workflow_argument_name": "chromsizes", - "value": "4a6d10ee-2edb-4402-a98f-0edb1d58f5e9" - } - ] - }, - "output_bucket": "elasticbeanstalk-fourfront-webdev-wfoutput", - "pf_meta": [ - { - "status": "to be uploaded by workflow", - "source_experiments": [ - "8d131b21-e667-4888-9095-c42f7aa86a3f" - ], - "uuid": "6088f403-c5f9-4f05-ae13-94c97ae4f5bc", - "file_format": "bam", - "accession": "4DNFIIU4BKGB", - "award": "1U01CA200059-01", - "lab": "4dn-dcic-lab" - }, - { - "status": "to be uploaded by workflow", - "source_experiments": [ - "8d131b21-e667-4888-9095-c42f7aa86a3f" - ], - "uuid": "57220af8-3ec4-44e3-91ca-33fb3995766a", - "file_format": "bam", - "accession": "4DNFIW9Z73VK", - "award": "1U01CA200059-01", - "lab": "4dn-dcic-lab" - }, - { - "status": "to be uploaded by workflow", - "source_experiments": [ - "8d131b21-e667-4888-9095-c42f7aa86a3f" - ], - "uuid": "13eeef39-050e-4353-ae6f-ea6b8e0078bf", - "file_format": "bam", - "accession": "4DNFI85VSERH", - "award": "1U01CA200059-01", - "lab": "4dn-dcic-lab" - }, - { - "status": "to be uploaded by workflow", - "source_experiments": [ - "8d131b21-e667-4888-9095-c42f7aa86a3f" - ], - "uuid": "675facd9-404c-4927-861e-f3e6d0e54f33", - "file_format": "bg", - "accession": "4DNFIB15HW7P", - "award": "1U01CA200059-01", - "lab": "4dn-dcic-lab" - } - ], - "parameters": { - "memperthread": "2G", - "nthreads": 4 - }, - "config": { - "ebs_size": 0, - "ebs_type": "io1", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": 500, - "shutdown_min": 30, - "instance_type": "", - "key_name": "", - "password": "dragonfly", - "log_bucket": "tibanna-output" - }, - "workflow_uuid": "146da22a-502d-4500-bf57-a7cf0b4b2364", - "input_files": [ - { - "object_key": "4DNFIZQZ39L9.bwaIndex.tgz", - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "uuid": "1f53df95-4cf3-41cc-971d-81bb16c486dd", - "workflow_argument_name": "bwaIndex" - }, - { - "uuid": "a6ffb820-c600-4c32-9d99-249ed8029022", - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "workflow_argument_name": "fastq", - "object_key": "4DNFI2ZDNVFL.fastq.gz" - }, - { - "object_key": "4DNFI823LSII.chrom.sizes", - "bucket_name": "elasticbeanstalk-fourfront-webdev-files", - "uuid": "4a6d10ee-2edb-4402-a98f-0edb1d58f5e9", - "workflow_argument_name": "chromsizes" - } - ] -} diff --git a/tests/tibanna/unicorn/test_awsem.py b/tests/tibanna/unicorn/test_awsem.py old mode 100644 new mode 100755 index b21b88e64..f1042bef8 --- a/tests/tibanna/unicorn/test_awsem.py +++ b/tests/tibanna/unicorn/test_awsem.py @@ -1,6 +1,7 @@ import pytest import copy from tibanna import awsem +from tibanna.exceptions import MalFormattedRunJsonException @pytest.fixture @@ -10,7 +11,8 @@ def run_json_inputfile(): "dir": "somebucket", "path": "somefilepath", "profile": "", - "rename": "" + "unzip": "", + "mount": False } @@ -24,21 +26,21 @@ def run_json_input(): "dir": "dir1", "path": "path1", "profile": "", - "rename": "" + "unzip": "" }, "input_bam": { "class": "File", "dir": "dir2", "path": "path2", "profile": "", - "rename": "" + "unzip": "" }, "restrict_frags": { "class": "File", "dir": "dir3", "path": "path3", "profile": "", - "rename": "" + "unzip": "" }, }, "Secondary_files_data": {}, @@ -225,7 +227,7 @@ def test_RunJsonJob(run_json_job): assert r.App.App_name == 'someapp' # changing r_dict shoudn't affect r assert r.Input.Input_parameters['n'] == 2 assert r.Output.output_bucket_directory == 'somebucket' - assert r.Log['log_bucket_directory'] == 'tibanna-output' + assert r.Log.log_bucket_directory == 'tibanna-output' assert r.JOBID == "J55BCqwHx6N5" @@ -250,21 +252,48 @@ def test_RunJsonInputFile(run_json_inputfile): assert r.dir_ == 'somebucket' assert r.path == 'somefilepath' assert r.profile == '' - assert r.rename == '' + assert r.unzip == '' assert r.unzip == '' r_dict = r.as_dict() assert 'class' in r_dict assert 'dir' in r_dict assert 'path' in r_dict assert 'profile' in r_dict - assert 'rename' in r_dict + assert 'unzip' in r_dict assert 'unzip' in r_dict assert r_dict['class'] == 'File' assert r_dict['dir'] == 'somebucket' assert r_dict['path'] == 'somefilepath' assert r_dict['profile'] == '' - assert r_dict['rename'] == '' assert r_dict['unzip'] == '' + assert r_dict['unzip'] == '' + + +def test_AwsemRunJsonInputFile_rename_mount_error(run_json_inputfile): + # rename and mount cannot be used together + run_json_inputfile['rename'] = 'somerenamedpath' + run_json_inputfile['mount'] = True + with pytest.raises(MalFormattedRunJsonException) as ex: + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + assert 'rename and mount' in str(ex.value) + + +def test_AwsemRunJsonInputFile_unzip_mount_error(run_json_inputfile): + # unzip and mount cannot be used together + run_json_inputfile['unzip'] = 'gz' + run_json_inputfile['mount'] = True + with pytest.raises(MalFormattedRunJsonException) as ex: + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + assert 'unzip and mount' in str(ex.value) + + +def test_AwsemRunJsonInputFile_2d_array_empty_rename_mount_no_error(run_json_inputfile): + # rename is not nul but should not raise an error + run_json_inputfile['path'] = [['path1', 'path2'], ['path3', 'path4']] + run_json_inputfile['rename'] = [[], []] + run_json_inputfile['mount'] = True + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + assert rj_infile.rename == [[], []] def test_RunJsonInput(run_json_input): @@ -282,3 +311,245 @@ def test_RunJsonApp(run_json_app): assert r_dict['App_name'] == 'someapp' r_dict['App_version'] = 'lalala' assert r.App_version == 'v1.0' # changing r_dict shouldn't affect r + + +def test_file2cwlfile(): + cwlfile = awsem.file2cwlfile('somedir/somefile', 'parentdir', '') + assert cwlfile == {'path': 'parentdir/somedir/somefile', 'class': 'File'} + + +def test_file2wdlfile(): + wdlfile = awsem.file2wdlfile('somedir/somefile', 'parentdir', '') + assert wdlfile == 'parentdir/somedir/somefile' + + +def test_file2cwlfile_unzip(): + cwlfile = awsem.file2cwlfile('somedir/somefile.gz', 'parentdir', 'gz') + assert cwlfile == {'path': 'parentdir/somedir/somefile', 'class': 'File'} + + +def test_file2cwlfile_unzip2(): + cwlfile = awsem.file2cwlfile('somedir/somefile.bz2', 'parentdir', 'bz2') + assert cwlfile == {'path': 'parentdir/somedir/somefile', 'class': 'File'} + + +def test_file2cwlfile_unzip3(): + cwlfile = awsem.file2cwlfile('somedir/somefile.gz', 'parentdir/', '') + assert cwlfile == {'path': 'parentdir/somedir/somefile.gz', 'class': 'File'} + + +def test_file2wdlfile_unzip(): + wdlfile = awsem.file2wdlfile('somedir/somefile.gz', 'parentdir', 'gz') + assert wdlfile == 'parentdir/somedir/somefile' + + +def test_file2wdlfile_unzip2(): + wdlfile = awsem.file2wdlfile('somedir/somefile.bz2', 'parentdir', 'bz2') + assert wdlfile == 'parentdir/somedir/somefile' + + +def test_file2wdlfile_unzip2(): + wdlfile = awsem.file2wdlfile('somedir/somefile.bz2', 'parentdir/', '') + assert wdlfile == 'parentdir/somedir/somefile.bz2' + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_input(run_json_inputfile): + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == {'path': '/data1/input/somefilepath', 'class': 'File'} + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_input_mount(run_json_inputfile): + run_json_inputfile['mount'] = True + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == {'path': '/data1/input-mounted-somebucket/somefilepath', 'class': 'File'} + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_rename(run_json_inputfile): + run_json_inputfile['rename'] = 'somerenamedpath' + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == {'path': '/data1/input/somerenamedpath', 'class': 'File'} + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_input(run_json_inputfile): + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == '/data1/input/somefilepath' + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_input_mount(run_json_inputfile): + run_json_inputfile['mount'] = True + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == '/data1/input-mounted-somebucket/somefilepath' + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_rename(run_json_inputfile): + run_json_inputfile['rename'] = 'somerenamedpath' + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == '/data1/input/somerenamedpath' + + +def test_AwsemRunJsonInput_as_dict_as_cwl_input(run_json_input): + rj_in = awsem.AwsemRunJsonInput(**run_json_input) + cwlinput = rj_in.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == {'n': 2, + 'chromsize': {'path': '/data1/input/path1', 'class': 'File'}, + 'input_bam': {'path': '/data1/input/path2', 'class': 'File'}, + 'restrict_frags': {'path': '/data1/input/path3', 'class': 'File'}} + + +def test_AwsemRunJsonInput_as_dict_as_wdl_input(run_json_input): + rj_in = awsem.AwsemRunJsonInput(**run_json_input) + wdlinput = rj_in.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == {'n': 2, + 'chromsize': '/data1/input/path1', + 'input_bam': '/data1/input/path2', + 'restrict_frags': '/data1/input/path3'} + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_array(run_json_inputfile): + run_json_inputfile['path'] = ['path1', 'path2'] + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == [{'path': '/data1/input/path1', 'class': 'File'}, + {'path': '/data1/input/path2', 'class': 'File'}] + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_2d_array(run_json_inputfile): + run_json_inputfile['path'] = [['path1', 'path2'], ['path3', 'path4']] + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == [[{'path': '/data1/input/path1', 'class': 'File'}, + {'path': '/data1/input/path2', 'class': 'File'}], + [{'path': '/data1/input/path3', 'class': 'File'}, + {'path': '/data1/input/path4', 'class': 'File'}]] + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_2d_array_rename(run_json_inputfile): + run_json_inputfile['path'] = [['path1', 'path2'], ['path3', 'path4']] + run_json_inputfile['rename'] = [['renamed1', 'renamed2'], ['renamed3', 'renamed4']] + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == [[{'path': '/data1/input/renamed1', 'class': 'File'}, + {'path': '/data1/input/renamed2', 'class': 'File'}], + [{'path': '/data1/input/renamed3', 'class': 'File'}, + {'path': '/data1/input/renamed4', 'class': 'File'}]] + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_2d_array_unzip(run_json_inputfile): + run_json_inputfile['path'] = [['path1.gz', 'path2.gz'], ['path3.gz', 'path4.gz']] + run_json_inputfile['unzip'] = 'gz' + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == [[{'path': '/data1/input/path1', 'class': 'File'}, + {'path': '/data1/input/path2', 'class': 'File'}], + [{'path': '/data1/input/path3', 'class': 'File'}, + {'path': '/data1/input/path4', 'class': 'File'}]] + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_2d_array_unzip2(run_json_inputfile): + run_json_inputfile['path'] = [['path1.bz2', 'path2.bz2'], ['path3.bz2', 'path4.bz2']] + run_json_inputfile['unzip'] = 'bz2' + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == [[{'path': '/data1/input/path1', 'class': 'File'}, + {'path': '/data1/input/path2', 'class': 'File'}], + [{'path': '/data1/input/path3', 'class': 'File'}, + {'path': '/data1/input/path4', 'class': 'File'}]] + + +def test_AwsemRunJsonInputFile_as_dict_as_cwl_2d_array_mount(run_json_inputfile): + run_json_inputfile['path'] = [['path1', 'path2'], ['path3', 'path4']] + run_json_inputfile['mount'] = True + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + cwlinput = rj_infile.as_dict_as_cwl_input('/data1/input/', '/data1/input-mounted-') + assert cwlinput == [[{'path': '/data1/input-mounted-somebucket/path1', 'class': 'File'}, + {'path': '/data1/input-mounted-somebucket/path2', 'class': 'File'}], + [{'path': '/data1/input-mounted-somebucket/path3', 'class': 'File'}, + {'path': '/data1/input-mounted-somebucket/path4', 'class': 'File'}]] + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_array(run_json_inputfile): + run_json_inputfile['path'] = ['path1', 'path2'] + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == ['/data1/input/path1', + '/data1/input/path2'] + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_2d_array(run_json_inputfile): + run_json_inputfile['path'] = [['path1', 'path2'], ['path3', 'path4']] + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == [['/data1/input/path1', + '/data1/input/path2'], + ['/data1/input/path3', + '/data1/input/path4']] + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_2d_array_rename(run_json_inputfile): + run_json_inputfile['path'] = [['path1', 'path2'], ['path3', 'path4']] + run_json_inputfile['rename'] = [['renamed1', 'renamed2'], ['renamed3', 'renamed4']] + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == [['/data1/input/renamed1', + '/data1/input/renamed2'], + ['/data1/input/renamed3', + '/data1/input/renamed4']] + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_2d_array_unzip(run_json_inputfile): + run_json_inputfile['path'] = [['path1.gz', 'path2.gz'], ['path3.gz', 'path4.gz']] + run_json_inputfile['unzip'] = 'gz' + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == [['/data1/input/path1', + '/data1/input/path2'], + ['/data1/input/path3', + '/data1/input/path4']] + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_2d_array_unzip2(run_json_inputfile): + run_json_inputfile['path'] = [['path1.bz2', 'path2.bz2'], ['path3.bz2', 'path4.bz2']] + run_json_inputfile['unzip'] = 'bz2' + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == [['/data1/input/path1', + '/data1/input/path2'], + ['/data1/input/path3', + '/data1/input/path4']] + + +def test_AwsemRunJsonInputFile_as_dict_as_wdl_2d_array_mount(run_json_inputfile): + run_json_inputfile['path'] = [['path1', 'path2'], ['path3', 'path4']] + run_json_inputfile['mount'] = True + rj_infile = awsem.AwsemRunJsonInputFile(**run_json_inputfile) + wdlinput = rj_infile.as_dict_as_wdl_input('/data1/input/', '/data1/input-mounted-') + assert wdlinput == [['/data1/input-mounted-somebucket/path1', + '/data1/input-mounted-somebucket/path2'], + ['/data1/input-mounted-somebucket/path3', + '/data1/input-mounted-somebucket/path4']] + + +def test_AwsemPostRunJsonOutput_alt_output_target(postrun_json_output): + postrun_json_output['output_target'] = {'arg1': 'target1', 'arg2': 'target2'} + postrun_json_output['alt_cond_output_argnames'] = {'arg2': ['arg2a', 'arg2b']} + prjo = awsem.AwsemPostRunJsonOutput(**postrun_json_output) + assert prjo.alt_output_target(['arg1', 'arg2b']) == {'arg1': 'target1', 'arg2b': 'target2'} + + +def test_file_uri_cwl_wdl_error(): + rji_dict = {'file:///data1/input/file1': {'path': 'somefile', 'dir': 'somebucket', 'mount': False}} + runjson_input = awsem.AwsemRunJsonInput(**{'Input_files_data': rji_dict}) + with pytest.raises(MalFormattedRunJsonException) as ex: + runjson_input.check_input_files_key_compatibility('cwl') + assert 'argument name for CWL' in str(ex.value) + with pytest.raises(MalFormattedRunJsonException) as ex: + runjson_input.check_input_files_key_compatibility('wdl') + assert 'argument name for CWL' in str(ex.value) + runjson_input.check_input_files_key_compatibility('shell') + runjson_input.check_input_files_key_compatibility('snakemake') diff --git a/tests/tibanna/unicorn/test_ec2_utils.py b/tests/tibanna/unicorn/test_ec2_utils.py old mode 100644 new mode 100755 index 67d405273..dc98ca037 --- a/tests/tibanna/unicorn/test_ec2_utils.py +++ b/tests/tibanna/unicorn/test_ec2_utils.py @@ -23,7 +23,7 @@ def fun(): def test_args(): input_dict = {'args': {'input_files': {}, 'output_S3_bucket': 'somebucket', 'app_name': 'someapp'}} - args = Args(**input_dict['args']) + args = Args(**input_dict['args'], fill_default=False) args_dict = args.as_dict() assert 'input_files' in args_dict assert 'app_name' in args_dict @@ -108,7 +108,7 @@ def test_args_parse_input_files_format_error(): 'cwl_main_filename': 'main.cwl', 'cwl_directory_url': 'someurl', 'app_name': 'someapp'}} - args = Args(**input_dict['args']) + args = Args(**input_dict['args'], fill_default=False) with pytest.raises(MalFormattedInputJsonException) as ex: args.fill_default() assert ex @@ -122,7 +122,7 @@ def test_args_parse_input_files_format_error2(): 'cwl_main_filename': 'main.cwl', 'cwl_directory_url': 'someurl', 'app_name': 'someapp'}} - args = Args(**input_dict['args']) + args = Args(**input_dict['args'], fill_default=False) with pytest.raises(MalFormattedInputJsonException) as ex: args.fill_default() assert ex @@ -225,7 +225,7 @@ def test_execution_mem_cpu(): 'snakemake_main_filename': 'Snakefile', 'snakemake_directory_url': 'someurl', 'command': 'snakemake', - 'container_image': 'quay.io/snakemake/snakemake'}, + 'container_image': 'snakemake/snakemake'}, 'config': {'log_bucket': 'tibanna-output', 'mem': 1, 'cpu': 1}} execution = Execution(input_dict) unicorn_dict = execution.input_dict @@ -437,7 +437,7 @@ def test_execution_missing_field5(): 'output_S3_bucket': 'somebucket', 'snakemake_main_filename': 'Snakefile', 'snakemake_directory_url': 'someurl', - 'container_image': 'quay.io/snakemake/snakemake'}, + 'container_image': 'snakemake/snakemake'}, 'config': {'log_bucket': 'tibanna-output', 'mem': 1, 'cpu': 1}} with pytest.raises(MissingFieldInInputJsonException) as ex: Execution(input_dict) diff --git a/tests/tibanna/unicorn/test_exceptions.py b/tests/tibanna/unicorn/test_exceptions.py old mode 100644 new mode 100755 index 62d4dd424..58fc22cd8 --- a/tests/tibanna/unicorn/test_exceptions.py +++ b/tests/tibanna/unicorn/test_exceptions.py @@ -27,7 +27,7 @@ def test_awsem_exception_not_enough_space_for_input(): assert res with pytest.raises(AWSEMJobErrorException) as exec_info: raise res - assert 'Not enough space for input files' in str(exec_info) + assert 'Not enough space for input files' in str(exec_info.value) def test_awsem_exception_no_space_for_docker(): @@ -39,8 +39,8 @@ def test_awsem_exception_no_space_for_docker(): assert res with pytest.raises(AWSEMJobErrorException) as exec_info: raise res - assert 'No space for docker' in str(exec_info) - assert 'tar.bz2: no space left' in str(exec_info) + assert 'No space for docker' in str(exec_info.value) + assert 'tar.bz2: no space left' in str(exec_info.value) def test_awsem_exception_no_space(): @@ -51,8 +51,8 @@ def test_awsem_exception_no_space(): assert res with pytest.raises(AWSEMJobErrorException) as exec_info: raise res - assert 'Not enough space' in str(exec_info) - assert '[fputs]' in str(exec_info) + assert 'Not enough space' in str(exec_info.value) + assert '[fputs]' in str(exec_info.value) def test_awsem_exception_cwl_missing_input(): @@ -66,8 +66,8 @@ def test_awsem_exception_cwl_missing_input(): assert res with pytest.raises(AWSEMJobErrorException) as exec_info: raise res - assert 'CWL missing input' in str(exec_info) - assert 'chromosomes' in str(exec_info) + assert 'CWL missing input' in str(exec_info.value) + assert 'chromosomes' in str(exec_info.value) def test_add_custom_errors(): @@ -83,8 +83,8 @@ def test_add_custom_errors(): assert res with pytest.raises(AWSEMJobErrorException) as exec_info: raise res - assert 'Unmatching pairs in fastq' in str(exec_info) - assert 'H3MVTCCXX:4:1101:1174861:0' in str(exec_info) + assert 'Unmatching pairs in fastq' in str(exec_info.value) + assert 'H3MVTCCXX:4:1101:1174861:0' in str(exec_info.value) def test_add_custom_errors2(): @@ -97,8 +97,8 @@ def test_add_custom_errors2(): assert res with pytest.raises(AWSEMJobErrorException) as exec_info: raise res - assert 'No peak called' in str(exec_info) - assert '1234567890abcdefg.regionPeak.gz' in str(exec_info) + assert 'No peak called' in str(exec_info.value) + assert '1234567890abcdefg.regionPeak.gz' in str(exec_info.value) def test_no_matching_error(): diff --git a/tests/tibanna/unicorn/test_nnested_array.py b/tests/tibanna/unicorn/test_nnested_array.py old mode 100644 new mode 100755 diff --git a/tests/tibanna/unicorn/test_top.py b/tests/tibanna/unicorn/test_top.py new file mode 100644 index 000000000..04f773610 --- /dev/null +++ b/tests/tibanna/unicorn/test_top.py @@ -0,0 +1,185 @@ +import os +from tibanna import top + + +top_contents = """ + +Timestamp: 2020-12-18-18:55:37 +top - 18:55:37 up 4 days, 3:18, 2 users, load average: 2.00, 2.00, 2.30 +Tasks: 344 total, 1 running, 343 sleeping, 0 stopped, 0 zombie +%Cpu(s): 6.6 us, 0.1 sy, 0.0 ni, 93.2 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st +KiB Mem : 12971188+total, 95469344 free, 28933200 used, 5309352 buff/cache +KiB Swap: 0 total, 0 free, 0 used. 10002531+avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND +16962 root 20 0 36.456g 0.011t 19372 S 93.8 8.9 125:11.21 java -jar somejar.jar +17086 root 20 0 36.464g 0.016t 19572 S 70.0 13.0 178:59.28 bwa mem +17919 ubuntu 20 0 40676 3828 3144 R 6.2 0.0 0:00.01 top -b -n1 -c -i -w 10000 + + +Timestamp: 2020-12-18-18:56:37 +top - 18:56:37 up 4 days, 3:18, 2 users, load average: 2.00, 2.00, 2.30 +Tasks: 344 total, 1 running, 343 sleeping, 0 stopped, 0 zombie +%Cpu(s): 6.6 us, 0.1 sy, 0.0 ni, 93.2 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st +KiB Mem : 12971188+total, 95469344 free, 28933200 used, 5309352 buff/cache +KiB Swap: 0 total, 0 free, 0 used. 10002531+avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND +16962 root 20 0 36.456g 0.011t 19372 S 92.8 9.9 125:11.21 java -jar somejar.jar +17919 ubuntu 20 0 40676 3828 3144 R 5.2 0.0 0:00.01 top -b -n1 -c -i -w 10000 +""" + + +def test_empty_top(): + top1 = top.Top('') + top1.digest() + assert top1.processes == {} + assert top1.timestamps == [] + +def test_top(): + top1 = top.Top(top_contents) + print(top1.as_dict()) + assert hasattr(top1, 'processes') + + timestamp1 = '2020-12-18-18:55:37' + timestamp2 = '2020-12-18-18:56:37' + assert timestamp1 in top1.processes + print(top1.processes[timestamp1]) + assert len( top1.processes[timestamp1]) == 2 + top1dict = top1.processes[timestamp1][0].as_dict() + print(top1dict) + assert top1dict['pid'] == '16962' + assert top1dict['user'] == 'root' + assert top1dict['cpu'] == 93.8 + assert top1dict['mem'] == 8.9 + assert top1dict['command'] == 'java -jar somejar.jar' + top2dict = top1.processes[timestamp1][1].as_dict() + print(top2dict) + assert top2dict['pid'] == '17086' + assert top2dict['user'] == 'root' + assert top2dict['cpu'] == 70.0 + assert top2dict['mem'] == 13.0 + assert top2dict['command'] == 'bwa mem' + + assert timestamp2 in top1.processes + assert len( top1.processes[timestamp2]) == 1 + top3dict = top1.processes[timestamp2][0].as_dict() + print(top3dict) + assert top3dict['pid'] == '16962' + assert top3dict['user'] == 'root' + assert top3dict['cpu'] == 92.8 + assert top3dict['mem'] == 9.9 + assert top3dict['command'] == 'java -jar somejar.jar' + +def test_digest(): + top1 = top.Top(top_contents) + top1.digest() + timestamp1 = '2020-12-18-18:55:37' + timestamp2 = '2020-12-18-18:56:37' + assert top1.timestamps == [timestamp1, timestamp2] + assert top1.commands == ['bwa mem', 'java -jar somejar.jar'] # alphabetically ordered + assert top1.cpus == {'java -jar somejar.jar': [93.8, 92.8], 'bwa mem': [70.0, 0]} + assert top1.mems == {'java -jar somejar.jar': [8.9, 9.9], 'bwa mem': [13.0, 0]} + assert top1.total_cpu_per_command('java -jar somejar.jar') == 93.8 + 92.8 + assert top1.total_cpu_per_command('bwa mem') == 70.0 + 0 + assert top1.total_mem_per_command('java -jar somejar.jar') == 8.9 + 9.9 + assert top1.total_mem_per_command('bwa mem') == 13.0 + 0 + + # change process and redigest + timestamp3 = '2020-12-18-18:57:37' + top1.processes[timestamp3] = top1.processes[timestamp2].copy() + del top1.processes[timestamp2] + top1.digest() + + assert len(top1.processes) == 2 + assert top1.timestamps == [timestamp1, timestamp3] + +def test_write_to_csv(): + top1 = top.Top(top_contents) + top1.digest() + test_tsv_file = 'some_tsv_file' + top1.write_to_csv(test_tsv_file, delimiter='\t', base=1) + with open(test_tsv_file) as f: + content = f.read() + lines = content.splitlines() + assert len(lines) == 3 + assert lines[0] == 'timepoints\t\"bwa mem\"\t\"java -jar somejar.jar\"' + assert lines[1] == '1\t70.0\t93.8' + assert lines[2] == '2\t0\t92.8' + + top1.write_to_csv(test_tsv_file, delimiter='\t', metric='mem', colname_for_timestamps='intervals', base=1) + with open(test_tsv_file) as f: + content = f.read() + lines = content.splitlines() + assert len(lines) == 3 + assert lines[0] == 'intervals\t\"bwa mem\"\t\"java -jar somejar.jar\"' + assert lines[1] == '1\t13.0\t8.9' + assert lines[2] == '2\t0\t9.9' + + # change time stamp to 2 minute interval and re-digest + top1.processes['2020-12-18-18:57:37'] = top1.processes['2020-12-18-18:56:37'].copy() + del top1.processes['2020-12-18-18:56:37'] + top1.digest() + print(top1.as_dict()) + + top1.write_to_csv(test_tsv_file, delimiter='\t', metric='mem', base=0, timestamp_start='2020-12-18-18:54:37') + with open(test_tsv_file) as f: + content = f.read() + lines = content.splitlines() + assert len(lines) == 5 + assert lines[0] == 'timepoints\t\"bwa mem\"\t\"java -jar somejar.jar\"' + assert lines[1] == '0\t0\t0' + assert lines[2] == '1\t13.0\t8.9' + assert lines[3] == '2\t0\t0' + assert lines[4] == '3\t0\t9.9' + + top1.write_to_csv(test_tsv_file, delimiter='\t', metric='mem', base=1, timestamp_start='2020-12-18-18:56:37', timestamp_end='2020-12-18-18:58:37') + with open(test_tsv_file) as f: + content = f.read() + lines = content.splitlines() + assert len(lines) == 4 + assert lines[0] == 'timepoints\t\"bwa mem\"\t\"java -jar somejar.jar\"' + assert lines[1] == '1\t0\t0' + assert lines[2] == '2\t0\t9.9' + assert lines[3] == '3\t0\t0' + + top1.write_to_csv(test_tsv_file, metric='mem', base=1, timestamp_start='2020-12-18-18:54:37', timestamp_end='2020-12-18-18:56:37') + with open(test_tsv_file) as f: + content = f.read() + lines = content.splitlines() + assert len(lines) == 4 + assert lines[0] == 'timepoints,\"bwa mem\",\"java -jar somejar.jar\"' + assert lines[1] == '1,0,0' + assert lines[2] == '2,13.0,8.9' + assert lines[3] == '3,0,0' + + top1.write_to_csv(test_tsv_file, metric='mem', base=1, timestamp_start='2020-12-18-18:53:02', timestamp_end='2020-12-18-18:56:22') + with open(test_tsv_file) as f: + content = f.read() + lines = content.splitlines() + assert len(lines) == 5 + assert lines[0] == 'timepoints,\"bwa mem\",\"java -jar somejar.jar\"' + assert lines[1] == '1,0,0' # 18:53:02 + assert lines[2] == '2,0,0' # 18:54:02 + assert lines[3] == '3,0,0' # 18:55:02 + assert lines[4] == '4,13.0,8.9' # 18:56:02 <- 18:55:37 (first entry), 18:56:22 (end time) are rounded to this one. + + os.remove(test_tsv_file) + +def test_wrap_in_double_quotes(): + haha = top.Top.wrap_in_double_quotes('haha') + assert haha == '"haha"' + +def test_get_collapsed_commands(): + top1 = top.Top(top_contents) + + # no need to collapse (not too many commands) + collapsed_commands = top1.get_collapsed_commands(max_n_commands=16) + assert set(collapsed_commands) == set(['java -jar somejar.jar', 'bwa mem']) + + top1.processes['2020-12-18-18:56:37'][0].command = 'java -jar some_other_jar.jar' + collapsed_commands = top1.get_collapsed_commands(max_n_commands=16) + assert set(collapsed_commands) == set(['java -jar somejar.jar', 'bwa mem', 'java -jar some_other_jar.jar']) + collapsed_commands = top1.get_collapsed_commands(max_n_commands=2) + assert set(collapsed_commands) == set(['java -jar', 'bwa mem']) + diff --git a/tests/tibanna/unicorn/test_utils.py b/tests/tibanna/unicorn/test_utils.py old mode 100644 new mode 100755 diff --git a/tests/webdevtestlist b/tests/webdevtestlist old mode 100644 new mode 100755 diff --git a/tibanna/__init__.py b/tibanna/__init__.py old mode 100644 new mode 100755 index e69de29bb..316dad135 --- a/tibanna/__init__.py +++ b/tibanna/__init__.py @@ -0,0 +1,30 @@ +import logging + + +class TibannaLoggingFormatter(logging.Formatter): + + verbose_fmt = "[%(name)s] %(levelname)s: %(asctime)s - %(message)s" + info_fmt = "%(message)s" + + def format(self, record): + if record.levelno == logging.INFO: + tmpformatter = logging.Formatter(TibannaLoggingFormatter.info_fmt) + else: + tmpformatter = logging.Formatter(TibannaLoggingFormatter.verbose_fmt) + tmpformatter.datefmt = '%y-%m-%d %H:%M:%S' + return tmpformatter.format(record) + + +def create_logger(name='root'): + logger = logging.getLogger(name) + + # configuring severity level + logger.setLevel(logging.DEBUG) + + # configuring format requires creating a handler first + log_handler = logging.StreamHandler() + log_formatter = TibannaLoggingFormatter() + log_handler.setFormatter(log_formatter) + logger.addHandler(log_handler) + + return logger diff --git a/tibanna/__main__.py b/tibanna/__main__.py old mode 100644 new mode 100755 index 1bf726112..1149390df --- a/tibanna/__main__.py +++ b/tibanna/__main__.py @@ -5,6 +5,7 @@ # -*- coding: utf-8 -*- import argparse import inspect +import json from ._version import __version__ # from botocore.errorfactory import ExecutionAlreadyExists from .core import API @@ -29,9 +30,11 @@ def descriptions(self): 'kill_all': 'kill all the running jobs on a step function', 'list_sfns': 'list all step functions, optionally with a summary (-n)', 'log': 'print execution log or postrun json for a job', + 'info': 'print out information about a job', 'rerun': 'rerun a specific job', 'rerun_many': 'rerun all the jobs that failed after a given time point', 'run_workflow': 'run a workflow', + 'run_batch_workflows': 'run many workflows in a batch', 'setup_tibanna_env': 'set up usergroup environment on AWS.' + 'This function is called automatically by deploy_tibanna or deploy_unicorn.' + 'Use it only when the IAM permissions need to be reset', @@ -39,7 +42,8 @@ def descriptions(self): 'users': 'list all users along with their associated tibanna user groups', 'plot_metrics': 'create a metrics report html and upload it to S3, or retrive one if one already exists', 'cost': 'print out the EC2/EBS cost of a job - it may not be ready for a day after a job finishes', - 'cleanup': 'remove all tibanna component for a usergroup (and suffix) including step function, lambdas IAM groups' + 'cleanup': 'remove all tibanna component for a usergroup (and suffix) including step function, lambdas IAM groups', + 'create_ami': 'create tibanna ami (Most users do not need this - tibanna AMIs are publicly available.)' } @property @@ -60,11 +64,22 @@ def args(self): 'help': "number of seconds between submission, to avoid drop-out (default 3)", 'type': int, 'default': 3}], - 'stat': - [{'flag': ["-s", "--sfn"], + 'run_batch_workflows': + [{'flag': ["-i", "--input-json-list"], + 'help': "list of tibanna input json files, e.g. -i input1.json [input2.json] [...]", + "nargs": "+"}, + {'flag': ["-s", "--sfn"], 'help': "tibanna step function name (e.g. 'tibanna_unicorn_monty'); " + "your current default is %s)" % TIBANNA_DEFAULT_STEP_FUNCTION_NAME, 'default': TIBANNA_DEFAULT_STEP_FUNCTION_NAME}, + {'flag': ["-S", "--sleep"], + 'help': "number of seconds between submission, to avoid drop-out (default 3)", + 'type': int, + 'default': 3}], + 'stat': + [{'flag': ["-s", "--sfn"], + 'help': "tibanna step function name (e.g. 'tibanna_unicorn_monty'); " + + "your current default is %s)" % TIBANNA_DEFAULT_STEP_FUNCTION_NAME}, {'flag': ["-t", "--status"], 'help': "filter by status; 'RUNNING'|'SUCCEEDED'|'FAILED'|'TIMED_OUT'|'ABORTED'"}, {'flag': ["-l", "--long"], @@ -72,7 +87,11 @@ def args(self): 'action': "store_true"}, {'flag': ["-n", "--nlines"], 'help': "number of lines to print", - 'type': int}], + 'type': int}, + {'flag': ["-j", "--job-ids"], + 'nargs': '+', + 'help': "job ids of the specific jobs to display, separated by space. " + + "This option cannot be combined with --nlines(-n), --status(-t) or --sfn(-s)"}], 'kill': [{'flag': ["-e", "--exec-arn"], 'help': "execution arn of the specific job to kill"}, @@ -99,8 +118,17 @@ def args(self): 'help': "tibanna step function name (e.g. 'tibanna_unicorn_monty'); " + "your current default is %s)" % TIBANNA_DEFAULT_STEP_FUNCTION_NAME, 'default': TIBANNA_DEFAULT_STEP_FUNCTION_NAME}, + {'flag': ["-r", "--runjson"], + 'help': "print out run json instead", 'action': "store_true"}, {'flag': ["-p", "--postrunjson"], - 'help': "print out postrun json instead", 'action': "store_true"}], + 'help': "print out postrun json instead", 'action': "store_true"}, + {'flag': ["-t", "--top"], + 'help': "print out top file (log file containing top command output) instead", 'action': "store_true"}, + {'flag': ["-T", "--top-latest"], + 'help': "print out the latest content of the top file", 'action': "store_true"}], + 'info': + [{'flag': ["-j", "--job-id"], + 'help': "job id of the specific job to log (alternative to --exec-arn/-e)"}], 'add_user': [{'flag': ["-u", "--user"], 'help': "user to add to a Tibanna usergroup"}, @@ -116,6 +144,8 @@ def args(self): 'rerun': [{'flag': ["-e", "--exec-arn"], 'help': "execution arn of the specific job to rerun"}, + {'flag': ["-j", "--job-id"], + 'help': "job id of the specific job to rerun (alternative to --exec-arn/-e)"}, {'flag': ["-s", "--sfn"], 'default': TIBANNA_DEFAULT_STEP_FUNCTION_NAME, 'help': "tibanna step function name (e.g. 'tibanna_unicorn_monty'); " + @@ -275,7 +305,25 @@ def args(self): 'help': "quiet"}, {'flag': ["-E", "--do-not-ignore-errors"], 'action': 'store_true', - 'help': "do not ignore errors that occur due to a resource already deleted or non-existent"}] + 'help': "do not ignore errors that occur due to a resource already deleted or non-existent"}], + 'create_ami': + [{'flag': ["-p", "--make-public"], + 'help': "Make the Tibanna AMI public (most users do not need this)", + 'action': 'store_true'}, + {'flag': ["-B", "--build-from-scratch"], + 'help': "Build a new AMI starting from Ubuntu base image. " + + "This option will launch an instance for creating the new image " + + "as opposed to simply copying an existing Tibanna image.", + 'action': 'store_true'}, + {'flag': ["-I", "--source-image-to-copy-from"], + 'help': "The ID of the image to copy (e.g. 'ami-0a7ddfc7e412ab6e0' which is a default public Tibanna image " + + "for us-east-1). To use this option, turn off option -B."}, + {'flag': ["-R", "--source-image-region"], + 'help': "The region of the image to copy (e.g. 'us-east-1' if source image to copy from is 'ami-0a7ddfc7e412ab6e0'). " + + "To use this option, turn off option -B."}, + {'flag': ["-U", "--ubuntu-base-image"], + 'help': "The ID of the Ubuntu 20.04 image to build from (e.g. 'ami-0885b1f6bd170450c' for us-east-1). " + + "To use this option, turn on the option -B."}] } @@ -291,6 +339,11 @@ def run_workflow(input_json, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, jobid='', d API().run_workflow(input_json, sfn=sfn, jobid=jobid, sleep=sleep, open_browser=not do_not_open_browser, verbose=True) +def run_batch_workflows(input_json_list, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, sleep=3): + """run a workflow""" + API().run_batch_workflows(input_json_list, sfn=sfn, sleep=sleep, verbose=True) + + def setup_tibanna_env(buckets='', usergroup_tag='default', no_randomize=False, do_not_delete_public_access_block=False): """set up usergroup environment on AWS @@ -322,9 +375,11 @@ def list_sfns(numbers=False): API().list_sfns(numbers=numbers) -def log(exec_arn=None, job_id=None, exec_name=None, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, postrunjson=False): - """print execution log or postrun json (-p) for a job""" - print(API().log(exec_arn, job_id, exec_name, sfn, postrunjson)) +def log(exec_arn=None, job_id=None, exec_name=None, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, + runjson=False, postrunjson=False, top=False, top_latest=False): + """print execution log, run json (-r), postrun json (-p) or top (-t) for a job""" + print(API().log(exec_arn, job_id, exec_name, sfn, runjson=runjson, postrunjson=postrunjson, + top=top, top_latest=top_latest)) def kill_all(sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME): @@ -337,11 +392,15 @@ def kill(exec_arn=None, job_id=None, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME): API().kill(exec_arn, job_id, sfn) -def rerun(exec_arn, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, app_name_filter=None, +def info(job_id): + """prints out information about a job""" + print(json.dumps(API().info(job_id), indent=True)) + +def rerun(exec_arn=None, job_id=None, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, app_name_filter=None, instance_type=None, shutdown_min=None, ebs_size=None, ebs_type=None, ebs_iops=None, overwrite_input_extra=None, key_name=None, name=None): """ rerun a specific job""" - API().rerun(exec_arn, sfn=sfn, + API().rerun(exec_arn=exec_arn, job_id=job_id, sfn=sfn, app_name_filter=app_name_filter, instance_type=instance_type, shutdown_min=shutdown_min, ebs_size=ebs_size, ebs_type=ebs_type, ebs_iops=ebs_iops, overwrite_input_extra=overwrite_input_extra, key_name=key_name, name=name) @@ -370,11 +429,11 @@ def rerun_many(sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, stopdate='13Feb2018', sto overwrite_input_extra=overwrite_input_extra, key_name=key_name, name=name) -def stat(sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, status=None, long=False, nlines=None): +def stat(sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, status=None, long=False, nlines=None, job_ids=None): """print out executions with details status can be one of 'RUNNING'|'SUCCEEDED'|'FAILED'|'TIMED_OUT'|'ABORTED' """ - API().stat(sfn=sfn, status=status, verbose=long, n=nlines) + API().stat(sfn=sfn, status=status, verbose=long, n=nlines, job_ids=job_ids) def plot_metrics(job_id, sfn=TIBANNA_DEFAULT_STEP_FUNCTION_NAME, force_upload=False, update_html_only=False, @@ -394,6 +453,13 @@ def cleanup(usergroup, suffix='', purge_history=False, do_not_remove_iam_group=F ignore_errors=not do_not_ignore_errors, purge_history=purge_history, verbose=not quiet) +def create_ami(make_public=False, build_from_scratch=False, source_image_to_copy_from=None, source_image_region=None, + ubuntu_base_image=None): + print(API().create_ami(make_public=make_public, build_from_scratch=build_from_scratch, + source_image_to_copy_from=source_image_to_copy_from, source_image_region=source_image_region, + ubuntu_base_image=ubuntu_base_image)) + + def main(Subcommands=Subcommands): """ Execute the program from the command line diff --git a/tibanna/_version.py b/tibanna/_version.py old mode 100644 new mode 100755 index ae48ecaf1..23b9f4bfd --- a/tibanna/_version.py +++ b/tibanna/_version.py @@ -1,4 +1,4 @@ """Version information.""" # The following line *must* be the last in the module, exactly as formatted: -__version__ = "0.18.4" +__version__ = "1.0.0" diff --git a/tibanna/ami.py b/tibanna/ami.py new file mode 100755 index 000000000..e053ff189 --- /dev/null +++ b/tibanna/ami.py @@ -0,0 +1,141 @@ +import boto3 +import time +import os +import json +from datetime import datetime +from tibanna import create_logger + + +logger = create_logger(__name__) + + +class AMI(object): + + BASE_AMI = 'ami-0885b1f6bd170450c' # ubuntu 20.04 for us-east-1 + BASE_REGION = 'us-east-1' + USERDATA_DIR = os.path.dirname(os.path.abspath(__file__)) + USERDATA_FILE = os.path.join(USERDATA_DIR, 'create_ami_userdata') + AMI_NAME = 'tibanna-ami-' + datetime.strftime(datetime.today(), '%Y%m%d') # e.g tibanna-ami-20201113 + + def __init__(self, base_ami=None, base_region=None, userdata_file=None, ami_name=None): + if base_ami: + self.BASE_AMI = base_ami + if base_region: + self.BASE_REGION = base_region + if userdata_file is not None: + self.USERDATA_FILE = userdata_file + if ami_name: + self.AMI_NAME = ami_name + + @staticmethod + def launch_instance_for_tibanna_ami(keyname, userdata_file, base_ami): + + launch_args = {'ImageId': base_ami, + 'InstanceType': 't3.micro', + 'MaxCount': 1, + 'MinCount': 1, + 'TagSpecifications': [{'ResourceType': 'instance', + 'Tags': [{"Key": "Name", "Value": "tibanna_ami"}]}]} + if userdata_file: + with open(userdata_file, 'r') as f: + userdata_str = f.read() + launch_args.update({'UserData': userdata_str}) + + if keyname: + launch_args.update({'KeyName': keyname}) + + logger.debug("launch_args=" + str(launch_args)) + ec2 = boto3.client('ec2') + res = ec2.run_instances(**launch_args) + logger.debug("response from EC2 run_instances :" + str(res) + '\n\n') + instance_id = res['Instances'][0]['InstanceId'] + + return instance_id + + def create_ami_for_tibanna(self, keyname=None, make_public=False): + return self.create_ami(keyname=keyname, userdata_file=self.USERDATA_FILE, + base_ami=self.BASE_AMI, ami_name=self.AMI_NAME, + make_public=make_public, base_region=self.BASE_REGION) + + @staticmethod + def create_ami(keyname=None, userdata_file=USERDATA_FILE, + base_ami=BASE_AMI, + ami_name=AMI_NAME, + make_public=False, + base_region='us-east-1'): + + if not userdata_file: + logger.info("no userdata.. no need to launch an instance.. just copying image") + ec2 = boto3.client('ec2') + try: + res_copy = ec2.copy_image(Name=ami_name, SourceImageId=base_ami, SourceRegion=base_region) + except: + raise Exception("Failed to copy image") + + # I tried 5 min - it's not enough and it fails at the next step. + logger.info("waiting for 10min for the image to be created..") + time.sleep(10 * 60) + + new_image_id = res_copy['ImageId'] + + if make_public: + ec2.modify_image_attribute(ImageId=new_image_id, + LaunchPermission={'Add': [{'Group': 'all'}]}) + return new_image_id + + # launch an instance + try: + instance_id = AMI.launch_instance_for_tibanna_ami(keyname, userdata_file, base_ami) + logger.debug("instance_id=" + instance_id) + except: + raise Exception("Failed to launch an instance") + + logger.info("waiting for 10min for the instance to install everything and reboot..") + time.sleep(10 * 60) + + # create an image from the instance + try: + create_image_args = {'InstanceId': instance_id, 'Name': ami_name} + ec2 = boto3.client('ec2') + logger.info("creating an image...") + res_create = ec2.create_image(**create_image_args) + except: + raise Exception("Failed to create an image") + + logger.info("waiting for 10min for the image to be created..") + time.sleep(10 * 60) + + try: + ec2.terminate_instances(InstanceIds=[instance_id]) + except: + raise Exception("Failed to terminate the instance") + + new_image_id = res_create['ImageId'] + + if make_public: + ec2.modify_image_attribute(ImageId=new_image_id, + LaunchPermission={'Add': [{'Group': 'all'}]}) + return new_image_id + + + if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description="Arguments") + parser.add_argument("-k", "--key_name", help="key_name (default: no key)", default="") + parser.add_argument("-a", "--ami_name", help="ami_name (default: 'tibanna-ami-'", default="") + parser.add_argument("-b", "--base-ami", + help="base ami (default: ubuntu 20.04 for us-east-1 ('ami-0885b1f6bd170450c')", + default=BASE_AMI) + parser.add_argument("-e", "--no-user-data", help="do not use userdata", action="store_true") + parser.add_argument("-p", "--make-public", help="make the new AMI public", action="store_true") + args = parser.parse_args() + + if args.no_user_data: + new_image_id = create_ami_for_tibanna(args.key_name, ami_name=args.ami_name, userdata_file='', + base_ami=args.base_ami, make_public=args.make_public) + else: + new_image_id = create_ami_for_tibanna(args.key_name, ami_name=args.ami_name, base_ami=args.base_ami, + make_public=args.make_public) + logger.info("new_image_id=" + new_image_id) + diff --git a/tibanna/awsem.py b/tibanna/awsem.py old mode 100644 new mode 100755 index f22a060dc..b2a8cacfc --- a/tibanna/awsem.py +++ b/tibanna/awsem.py @@ -1,27 +1,51 @@ +import re +import copy from datetime import datetime from .base import SerializableObject from .ec2_utils import Config -from .exceptions import MalFormattedPostrunJsonException +from .exceptions import MalFormattedPostRunJsonException, MalFormattedRunJsonException from .vars import AWSEM_TIME_STAMP_FORMAT +from .nnested_array import flatten class AwsemRunJson(SerializableObject): - def __init__(self, Job, config): - self.create_Job(Job) - self.config = Config(**config) + def __init__(self, Job=None, config=None, strict=True): + if strict: + if not Job or not config: + raise MalFormattedPostRunJsonException("Job and config are required fields.") + if not Job: + Job = {} + self.create_Job(Job, strict=strict) + self.config = Config(**config) if config else None - def create_Job(self, Job): - self.Job = AwsemRunJsonJob(**Job) + def create_Job(self, Job, strict=True): + self.Job = AwsemRunJsonJob(**Job, strict=strict) class AwsemRunJsonJob(SerializableObject): - def __init__(self, App, Input, Output, JOBID, start_time, Log): + def __init__(self, App=None, Input=None, Output=None, JOBID='', + start_time=None, Log=None, strict=True): + if strict: + if App is None or Input is None or Output is None or not JOBID: + raise MalFormattedRunJsonException + if not App: + App = {} self.App = AwsemRunJsonApp(**App) + if not Input: + Input = {} self.Input = AwsemRunJsonInput(**Input) + if not Output: + Output = {} self.create_Output(Output) self.start_time = start_time self.JOBID = JOBID - self.Log = Log + if not Log: + Log = {} + self.Log = AwsemRunJsonLog(**Log) + + # format check + if self.App: + self.Input.check_input_files_key_compatibility(self.App.language) def create_Output(self, Output): self.Output = AwsemPostRunJsonOutput(**Output) @@ -32,7 +56,15 @@ def update(self, **kwargs): @property def start_time_as_str(self): - return datetime.strptime(self.start_time, AWSEM_TIME_STAMP_FORMAT) + if not self.start_time: + return '' + else: + return datetime.strptime(self.start_time, AWSEM_TIME_STAMP_FORMAT) + + +class AwsemRunJsonLog(SerializableObject): + def __init__(self, log_bucket_directory=None): + self.log_bucket_directory = log_bucket_directory class AwsemRunJsonApp(SerializableObject): @@ -50,13 +82,23 @@ def __init__(self, App_name=None, App_version=None, language=None, self.wdl_url = wdl_url self.main_wdl = main_wdl self.other_wdl_files = other_wdl_files - + self.container_image = container_image + self.command = command + self.snakemake_url = snakemake_url + self.main_snakemake = main_snakemake + self.other_snakemake_files = other_snakemake_files class AwsemRunJsonInput(SerializableObject): - def __init__(self, Input_files_data, Input_parameters, Secondary_files_data, + def __init__(self, Input_files_data=None, Input_parameters=None, Secondary_files_data=None, # Input_files_reference is for older postrunjson # Env is missing in older postrunjson Input_files_reference=None, Env=None): + if not Input_files_data: + Input_files_data = {} + if not Input_parameters: + Input_parameters = {} + if not Secondary_files_data: + Secondary_files_data = {} self.Input_files_data = {k: AwsemRunJsonInputFile(**v) for k, v in Input_files_data.items()} self.Secondary_files_data = {k: AwsemRunJsonInputFile(**v) for k, v in Secondary_files_data.items()} self.Input_parameters = Input_parameters @@ -64,18 +106,50 @@ def __init__(self, Input_files_data, Input_parameters, Secondary_files_data, if Input_files_reference: self.Input_files_reference = {k: AwsemRunJsonInputFile(**v) for k, v in Input_files_reference.items()} + def as_dict_as_cwl_input(self, input_dir='', input_mount_dir_prefix=''): + d = {k: v.as_dict_as_cwl_input(input_dir, input_mount_dir_prefix) for k, v in self.Input_files_data.items()} + d.update(self.Input_parameters) + return d + + def as_dict_as_wdl_input(self, input_dir='', input_mount_dir_prefix=''): + d = {k: v.as_dict_as_wdl_input(input_dir, input_mount_dir_prefix) for k, v in self.Input_files_data.items()} + d.update(self.Input_parameters) + return d + + def check_input_files_key_compatibility(self, language): + for category in ["Input_files_data", "Secondary_files_data"]: + for inkey in getattr(self, category): + if inkey.startswith('file://'): + if language not in ['shell', 'snakemake']: + raise MalFormattedRunJsonException('input file has to be defined with argument name for CWL and WDL') + target = inkey.replace('file://', '') + if not target.startswith('/data1/'): + raise Exception('input target directory must be in /data1/') + if not target.startswith('/data1/' + language) and \ + not target.startswith('/data1/input') and \ + not target.startswith('/data1/out'): + raise Exception('input target directory must be in /data1/input, /data1/out or /data1/%s' % language) + class AwsemRunJsonInputFile(SerializableObject): - def __init__(self, path, profile='', rename='', unzip='', **kwargs): # kwargs includes 'dir' and 'class' + def __init__(self, path, profile='', rename='', unzip='', mount=False, **kwargs): # kwargs includes 'dir' and 'class' # profile and rename are missing in the old postrunjson self.path = path self.profile = profile self.rename = rename self.unzip = unzip + self.mount = mount # handling reserved name key self.class_ = kwargs.get('class', None) self.dir_ = kwargs.get('dir', None) + # field compatibility check + errmsg = "Incompatible input for file %s: %s and mount cannot be used together." + if self.rename and len(flatten(self.rename))>0 and self.mount: # the second condition covers e.g. [] + raise MalFormattedRunJsonException(errmsg % (self.path, 'rename')) + if self.unzip and self.mount: + raise MalFormattedRunJsonException(errmsg % (self.path, 'unzip')) + def as_dict(self): d = super().as_dict() # handling reserved name key @@ -86,36 +160,142 @@ def as_dict(self): del(d[rk_alt]) return d + def as_dict_as_cwl_input(self, input_dir='', input_mount_dir_prefix=''): + if self.mount: + input_dir = input_mount_dir_prefix + self.dir_ + else: + input_dir = input_dir + if self.rename: + if isinstance(self.rename, list): + path = self.rename[:] + else: + path = self.rename + else: + path = self.path + if isinstance(path, list): + d = [] + for pi in path: + if isinstance(pi, list): + nested = [] + for ppi in pi: + if isinstance(ppi, list): + nested.append([file2cwlfile(pppi, input_dir, self.unzip) for pppi in ppi]) + else: + nested.append(file2cwlfile(ppi, input_dir, self.unzip)) + d.append(nested) + else: + d.append(file2cwlfile(pi, input_dir, self.unzip)) + return d + else: + return file2cwlfile(path, input_dir, self.unzip) + + def as_dict_as_wdl_input(self, input_dir='', input_mount_dir_prefix=''): + if self.mount: + input_dir = input_mount_dir_prefix + self.dir_ + else: + input_dir = input_dir + if self.rename: + if isinstance(self.rename, list): + path = list(self.rename) + else: + path = self.rename + else: + path = self.path + if isinstance(path, list): + d = [] + for pi in path: + if isinstance(pi, list): + nested = [] + for ppi in pi: + if isinstance(ppi, list): + nested.append([file2wdlfile(pppi, input_dir, self.unzip) for pppi in ppi]) + else: + nested.append(file2wdlfile(ppi, input_dir, self.unzip)) + d.append(nested) + else: + d.append(file2wdlfile(pi, input_dir, self.unzip)) + return d + else: + return file2wdlfile(path, input_dir, self.unzip) + class AwsemRunJsonOutput(SerializableObject): def __init__(self, output_bucket_directory=None, output_target=None, secondary_output_target=None, alt_cond_output_argnames=None): - self.output_bucket_directory = output_bucket_directory - self.output_target = output_target - self.secondary_output_target = secondary_output_target - self.alt_cond_output_argnames = alt_cond_output_argnames + self.output_bucket_directory = output_bucket_directory or {} + self.output_target = output_target or {} + self.secondary_output_target = secondary_output_target or {} + self.alt_cond_output_argnames = alt_cond_output_argnames or {} + + for u, v in self.secondary_output_target.items(): + if not isinstance(v, list): + self.secondary_output_target[u] = [v] + + def alt_output_target(self, argname_list): + """In case conditional alternative output targets exist, return alternative output target + where the output target keys are replaced with the alternative names. + If not, return output_target itself. + This function does not actually modify output_target. + It cannot be applied to custom output targets starting with 'file://' + We don't need to do the same for secondary files because + conditional alternative names only occur in WDL which does not support secondary files""" + + # first create a list of keys to be replaced + replace_list = [] + for k in self.output_target: + if k.startswith('file://'): + continue + if k not in argname_list: + if k in self.alt_cond_output_argnames: + key_exists = False # initialize + for k_alt in self.alt_cond_output_argnames[k]: + if k_alt in argname_list: + key_exists = True + replace_list.append((k, k_alt)) + break + if not key_exists: + raise Exception("output target key %s doesn't exist in argname list" % k) + else: + raise Exception("output target key %s doesn't exist in argname list" % k) + + # return the alternated output_target + alt_output_target = copy.deepcopy(self.output_target) + for k, k_alt in replace_list: + alt_output_target[k_alt] = alt_output_target[k] + del alt_output_target[k] + return alt_output_target class AwsemPostRunJson(AwsemRunJson): - def __init__(self, Job, config, commands=None,log=None): - super().__init__(Job, config) + def __init__(self, Job=None, config=None, commands=None,log=None, strict=True): + if strict: + if not Job or not config: + raise MalFormattedPostRunJsonException("Job and config are required fields.") + super().__init__(Job, config, strict=strict) if commands: self.commands = commands if log: self.log = log - def create_Job(self, Job): - self.Job = AwsemPostRunJsonJob(**Job) + def add_commands(self, command): + self.command = command + + def create_Job(self, Job, strict=True): + self.Job = AwsemPostRunJsonJob(**Job, strict=strict) class AwsemPostRunJsonJob(AwsemRunJsonJob): - def __init__(self, App, Input, Output, JOBID, - start_time, end_time=None, status=None, Log=None, + def __init__(self, App=None, Input=None, Output=None, JOBID='', + start_time=None, end_time=None, status=None, Log=None, total_input_size=None, total_output_size=None, total_tmp_size=None, # older postrunjsons don't have these fields filesystem='', instance_id='', - Metrics=None): - super().__init__(App, Input, Output, JOBID, start_time, Log) + Metrics=None, strict=True): + if strict: + if App is None or Input is None or Output is None or not JOBID or start_time is None: + errmsg = "App, Input, Output, JOBID and start_time are required fields" + raise MalFormattedPostRunJsonException(errmsg) + super().__init__(App, Input, Output, JOBID, start_time, Log, strict=strict) self.end_time = end_time self.status = status self.filesystem = filesystem @@ -135,6 +315,10 @@ def end_time_as_str(self): except: return None + def add_filesystem(self, filesystem): + self.filesystem = filesystem + + class AwsemPostRunJsonOutput(AwsemRunJsonOutput): def __init__(self, output_bucket_directory=None, output_target=None, secondary_output_target=None, alt_cond_output_argnames=None, @@ -144,12 +328,19 @@ def __init__(self, output_bucket_directory=None, output_target=None, super().__init__(output_bucket_directory, output_target, secondary_output_target, alt_cond_output_argnames) if 'Output files' in kwargs: - self.Output_files_ = {k: AwsemPostRunJsonOutputFile(**v) for k, v in kwargs['Output files'].items()} + self.add_output_files(kwargs['Output files']) + else: + self.Output_files_ = {} @property def output_files(self): return self.Output_files_ + def add_output_files(self, output_files): + """add or replace output files. output_files is a dictionary with argnames as keys + and a dict form of AwsemPostRunJsonOutputFile objects as values""" + self.Output_files_ = {k: AwsemPostRunJsonOutputFile(**v) for k, v in output_files.items()} + def as_dict(self): d = super().as_dict() # handling reserved name key @@ -160,7 +351,7 @@ def as_dict(self): class AwsemPostRunJsonOutputFile(SerializableObject): - def __init__(self, path, target, basename=None, checksum=None, + def __init__(self, path, target=None, basename=None, checksum=None, location=None, md5sum=None, size=None, secondaryFiles=None, **kwargs): # kwargs includes 'class' # both WDL and CWL @@ -176,12 +367,15 @@ def __init__(self, path, target, basename=None, checksum=None, if isinstance(secondaryFiles, list): self.secondaryFiles = [AwsemPostRunJsonOutputFile(**sf) for sf in secondaryFiles] else: - raise MalFormattedPostrunJsonException("secondaryFiles must be a list") + raise MalFormattedPostRunJsonException("secondaryFiles must be a list") else: self.secondaryFiles = None # handling reserved name key self.class_ = kwargs.get('class', None) + def add_target(self, target): + self.target = target + def as_dict(self): d = super().as_dict() # handling reserved name key @@ -189,3 +383,19 @@ def as_dict(self): d['class'] = d['class_'] del(d['class_']) return d + + +def file2cwlfile(filename, dirname, unzip): + if unzip: + filename = re.match('(.+)\.{0}$'.format(unzip), filename).group(1) + if dirname.endswith('/'): + dirname = dirname.rstrip('/') + return {"class": 'File', "path": dirname + '/' + filename} + + +def file2wdlfile(filename, dirname, unzip): + if unzip: + filename = re.match('(.+)\.{0}$'.format(unzip), filename).group(1) + if dirname.endswith('/'): + dirname = dirname.rstrip('/') + return dirname + '/' + filename diff --git a/tibanna/base.py b/tibanna/base.py old mode 100644 new mode 100755 diff --git a/tibanna/check_task.py b/tibanna/check_task.py old mode 100644 new mode 100755 index 26c0ffe0f..1cd339b88 --- a/tibanna/check_task.py +++ b/tibanna/check_task.py @@ -2,11 +2,11 @@ import boto3 import json import copy +from . import create_logger from .cw_utils import TibannaResource from datetime import datetime, timedelta from dateutil.tz import tzutc from .utils import ( - printlog, does_key_exist, read_s3 ) @@ -29,6 +29,9 @@ RESPONSE_JSON_CONTENT_INCLUSION_LIMIT = 30000 # strictly it is 32,768 but just to be safe. +logger = create_logger(__name__) + + def check_task(input_json): return CheckTask(input_json).run() @@ -71,11 +74,11 @@ def run(self): try: self.handle_postrun_json(bucket_name, jobid, self.input_json, public_read=public_postrun_json) except Exception as e: - printlog("error handling postrun json %s" % str(e)) + logger.warning("error occurred while handling postrun json but continuing. %s" % str(e)) eh = AWSEMErrorHandler() if 'custom_errors' in self.input_json['args']: eh.add_custom_errors(self.input_json['args']['custom_errors']) - log = API().log(job_id=jobid, logbucket=bucket_name) + log = self.API().log(job_id=jobid, logbucket=bucket_name) ex = eh.parse_log(log) if ex: msg_aug = str(ex) + ". For more info - " + eh.general_awsem_check_log_msg(jobid) @@ -104,7 +107,7 @@ def run(self): ec2_state = res['Reservations'][0]['Instances'][0]['State']['Name'] if ec2_state in ['stopped', 'shutting-down', 'terminated']: errmsg = "EC2 is terminated unintendedly for job %s - please rerun." % jobid - printlog(errmsg) + logger.error(errmsg) raise EC2UnintendedTerminationException(errmsg) # check CPU utilization for the past hour @@ -142,7 +145,7 @@ def terminate_idle_instance(self, jobid, instance_id, cpu, ebs_read): "Nothing has been running for the past hour for job %s," "but cannot terminate the instance - cpu utilization (%s) : %s" ) % (jobid, str(cpu), str(e)) - printlog(errmsg) + logger.error(errmsg) raise EC2IdleException(errmsg) def handle_postrun_json(self, bucket_name, jobid, input_json, public_read=False): @@ -154,8 +157,8 @@ def handle_postrun_json(self, bucket_name, jobid, input_json, public_read=False) prj = AwsemPostRunJson(**postrunjsoncontent) prj.Job.update(instance_id=input_json['config'].get('instance_id', '')) self.handle_metrics(prj) - printlog("inside funtion handle_postrun_json") - printlog("content=\n" + json.dumps(prj.as_dict(), indent=4)) + logger.debug("inside funtion handle_postrun_json") + logger.debug("content=\n" + json.dumps(prj.as_dict(), indent=4)) # upload postrun json file back to s3 acl = 'public-read' if public_read else 'private' try: @@ -191,5 +194,8 @@ def handle_metrics(self, prj): except Exception as e: raise MetricRetrievalException("error getting metrics: %s" % str(e)) prj.Job.update(Metrics=resources.as_dict()) - resources.plot_metrics(prj.config.instance_type, directory='/tmp/tibanna_metrics/') - resources.upload(bucket=prj.config.log_bucket, prefix=prj.Job.JOBID + '.metrics/') + self.API().plot_metrics(prj.Job.JOBID, directory='/tmp/tibanna_metrics/', + force_upload=True, open_browser=False, + endtime=prj.Job.end_time_as_str or datetime.now(), + filesystem=prj.Job.filesystem, + instance_id=prj.Job.instance_id) diff --git a/tibanna/core.py b/tibanna/core.py old mode 100644 new mode 100755 index a84878969..f3bc9b349 --- a/tibanna/core.py +++ b/tibanna/core.py @@ -4,7 +4,6 @@ import json import time import copy -import logging import importlib import shutil import subprocess @@ -13,6 +12,7 @@ from dateutil.tz import tzutc from uuid import uuid4, UUID from types import ModuleType +from . import create_logger from .vars import ( _tibanna, AWS_ACCOUNT_NUMBER, @@ -20,11 +20,7 @@ TIBANNA_DEFAULT_STEP_FUNCTION_NAME, STEP_FUNCTION_ARN, EXECUTION_ARN, - AMI_ID_CWL_V1, - AMI_ID_CWL_DRAFT3, - AMI_ID_WDL, - AMI_ID_SNAKEMAKE, - AMI_ID_SHELL, + AMI_ID, TIBANNA_REPO_NAME, TIBANNA_REPO_BRANCH, TIBANNA_PROFILE_ACCESS_KEY, @@ -39,7 +35,6 @@ ) from .utils import ( _tibanna_settings, - printlog, create_jobid, does_key_exist, read_s3, @@ -51,6 +46,7 @@ UnicornInput, upload_workflow_to_s3 ) +from .ami import AMI # from botocore.errorfactory import ExecutionAlreadyExists from .stepfunction import StepFunctionUnicorn from .awsem import AwsemRunJson, AwsemPostRunJson @@ -61,8 +57,7 @@ # logger -LOG = logging.getLogger(__name__) -LOG.setLevel(logging.INFO) +logger = create_logger(__name__) UNICORN_LAMBDAS = ['run_task_awsem', 'check_task_awsem'] @@ -183,14 +178,14 @@ def run_workflow(self, input_json, sfn=None, unicorn_input = UnicornInput(data) args = unicorn_input.args if args.language.startswith('cwl') and args.cwl_directory_local or \ - args.language == 'wdl' and args.wdl_directory_local or \ + args.language in ['wdl', 'wdl_v1', 'wdl_draft2'] and args.wdl_directory_local or \ args.language == 'snakemake' and args.snakemake_directory_local: upload_workflow_to_s3(unicorn_input) data['args'] = args.as_dict() # update args # submit job as an execution aws_input = json.dumps(data) if verbose: - print("about to start run %s" % run_name) + logger.info("about to start run %s" % run_name) # trigger the step function to run try: response = client.start_execution( @@ -206,19 +201,29 @@ def run_workflow(self, input_json, sfn=None, data[_tibanna]['response'] = response if verbose: # print some info - print("response from aws was: \n %s" % response) - print("url to view status:") - print(data[_tibanna]['url']) - print("JOBID %s submitted" % data['jobid']) - print("EXECUTION ARN = %s" % data[_tibanna]['exec_arn']) + logger.info("response from aws was: \n %s" % response) + logger.info("url to view status:") + logger.info(data[_tibanna]['url']) + logger.info("JOBID %s submitted" % data['jobid']) + logger.info("EXECUTION ARN = %s" % data[_tibanna]['exec_arn']) if 'cloudwatch_dashboard' in data['config'] and data['config']['cloudwatch_dashboard']: cw_db_url = 'https://console.aws.amazon.com/cloudwatch/' + \ 'home?region=%s#dashboards:name=awsem-%s' % (AWS_REGION, jobid) - print("Cloudwatch Dashboard = %s" % cw_db_url) + logger.info("Cloudwatch Dashboard = %s" % cw_db_url) if open_browser and shutil.which('open') is not None: subprocess.call(["open", data[_tibanna]['url']]) return data + def run_batch_workflows(self, input_json_list, sfn=None, + env=None, sleep=3, verbose=True, open_browser=True): + """given a list of input json, run multiple workflows""" + run_infos = [] + for input_json in input_json_list: + run_info = self.run_workflow(input_json, env=env, sfn=sfn, sleep=sleep, verbose=verbose, + open_browser=False) + run_infos.append(run_info) + return run_infos + def add_to_dydb(self, awsem_job_id, execution_name, sfn, logbucket, verbose=True): time_stamp = datetime.strftime(datetime.utcnow(), '%Y%m%d-%H:%M:%S-UTC') dydb = boto3.client('dynamodb', region_name=AWS_REGION) @@ -227,7 +232,7 @@ def add_to_dydb(self, awsem_job_id, execution_name, sfn, logbucket, verbose=True res = dydb.describe_table(TableName=DYNAMODB_TABLE) except Exception as e: if verbose: - printlog("Not adding to dynamo table: %s" % e) + logger.info("Not adding to dynamo table: %s" % e) return try: response = dydb.put_item( @@ -251,7 +256,7 @@ def add_to_dydb(self, awsem_job_id, execution_name, sfn, logbucket, verbose=True } ) if verbose: - printlog(response) + logger.info("Successfully put item to dynamoDB: " + str(response)) except Exception as e: raise(e) @@ -262,7 +267,9 @@ def check_status(self, exec_arn=None, job_id=None): ddinfo = self.info(job_id) if not ddinfo: raise Exception("Can't find exec_arn from the job_id") - exec_arn = ddinfo.get('exec_arn', '') + exec_name = ddinfo.get('Execution Name', '') + sfn = ddinfo.get('Step Function', '') + exec_arn = EXECUTION_ARN(exec_name, sfn) if not exec_arn: raise Exception("Can't find exec_arn from the job_id") sts = boto3.client('stepfunctions', region_name=AWS_REGION) @@ -275,7 +282,9 @@ def check_output(self, exec_arn=None, job_id=None): ddinfo = self.info(job_id) if not ddinfo: raise Exception("Can't find exec_arn from the job_id") - exec_arn = ddinfo.get('exec_arn', '') + exec_name = ddinfo.get('Execution Name', '') + sfn = ddinfo.get('Step Function', '') + exec_arn = EXECUTION_ARN(exec_name, sfn) if not exec_arn: raise Exception("Can't find exec_arn from the job_id") sts = boto3.client('stepfunctions', region_name=AWS_REGION) @@ -296,7 +305,7 @@ def get_dd(self, job_id): 'ComparisonOperator': 'EQ'}}) return ddres except Exception as e: - printlog("Warning: dynamoDB entry not found: %s" % e) + logger.warning("DynamoDB entry not found: %s" % e) return None def info(self, job_id): @@ -311,27 +320,12 @@ def get_info_from_dd(self, ddres): if 'Items' in ddres: try: dditem = ddres['Items'][0] - exec_name = dditem['Execution Name']['S'] - sfn = dditem['Step Function']['S'] - exec_arn = EXECUTION_ARN(exec_name, sfn) - if 'instance_id' in dditem: - instance_id = dditem['instance_id']['S'] - else: - instance_id = '' - if 'Log Bucket' in dditem: - logbucket = dditem['Log Bucket']['S'] - else: - logbucket = '' - return {'exec_name': exec_name, - 'exec_arn': exec_arn, - 'step_function': sfn, - 'instance_id': instance_id, - 'log_bucket': logbucket} + return dd_utils.item2dict(dditem) except Exception as e: - printlog("Warning: dynamoDB fields not found: %s" % e) + logger.warning("DynamoDB fields not found: %s" % e) return None else: - printlog("Warning: dynamoDB Items field not found:") + logger.warning("DynamoDB Items field not found:") return None def kill(self, exec_arn=None, job_id=None, sfn=None): @@ -348,25 +342,25 @@ def kill(self, exec_arn=None, job_id=None, sfn=None): if tag['Key'] == 'Type' and tag['Value'] != 'awsem': continue if tag['Key'] == 'Name' and tag['Value'] == 'awsem-' + jobid: - printlog("terminating EC2 instance") + logger.info("terminating EC2 instance") response = i.terminate() - printlog(response) + logger.info("Successfully terminated instance: " + str(response)) terminated = True break if terminated: break - printlog("terminating step function execution") + logger.info("terminating step function execution") resp_sf = sf.stop_execution(executionArn=exec_arn, error="Aborted") - printlog(resp_sf) + logger.info("Successfully terminated step function execution: " + str(resp_sf)) elif job_id: ec2 = boto3.client('ec2') res = ec2.describe_instances(Filters=[{'Name': 'tag:Name', 'Values': ['awsem-' + job_id]}]) if not res['Reservations']: raise("instance not available - if you just submitted the job, try again later") instance_id = res['Reservations'][0]['Instances'][0]['InstanceId'] - printlog("terminating EC2 instance") + logger.info("terminating EC2 instance") resp_term = ec2.terminate_instances(InstanceIds=[instance_id]) - printlog(resp_term) + logger.info("Successfully terminated instance: " + str(resp_term)) # first try dynanmodb to get logbucket ddres = dict() try: @@ -382,7 +376,7 @@ def kill(self, exec_arn=None, job_id=None, sfn=None): exec_arn = EXECUTION_ARN(exec_name, sfn) else: if not sfn: - printlog("Can't stop step function because step function name is not given.") + logger.warning("Can't stop step function because step function name is not given.") return None stateMachineArn = STEP_FUNCTION_ARN(sfn) res = sf.list_executions(stateMachineArn=stateMachineArn, statusFilter='RUNNING') @@ -404,9 +398,9 @@ def kill(self, exec_arn=None, job_id=None, sfn=None): break if not exec_arn: raise Exception("can't find the execution") - printlog("terminating step function execution") + logger.info("terminating step function execution") resp_sf = sf.stop_execution(executionArn=exec_arn, error="Aborted") - printlog(resp_sf) + logger.info("Successfully terminated step function execution: " + str(resp_sf)) def kill_all(self, sfn=None): """killing all the running jobs""" @@ -426,11 +420,19 @@ def kill_all(self, sfn=None): else: break - def log(self, exec_arn=None, job_id=None, exec_name=None, sfn=None, postrunjson=False, runjson=False, logbucket=None, quiet=False): + def log(self, exec_arn=None, job_id=None, exec_name=None, sfn=None, + postrunjson=False, runjson=False, top=False, top_latest=False, + inputjson=False, logbucket=None, quiet=False): if postrunjson: suffix = '.postrun.json' elif runjson: suffix = '.run.json' + elif top: + suffix = '.top' + elif top_latest: + suffix = '.top_latest' + elif inputjson: + suffix = '.input.json' else: suffix = '.log' if not sfn: @@ -446,17 +448,11 @@ def log(self, exec_arn=None, job_id=None, exec_name=None, sfn=None, postrunjson= elif job_id: if not logbucket: # first try dynanmodb to get logbucket - ddres = dict() try: - dd = boto3.client('dynamodb') - ddres = dd.query(TableName=DYNAMODB_TABLE, - KeyConditions={'Job Id': {'AttributeValueList': [{'S': job_id}], - 'ComparisonOperator': 'EQ'}}) + logbucket = self.info(job_id)['Log Bucket'] except Exception as e: pass - if 'Items' in ddres: - logbucket = ddres['Items'][0]['Log Bucket']['S'] - else: + if not logbucket: # search through executions to get logbucket stateMachineArn = STEP_FUNCTION_ARN(sfn) try: @@ -487,8 +483,8 @@ def log(self, exec_arn=None, job_id=None, exec_name=None, sfn=None, postrunjson= except Exception as e: if 'NoSuchKey' in str(e): if not quiet: - printlog("log/postrunjson file is not ready yet. " + - "Wait a few seconds/minutes and try again.") + logger.info("log/postrunjson file is not ready yet. " + + "Wait a few seconds/minutes and try again.") return '' else: raise e @@ -496,20 +492,17 @@ def log(self, exec_arn=None, job_id=None, exec_name=None, sfn=None, postrunjson= return(res_s3['Body'].read().decode('utf-8', 'backslashreplace')) return None - def stat(self, sfn=None, status=None, verbose=False, n=None): + def stat(self, sfn=None, status=None, verbose=False, n=None, job_ids=None): """print out executions with details (-v) status can be one of 'RUNNING'|'SUCCEEDED'|'FAILED'|'TIMED_OUT'|'ABORTED' + or specify a list of job ids """ - if not sfn: - sfn = self.default_stepfunction_name - args = { - 'stateMachineArn': STEP_FUNCTION_ARN(sfn), - 'maxResults': 100 - } - if status: - args['statusFilter'] = status - res = dict() - client = boto3.client('stepfunctions') + if n and job_ids: + raise Exception("n and job_id filters do not work together.") + if sfn and job_ids: + raise Exception("Please do not specify sfn when job_ids are specified.") + if status and job_ids: + raise Exception("Status filter cannot be specified when job_ids are specified.") if verbose: print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format('jobid', 'status', 'name', 'start_time', 'stop_time', @@ -518,59 +511,92 @@ def stat(self, sfn=None, status=None, verbose=False, n=None): 'password')) else: print("{}\t{}\t{}\t{}\t{}".format('jobid', 'status', 'name', 'start_time', 'stop_time')) - res = client.list_executions(**args) + client = boto3.client('stepfunctions') ec2 = boto3.client('ec2') - k = 0 - while True: - if n and k == n: - break - if 'executions' not in res or not res['executions']: - break - for exc in res['executions']: + + def parse_exec_desc_and_ec2_desc(exec_arn, verbose): + # collecting execution stats + exec_desc = client.describe_execution(executionArn=exec_arn) + + # getting info from execution description + exec_desc = client.describe_execution(executionArn=exec_arn) + job_id = json.loads(exec_desc['input']).get('jobid', 'no jobid') + status = exec_desc['status'] + name = exec_desc['name'] + start_time = exec_desc['startDate'].strftime("%Y-%m-%d %H:%M") + if 'stopDate' in exec_desc: + stop_time = exec_desc['stopDate'].strftime("%Y-%m-%d %H:%M") + else: + stop_time = '' + + # collect instance stats + ec2_desc = ec2.describe_instances(Filters=[{'Name': 'tag:Name', 'Values': ['awsem-' + job_id]}]) + + # getting info from ec2 description + if ec2_desc['Reservations']: + ec2_desc_inst = ec2_desc['Reservations'][0]['Instances'][0] + instance_status = ec2_desc_inst['State']['Name'] + instance_id = ec2_desc_inst['InstanceId'] + instance_type = ec2_desc_inst['InstanceType'] + if instance_status not in ['terminated', 'shutting-down']: + instance_ip = ec2_desc_inst.get('PublicIpAddress', '-') + keyname = ec2_desc_inst.get('KeyName', '-') + password = json.loads(exec_desc['input'])['config'].get('password', '-') + else: + instance_ip = '-' + keyname = '-' + password = '-' + else: + instance_status = '-' + instance_id = '-' + instance_type = '-' + instance_ip = '-' + keyname = '-' + password = '-' + + parsed_stat = (job_id, status, name, start_time, stop_time, + instance_id, instance_type, instance_status, + instance_ip, keyname, password) + if verbose: + print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(*parsed_stat)) + else: + print("{}\t{}\t{}\t{}\t{}".format(*parsed_stat[0:5])) + + if job_ids: + for job_id in job_ids: + dd_info = self.info(job_id) + if 'Execution Name' not in dd_info: + raise Exception("Cannot find execution name for job ID %s" % job_id) + if 'Step Function' not in dd_info: + raise Exception("Cannot find step function for job ID %s" % job_id) + exec_arn = EXECUTION_ARN(dd_info['Execution Name'], dd_info['Step Function']) + parse_exec_desc_and_ec2_desc(exec_arn, verbose) + else: + if not sfn: + sfn = self.default_stepfunction_name + args = { + 'stateMachineArn': STEP_FUNCTION_ARN(sfn), + 'maxResults': 100 + } + if status: + args['statusFilter'] = status + res = dict() + res = client.list_executions(**args) + k = 0 + while True: if n and k == n: break - k = k + 1 - desc = client.describe_execution(executionArn=exc['executionArn']) - jobid = json.loads(desc['input']).get('jobid', 'no jobid') - status = exc['status'] - name = exc['name'] - start_time = exc['startDate'].strftime("%Y-%m-%d %H:%M") - if 'stopDate' in exc: - stop_time = exc['stopDate'].strftime("%Y-%m-%d %H:%M") - else: - stop_time = '' - if verbose: - # collect instance stats - res = ec2.describe_instances(Filters=[{'Name': 'tag:Name', 'Values': ['awsem-' + jobid]}]) - if res['Reservations']: - instance_status = res['Reservations'][0]['Instances'][0]['State']['Name'] - instance_id = res['Reservations'][0]['Instances'][0]['InstanceId'] - instance_type = res['Reservations'][0]['Instances'][0]['InstanceType'] - if instance_status not in ['terminated', 'shutting-down']: - instance_ip = res['Reservations'][0]['Instances'][0].get('PublicIpAddress', '-') - keyname = res['Reservations'][0]['Instances'][0].get('KeyName', '-') - password = json.loads(desc['input'])['config'].get('password', '-') - else: - instance_ip = '-' - keyname = '-' - password = '-' - else: - instance_status = '-' - instance_id = '-' - instance_type = '-' - instance_ip = '-' - keyname = '-' - password = '-' - print_template = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}" - print(print_template.format(jobid, status, name, start_time, stop_time, - instance_id, instance_type, instance_status, - instance_ip, keyname, password)) + if 'executions' not in res or not res['executions']: + break + for exc in res['executions']: + if n and k == n: + break + k = k + 1 + parse_exec_desc_and_ec2_desc(exc['executionArn'], verbose) + if 'nextToken' in res: + res = client.list_executions(nextToken=res['nextToken'], **args) else: - print("{}\t{}\t{}\t{}\t{}".format(jobid, status, name, start_time, stop_time)) - if 'nextToken' in res: - res = client.list_executions(nextToken=res['nextToken'], **args) - else: - break + break def list_sfns(self, numbers=False): """list all step functions, optionally with a summary (-n)""" @@ -618,7 +644,7 @@ def clear_input_json_template(self, input_json_template): input_json_template['_tibanna']['run_name'] = input_json_template['_tibanna']['run_name'][:-36] input_json_template['config']['run_name'] = input_json_template['_tibanna']['run_name'] - def rerun(self, exec_arn, sfn=None, + def rerun(self, exec_arn=None, job_id=None, sfn=None, override_config=None, app_name_filter=None, instance_type=None, shutdown_min=None, ebs_size=None, ebs_type=None, ebs_iops=None, overwrite_input_extra=None, key_name=None, name=None): @@ -629,10 +655,13 @@ def rerun(self, exec_arn, sfn=None, then rerun only if it matches app_name """ if not sfn: - sfn = self.default_stepfunction_name - client = boto3.client('stepfunctions') - res = client.describe_execution(executionArn=exec_arn) - input_json_template = json.loads(res['input']) + sfn = self.default_stepfunction_name # this is a target sfn + if not exec_arn and job_id: + input_json_template = json.loads(self.log(job_id=job_id, inputjson=True)) + else: + client = boto3.client('stepfunctions') + res = client.describe_execution(executionArn=exec_arn) + input_json_template = json.loads(res['input']) # filter by app_name if app_name_filter: if 'app_name' not in input_json_template: @@ -689,8 +718,6 @@ def rerun_many(self, sfn=None, stopdate='13Feb2018', stophour=13, sflist = client.list_executions(stateMachineArn=STEP_FUNCTION_ARN(sfn), statusFilter=status) k = 0 for exc in sflist['executions']: - print(exc['stopDate'].replace(tzinfo=None)) - print(stoptime_in_datetime) if exc['stopDate'].replace(tzinfo=None) > stoptime_in_datetime: k = k + 1 self.rerun(exc['executionArn'], sfn=sfn, @@ -703,13 +730,9 @@ def rerun_many(self, sfn=None, stopdate='13Feb2018', stophour=13, def env_list(self, name): # don't set this as a global, since not all tasks require it envlist = { - self.run_task_lambda: {'AMI_ID_CWL_V1': AMI_ID_CWL_V1, - 'AMI_ID_CWL_DRAFT3': AMI_ID_CWL_DRAFT3, - 'AMI_ID_WDL': AMI_ID_WDL, - 'AMI_ID_SNAKEMAKE': AMI_ID_SNAKEMAKE, - 'AMI_ID_SHELL': AMI_ID_SHELL, - 'TIBANNA_REPO_NAME': TIBANNA_REPO_NAME, - 'TIBANNA_REPO_BRANCH': TIBANNA_REPO_BRANCH}, + self.run_task_lambda: {'AMI_ID': AMI_ID, + 'TIBANNA_REPO_NAME': TIBANNA_REPO_NAME, + 'TIBANNA_REPO_BRANCH': TIBANNA_REPO_BRANCH}, self.check_task_lambda: {} } if TIBANNA_PROFILE_ACCESS_KEY and TIBANNA_PROFILE_SECRET_KEY: @@ -741,14 +764,14 @@ def deploy_lambda(self, name, suffix, usergroup=''): else: extra_config['Environment']['Variables']['AWS_S3_ROLE_NAME'] = 'S3_access' # 4dn-dcic default(temp) # add role - print('name=%s' % name) + logger.info('name=%s' % name) if name in [self.run_task_lambda, self.check_task_lambda]: role_arn_prefix = 'arn:aws:iam::' + AWS_ACCOUNT_NUMBER + ':role/' if usergroup: role_arn = role_arn_prefix + tibanna_iam.role_name(name) else: role_arn = role_arn_prefix + 'lambda_full_s3' # 4dn-dcic default(temp) - print("role_arn=" + role_arn) + logger.info("role_arn=" + role_arn) extra_config['Role'] = role_arn if usergroup and suffix: function_name_suffix = usergroup + '_' + suffix @@ -767,7 +790,7 @@ def deploy_lambda(self, name, suffix, usergroup=''): if name not in self.do_not_delete: try: boto3.client('lambda').get_function(FunctionName=full_function_name) - print("deleting existing lambda") + logger.info("deleting existing lambda") boto3.client('lambda').delete_function(FunctionName=full_function_name) except Exception as e: if 'Function not found' in str(e): @@ -780,7 +803,7 @@ def deploy_lambda(self, name, suffix, usergroup=''): def deploy_core(self, name, suffix=None, usergroup=''): """deploy/update lambdas only""" - print("preparing for deploy...") + logger.info("preparing for deploy...") if name == 'all': names = self.lambda_names elif name == 'unicorn': @@ -795,15 +818,15 @@ def setup_tibanna_env(self, buckets='', usergroup_tag='default', no_randomize=Fa """set up usergroup environment on AWS This function is called automatically by deploy_tibanna or deploy_unicorn Use it only when the IAM permissions need to be reset""" - print("setting up tibanna usergroup environment on AWS...") + logger.info("setting up tibanna usergroup environment on AWS...") if not AWS_ACCOUNT_NUMBER or not AWS_REGION: - print("Please set and export environment variable AWS_ACCOUNT_NUMBER and AWS_REGION!") + logger.info("Please set and export environment variable AWS_ACCOUNT_NUMBER and AWS_REGION!") exit(1) if not buckets: - print("WARNING: Without setting buckets (using --buckets)," + - "Tibanna would have access to only public buckets." + - "To give permission to Tibanna for private buckets," + - "use --buckets=,,...") + logger.warning("Without setting buckets (using --buckets)," + + "Tibanna would have access to only public buckets." + + "To give permission to Tibanna for private buckets," + + "use --buckets=,,...") time.sleep(2) if buckets: bucket_names = buckets.split(',') @@ -812,11 +835,11 @@ def setup_tibanna_env(self, buckets='', usergroup_tag='default', no_randomize=Fa if bucket_names and not do_not_delete_public_access_block: client = boto3.client('s3') for b in bucket_names: - printlog("Deleting public access block for bucket %s" % b) + logger.info("Deleting public access block for bucket %s" % b) response = client.delete_public_access_block(Bucket=b) tibanna_iam = self.IAM(usergroup_tag, bucket_names, no_randomize=no_randomize) tibanna_iam.create_tibanna_iam(verbose=verbose) - print("Tibanna usergroup %s has been created on AWS." % tibanna_iam.user_group_name) + logger.info("Tibanna usergroup %s has been created on AWS." % tibanna_iam.user_group_name) return tibanna_iam.user_group_name def deploy_tibanna(self, suffix=None, usergroup='', setup=False, @@ -831,12 +854,12 @@ def deploy_tibanna(self, suffix=None, usergroup='', setup=False, do_not_delete_public_access_block=do_not_delete_public_access_block) # this function will remove existing step function on a conflict step_function_name = self.create_stepfunction(suffix, usergroup=usergroup) - print("creating a new step function... %s" % step_function_name) + logger.info("creating a new step function... %s" % step_function_name) if setenv: os.environ['TIBANNA_DEFAULT_STEP_FUNCTION_NAME'] = step_function_name with open(os.getenv('HOME') + "/.bashrc", "a") as outfile: # 'a' stands for "append" outfile.write("\nexport TIBANNA_DEFAULT_STEP_FUNCTION_NAME=%s\n" % step_function_name) - print("deploying lambdas...") + logger.info("deploying lambdas...") self.deploy_core('all', suffix=suffix, usergroup=usergroup) dd_utils.create_dynamo_table(DYNAMODB_TABLE, DYNAMODB_KEYNAME) return step_function_name @@ -885,7 +908,7 @@ def create_stepfunction(self, dev_suffix=None, aws_acc=AWS_ACCOUNT_NUMBER, usergroup=None): if not aws_acc or not region_name: - print("Please set and export environment variable AWS_ACCOUNT_NUMBER and AWS_REGION!") + logger.info("Please set and export environment variable AWS_ACCOUNT_NUMBER and AWS_REGION!") exit(1) # create a step function definition object sfndef = self.StepFunction(dev_suffix, region_name, aws_acc, usergroup) @@ -903,11 +926,11 @@ def create_stepfunction(self, dev_suffix=None, # get ARN from the error and format as necessary exc_str = str(e) if 'State Machine Already Exists:' not in exc_str: - print('Cannot delete state machine. Exiting...' % exc_str) + logger.error('Cannot delete state machine. Exiting...' % exc_str) raise(e) sfn_arn = exc_str.split('State Machine Already Exists:')[-1].strip().strip("''") - print('Step function with name %s already exists!' % sfndef.sfn_name) - print('Updating the state machine...') + logger.info('Step function with name %s already exists!' % sfndef.sfn_name) + logger.info('Updating the state machine...') try: sfn.update_state_machine( stateMachineArn=sfn_arn, @@ -915,7 +938,7 @@ def create_stepfunction(self, dev_suffix=None, roleArn=sfndef.sfn_role_arn ) except Exception as e: - print('Error updating state machine %s' % str(e)) + logger.error('Error updating state machine %s' % str(e)) raise(e) except Exception as e: raise(e) @@ -936,8 +959,11 @@ def plot_metrics(self, job_id, sfn=None, directory='.', open_browser=True, force postrunjsonstr = self.log(job_id=job_id, sfn=sfn, postrunjson=True, quiet=True) if postrunjsonstr: postrunjson = AwsemPostRunJson(**json.loads(postrunjsonstr)) - job_complete = True job = postrunjson.Job + if hasattr(job, 'end_time_as_str') and job.end_time_as_str: + job_complete = True + else: + job_complete = False log_bucket = postrunjson.config.log_bucket instance_type = postrunjson.config.instance_type or 'unknown' else: @@ -955,8 +981,8 @@ def plot_metrics(self, job_id, sfn=None, directory='.', open_browser=True, force if self.check_metrics_plot(job_id, log_bucket) and \ self.check_metrics_lock(job_id, log_bucket) and \ not force_upload: - printlog("Metrics plot is already on S3 bucket.") - printlog('metrics url= ' + METRICS_URL(log_bucket, job_id)) + logger.info("Metrics plot is already on S3 bucket.") + logger.info('metrics url= ' + METRICS_URL(log_bucket, job_id)) # open metrics html in browser if open_browser: webbrowser.open(METRICS_URL(log_bucket, job_id)) @@ -1010,7 +1036,8 @@ def plot_metrics(self, job_id, sfn=None, directory='.', open_browser=True, force else: try: M = self.TibannaResource(instance_id, filesystem, starttime, endtime) - M.plot_metrics(instance_type, directory) + top_content = self.log(job_id=job_id, top=True) + M.plot_metrics(instance_type, directory, top_content=top_content) except Exception as e: raise MetricRetrievalException(e) # upload files @@ -1018,7 +1045,7 @@ def plot_metrics(self, job_id, sfn=None, directory='.', open_browser=True, force # clean up uploaded files for f in M.list_files: os.remove(f) - printlog('metrics url= ' + METRICS_URL(log_bucket, job_id)) + logger.info('metrics url= ' + METRICS_URL(log_bucket, job_id)) # open metrics html in browser if open_browser: webbrowser.open(METRICS_URL(log_bucket, job_id)) @@ -1059,7 +1086,7 @@ def reformat_time(t, delta): upload('metrics_report.tsv', log_bucket, job_id + '.metrics/') os.remove('metrics_report.tsv') else: - printlog("cost already in the tsv file. not updating") + logger.info("cost already in the tsv file. not updating") return cost def does_dynamo_table_exist(self, tablename): @@ -1079,7 +1106,7 @@ def does_dynamo_table_exist(self, tablename): def create_dynamo_table(self, tablename, keyname): if self.does_dynamo_table_exist(tablename): - print("dynamodb table %s already exists. skip creating db" % tablename) + logger.info("dynamodb table %s already exists. skip creating db" % tablename) else: response = boto3.client('dynamodb').create_table( TableName=tablename, @@ -1120,8 +1147,8 @@ def cleanup(self, user_group_name, suffix='', ignore_errors=True, do_not_remove_ def handle_error(errmsg): if ignore_errors: if verbose: - printlog(errmsg) - printlog("continue to remove the other components") + logger.warning(errmsg) + logger.info("continue to remove the other components") else: raise Exception(errmsg) @@ -1134,7 +1161,7 @@ def handle_error(errmsg): # delete step function sfn = 'tibanna_' + self.sfn_type + lambda_suffix if verbose: - printlog("deleting step function %s" % sfn) + logger.info("deleting step function %s" % sfn) try: boto3.client('stepfunctions').delete_state_machine(stateMachineArn=STEP_FUNCTION_ARN(sfn)) except Exception as e: @@ -1143,7 +1170,7 @@ def handle_error(errmsg): lambda_client = boto3.client('lambda') for lmb in self.lambda_names: if verbose: - printlog("deleting lambda functions %s" % lmb + lambda_suffix) + logger.info("deleting lambda functions %s" % lmb + lambda_suffix) try: lambda_client.delete_function(FunctionName=lmb + lambda_suffix) except Exception as e: @@ -1151,12 +1178,12 @@ def handle_error(errmsg): # delete IAM policies, roles and groups if not do_not_remove_iam_group: if verbose: - printlog("deleting IAM permissions %s" % sfn) + logger.info("deleting IAM permissions %s" % sfn) iam = self.IAM(user_group_name) iam.delete_tibanna_iam(verbose=verbose, ignore_errors=ignore_errors) if purge_history: if verbose: - printlog("deleting all job files and history") + logger.info("deleting all job files and history") item_list = dd_utils.get_items(DYNAMODB_TABLE, DYNAMODB_KEYNAME, 'Step Function', sfn, ['Log Bucket']) for item in item_list: jobid = item[DYNAMODB_KEYNAME] @@ -1166,14 +1193,32 @@ def handle_error(errmsg): except Exception as e: if 'NoSuchBucket' in str(e): if verbose: - printlog("log bucket %s missing... skip job %s" % (item['Log Bucket'], jobid)) + logger.info("log bucket %s missing... skip job %s" % (item['Log Bucket'], jobid)) continue if verbose: - printlog("deleting %d job files for job %s" % (len(keylist), jobid)) + logger.info("deleting %d job files for job %s" % (len(keylist), jobid)) delete_keys(keylist, item['Log Bucket']) else: if verbose: - printlog("log bucket info missing.. skip job %s" % jobid) + logger.info("log bucket info missing.. skip job %s" % jobid) dd_utils.delete_items(DYNAMODB_TABLE, DYNAMODB_KEYNAME, item_list, verbose=verbose) if verbose: - printlog("Finished cleaning") + logger.info("Finished cleaning") + + def create_ami(self, build_from_scratch=True, source_image_to_copy_from=None, source_image_region=None, + ubuntu_base_image=None, make_public=False): + args = dict() + if build_from_scratch: + # build from ubuntu 20.04 image and user data + if ubuntu_base_image: + args.update({'base_ami': ubuntu_base_image}) + else: + # copy an existing image + args.update({'userdata_file': ''}) + if source_image_to_copy_from: + args.update({'base_ami': source_image_to_copy_from}) + if source_image_region: + args.update({'base_region': source_image_region}) + + return AMI(**args).create_ami_for_tibanna(make_public=make_public) + diff --git a/tibanna/create_ami_userdata b/tibanna/create_ami_userdata new file mode 100644 index 000000000..443551e86 --- /dev/null +++ b/tibanna/create_ami_userdata @@ -0,0 +1,22 @@ +#!/bin/bash + +# basic updates and installation +apt update +apt install -y awscli +apt install -y apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common # docker + +# install docker +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - +apt-key fingerprint 0EBFCD88 +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable" +apt update # update again with an updated repository +apt install -y docker-ce # installing docker-ce +usermod -aG docker ubuntu # making it available for non-root user ubuntu + +reboot diff --git a/tibanna/cw_utils.py b/tibanna/cw_utils.py old mode 100644 new mode 100755 index b5db30341..10fdfa68d --- a/tibanna/cw_utils.py +++ b/tibanna/cw_utils.py @@ -1,23 +1,37 @@ import boto3, os -from tibanna.utils import ( - printlog, +from . import create_logger +from .utils import ( upload, read_s3 ) +from .top import Top from .vars import ( - AWS_REGION + AWS_REGION, + EBS_MOUNT_POINT ) -# from datetime import timezone from datetime import datetime from datetime import timedelta -# instance_id = 'i-0167a6c2d25ce5822' -# filesystem = "/dev/xvdb" -# filesystem = "/dev/nvme1n1" + +logger = create_logger(__name__) class TibannaResource(object): + """class handling cloudwatch metrics for cpu / memory /disk space + and top command metrics for cpu and memory per process. + """ + + timestamp_format = '%Y-%m-%d %H:%M:%S' + + @classmethod + def convert_timestamp_to_datetime(cls, timestamp): + return datetime.strptime(timestamp, cls.timestamp_format) + def __init__(self, instance_id, filesystem, starttime, endtime=datetime.utcnow()): + """All the Cloudwatch metrics are retrieved and stored at the initialization. + :param instance_id: e.g. 'i-0167a6c2d25ce5822' + :param filesystem: e.g. "/dev/xvdb", "/dev/nvme1n1" + """ self.instance_id = instance_id self.filesystem = filesystem self.client = boto3.client('cloudwatch', region_name=AWS_REGION) @@ -28,7 +42,7 @@ def __init__(self, instance_id, filesystem, starttime, endtime=datetime.utcnow() nTimeChunks = round(nTimeChunks) + 1 else: nTimeChunks = round(nTimeChunks) - print("Spliting run time into %s chunks" % str(nTimeChunks)) + logger.info("Spliting run time into %s chunks" % str(nTimeChunks)) self.starttimes = [starttime + timedelta(days=k) for k in range(0, nTimeChunks)] self.endtimes = [starttime + timedelta(days=k+1) for k in range(0, nTimeChunks)] self.start = starttime.replace(microsecond=0) # initial starttime for the window requested @@ -72,11 +86,12 @@ def get_metrics(self, nTimeChunks=1): # this following one is used to detect file copying while CPU utilization is near zero self.max_ebs_read_bytes = self.choose_max(max_ebs_read_chunks) - def plot_metrics(self, instance_type, directory='.'): + def plot_metrics(self, instance_type, directory='.', top_content=''): """plot full metrics across all time chunks. AWS allows only 1440 data points at a time which corresponds to 24 hours at 1min interval, so we have to split them into chunks. + :param top_content: content of the .top in the str format, used for plotting top metrics. """ max_mem_utilization_percent_chunks_all_pts = [] max_mem_used_MB_chunks_all_pts = [] @@ -103,45 +118,48 @@ def plot_metrics(self, instance_type, directory='.'): 'max_disk_space_utilization_percent': (max_disk_space_utilization_percent_chunks_all_pts, 1), 'max_cpu_utilization_percent': (max_cpu_utilization_percent_chunks_all_pts, 5) } + self.list_files.extend(self.write_top_tsvs(directory, top_content)) self.list_files.append(self.write_tsv(directory, **input_dict)) self.list_files.append(self.write_metrics(instance_type, directory)) # writing html self.list_files.append(self.write_html(instance_type, directory)) def upload(self, bucket, prefix='', lock=True): - printlog(str(self.list_files)) + logger.debug("list_files: " + str(self.list_files)) for f in self.list_files: upload(f, bucket, prefix) if lock: upload(None, bucket, os.path.join(prefix, 'lock')) - def choose_max(self, x): - M = -1 - for v in x: - if v: - M = max([v, M]) - if M == -1: - M = "" - return(M) - - def choose_min(self, x): - M = 10000000000 - for v in x: - if v: - M = min([v, M]) - if M == 10000000000: - M = "" - return(M) - - def get_max(self, x): + @staticmethod + def choose_max(x): + """given a list of values that may include None, 0 or an empty string, + chooses a positive nonzero maximum. (e.g. [0,1,2,None,3] => 3) + if no positive nonzero value exists in the list, returns an empty string.""" + return TibannaResource.get_max(list(filter(lambda x:x, x))) + + @staticmethod + def choose_min(x): + """given a list of values that may include None, 0 or an empty string, + chooses a nonzero minimum. (e.g. [0,1,2,None,3] => 1) + if no nonzero value exists in the list, returns an empty string.""" + return TibannaResource.get_min(list(filter(lambda x:x, x))) + + @staticmethod + def get_max(x): + """given a list of values, returns maximum value, + but if the list is empty, returns an empty string""" return(max(x) if x else '') - def get_min(self, x): + @staticmethod + def get_min(x): + """given a list of values, returns miminim value, + but if the list is empty, returns an empty string""" return(min(x) if x else '') def as_dict(self): d = self.__dict__.copy() - printlog(d) + logger.debug("original dict: " + str(d)) del(d['client']) del(d['starttimes']) del(d['endtimes']) @@ -253,7 +271,7 @@ def max_disk_space_utilization_all_pts(self): MetricName='DiskSpaceUtilization', Dimensions=[ {'Name': 'InstanceId', 'Value': self.instance_id}, - {'Name': 'MountPath', 'Value': '/data1'}, + {'Name': 'MountPath', 'Value': EBS_MOUNT_POINT}, {'Name': 'Filesystem', 'Value': self.filesystem} ], Period=60, @@ -271,7 +289,7 @@ def max_disk_space_used_all_pts(self): MetricName='DiskSpaceUsed', Dimensions=[ {'Name': 'InstanceId', 'Value': self.instance_id}, - {'Name': 'MountPath', 'Value': '/data1'}, + {'Name': 'MountPath', 'Value': EBS_MOUNT_POINT}, {'Name': 'Filesystem', 'Value': self.filesystem} ], Period=60, @@ -326,14 +344,14 @@ def update_html(cls, bucket, prefix, directory='.', upload_new=True): k, v = line.split('\t') d.setdefault(k, v) # everything is string now # times into datetime objects - starttime = datetime.strptime(d['Start_Time'], '%Y-%m-%d %H:%M:%S') + starttime = cls.convert_timestamp_to_datetime(d['Start_Time']) try: - endtime = datetime.strptime(d['End_Time'], '%Y-%m-%d %H:%M:%S') + endtime = cls.convert_timestamp_to_datetime(d['End_Time']) except: # temporary fix for retrocompatibility if 'End_time' in d: - endtime = datetime.strptime(d['End_time'], '%Y-%m-%d %H:%M:%S') + endtime = cls.convert_timestamp_to_datetime(d['End_time']) else: - endtime = datetime.strptime(d['Time_of_Request'], '%Y-%m-%d %H:%M:%S') + endtime = cls.convert_timestamp_to_datetime(d['Time_of_Request']) cost = d['Cost'] if 'Cost' in d else '---' instance = d['Instance_Type'] if 'Instance_Type' in d else '---' # writing @@ -349,6 +367,17 @@ def update_html(cls, bucket, prefix, directory='.', upload_new=True): upload(filename, bucket, prefix) os.remove(filename) + @staticmethod + def write_top_tsvs(directory, top_content): + TibannaResource.check_mkdir(directory) + top_obj = Top(top_content) + top_obj.digest() + cpu_filename = directory + '/' + 'top_cpu.tsv' + mem_filename = directory + '/' + 'top_mem.tsv' + top_obj.write_to_csv(cpu_filename, delimiter='\t', metric='cpu', colname_for_timestamps='interval', base=1) + top_obj.write_to_csv(mem_filename, delimiter='\t', metric='mem', colname_for_timestamps='interval', base=1) + return [cpu_filename, mem_filename] + def write_tsv(self, directory, **kwargs): # kwargs, key: (chunks_all_pts, interval), interval is 1 or 5 min self.check_mkdir(directory) filename = directory + '/' + 'metrics.tsv' @@ -397,7 +426,8 @@ def write_metrics(self, instance_type, directory): fo.write('Instance_Type' + '\t' + instance_type + '\n') return(filename) - def check_mkdir(self, directory): + @staticmethod + def check_mkdir(directory): if not os.path.exists(directory): os.makedirs(directory) @@ -480,6 +510,12 @@ def create_html(cls): width: 85%%; background-color: #2C6088; } + .barplot { + height: 300px; + } + .barplot_legend { + height: 350px; + } /* Style the lines by removing the fill and applying a stroke */ .line { fill: none; @@ -615,6 +651,16 @@ def create_html(cls):

Disk Usage (/data1)

+
+

CPU Usage Per Process (from Top command)

+
+
+
+
+

Memory Usage Per Process (from Top command)

+
+
+
@@ -841,6 +887,141 @@ def create_html(cls): .style("text-anchor", "middle") .text(axis_label); } + var barplot_colors = ['black', 'red', 'green', 'blue', 'magenta', 'yellow', 'cyan', + 'pink', 'mediumslateblue', 'maroon', 'orange', + 'gray', 'palegreen', 'mediumvioletred', 'deepskyblue', + 'rosybrown', 'lightgrey', 'indigo', 'cornflowerblue'] + function bar_plot(data_array, div, axis_label) { + // Get div dimensions + var div_width = document.getElementById(div).offsetWidth + , div_height = document.getElementById(div).offsetHeight; + // Use the margin convention practice + var margin = {top: 20, right: 150, bottom: 100, left: 150} + , width = div_width - margin.left - margin.right // Use the window's width + , height = div_height - margin.top - margin.bottom; // Use the window's height + // number of different colors (also number of columns to visualize together) + var n_cols = data_array.length + // The number of datapoints + var n_data = data_array[0].length; + var n = 0 + if (n_data < 5) { + n = 5 + } else { + n = n_data + } + // sum for each timepoint, to calculate y scale + sum_array = d3.range(n_data).map(function(d) { + var sum = 0 + for( col=0; col value pair, the key being "y" and the value is a random number + // Add the SVG to the page + var svg = d3.select("#" + div).append("svg") + .attr("width", width + margin.left + margin.right) + .attr("height", height + margin.top + margin.bottom) + .append("g") + .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); + // Add the X gridlines + svg.append("g") + .attr("class", "grid") + .attr("transform", "translate(0," + height + ")") + .call(make_x_gridlines(xScale, n) + .tickSize(-height) + .tickFormat("") + ) + // Add the Y gridlines + svg.append("g") + .attr("class", "grid") + .call(make_y_gridlines(yScale, d3.max(sum_array)) + .tickSize(-width) + .tickFormat("") + ) + // Call the x axis in a group tag + svg.append("g") + .attr("class", "x axis") + .attr("transform", "translate(0," + height + ")") + .call(d3.axisBottom(xScale)); // Create an axis component with d3.axisBottom + // Call the y axis in a group tag + svg.append("g") + .attr("class", "y axis") + .call(d3.axisLeft(yScale)); // Create an axis component with d3.axisLeft + // Add rectangles, bind the data + var data_array_cum = data_array // dimension and index 0 should be the same + for( var col=0; col 0) { + data_array_cum[col] = d3.range(n_data).map(function(d) { return data_array_cum[col-1][d] + data_array[col][d] }) + var dataset = d3.range(n_data).map(function(d) { return {"prev_y": data_array_cum[col-1][d], "y": data_array_cum[col][d]} }) + } + //var dataset = d3.range(n_data).map(function(d) { return {"dy": data_array[col][d], "y": data_array_cum[col][d]} }) + svg.selectAll(".bar") + .data(dataset) + .enter() + .append('rect') + .attr("class", "bar" + col) + .attr("fill", barplot_colors[col]) + .attr('x', function(d, i) { return xScale(i) + xScale(1) - xScale(0.5); }) + .attr('y', function(d) { return yScale(d.y); }) + .attr('height', function(d) { return yScale(d.prev_y) - yScale(d.y); }) + .attr('width', xScale(1)); + } + svg.append("text") + .attr("transform", "translate(" + (width / 2) + " ," + (height + margin.bottom - margin.bottom / 2) + ")") + .style("text-anchor", "middle") + .text("Time [min]"); + svg.append("text") + .attr("transform", "rotate(-90)") + .attr("y", 0 - margin.left + margin.left / 2) + .attr("x",0 - (height / 2)) + .attr("dy", "1em") + .style("text-anchor", "middle") + .text(axis_label); + } + function bar_plot_legend(legend_text, div) { + // Get div dimensions + var div_width = document.getElementById(div).offsetWidth + , div_height = document.getElementById(div).offsetHeight; + // Use the margin convention practice + var margin = {top: 20, right: 150, bottom: 100, left: 150} + , width = div_width - margin.left - margin.right // Use the window's width + , height = div_height - margin.top - margin.bottom; // Use the window's height + // number of different colors (also number of columns to visualize together) + var n_cols = legend_text.length + // Add the SVG to the page + var svg = d3.select("#" + div).append("svg") + .attr("width", width + margin.left + margin.right) + .attr("height", height + margin.top + margin.bottom) + .append("g") + .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); + for( var col=0; col\ """ return(html) diff --git a/tibanna/dd_utils.py b/tibanna/dd_utils.py old mode 100644 new mode 100755 index fcbda1b61..88fa68a64 --- a/tibanna/dd_utils.py +++ b/tibanna/dd_utils.py @@ -1,5 +1,13 @@ import boto3 -from .utils import printlog +from . import create_logger + + +logger = create_logger(__name__) + + +def item2dict(item): + '''convert a dynamoDB-style item to a regular dictionary''' + return {k: list(v.values())[0] for k, v in item.items()} def does_dynamo_table_exist(tablename): @@ -20,7 +28,7 @@ def does_dynamo_table_exist(tablename): def create_dynamo_table(tablename, keyname): if does_dynamo_table_exist(tablename): - print("dynamodb table %s already exists. skip creating db" % tablename) + logger.info("dynamodb table %s already exists. skip creating db" % tablename) else: response = boto3.client('dynamodb').create_table( TableName=tablename, @@ -87,4 +95,4 @@ def delete_items(table_name, primary_key, item_list, verbose=True): Key={primary_key: {'S': item[primary_key]}} ) if verbose: - printlog("%d entries deleted from dynamodb." % len(item_list)) + logger.info("%d entries deleted from dynamodb." % len(item_list)) diff --git a/tibanna/ec2_utils.py b/tibanna/ec2_utils.py old mode 100644 new mode 100755 index ec4ae131e..7ded99f2d --- a/tibanna/ec2_utils.py +++ b/tibanna/ec2_utils.py @@ -6,8 +6,8 @@ import boto3 import copy import re +from . import create_logger from .utils import ( - printlog, does_key_exist, create_jobid ) @@ -16,12 +16,11 @@ S3_ACCESS_ARN, TIBANNA_REPO_NAME, TIBANNA_REPO_BRANCH, - AMI_ID_WDL, - AMI_ID_SHELL, - AMI_ID_SNAKEMAKE, - AMI_ID_CWL_V1, - AMI_ID_CWL_DRAFT3, - DYNAMODB_TABLE + AMI_ID, + DYNAMODB_TABLE, + DEFAULT_ROOT_EBS_SIZE, + TIBANNA_AWSF_DIR, + DEFAULT_AWSF_IMAGE ) from .exceptions import ( MissingFieldInInputJsonException, @@ -30,7 +29,8 @@ EC2InstanceLimitException, EC2InstanceLimitWaitException, DependencyStillRunningException, - DependencyFailedException + DependencyFailedException, + UnsupportedCWLVersionException ) from .base import SerializableObject from .nnested_array import flatten, run_on_nested_arrays1 @@ -38,26 +38,28 @@ from Benchmark import run as B from Benchmark.classes import get_instance_types, instance_list from Benchmark.byteformat import B2GB -logger = logging.getLogger() -logger.setLevel(logging.INFO) NONSPOT_EC2_PARAM_LIST = ['TagSpecifications', 'InstanceInitiatedShutdownBehavior', 'MaxCount', 'MinCount', 'DisableApiTermination'] +logger = create_logger(__name__) + + class UnicornInput(SerializableObject): - def __init__(self, input_dict): + def __init__(self, input_dict, fill_default=True): if 'jobid' in input_dict and input_dict.get('jobid'): self.jobid = input_dict.get('jobid') else: self.jobid = create_jobid() - self.args = Args(**input_dict['args']) # args is a required field - self.cfg = Config(**input_dict['config']) # config is a required field + self.args = Args(**input_dict['args'], fill_default=fill_default) # args is a required field + self.cfg = Config(**input_dict['config'], fill_default=fill_default) # config is a required field # add other fields too for field, v in input_dict.items(): if field not in ['jobid', 'args', 'config']: setattr(self, field, v) - # fill the default values and internally used fields - self.auto_fill() + if fill_default: + # fill the default values and internally used fields + self.auto_fill() def as_dict(self): d = super().as_dict() @@ -71,9 +73,6 @@ def auto_fill(self): """ args = self.args cfg = self.cfg - args.fill_default() - cfg.fill_default() - cfg.fill_internal() cfg.fill_language_options(args.language, getattr(args, 'singularity', False)) cfg.fill_other_fields(args.app_name) # sanity check @@ -107,12 +106,14 @@ def auto_fill(self): class Args(SerializableObject): - def __init__(self, **kwargs): + def __init__(self, fill_default=True, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) for field in ['output_S3_bucket']: if not hasattr(self, field): raise MissingFieldInInputJsonException("field %s is required in args" % field) + if fill_default: + self.fill_default() def update(self, d): for k, v in d.items(): @@ -132,15 +133,15 @@ def fill_default(self): if not hasattr(self, field): setattr(self, field, '') # if language and cwl_version is not specified, - # by default it is cwl_draft3 + # by default it is cwl_v1 if not hasattr(self, 'language'): if not hasattr(self, 'cwl_version'): - self.cwl_version = 'draft3' - self.language = 'cwl_draft3' + self.cwl_version = 'v1' + self.language = 'cwl_v1' elif self.cwl_version == 'v1': self.language = 'cwl_v1' elif self.cwl_version == 'draft3': - self.language = 'cwl_draft3' + raise UnsupportedCWLVersionException if not hasattr(self, 'singularity'): self.singularity = False if not hasattr(self, 'app_name'): @@ -149,7 +150,7 @@ def fill_default(self): self.parse_input_files() # check workflow info is there and fill in default errmsg_template = "field %s is required in args for language %s" - if self.language == 'wdl': + if self.language in ['wdl', 'wdl_v1', 'wdl_draft2']: if not hasattr(self, 'wdl_main_filename'): raise MissingFieldInInputJsonException(errmsg_template % ('wdl_main_filename', self.language)) if not hasattr(self, 'wdl_child_filenames'): @@ -243,12 +244,15 @@ def parse_s3_url(self, url, bucket_only=False, key_only=False): class Config(SerializableObject): - def __init__(self, **kwargs): + def __init__(self, fill_default=True, **kwargs): for k, v in kwargs.items(): setattr(self, k, v) for field in ['log_bucket']: if not hasattr(self, field): raise MissingFieldInInputJsonException("field %s is required in config" % field) + if fill_default: + self.fill_default() + self.fill_internal() def update(self, d): for k, v in d.items(): @@ -265,7 +269,7 @@ def fill_default(self): if not hasattr(self, "ebs_size"): self.ebs_size = 0 # unspecified by default if not hasattr(self, "ebs_type"): - self.ebs_type = 'gp2' + self.ebs_type = 'gp3' if not hasattr(self, "shutdown_min"): self.shutdown_min = 'now' if not hasattr(self, "spot_instance"): @@ -278,30 +282,22 @@ def fill_default(self): self.public_postrun_json = False # 4dn will use 'true' --> this will automatically be added by start_run_awsem if not hasattr(self, 'root_ebs_size'): - self.root_ebs_size = 8 + self.root_ebs_size = DEFAULT_ROOT_EBS_SIZE + if not hasattr(self, 'awsf_image'): + self.awsf_image = DEFAULT_AWSF_IMAGE def fill_internal(self): # fill internally-used fields (users cannot specify these fields) # script url self.script_url = 'https://raw.githubusercontent.com/' + \ - TIBANNA_REPO_NAME + '/' + TIBANNA_REPO_BRANCH + '/awsf/' + TIBANNA_REPO_NAME + '/' + TIBANNA_REPO_BRANCH + '/' + TIBANNA_AWSF_DIR + '/' self.json_bucket = self.log_bucket - def fill_language_options(self, language='cwl_draft3', singularity=False): + def fill_language_options(self, language='cwl_v1', singularity=False): """fill in ami_id and language fields (these are also internal)""" - if language == 'wdl': - self.ami_id = AMI_ID_WDL - elif language == 'shell': - self.ami_id = AMI_ID_SHELL - elif language == 'snakemake': - self.ami_id = AMI_ID_SNAKEMAKE - else: # cwl - if language in ['cwl', 'cwl_v1']: # 'cwl' means 'cwl_v1' - self.ami_id = AMI_ID_CWL_V1 - else: - self.ami_id = AMI_ID_CWL_DRAFT3 - if singularity: # applied to only cwl though it is pretty useless - self.singularity = True + self.ami_id = AMI_ID + if singularity: + self.singularity = True self.language = language def fill_other_fields(self, app_name=''): @@ -447,7 +443,7 @@ def get_input_size_in_bytes(self): else: size = get_file_size(f['object_key'], bucket) input_size_in_bytes.update({str(argname): size}) - print({"input_size_in_bytes": input_size_in_bytes}) + logger.debug(str({"input_size_in_bytes": input_size_in_bytes})) return input_size_in_bytes def get_benchmarking(self, input_size_in_bytes): @@ -523,7 +519,7 @@ def inner(*args, **kwargs): # change behavior as well, # to avoid 'retry_without_spot works only with spot' error in the next round self.cfg.behavior_on_capacity_limit = 'fail' - printlog("trying without spot...") + logger.info("trying without spot...") return 'continue' else: raise Exception("failed to launch instance for job %s: %s" % (self.jobid, str(e))) @@ -545,7 +541,7 @@ def create_run_json_dict(self): 'App_version': args.app_version, 'language': args.language } - if args.language == 'wdl': + if args.language in ['wdl', 'wdl_v1', 'wdl_draft2']: app.update({ 'main_wdl': args.wdl_main_filename, 'other_wdl_files': ','.join(args.wdl_child_filenames), @@ -644,23 +640,22 @@ def create_userdata(self, profile=None): str += "JOBID={}\n".format(self.jobid) str += "RUN_SCRIPT=aws_run_workflow_generic.sh\n" str += "SHUTDOWN_MIN={}\n".format(cfg.shutdown_min) - str += "JSON_BUCKET_NAME={}\n".format(cfg.json_bucket) str += "LOGBUCKET={}\n".format(cfg.log_bucket) str += "SCRIPT_URL={}\n".format(cfg.script_url) - str += "LANGUAGE={}\n".format(cfg.language) str += "wget $SCRIPT_URL/$RUN_SCRIPT\n" str += "chmod +x $RUN_SCRIPT\n" str += "source $RUN_SCRIPT -i $JOBID -m $SHUTDOWN_MIN" - str += " -j $JSON_BUCKET_NAME -l $LOGBUCKET -u $SCRIPT_URL -L $LANGUAGE" + str += " -l $LOGBUCKET" + str += " -V {version}".format(version=__version__) + str += " -A {awsf_image}".format(awsf_image=cfg.awsf_image) if cfg.password: str += " -p {}".format(cfg.password) if profile: str += " -a {access_key} -s {secret_key} -r {region}".format(region=AWS_REGION, **profile) if hasattr(cfg, 'singularity') and cfg.singularity: str += " -g" - str += " -V {version}".format(version=__version__) str += "\n" - print(str) + logger.debug("userdata: \n" + str) return(str) @property @@ -690,7 +685,7 @@ def launch_args(self): {'DeviceName': '/dev/sda1', 'Ebs': {'DeleteOnTermination': True, 'VolumeSize': self.cfg.root_ebs_size, - 'VolumeType': 'gp2'}}]}) + 'VolumeType': 'gp3'}}]}) if self.cfg.ebs_iops: # io1 type, specify iops largs["BlockDeviceMappings"][0]["Ebs"]['Iops'] = self.cfg.ebs_iops if self.cfg.ebs_size >= 16000: @@ -894,7 +889,7 @@ def upload_workflow_to_s3(unicorn_input): jobid = unicorn_input.jobid bucket = cfg.log_bucket key_prefix = jobid + '.workflow/' - if args.language == 'wdl': + if args.language in ['wdl', 'wdl_v1', 'wdl_draft2']: main_wf = args.wdl_main_filename wf_files = args.wdl_child_filenames.copy() localdir = args.wdl_directory_local @@ -946,12 +941,12 @@ def get_file_size(key, bucket, size_in_gb=False): default returns file size in bytes, unless size_in_gb = True ''' - printlog("getting file or subfoler size") + logger.info("getting file or subfoler size") meta = does_key_exist(bucket, key) if not meta: try: size = 0 - printlog("trying to get total size of the prefix") + logger.info("trying to get total size of the prefix") for item in get_all_objects_in_prefix(bucket, key): size += item['Size'] except: diff --git a/tibanna/exceptions.py b/tibanna/exceptions.py old mode 100644 new mode 100755 index debbfbff0..9cc8e86fc --- a/tibanna/exceptions.py +++ b/tibanna/exceptions.py @@ -117,9 +117,21 @@ class MalFormattedInputJsonException(Exception): pass -class MalFormattedPostrunJsonException(Exception): +class MalFormattedRunJsonException(Exception): + pass + + +class MalFormattedPostRunJsonException(Exception): pass class MetricRetrievalException(Exception): pass + + +class UnsupportedCWLVersionException(Exception): + def __init__(self, message=None): + if not message: + message = "CWL draft3 is no longer supported. Please switched to v1" + super().__init__(message) + diff --git a/tibanna/iam_utils.py b/tibanna/iam_utils.py old mode 100644 new mode 100755 index 36f41d8d5..ed1c799d8 --- a/tibanna/iam_utils.py +++ b/tibanna/iam_utils.py @@ -1,6 +1,7 @@ import boto3 import json import random +from . import create_logger from .vars import ( DYNAMODB_TABLE, AWS_ACCOUNT_NUMBER, @@ -9,7 +10,9 @@ RUN_TASK_LAMBDA_NAME, CHECK_TASK_LAMBDA_NAME ) -from .utils import printlog + + +logger = create_logger(__name__) class IAM(object): @@ -127,7 +130,7 @@ def policy_arn_list_for_role(self, role_type): run_task_custom_policy_types = ['list', 'cloudwatch', 'passrole', 'bucket', 'dynamodb', 'desc_stepfunction', 'cw_dashboard'] check_task_custom_policy_types = ['cloudwatch_metric', 'cloudwatch', 'bucket', 'ec2_desc', - 'termination'] + 'termination', 'dynamodb'] arnlist = {'ec2': [self.policy_arn(_) for _ in ['bucket', 'cloudwatch_metric']] + ['arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly'], # 'stepfunction': [self.policy_arn(_) for _ in ['lambdainvoke']], @@ -324,7 +327,8 @@ def policy_dynamodb(self): "Effect": "Allow", "Action": [ "dynamodb:DescribeTable", - "dynamodb:PutItem" + "dynamodb:PutItem", + "dynamodb:Query" ], "Resource": "arn:aws:dynamodb:" + self.region + ":" + self.account_id + ":table/" + DYNAMODB_TABLE } @@ -384,7 +388,7 @@ def create_role_robust(self, rolename, roledoc, verbose=False): except Exception as e2: raise Exception("Can't create role %s: %s" % (rolename, str(e2))) if verbose: - print(response) + logger.debug("response from create_role_robust: " + str(response)) def create_empty_role_for_lambda(self, verbose=False): role_policy_doc_lambda = self.role_policy_document('lambda') @@ -392,7 +396,7 @@ def create_empty_role_for_lambda(self, verbose=False): try: self.client.get_role(RoleName=empty_role_name) except Exception: - print("creating %s", empty_role_name) + logger.info("creating %s", empty_role_name) self.create_role_robust(empty_role_name, json.dumps(role_policy_doc_lambda), verbose) def create_role_for_role_type(self, role_type, verbose=False): @@ -402,7 +406,7 @@ def create_role_for_role_type(self, role_type, verbose=False): for p_arn in self.policy_arn_list_for_role(role_type): response = role.attach_policy(PolicyArn=p_arn) if verbose: - print(response) + logger.debug("response from IAM attach_policy :" + str(response)) def create_user_group(self, verbose=False): try: @@ -410,7 +414,7 @@ def create_user_group(self, verbose=False): GroupName=self.iam_group_name ) if verbose: - print(response) + logger.debug("response from IAM create_group :" + str(response)) except Exception as e: if 'EntityAlreadyExists' in str(e): # do not actually delete the group, just detach existing policies. @@ -421,29 +425,29 @@ def create_user_group(self, verbose=False): PolicyArn='arn:aws:iam::aws:policy/AWSStepFunctionsFullAccess' ) if verbose: - print(response) + logger.debug("response from IAM attach_policy :" + str(response)) response = group.attach_policy( PolicyArn='arn:aws:iam::aws:policy/AWSStepFunctionsConsoleFullAccess' ) if verbose: - print(response) + logger.debug("response from IAM attach_policy :" + str(response)) response = group.attach_policy( PolicyArn='arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole' ) if verbose: - print(response) + logger.debug("response from IAM attach_policy :" + str(response)) response = group.attach_policy( PolicyArn='arn:aws:iam::aws:policy/CloudWatchReadOnlyAccess' ) if verbose: - print(response) + logger.debug("response from IAM attach_policy :" + str(response)) custom_policy_types = ['bucket', 'ec2_desc', 'cloudwatch_metric', 'dynamodb', 'termination'] for pn in [self.policy_name(pt) for pt in custom_policy_types]: response = group.attach_policy( PolicyArn='arn:aws:iam::' + self.account_id + ':policy/' + pn ) if verbose: - print(response) + logger.debug("response from IAM attach_policy :" + str(response)) def create_policy_robust(self, policy_name, policy_doc, verbose=False): try: @@ -452,7 +456,7 @@ def create_policy_robust(self, policy_name, policy_doc, verbose=False): PolicyDocument=policy_doc, ) if verbose: - print(response) + logger.debug("response from IAM create_policy :" + str(response)) except Exception as e: if 'EntityAlreadyExists' in str(e): try: @@ -464,7 +468,7 @@ def create_policy_robust(self, policy_name, policy_doc, verbose=False): PolicyDocument=policy_doc, ) if verbose: - print(response) + logger.debug("response from IAM create_policy :" + str(response)) except Exception as e2: raise Exception("Can't create policy %s : %s" % (policy_name, str(e2))) @@ -505,7 +509,7 @@ def create_tibanna_iam(self, verbose=False): A user group shares permission for buckets, tibanna execution and logs """ # create prefix that represent a single user group - printlog("creating iam permissions with tibanna policy prefix %s" % self.tibanna_policy_prefix) + logger.info("creating iam permissions with tibanna policy prefix %s" % self.tibanna_policy_prefix) # policies for pt in self.policy_types: @@ -526,7 +530,7 @@ def create_tibanna_iam(self, verbose=False): def remove_role(self, rolename, verbose=False, ignore_errors=True): if verbose: - printlog("removing role %s" % rolename) + logger.info("removing role %s" % rolename) try: role = self.iam.Role(rolename) role.description @@ -534,7 +538,7 @@ def remove_role(self, rolename, verbose=False, ignore_errors=True): if 'ResourceNotFound' in str(e) or 'NoSuchEntity' in str(e): if ignore_errors: if verbose: - printlog("role %s doesn't exist. skipping." % rolename) + logger.info("role %s doesn't exist. skipping." % rolename) return else: raise Exception(e) @@ -561,14 +565,14 @@ def remove_roles(self, verbose=False, ignore_errors=True): def remove_instance_profile(self, verbose=False, ignore_errors=True): if verbose: - printlog("removing instance profile %s" % self.instance_profile_name) + logger.info("removing instance profile %s" % self.instance_profile_name) try: res = self.client.delete_instance_profile(InstanceProfileName=self.instance_profile_name) except Exception as e: if 'ResourceNotFound' in str(e) or 'NoSuchEntity' in str(e): if ignore_errors: if verbose: - printlog("instance profile %s doesn't exist. skipping." % self.instance_profile_name) + logger.info("instance profile %s doesn't exist. skipping." % self.instance_profile_name) return else: raise Exception(e) @@ -576,7 +580,7 @@ def remove_instance_profile(self, verbose=False, ignore_errors=True): def remove_policy(self, policy_name, verbose=False, ignore_errors=True): if verbose: - printlog("removing policy %s" % policy_name) + logger.info("removing policy %s" % policy_name) policy_arn = 'arn:aws:iam::' + self.account_id + ':policy/' + policy_name # first detach roles and groups and delete versions (requirements for deleting policy) try: @@ -586,7 +590,7 @@ def remove_policy(self, policy_name, verbose=False, ignore_errors=True): if 'ResourceNotFound' in str(e) or 'NoSuchEntity' in str(e): if ignore_errors: if verbose: - printlog("policy %s doesn't exist. skipping." % policy_arn) + logger.info("policy %s doesn't exist. skipping." % policy_arn) return else: raise Exception(e) @@ -622,7 +626,7 @@ def remove_users_from_group(self, verbose=False): def delete_group(self, verbose=False, ignore_errors=True): if verbose: - printlog("removing group %s" % self.iam_group_name) + logger.info("removing group %s" % self.iam_group_name) try: gr = self.iam.Group(self.iam_group_name) gr.group_id @@ -630,7 +634,7 @@ def delete_group(self, verbose=False, ignore_errors=True): if 'ResourceNotFound' in str(e) or 'NoSuchEntity' in str(e): if ignore_errors: if verbose: - printlog("group %s doesn't exist. skipping." % self.iam_group_name) + logger.info("group %s doesn't exist. skipping." % self.iam_group_name) return else: raise Exception(e) diff --git a/tibanna/lambdas/__init__.py b/tibanna/lambdas/__init__.py old mode 100644 new mode 100755 diff --git a/tibanna/lambdas/check_task_awsem.py b/tibanna/lambdas/check_task_awsem.py old mode 100644 new mode 100755 diff --git a/tibanna/lambdas/requirements.txt b/tibanna/lambdas/requirements.txt old mode 100644 new mode 100755 diff --git a/tibanna/lambdas/run_task_awsem.py b/tibanna/lambdas/run_task_awsem.py old mode 100644 new mode 100755 diff --git a/tibanna/nnested_array.py b/tibanna/nnested_array.py old mode 100644 new mode 100755 index 40196a56d..d07d040d6 --- a/tibanna/nnested_array.py +++ b/tibanna/nnested_array.py @@ -44,7 +44,7 @@ def run_on_nested_arrays2(a, b, func, **param): return(func(a, b, **param)) -def create_dim(a, dim=''): +def create_dim(a, dim='', empty=False): """create dimension array for n-nested array example: >>> create_dim([[1,2],[3,4],[5,6,[7,8],]]) @@ -53,13 +53,18 @@ def create_dim(a, dim=''): '' >>> create_dim([5,5]) ['0', '1'] + >>> create_dim([5,5], empty=True) + ['', ''] """ if isinstance(a, list): if dim: prefix = dim + '-' else: prefix = '' - return([create_dim(a_, prefix + str(i)) for i, a_ in enumerate(a)]) + if empty: + return([create_dim(a_, empty=empty) for a_ in a]) + else: + return([create_dim(a_, prefix + str(i)) for i, a_ in enumerate(a)]) else: return(dim) diff --git a/tibanna/run_task.py b/tibanna/run_task.py old mode 100644 new mode 100755 index 21b247fe4..7d158ccbf --- a/tibanna/run_task.py +++ b/tibanna/run_task.py @@ -35,11 +35,11 @@ def run_task(input_json): # required for cwl cwl_main_filename: main cwl file name cwl_directory_url: the url (http:// or s3://) in which the cwl files resides - cwl_version: the version of cwl (either 'draft3' or 'v1') + cwl_version: the version of cwl (now only 'v1' is supported) cwl_child_filenames (optional): names of the other cwl files used by main cwl file, delimited by comma - language (optional for cwl): 'cwl_v1' or 'cwl_draft3' + language (optional for cwl): now only 'cwl_v1' is supported # required for wdl - language: 'wdl' + language: 'wdl' (='wdl_draft2'), 'wdl_v1', or 'wdl_draft2' wdl_main_filename: main wdl file name wdl_directory_url: the url (http:// or s3://) in which the wdl files resides wdl_child_filenames (optional): names of the other wdl files used by main wdl file, delimited by comma diff --git a/tibanna/stepfunction.py b/tibanna/stepfunction.py old mode 100644 new mode 100755 diff --git a/tibanna/top.py b/tibanna/top.py new file mode 100644 index 000000000..6add115ef --- /dev/null +++ b/tibanna/top.py @@ -0,0 +1,358 @@ +import datetime + + +class Top(object): + """class TopSeries stores the information of a series of top commands + + :: + echo -n 'Timestamp: '; date +%F-%H:%M:%S + top -b -n1 [-i] [-c] + + over short intervals to monitor the same set of processes over time. + + An example input content looks like below, or a series of these. + The initialization works at any time interval and can be used as a generic + class, but the class is designed for the output of a regular top commands above + run at about 1-minute intervals, which is performed by awsf3 on an AWSEM instance + through cron jobs. (some can be skipped but there should be no more than 1 per minute). + This top output can be obtained through ``tibanna log -j -t`` or through + API ``API().log(job_id=, top=True)``. + + :: + + Timestamp: 2020-12-18-18:55:37 + top - 18:55:37 up 4 days, 2:37, 0 users, load average: 5.59, 5.28, 5.76 + Tasks: 7 total, 1 running, 6 sleeping, 0 stopped, 0 zombie + %Cpu(s): 6.6 us, 0.1 sy, 0.0 ni, 93.2 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st + KiB Mem : 12971188+total, 10379019+free, 20613644 used, 5308056 buff/cache + KiB Swap: 0 total, 0 free, 0 used. 10834606+avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 712 root 20 0 36.464g 8.223g 19572 S 100.0 6.6 125:55.12 java -Xmx32g -Xms32g -jar juicer_tools.jar addNorm -w 1000 -d -F out.hic + 17919 ubuntu 20 0 40676 3828 3144 R 6.2 0.0 0:00.01 top -b -n1 -c -i -w 10000 + + The default timestamp from top output does not contain dates, which can screw up multi-day processes + which is common for bioinformatics pipelines. So, an extra timestamp is added before each top command. + + To parse top output content, simply create an object. This will create processes attribute, + which is a raw parsed result organized by time stamps. + + :: + + top = Top(top_output_content) + + To reorganize the contents by commands, run digest. By default, the max number of commands is 16, + and if there are more than 16 unique commands, they will be collapsed into prefixes. + + :: + + top.digest() + + To write a csv / tsv file organized by both timestamps (rows) and commands (columns), + use :func: write_to_csv. + + :: + + top.write_to_csv(...) + + """ + + # assume this format for timestamp + timestamp_format = '%Y-%m-%d-%H:%M:%S' + + # These commands are excluded when parsing the top output + # Currently only 1-, 2- or 3-word prefixes work. + exclude_list = ['top', 'docker', 'dockerd', '/usr/bin/dockerd', 'cron', + 'docker-untar', 'containerd', 'goofys-latest', 'cwltool', + '/usr/bin/python3 /usr/local/bin/cwltool', 'containerd-shim', + '/usr/bin/python3 /bin/unattended-upgrade', + '/usr/bin/python3 /usr/local/bin/awsf3', + '/usr/bin/python3 /usr/local/bin/aws s3', + 'java -jar /usr/local/bin/cromwell.jar', + 'java -jar /usr/local/bin/cromwell-31.jar'] + + def __init__(self, contents): + """initialization parsed top output content and + creates processes which is a dictionary with timestamps as keys + and a list of Process class objects as a value. + It also creates empty attributes timestamps, commands, cpus and mems + which can be filled through method :func: digest. + """ + self.processes = dict() + self.timestamps = [] + self.commands = [] + self.cpus = dict() + self.mems = dict() + self.parse_contents(contents) + + def parse_contents(self, contents): + is_in_table = False + for line in contents.splitlines(): + if line.startswith('Timestamp:'): + timestamp = line.split()[1] + continue + if line.lstrip().startswith('PID'): + is_in_table = True + continue + if not line or line.isspace(): + is_in_table = False + if is_in_table: + if timestamp not in self.processes: + self.processes[timestamp] = [] + process = Process(line) + if not self.should_skip_process(process): + self.processes[timestamp].append(Process(line)) + + def digest(self, max_n_commands=16, sort_by='alphabetical'): + """Fills in timestamps, commands, cpus and mems attributes + from processes attribute. + :param max_n_commands: When the number of unique commands exceeds + this value, they are collapsed into unique prefixes. + :sort_by: alphabetical|cpu|mem The commands are by default sorted + alphabetically, but optionally can be sorted by total cpus or total + mem (in reverser order) (e.g. the first command consumed the most cpu) + """ + # Reinitializat these so that you get the same results if you run it twice + self.timestamps = [] + self.commands = [] + self.cpus = dict() + self.mems = dict() + # First fill in commands from commands in processes (and collapse if needed.) + self.commands = self.get_collapsed_commands(max_n_commands) + # Fill in timestamps, cpus and mems from processes, matching collapsed commands. + self.nTimepoints = len(self.processes) + timestamp_ind = 0 + for timestamp in sorted(self.processes): + # sorted timestamps (columns) + self.timestamps.append(timestamp) + # commands (rows) + for process in self.processes[timestamp]: + # find a matching collapsed command (i.e. command prefix) and use that as command. + command = Top.convert_command_to_collapsed_command(process.command, self.commands) + if command not in self.cpus: + self.cpus[command] = [0] * self.nTimepoints + self.mems[command] = [0] * self.nTimepoints + self.cpus[command][timestamp_ind] += process.cpu + self.mems[command][timestamp_ind] += process.mem + timestamp_ind += 1 + # sort commands according to total cpu + self.sort_commands(by=sort_by) + + def get_collapsed_commands(self, max_n_commands): + """If the number of commands exceeds max_n_commands, + return a collapsed set of commands + that consists of prefixes of commands so that + the total number is within max_n_commands. + First decide the number of words from the beginning of the commands + to collapse commands that start with the same words, i.e. + find the maximum number of words that makes the number of unique commands to be + bounded by max_n_commands. + If using only the first word is not sufficient, go down to the characters of + the first word. If that's still not sufficient, collapse all of them into a single + command ('all_commands') + After the collapse, commands that are unique to a collapsed prefix are + extended back to the original command. + """ + + all_commands = set() + for timestamp in self.processes: + all_commands.update(set([pr.command for pr in self.processes[timestamp]])) + + if len(all_commands) <= max_n_commands: + # no need to collapse + return list(all_commands) + + # decide the number of words from the beginning of the commands + # to collapse commands starting with the same words + all_cmd_lengths = [len(cmd.split()) for cmd in all_commands] # number of words per command + max_cmd_length = max(all_cmd_lengths) + min_cmd_length = min(all_cmd_lengths) + collapsed_len = max_cmd_length - 1 + n_commands = len(all_commands) + while(n_commands > max_n_commands and collapsed_len > 1): + reduced_commands = set() + for cmd in all_commands: + reduced_commands.add(Top.first_words(cmd, collapsed_len)) + n_commands = len(reduced_commands) + collapsed_len -= 1 + + # went down to the first words but still too many commands - start splitting characters then + if n_commands > max_n_commands: + all_cmd_lengths = [len(cmd.split()[0]) for cmd in all_commands] # number of characters of the first word + max_cmd_length = max(all_cmd_lengths) + min_cmd_length = min(all_cmd_lengths) + collapsed_len = max_cmd_length - 1 + while(n_commands > max_n_commands and collapsed_len > 1): + reduced_commands = set() + for cmd in all_commands: + reduced_commands.add(Top.first_characters(cmd.split()[0], collapsed_len)) + n_commands = len(reduced_commands) + collapsed_len -= 1 + + if n_commands > max_n_commands: + return ['all_commands'] + else: + # extend reduced commands that don't need to be reduced + for r_cmd in list(reduced_commands): # wrap in list so that we can remove elements in the loop + uniq_cmds = [cmd for cmd in all_commands if cmd.startswith(r_cmd)] + if len(uniq_cmds) == 1: + reduced_commands.remove(r_cmd) + reduced_commands.add(uniq_cmds[0]) + return reduced_commands + + def write_to_csv(self, csv_file, metric='cpu', delimiter=',', colname_for_timestamps='timepoints', + timestamp_start=None, timestamp_end=None, base=0): + """write metrics as csv file with commands as columns + :param metric: 'cpu' or 'mem' + :param delimiter: default ',' + :param colname_for_timestamps: colunm name for the timepoint column (1st column). default 'timepoints' + :param timestamp_start: start time in the same timestamp format (e.g. 01:23:45), + time stamps will be converted to minutes since start time. + The report starts with minute 0. + Time points with no top records will be filled with 0. + If not specified, the first timestamp in the top commands will be used. + :param timestamp_end: end time in the same timestamp format (e.g. 01:23:45), + The reports will be generated only up to the end time. + Time points with no top records will be filled with 0. + If not specified, the last timestamp in the top commands will be used. + :param base: default 0. If 0, minutes start with 0, if 1, minutes are 1-based (shifted by 1). + """ + metric_array = getattr(self, metric + 's') + if self.timestamps: + if not timestamp_start: + timestamp_start = self.timestamps[0] + if not timestamp_end: + timestamp_end = self.timestamps[-1] + timestamps_as_minutes = self.timestamps_as_minutes(timestamp_start) + last_minute = self.as_minutes(timestamp_end, timestamp_start) + else: # default when timestamps is not available (empty object) + timestamps_as_minutes = range(0, 5) + last_minute = 5 + with open(csv_file, 'w') as fo: + # header + fo.write(delimiter.join([colname_for_timestamps] + [Top.wrap_in_double_quotes(cmd) for cmd in self.commands])) + fo.write('\n') + # contents + # skip timepoints earlier than timestamp_start + for i in range(0, len(timestamps_as_minutes)): + if timestamps_as_minutes[i] >= 0: + break + for clock in range(0, last_minute + 1): + clock_shifted = clock + base + if i < len(timestamps_as_minutes) and timestamps_as_minutes[i] == clock: + fo.write(delimiter.join([str(clock_shifted)] + [str(metric_array[cmd][i]) for cmd in self.commands])) + i += 1 + else: + fo.write(delimiter.join([str(clock_shifted)] + ['0' for cmd in self.commands])) # add 0 for timepoints not reported + fo.write('\n') + + def should_skip_process(self, process): + """A predicate function to check if the process should be skipped (excluded). + It returns True if the input process should be skipped. + e.g. the top command itself is excluded, as well as docker, awsf3, cwltool, etc. + the list to be excluded is in self.exclude_list. + It compares either first word or first two or three words only. + Kernel threads (single-word commands wrapped in bracket (e.g. [perl]) are also excluded. + """ + first_word = Top.first_words(process.command, 1) + first_two_words = Top.first_words(process.command, 2) + first_three_words = Top.first_words(process.command, 3) + if first_word in self.exclude_list: + return True + elif first_two_words in self.exclude_list: + return True + elif first_three_words in self.exclude_list: + return True + if first_word.startswith('[') and first_word.endswith(']'): + return True + return False + + @staticmethod + def convert_command_to_collapsed_command(cmd, collapsed_commands): + if collapsed_commands == 'all_commands': # collapsed to one command + return 'all_commands' + elif cmd in collapsed_commands: # not collapsed + return cmd + else: # collapsed to prefix + all_prefixes = [_ for _ in collapsed_commands if cmd.startswith(_)] + longest_prefix = sorted(all_prefixes, key=lambda x: len(x), reverse=True)[0] + return longest_prefix + + def total_cpu_per_command(self, command): + return sum([v for v in self.cpus[command]]) + + def total_mem_per_command(self, command): + return sum([v for v in self.mems[command]]) + + def sort_commands(self, by='cpu'): + """sort self.commands by total cpu (default) or mem in reverse order, + or alphabetically (by='alphabetical')""" + if by == 'cpu': + self.commands = sorted(self.commands, key=lambda x: self.total_cpu_per_command(x), reverse=True) + elif by == 'mem': + self.commands = sorted(self.commands, key=lambda x: self.total_mem_per_command(x), reverse=True) + elif by == 'alphabetical': + self.commands = sorted(self.commands) + + @classmethod + def as_minutes(cls, timestamp, timestamp_start): + """timestamp as minutes since timestamp_start. + :param timestamp: given timestamp in the same format (e.g. 01:23:45) + :param timestamp_start: start timestamp in the same format (e.g. 01:20:45) + In the above example, 3 will be the return value. + """ + dt = cls.as_datetime(timestamp) + dt_start = cls.as_datetime(timestamp_start) + # negative numbers are not supported by timedelta, so do each case separately + if dt > dt_start: + return round((dt - dt_start).seconds / 60) + else: + return -round((dt_start - dt).seconds / 60) + + def timestamps_as_minutes(self, timestamp_start): + """convert self.timestamps to a list of minutes since timestamp_start + :param timestamp_start: timestamp in the same format (e.g. 01:23:45) + """ + return [self.as_minutes(t, timestamp_start) for t in self.timestamps] + + @classmethod + def as_datetime(cls, timestamp): + return datetime.datetime.strptime(timestamp, cls.timestamp_format) + + @staticmethod + def wrap_in_double_quotes(string): + """wrap a given string with double quotes (e.g. haha -> "haha") + """ + return '\"' + string + '\"' + + @staticmethod + def first_words(string, n_words): + """returns first n words of a string + e.g. first_words('abc def ghi', 2) ==> 'abc def' + """ + words = string.split() + return ' '.join(words[0:min(n_words, len(words))]) + + @staticmethod + def first_characters(string, n_letters): + """returns first n letters of a string + e.g. first_characters('abc def ghi', 2) ==> 'ab' + """ + letters = list(string) + return ''.join(letters[0:min(n_letters, len(letters))]) + + def as_dict(self): + return self.__dict__ + + +class Process(object): + def __init__(self, top_line): + prinfo_as_list = top_line.lstrip().split() + self.pid = prinfo_as_list[0] + self.user = prinfo_as_list[1] + self.cpu = float(prinfo_as_list[8]) + self.mem = float(prinfo_as_list[9]) + self.command = ' '.join(prinfo_as_list[11:]) + + def as_dict(self): + return self.__dict__ diff --git a/tibanna/utils.py b/tibanna/utils.py old mode 100644 new mode 100755 index bf0592e46..31775a999 --- a/tibanna/utils.py +++ b/tibanna/utils.py @@ -1,10 +1,10 @@ import random import string -import logging import boto3 import os import mimetypes from uuid import uuid4, UUID +from . import create_logger from .vars import ( _tibanna, EXECUTION_ARN, @@ -12,12 +12,7 @@ ) -LOG = logging.getLogger(__name__) - - -def printlog(message): - print(message) - LOG.info(message) +logger = create_logger(__name__) def _tibanna_settings(settings_patch=None, force_inplace=False, env=''): @@ -83,7 +78,7 @@ def create_jobid(): def read_s3(bucket, object_name): response = boto3.client('s3').get_object(Bucket=bucket, Key=object_name) - printlog(str(response)) + logger.debug("response_from_read_s3:" + str(response)) return response['Body'].read().decode('utf-8', 'backslashreplace') @@ -112,7 +107,7 @@ def upload(filepath, bucket, prefix='', public=True): if filepath: dirname, filename = os.path.split(filepath) key = os.path.join(prefix, filename) - printlog("filepath=%s, filename=%s, key=%s" % (filepath, filename, key)) + logger.debug("filepath=%s, filename=%s, key=%s" % (filepath, filename, key)) content_type = mimetypes.guess_type(filename)[0] if content_type is None: content_type = 'binary/octet-stream' diff --git a/tibanna/vars.py b/tibanna/vars.py old mode 100644 new mode 100755 index 2d30aa726..98507e2c5 --- a/tibanna/vars.py +++ b/tibanna/vars.py @@ -3,10 +3,15 @@ import sys from datetime import datetime from dateutil.tz import tzutc +from ._version import __version__ +from . import create_logger + + +logger = create_logger(__name__) if boto3.session.Session().get_credentials() is None: - print('Please provide AWS credentials.') + logger.info('Please provide AWS credentials.') sys.exit(-1) @@ -32,35 +37,37 @@ # Tibanna AMI info AMI_PER_REGION = { - 'us-east-1': 'ami-0f06a8358d41c4b9c', - 'ap-south-1' : 'ami-09d95d9217d0cf385', + # new AMI based on ubuntu 20.04 works with awsf3 and it's available only for us-east-1. + 'us-east-1': 'ami-0a7ddfc7e412ab6e0', + 'us-east-2': 'ami-0b44d62b891fb789b', + 'us-west-1': 'ami-0e1e2593b3a0d1893', + 'us-west-2': 'ami-07c59ed4484710392', + 'ap-south-1' : 'ami-05d8bf32dfd849840', 'ap-northeast-2' : 'ami-0c41548ca349c7a24', - 'ap-southeast-1' : 'ami-05ed988e6e239f8ab', - 'ap-southeast-2' : 'ami-08015a75aa06d5169', - 'ap-northeast-1' : 'ami-0ca2f82fea1712d9c', - 'ca-central-1': 'ami-0db70f7b86ac96a83', - 'eu-central-1': 'ami-04e369eb9ff2f4f2d', - 'eu-west-1': 'ami-02de1cc972d19b5f0', - 'eu-west-2': 'ami-092454e8dfc2d7fa6', - 'eu-west-3': 'ami-02f01bb8e27345b00', - 'eu-north-1': 'ami-06cff15ceaadf54ca', - 'sa-east-1': 'ami-06f63076e5a4fa510', - 'us-east-2': 'ami-0691eb4caeced8412', - 'us-west-1': 'ami-009aab2c590a01210', - 'us-west-2': 'ami-05bcbe2628605a628' + 'ap-southeast-1' : 'ami-0000f4a22faea40cd', + 'ap-southeast-2' : 'ami-07068475ae944838b', + 'ap-northeast-1' : 'ami-0ee094aa0951d13af', + 'ca-central-1': 'ami-08ffe0a93f1fe9819', + 'eu-central-1': 'ami-07af39d28b148d1dd', + 'eu-west-1': 'ami-0646a764fc87b0ed0', + 'eu-west-2': 'ami-0ce9320a6d39d00ae', + 'eu-west-3': 'ami-04f0b79f6cf2e3639', + 'eu-north-1': 'ami-0eed0fe896c259550', + 'sa-east-1': 'ami-05e255e0c31f92d16', + 'me-south-1': 'ami-0d641bcc53597f070', + 'af-south-1': 'ami-08a0595fe4fbe4734', + 'ap-east-1': 'ami-06b692ca269732ef6', + 'eu-south-1': 'ami-0ad3c42914e596f1d' } +if AWS_REGION not in AMI_PER_REGION: + logger.warning("Public Tibanna AMI for region %s is not available." % AWS_REGION) AMI_ID = AMI_PER_REGION.get(AWS_REGION, '') -if not AMI_ID: - raise Exception("AMI for region %s is not supported" % AWS_REGION) -AMI_ID_CWL_V1 = AMI_ID -AMI_ID_CWL_DRAFT3 = AMI_ID -AMI_ID_WDL = AMI_ID -AMI_ID_SHELL = AMI_ID -AMI_ID_SNAKEMAKE = AMI_ID + # Tibanna repo from which awsf scripts are pulled TIBANNA_REPO_NAME = os.environ.get('TIBANNA_REPO_NAME', '4dn-dcic/tibanna') TIBANNA_REPO_BRANCH = os.environ.get('TIBANNA_REPO_BRANCH', 'master') +TIBANNA_AWSF_DIR = 'awsf3' # Tibanna roles AWS_S3_ROLE_NAME = os.environ.get('AWS_S3_ROLE_NAME', 'S3_access') @@ -89,6 +96,16 @@ def PARSE_AWSEM_TIME(t_str): return t.replace(tzinfo=tzutc()) +# EBS mount path for cloudwatch metric collection +EBS_MOUNT_POINT = '/mnt/data1' + + +# Default root EBS size +DEFAULT_ROOT_EBS_SIZE = 8 + +# Default awsf image +DEFAULT_AWSF_IMAGE = '4dndcic/tibanna-awsf:' + __version__ + SFN_TYPE = 'unicorn' LAMBDA_TYPE = '' RUN_TASK_LAMBDA_NAME = 'run_task_awsem'