diff --git a/.github/workflows/dockerhub_push_release.yml b/.github/workflows/dockerhub_push_release.yml new file mode 100644 index 0000000..e8b6638 --- /dev/null +++ b/.github/workflows/dockerhub_push_release.yml @@ -0,0 +1,25 @@ +name: deploy release +# This builds the docker image and pushes it to DockerHub +on: + release: + types: [published] +jobs: + push_dockerhub: + name: Push new Docker image to Docker Hub (release) + runs-on: ubuntu-latest + # Only run for the official repo, for releases and merged PRs + if: ${{ github.repository == 'BU-ISCIII/taranis' }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }} + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Build new docker image + run: docker build --no-cache . -t buisciii/taranis:${{ github.event.release.tag_name }} + + - name: Push Docker image to DockerHub (develop) + run: | + echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin + docker push buisciii/taranis:${{ github.event.release.tag_name }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..ed66541 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,38 @@ +name: tests ci +# This workflow runs the pipeline with the minimal test dataset to check that it completes any errors +on: + push: + branches: [develop] + pull_request_target: + branches: [develop] + release: + types: [published] + +jobs: + push_dockerhub: + name: Push new Docker image to Docker Hub (dev) + runs-on: ubuntu-latest + # Only run for the official repo, for releases and merged PRs + if: ${{ github.repository == 'BU-ISCIII/taranis' }} + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }} + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Build new docker image + run: docker build --no-cache . -t buisciii/taranis:dev + + - name: Push Docker image to DockerHub (develop) + run: | + echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin + docker push buisciii/taranis:dev + run-tests: + name: Run tests + needs: push_dockerhub + runs-on: ubuntu-latest + steps: + - name: Run pipeline with test data + run: | + docker run buisciii/taranis:dev bash -c /opt/taranis/test/test.sh diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..28f3c5c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM continuumio/miniconda3:latest + +RUN mkdir /opt/taranis/ +ADD utils /opt/taranis/utils +ADD test /opt/taranis/test +ADD *.py /opt/taranis/ +ADD environment.yml /opt/taranis/ +ADD logging_config.ini /opt/taranis/ +ADD README.md /opt/taranis/ +ADD LICENSE /opt/taranis/ + +SHELL ["/bin/bash", "-c"] +RUN cd /opt/taranis +RUN /opt/conda/bin/conda env create -f /opt/taranis/environment.yml && /opt/conda/bin/conda clean -a +RUN /opt/conda/bin/conda env export --name taranis > taranis.yml +RUN echo "conda activate taranis" > ~/.bashrc +ENV PATH /opt/conda/envs/taranis:/opt/conda/envs/taranis/utils:$PATH diff --git a/README.md b/README.md index 0c975d3..bfe9399 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ This option is recomended. Install Anaconda3. -`conda install -c bioconda taranis` +`conda install -c conda-forge -c bioconda -c defaults taranis` Wait for the environment to solve.
Ignore warnings/errors. diff --git a/allele_calling.py b/allele_calling.py old mode 100644 new mode 100755 index 4e079c6..72d3294 --- a/allele_calling.py +++ b/allele_calling.py @@ -1196,19 +1196,26 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_ break if sample_name not in samples_profiles_dict: - if len(analysis_profiles_dict[sample_name]) == len(profile_header): - new_st_id = str(len(ST_profiles_dict) + 1) - ST_profiles_dict[new_st_id + "_INF"] = analysis_profile_dict[sample_name] - inf_ST[new_st_id] = analysis_profile_dict[sample_name] + if sample_name in analysis_profiles_dict: + if len(analysis_profiles_dict[sample_name]) == len(profile_header): + new_st_id = str(len(ST_profiles_dict) + 1) + ST_profiles_dict[new_st_id + "_INF"] = analysis_profile_dict[sample_name] + inf_ST[new_st_id] = analysis_profile_dict[sample_name] - samples_profiles_dict[sample_name]=new_st_id + "_INF" + samples_profiles_dict[sample_name]=new_st_id + "_INF" - if "New" not in count_st: - count_st["New"] = {} - if new_st_id not in count_st["New"]: - count_st["New"][new_st_id] = 0 - count_st["New"][new_st_id] += 1 + if "New" not in count_st: + count_st["New"] = {} + if new_st_id not in count_st["New"]: + count_st["New"][new_st_id] = 0 + count_st["New"][new_st_id] += 1 + + else: + samples_profiles_dict[sample_name] = '-' + if "Unknown" not in count_st: + count_st["Unknown"] = 0 + count_st["Unknown"] += 1 else: samples_profiles_dict[sample_name] = '-' diff --git a/analyze_schema.py b/analyze_schema.py index d861f30..5e35083 100755 --- a/analyze_schema.py +++ b/analyze_schema.py @@ -6,6 +6,7 @@ import glob from datetime import datetime import statistics +from collections import Counter #import matplotlib.pyplot as plt import plotly.graph_objs as go import plotly.io as pio @@ -170,7 +171,9 @@ def extract_info_schema (schema_files, outputdir, genus, species, usegenus, logg stdev = 0 else: stdev = statistics.stdev(alleles_len) - schema_statistics[gene_name]=[statistics.mode(alleles_len), statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)] + + #schema_statistics[gene_name]=[statistics.mode(alleles_len), statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)] + schema_statistics[gene_name]=[list(Counter(alleles_len).most_common(1)[0])[0], statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)] for length in list(set(alleles_len)): schema_variability_count[gene_name][str(length)] = str(alleles_len.count(length)) @@ -572,14 +575,16 @@ def analyze_schema (inputdir, outputdir, genus, species, usegenus, logger) : total_alleles += int(schema_variability_count[core][length]) stat_fh.write(core + '\t' + '\t'.join (map(str,schema_statistics[core])) + '\t' + ', '.join(length_number) + '\t' + str(total_alleles) + '\n') + #stat_fh.write(core + '\t' + ', '.join(map(str,schema_statistics[core][0])) + '\t' + '\t'.join (map(str,schema_statistics[core][1::])) + '\t' + ', '.join(length_number) + '\t' + str(total_alleles) + '\n') # Saving schema annotation to file - logger.info('Saving core gene schema annotation to file..') - annotation_file = os.path.join(outputdir, 'raw_info' , 'annotation.tsv') - with open (annotation_file , 'w') as annot_fh : - annot_fh.write('\t'.join(header_annotation) + '\n') - for core in sorted(annotation_core_dict) : - annot_fh.write(core + '\t' + '\t'.join(annotation_core_dict[core]) + '\n') + #logger.info('Saving core gene schema annotation to file..') + #annotation_file = os.path.join(outputdir, 'raw_info' , 'annotation.tsv') + #with open (annotation_file , 'w') as annot_fh : + # annot_fh.write('\t'.join(header_annotation) + '\n') + # for core in sorted(annotation_core_dict) : + # annot_fh.write(core + '\t' + '\t'.join(annotation_core_dict[core]) + '\n') + logger.info('Completed dumped raw information to files') diff --git a/distance_matrix.py b/distance_matrix.py old mode 100644 new mode 100755 diff --git a/environment.yml b/environment.yml index 65cd2c4..653ff2c 100644 --- a/environment.yml +++ b/environment.yml @@ -4,17 +4,14 @@ channels: - bioconda - defaults dependencies: - - pip - - python>=3.6 + - conda-forge::python>=3.6 + - conda-forge::biopython==1.72 + - conda-forge::pandas==1.2.4 + - conda-forge::progressbar==2.5 + - conda-forge::openpyxl==3.0.7 + - conda-forge::plotly==5.0.0 + - conda-forge::numpy==1.20.3 - bioconda::prokka>=1.14 - bioconda::blast>=2.9 - bioconda::mash>=2 - bioconda::prodigal=2.6.3 - - - pip: - - biopython==1.72 - - pandas==1.2.4 - - progressbar==2.5 - - openpyxl==3.0.7 - - plotly==5.0.0 - - numpy==1.20.3 diff --git a/logging_config.ini b/logging_config.ini index 85db65d..8d725f2 100644 --- a/logging_config.ini +++ b/logging_config.ini @@ -19,5 +19,5 @@ class=handlers.RotatingFileHandler level=NOTSET ## args(log_file_name, 'a', maxBytes , backupCount) #args=('Programas/taranis_b/logs/taranis.log','a',500000,5) -args=("./taranis.log",'a',500000,5) +args=("taranis.log",'a',500000,5) formatter=logfileformatter diff --git a/reference_alleles.py b/reference_alleles.py old mode 100644 new mode 100755 diff --git a/taranis.py b/taranis.py index 20a2eaf..706aa9b 100755 --- a/taranis.py +++ b/taranis.py @@ -208,7 +208,7 @@ def check_arg (args=None) : default = 100) distance_matrix_parser.add_argument('-sample_missing_threshold', required = False, help = 'Missing values percentage threshold above which samples are excluded for distance matrix creation. Default is 100.', - default = 100) + default = 20) distance_matrix_parser.add_argument('-paralog_filter', required = False, help = 'Consider paralog tags (NIPH, NIPHEM) as missing values. Default is True', default = True) diff --git a/test/test.sh b/test/test.sh new file mode 100755 index 0000000..f9ba660 --- /dev/null +++ b/test/test.sh @@ -0,0 +1,140 @@ +#!/bin/bash --login + +# Exit immediately if a pipeline, which may consist of a single simple command, a list, +#or a compound command returns a non-zero status: If errors are not handled by user +set -e +# Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion. + +#Print everything as if it were executed, after substitution and expansion is applied: Debug|log option +#set -x + +#============================================================= +# HEADER +#============================================================= + +#INSTITUTION:ISCIII +#CENTRE:BU-ISCIII +# +#ACKNOLEDGE: longops2getops.sh: https://gist.github.com/adamhotep/895cebf290e95e613c006afbffef09d7 +# +#DESCRIPTION: test.sh uses test data for testing taranis installation. +# +# +#================================================================ +# END_OF_HEADER +#================================================================ + +#SHORT USAGE RULES +#LONG USAGE FUNCTION +usage() { + cat << EOF + +plasmidID is a computational pipeline tha reconstruct and annotate the most likely plasmids present in one sample + +usage : $0 + + -v | --version version + -h | --help display usage message + +example: ./test.sh + +EOF +} + +#================================================================ +# OPTION_PROCESSING +#================================================================ +# Error handling +error(){ + local parent_lineno="$1" + local script="$2" + local message="$3" + local code="${4:-1}" + + RED='\033[0;31m' + NC='\033[0m' + + if [[ -n "$message" ]] ; then + echo -e "\n---------------------------------------\n" + echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" + echo -e "MESSAGE:\n" + echo -e "$message" + echo -e "\n---------------------------------------\n" + else + echo -e "\n---------------------------------------\n" + echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}" + echo -e "\n---------------------------------------\n" + fi + + exit "${code}" +} + +# translate long options to short +reset=true +for arg in "$@" +do + if [ -n "$reset" ]; then + unset reset + set -- # this resets the "$@" array so we can rebuild it + fi + case "$arg" in + --help) set -- "$@" -h ;; + --version) set -- "$@" -v ;; + # pass through anything else + *) set -- "$@" "$arg" ;; + esac +done + +#DECLARE FLAGS AND VARIABLES +script_dir=$(dirname $(readlink -f $0)) +assemblies="./samples_listeria/" +schema="./MLST_listeria/" +profile="./profile_MLST_listeria/profiles_csv.csv" +refgenome="./reference_listeria/GCF_002213505.1_ASM221350v1_genomic.fna" + +#PARSE VARIABLE ARGUMENTS WITH getops +#common example with letters, for long options check longopts2getopts.sh +options=":1:2:d:s:g:c:a:i:o:C:S:f:l:L:T:M:X:y:Y:RVtvh" +while getopts $options opt; do + case $opt in + h ) + usage + exit 1 + ;; + v ) + echo $VERSION + exit 1 + ;; + \?) + echo "Invalid Option: -$OPTARG" 1>&2 + usage + exit 1 + ;; + : ) + echo "Option -$OPTARG requires an argument." >&2 + exit 1 + ;; + * ) + echo "Unimplemented option: -$OPTARG" >&2; + exit 1 + ;; + + esac +done +shift $((OPTIND-1)) + +## Execute plasmidID with test data. +echo "Executing:../taranis.py allele_calling -coregenedir $schema -inputdir $assemblies -refgenome $refgenome -outputdir allele_calling_test -percentlength 20 -refalleles $refallele -profile $profile" +echo "Assemblies: $assemblies" +echo "Schema: $schema" +echo "$PWD" +cd +$script_dir/../taranis.py analyze_schema -inputdir $script_dir/MLST_listeria -outputdir analyze_schema_test + +$script_dir/../taranis.py reference_alleles -coregenedir $script_dir/MLST_listeria -outputdir reference_alleles_test + +$script_dir/../taranis.py allele_calling -coregenedir $script_dir/$schema -inputdir $script_dir/$assemblies -refgenome $script_dir/$refgenome -outputdir allele_calling_test -percentlength 20 -refalleles reference_alleles_test -profile $script_dir/$profile + +$script_dir/../taranis.py distance_matrix -alleles_matrix allele_calling_test/result.tsv -outputdir distance_matrix_test + +echo "ALL DONE. TEST COMPLETED SUCCESSFULLY YOUR INSTALLATION SHOULD BE CORRECT." diff --git a/utils/taranis_utils.py b/utils/taranis_utils.py index cc481bd..3ef4bb9 100644 --- a/utils/taranis_utils.py +++ b/utils/taranis_utils.py @@ -186,7 +186,7 @@ def junk (): ## N return True -def check_prerequisites (pre_requisite_list, logger): +def check_prerequisites (pre_requisite_list, logger): # check if blast is installed and has the minimum version for program, version in pre_requisite_list : if not check_program_is_exec_version (program , version, logger): @@ -213,9 +213,9 @@ def check_program_is_exec_version (program, version, logger): def create_blastdb (file_name, db_name,db_type, logger ): - f_name = os.path.basename(file_name).split('.') - db_dir = os.path.join(db_name,f_name[0]) - output_blast_dir = os.path.join(db_dir, f_name[0]) + f_name = '.'.join(os.path.basename(file_name).split('.')[:-1]) + db_dir = os.path.join(db_name,f_name) + output_blast_dir = os.path.join(db_dir, f_name) if not os.path.exists(db_dir): try: @@ -247,7 +247,7 @@ def get_fasta_file_list (check_directory, logger): if not os.path.isdir(check_directory): logger.info('directory %s does not exists', check_directory) return False - + fasta_format = ['*.fasta', '*.fa', '*.fna', '*.ffn', '*.frn'] list_filtered_files = [] for extension in fasta_format: @@ -255,7 +255,7 @@ def get_fasta_file_list (check_directory, logger): sublist_filtered_files = glob.glob(filter_files) for fasta_file in sublist_filtered_files: list_filtered_files.append(fasta_file) - + list_filtered_files.sort() if len (list_filtered_files) == 0 : logger.info('directory %s does not have any fasta file ', check_directory) @@ -315,15 +315,15 @@ def check_core_gene_quality(fasta_file_path, logger): return locus_quality -def check_sequence_order(allele_sequence, logger): +def check_sequence_order(allele_sequence, logger): start_codon_forward= ['ATG','ATA','ATT','GTG', 'TTG'] start_codon_reverse= ['CAT', 'TAT','AAT','CAC','CAA'] stop_codons_forward = ['TAA', 'TAG','TGA'] stop_codons_reverse = ['TTA', 'CTA','TCA'] - + # check direction - if allele_sequence[0:3] in start_codon_forward or allele_sequence[-3:] in stop_codons_forward: + if allele_sequence[0:3] in start_codon_forward or allele_sequence[-3:] in stop_codons_forward: return 'forward' if allele_sequence[-3:] in start_codon_reverse or allele_sequence[0:3] in stop_codons_reverse: return 'reverse' @@ -346,20 +346,20 @@ def get_stop_codon_index(seq) : ### (tsv para algunos locus? Utils para analyze schema?) def get_gene_annotation (annotation_file, annotation_dir, genus, species, usegenus, logger) : - + name_file = os.path.basename(annotation_file).split('.') annotation_dir = os.path.join (annotation_dir, 'annotation', name_file[0]) - + if usegenus == 'true': annotation_result = subprocess.run (['prokka', annotation_file, '--outdir', annotation_dir, - '--genus', genus, '--species', species, '--usegenus', + '--genus', genus, '--species', species, '--usegenus', '--gcode', '11', '--prefix', name_file[0], '--quiet']) elif usegenus == 'false': annotation_result = subprocess.run (['prokka', annotation_file, '--outdir', annotation_dir, - '--genus', genus, '--species', species, + '--genus', genus, '--species', species, '--gcode', '11', '--prefix', name_file[0], '--quiet']) - + annot_tsv = [] tsv_path = os.path.join (annotation_dir, name_file[0] + '.tsv') @@ -377,8 +377,8 @@ def get_gene_annotation (annotation_file, annotation_dir, genus, species, usegen gene_annot = annot_tsv[1][2] except: gene_annot = 'Not found by Prokka' - - try: + + try: product_annot = annot_tsv[1][4] except: product_annot = 'Not found by Prokka'