Merge branch 'release/2.0.1'

BU-ISCIII · Jul 14, 2021 · 278d07c · 278d07c
2 parents e92f8f8 + bf74310
commit 278d07c
Show file tree

Hide file tree

Showing 13 changed files with 275 additions and 46 deletions.
diff --git a/.github/workflows/dockerhub_push_release.yml b/.github/workflows/dockerhub_push_release.yml
@@ -0,0 +1,25 @@
+name: deploy release
+# This builds the docker image and pushes it to DockerHub
+on:
+  release:
+     types: [published]
+jobs:
+  push_dockerhub:
+    name: Push new Docker image to Docker Hub (release)
+    runs-on: ubuntu-latest
+    # Only run for the official repo, for releases and merged PRs
+    if: ${{ github.repository == 'BU-ISCIII/taranis' }}
+    env:
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+      DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }}
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v2
+
+      - name: Build new docker image
+        run: docker build --no-cache . -t buisciii/taranis:${{ github.event.release.tag_name }}
+
+      - name: Push Docker image to DockerHub (develop)
+        run: |
+          echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
+          docker push buisciii/taranis:${{ github.event.release.tag_name }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,38 @@
+name: tests ci
+# This workflow runs the pipeline with the minimal test dataset to check that it completes any errors
+on:
+  push:
+    branches: [develop]
+  pull_request_target:
+    branches: [develop]
+  release:
+    types: [published]
+
+jobs:
+  push_dockerhub:
+    name: Push new Docker image to Docker Hub (dev)
+    runs-on: ubuntu-latest
+    # Only run for the official repo, for releases and merged PRs
+    if: ${{ github.repository == 'BU-ISCIII/taranis' }}
+    env:
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+      DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASSWORD }}
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v2
+
+      - name: Build new docker image
+        run: docker build --no-cache . -t buisciii/taranis:dev
+
+      - name: Push Docker image to DockerHub (develop)
+        run: |
+          echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
+          docker push buisciii/taranis:dev
+  run-tests:
+    name: Run tests
+    needs: push_dockerhub
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run pipeline with test data
+        run: |
+            docker run buisciii/taranis:dev bash -c /opt/taranis/test/test.sh
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,17 @@
+FROM continuumio/miniconda3:latest
+
+RUN mkdir /opt/taranis/
+ADD utils /opt/taranis/utils
+ADD test /opt/taranis/test
+ADD *.py /opt/taranis/
+ADD environment.yml /opt/taranis/
+ADD logging_config.ini /opt/taranis/
+ADD README.md /opt/taranis/
+ADD LICENSE /opt/taranis/
+
+SHELL ["/bin/bash", "-c"]
+RUN cd /opt/taranis
+RUN /opt/conda/bin/conda env create -f /opt/taranis/environment.yml && /opt/conda/bin/conda clean -a
+RUN /opt/conda/bin/conda env export --name taranis > taranis.yml
+RUN echo "conda activate taranis" > ~/.bashrc
+ENV PATH /opt/conda/envs/taranis:/opt/conda/envs/taranis/utils:$PATH
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ This option is recomended.
 
 Install Anaconda3.
 
-`conda install -c bioconda taranis`
+`conda install -c conda-forge -c bioconda -c defaults taranis`
 
 Wait for the environment to solve. <br>
 Ignore warnings/errors.

diff --git a/allele_calling.py b/allele_calling.py
@@ -1196,19 +1196,26 @@ def get_ST_profile(outputdir, profile_csv_path, exact_dict, inf_dict, core_gene_
                 break
 
         if sample_name not in samples_profiles_dict:
-            if len(analysis_profiles_dict[sample_name]) == len(profile_header):
-                new_st_id = str(len(ST_profiles_dict) + 1)
-                ST_profiles_dict[new_st_id  + "_INF"] = analysis_profile_dict[sample_name]
-                inf_ST[new_st_id] = analysis_profile_dict[sample_name]
+            if sample_name in analysis_profiles_dict:
+                if len(analysis_profiles_dict[sample_name]) == len(profile_header):
+                    new_st_id = str(len(ST_profiles_dict) + 1)
+                    ST_profiles_dict[new_st_id  + "_INF"] = analysis_profile_dict[sample_name]
+                    inf_ST[new_st_id] = analysis_profile_dict[sample_name]
 
-                samples_profiles_dict[sample_name]=new_st_id  + "_INF"
+                    samples_profiles_dict[sample_name]=new_st_id  + "_INF"
 
-                if "New" not in count_st:
-                    count_st["New"] = {}
-                if new_st_id not in count_st["New"]:
-                    count_st["New"][new_st_id] = 0
-                count_st["New"][new_st_id] += 1
+                    if "New" not in count_st:
+                        count_st["New"] = {}
+                    if new_st_id not in count_st["New"]:
+                        count_st["New"][new_st_id] = 0
+                    count_st["New"][new_st_id] += 1
+
+                else:
+                    samples_profiles_dict[sample_name] = '-'
 
+                    if "Unknown" not in count_st:
+                        count_st["Unknown"] = 0
+                    count_st["Unknown"] += 1
             else:
                 samples_profiles_dict[sample_name] = '-'
 

diff --git a/analyze_schema.py b/analyze_schema.py
@@ -6,6 +6,7 @@
 import glob
 from datetime import datetime
 import statistics
+from collections import Counter
 #import matplotlib.pyplot as plt
 import plotly.graph_objs as go
 import plotly.io as pio
@@ -170,7 +171,9 @@ def extract_info_schema (schema_files, outputdir, genus, species, usegenus, logg
             stdev = 0
         else:
             stdev = statistics.stdev(alleles_len)
-        schema_statistics[gene_name]=[statistics.mode(alleles_len), statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)]
+
+        #schema_statistics[gene_name]=[statistics.mode(alleles_len), statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)]
+        schema_statistics[gene_name]=[list(Counter(alleles_len).most_common(1)[0])[0], statistics.mean(alleles_len), stdev, min(alleles_len), max(alleles_len)]
 
         for length in list(set(alleles_len)):
             schema_variability_count[gene_name][str(length)] = str(alleles_len.count(length))
@@ -572,14 +575,16 @@ def analyze_schema (inputdir, outputdir, genus, species, usegenus, logger) :
                 total_alleles += int(schema_variability_count[core][length])
 
             stat_fh.write(core + '\t' + '\t'.join (map(str,schema_statistics[core])) + '\t' + ', '.join(length_number) + '\t' + str(total_alleles) + '\n')
+            #stat_fh.write(core + '\t' + ', '.join(map(str,schema_statistics[core][0])) + '\t' + '\t'.join (map(str,schema_statistics[core][1::])) + '\t' + ', '.join(length_number) + '\t' + str(total_alleles) + '\n')
 
     # Saving schema annotation to file
-    logger.info('Saving core gene schema annotation to file..')
-    annotation_file =  os.path.join(outputdir, 'raw_info' , 'annotation.tsv')
-    with open (annotation_file , 'w') as annot_fh :
-        annot_fh.write('\t'.join(header_annotation) + '\n')
-        for core in sorted(annotation_core_dict) :
-            annot_fh.write(core + '\t' + '\t'.join(annotation_core_dict[core]) + '\n')
+    #logger.info('Saving core gene schema annotation to file..') 
+    #annotation_file =  os.path.join(outputdir, 'raw_info' , 'annotation.tsv')    
+    #with open (annotation_file , 'w') as annot_fh :
+    #    annot_fh.write('\t'.join(header_annotation) + '\n')
+    #    for core in sorted(annotation_core_dict) :
+    #        annot_fh.write(core + '\t' + '\t'.join(annotation_core_dict[core]) + '\n')
+
 
     logger.info('Completed dumped raw information to files')
 

diff --git a/distance_matrix.py b/distance_matrix.py
diff --git a/environment.yml b/environment.yml
@@ -4,17 +4,14 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - pip
-  - python>=3.6
+  - conda-forge::python>=3.6
+  - conda-forge::biopython==1.72
+  - conda-forge::pandas==1.2.4
+  - conda-forge::progressbar==2.5
+  - conda-forge::openpyxl==3.0.7
+  - conda-forge::plotly==5.0.0
+  - conda-forge::numpy==1.20.3
   - bioconda::prokka>=1.14
   - bioconda::blast>=2.9
   - bioconda::mash>=2
   - bioconda::prodigal=2.6.3
-
-  - pip:
-    - biopython==1.72
-    - pandas==1.2.4
-    - progressbar==2.5
-    - openpyxl==3.0.7
-    - plotly==5.0.0
-    - numpy==1.20.3
diff --git a/logging_config.ini b/logging_config.ini
@@ -19,5 +19,5 @@ class=handlers.RotatingFileHandler
 level=NOTSET
 ## args(log_file_name, 'a', maxBytes , backupCount)
 #args=('Programas/taranis_b/logs/taranis.log','a',500000,5)
-args=("./taranis.log",'a',500000,5)
+args=("taranis.log",'a',500000,5)
 formatter=logfileformatter
diff --git a/reference_alleles.py b/reference_alleles.py
diff --git a/taranis.py b/taranis.py
@@ -208,7 +208,7 @@ def check_arg (args=None) :
                                     default = 100)
     distance_matrix_parser.add_argument('-sample_missing_threshold', required = False,
                                     help = 'Missing values percentage threshold above which samples are excluded for distance matrix creation. Default is 100.',
-                                    default = 100)
+                                    default = 20)
     distance_matrix_parser.add_argument('-paralog_filter', required = False,
                                     help = 'Consider paralog tags (NIPH, NIPHEM) as missing values. Default is True',
                                     default = True)

diff --git a/test/test.sh b/test/test.sh
@@ -0,0 +1,140 @@
+#!/bin/bash --login
+
+# Exit immediately if a pipeline, which may consist of a single simple command, a list,
+#or a compound command returns a non-zero status: If errors are not handled by user
+set -e
+# Treat unset variables and parameters other than the special parameters ‘@’ or ‘*’ as an error when performing parameter expansion.
+
+#Print everything as if it were executed, after substitution and expansion is applied: Debug|log option
+#set -x
+
+#=============================================================
+# HEADER
+#=============================================================
+
+#INSTITUTION:ISCIII
+#CENTRE:BU-ISCIII
+#
+#ACKNOLEDGE: longops2getops.sh: https://gist.github.com/adamhotep/895cebf290e95e613c006afbffef09d7
+#
+#DESCRIPTION: test.sh uses test data for testing taranis installation.
+#
+#
+#================================================================
+# END_OF_HEADER
+#================================================================
+
+#SHORT USAGE RULES
+#LONG USAGE FUNCTION
+usage() {
+	cat << EOF
+
+plasmidID is a computational pipeline tha reconstruct and annotate the most likely plasmids present in one sample
+
+usage : $0
+
+	-v | --version		version
+	-h | --help		display usage message
+
+example: ./test.sh
+
+EOF
+}
+
+#================================================================
+# OPTION_PROCESSING
+#================================================================
+# Error handling
+error(){
+  local parent_lineno="$1"
+  local script="$2"
+  local message="$3"
+  local code="${4:-1}"
+
+	RED='\033[0;31m'
+	NC='\033[0m'
+
+  if [[ -n "$message" ]] ; then
+    echo -e "\n---------------------------------------\n"
+    echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
+    echo -e "MESSAGE:\n"
+    echo -e "$message"
+    echo -e "\n---------------------------------------\n"
+  else
+    echo -e "\n---------------------------------------\n"
+    echo -e "${RED}ERROR${NC} in Script $script on or near line ${parent_lineno}; exiting with status ${code}"
+    echo -e "\n---------------------------------------\n"
+  fi
+
+  exit "${code}"
+}
+
+# translate long options to short
+reset=true
+for arg in "$@"
+do
+    if [ -n "$reset" ]; then
+      unset reset
+      set --      # this resets the "$@" array so we can rebuild it
+    fi
+    case "$arg" in
+       	--help)    	set -- "$@" -h ;;
+       	--version) 	set -- "$@" -v ;;
+       # pass through anything else
+       *)         set -- "$@" "$arg" ;;
+    esac
+done
+
+#DECLARE FLAGS AND VARIABLES
+script_dir=$(dirname $(readlink -f $0))
+assemblies="./samples_listeria/"
+schema="./MLST_listeria/"
+profile="./profile_MLST_listeria/profiles_csv.csv"
+refgenome="./reference_listeria/GCF_002213505.1_ASM221350v1_genomic.fna"
+
+#PARSE VARIABLE ARGUMENTS WITH getops
+#common example with letters, for long options check longopts2getopts.sh
+options=":1:2:d:s:g:c:a:i:o:C:S:f:l:L:T:M:X:y:Y:RVtvh"
+while getopts $options opt; do
+	case $opt in
+        h )
+		  	usage
+		  	exit 1
+		  	;;
+		v )
+		  	echo $VERSION
+		  	exit 1
+		  	;;
+		\?)
+			echo "Invalid Option: -$OPTARG" 1>&2
+			usage
+			exit 1
+			;;
+		: )
+      		echo "Option -$OPTARG requires an argument." >&2
+      		exit 1
+      		;;
+      	* )
+			echo "Unimplemented option: -$OPTARG" >&2;
+			exit 1
+			;;
+
+	esac
+done
+shift $((OPTIND-1))
+
+## Execute plasmidID with test data.
+echo "Executing:../taranis.py allele_calling -coregenedir $schema -inputdir $assemblies -refgenome $refgenome -outputdir allele_calling_test -percentlength 20 -refalleles $refallele -profile $profile"
+echo "Assemblies: $assemblies"
+echo "Schema: $schema"
+echo "$PWD"
+cd
+$script_dir/../taranis.py analyze_schema -inputdir $script_dir/MLST_listeria -outputdir analyze_schema_test
+
+$script_dir/../taranis.py reference_alleles -coregenedir $script_dir/MLST_listeria -outputdir reference_alleles_test
+
+$script_dir/../taranis.py allele_calling -coregenedir $script_dir/$schema -inputdir $script_dir/$assemblies -refgenome $script_dir/$refgenome -outputdir allele_calling_test -percentlength 20 -refalleles reference_alleles_test -profile $script_dir/$profile
+
+$script_dir/../taranis.py distance_matrix -alleles_matrix allele_calling_test/result.tsv -outputdir distance_matrix_test
+
+echo "ALL DONE. TEST COMPLETED SUCCESSFULLY YOUR INSTALLATION SHOULD BE CORRECT."