Merge branch 'dev'

geraldinepascal · Apr 1, 2021 · 6e7d406 · 6e7d406
2 parents 272569b + fb62d38
commit 6e7d406
Show file tree

Hide file tree

Showing 57 changed files with 247 additions and 178 deletions.
diff --git a/INSTALL_from_source.md b/INSTALL_from_source.md
@@ -7,7 +7,7 @@ It has been tested on a Xubuntu 16.04 virtual machine.
 Here we suppose to install dependencies in the same directory as FROGS.
 
 ```bash
-version=3.2.1
+version=3.2.2
 DIR=`pwd`
 BIN_DIR=$DIR/bin
 mkdir -p $BIN_DIR
@@ -51,7 +51,7 @@ sudo apt-get install python3-scipy
 
 
 
-## 1) vsearch 2.15.1 , for FROGS Preprocess and FROGS Remove_chimera
+## 1) vsearch 2.17.0 , for FROGS Preprocess and FROGS Remove_chimera
 
 **require** :  autoconf, zlib and bzip2 libraries
 
@@ -63,16 +63,17 @@ sudo apt-get install autoconf libz-dev libbz2-dev
 
 ```bash
 cd $BIN_DIR
-wget https://github.com/torognes/vsearch/archive/v2.15.1.tar.gz
-tar xzf v2.15.1.tar.gz
-cd vsearch-2.15.1
+wget https://github.com/torognes/vsearch/archive/v2.17.0.tar.gz
+tar xzf v2.17.0.tar.gz
+cd vsearch-2.17.0
 ./autogen.sh
 ./configure
 make
+make install
 # test installation
 ./bin/vsearch -version
 # add to FROGS
-ln -s $BIN_DIR/vsearch-2.15.1/bin/vsearch $FROGS_libexec/.
+ln -s $BIN_DIR/vsearch-2.17.0/bin/vsearch $FROGS_libexec/.
 ```
 
 ## 2) FLASH 1.2.11 (optional), for FROGS Preprocess
@@ -453,6 +454,6 @@ Step phyloseq_structure lundi 4 janvier 2021, 14:19:47 (UTC+0100)
 Step phyloseq_clustering lundi 4 janvier 2021, 14:20:38 (UTC+0100)
 Step phyloseq_manova lundi 4 janvier 2021, 14:20:54 (UTC+0100)
 Step deseq2_preprocess lundi 4 janvier 2021, 14:21:10 (UTC+0100)
-Step deseq2_visualization lundi 4 janvier 2021, 14:21:43 (UTC+0100)
+Step deseq2_visualisation lundi 4 janvier 2021, 14:21:43 (UTC+0100)
 Completed with success
 ```
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 Visit our web site : http://frogs.toulouse.inrae.fr/
 
-[![Release](https://img.shields.io/badge/release-3.2.1-blue.svg)![Date](https://img.shields.io/badge/date-February%202021-red.svg)](https://github.com/geraldinepascal/FROGS-wrappers/releases) [<img src="https://www.podcastscience.fm/wp-content/uploads/2017/12/deezer.png" width="5%" style="display: block; margin: auto;"/>](https://www.deezer.com/fr/playlist/5233843102?utm_source=deezer&utm_content=playlist-5233843102&utm_term=18632989_1545296531&utm_medium=web)
+[![Release](https://img.shields.io/badge/release-3.2.2-blue.svg)![Date](https://img.shields.io/badge/date-April%202021-red.svg)](https://github.com/geraldinepascal/FROGS-wrappers/releases) [<img src="https://www.podcastscience.fm/wp-content/uploads/2017/12/deezer.png" width="5%" style="display: block; margin: auto;"/>](https://www.deezer.com/fr/playlist/5233843102?utm_source=deezer&utm_content=playlist-5233843102&utm_term=18632989_1545296531&utm_medium=web)
 
 
 
@@ -101,10 +101,10 @@ FROGS relies on different specific tools for each of the analysis steps.
 
 | FROGS Tools |Dependancy  | version tested |
 | ----------- | :--------: | -------------: |
-| Preprocess and Remove_chimera |        [vsearch](https://github.com/torognes/vsearch)        | 2.15.1 |
+| Preprocess and Remove_chimera |        [vsearch](https://github.com/torognes/vsearch)        | 2.17.0 |
 | Preprocess                    | [flash](https://sourceforge.net/projects/flashpage/files/) (optional) |               1.2.11 |
-| Preprocess                    |       [cutadapt](https://github.com/marcelm/cutadapt)        |            3.1 |
-| Clustering                    |          [swarm](https://github.com/torognes/swarm)          |            3.0.0 |
+| Preprocess                    |       [cutadapt](https://github.com/marcelm/cutadapt) (need to be >=2.8)       |            3.1 |
+| Clustering                    |          [swarm](https://github.com/torognes/swarm) (need to be >=2.1)          |            3.0.0 |
 | ITSx                          |        [ITSx](http://microbiology.se/software/itsx/)         |  1.1.2 |
 | Affiliation_OTU               | [NCBI BLAST+](http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download) |          2.10.1 |
 | Affiliation_OTU               |    [RDP Classifier](https://github.com/rdpstaff/RDPTools)    |                2.0.3 |
@@ -132,9 +132,9 @@ FROGS is now available on bioconda (https://anaconda.org/bioconda/frogs).
   * to create a specific environment for a specific FROGS version
 
 ```
-conda env create --name frogs@3.2.1 --file frogs-conda-requirements.yaml
+conda env create --name frogs@3.2.2 --file frogs-conda-requirements.yaml
 # to use FROGS, first you need to activate your environment
-conda activate frogs@3.2.1
+conda activate frogs@3.2.2
 ```
 
 ### From source
@@ -146,7 +146,7 @@ see [INSTALL_from_source.md](INSTALL_from_source.md)
 To check your installation you can type:
 ```
 cd <FROGS_PATH>/test
-# when using conda FROGS_PATH=<conda_env_dir>/__frogs@3.2.1/share/FROGS_3.2.1
+# when using conda FROGS_PATH=<conda_env_dir>/frogs@3.2.2/share/FROGS_3.2.2
 
 sh test.sh <FROGS_PATH> <NB_CPU> <JAVA_MEM> <OUT_FOLDER>
 ```
@@ -183,7 +183,7 @@ Step phyloseq_structure mardi 10 novembre 2020, 11:20:45 (UTC+0100)
 Step phyloseq_clustering mardi 10 novembre 2020, 11:21:59 (UTC+0100)
 Step phyloseq_manova mardi 10 novembre 2020, 11:22:20 (UTC+0100)
 Step deseq2_preprocess mardi 10 novembre 2020, 11:22:42 (UTC+0100)
-Step deseq2_visualization mardi 10 novembre 2020, 11:23:29 (UTC+0100)
+Step deseq2_visualisation mardi 10 novembre 2020, 11:23:29 (UTC+0100)
 Completed with success
 ```
 

diff --git a/RELEASES_NOTES.md b/RELEASES_NOTES.md
@@ -1,3 +1,32 @@
+# v3.2.2 [2021-04]
+
+### Modifications
+
+* Preprocess: use maxdiffpct instead of maxdiffs in vsearch fastq_mergepairs command line, and recommand 2.17.0 version. 
+* DESeq2 : 
+  * rename tool in DESeq2 visualisation (with s instead of z)
+  * improve filter in datatable
+  * change color
+  * add padj threshold in MAplot
+* ITSx : add organims model option (it was restrict to Fungi, take care of increase computing time
+* OTU affiliation : sort blast affiliations in biom by taxonomy
+* Clusters stat : add precision in HTML
+* Remove chimera : add precision in HTML, and rename table columns names
+* Affiliation Filter : add precision in HTML
+* Various tools:
+  * add taxonomic rank consistency between user declaration and input files (reference database, biom)
+  * correct typo
+
+### Bug fixed
+
+* DESeq2 visualisation : 
+  * add intermediates_dir argument in Rscript command
+  * debug pie charts color attribution
+* Normalisation : correct bug when calculating number of OTU by sample
+* ITSx : correct stderr scanning
+* Affiliation filter : correct bug in OTU filter by sample and by filter
+
+
 # v3.2.1 [2021-02-22]
 
 ### Bug fixed

diff --git a/app/deseq2_visualisation.Rmd b/app/deseq2_visualisation.Rmd
@@ -0,0 +1 @@
+../tools/deseq2_visualisation/deseq2_visualisation.Rmd
diff --git a/app/deseq2_visualisation.py b/app/deseq2_visualisation.py
@@ -0,0 +1 @@
+../tools/deseq2_visualisation/deseq2_visualisation.py
diff --git a/app/deseq2_visualization.Rmd b/app/deseq2_visualization.Rmd
diff --git a/app/deseq2_visualization.py b/app/deseq2_visualization.py
diff --git a/frogs-conda-requirements.yaml b/frogs-conda-requirements.yaml
@@ -3,15 +3,15 @@ channels:
   - bioconda
 dependencies:
 # bioconda
-  - frogs =3.2.1
+  - frogs =3.2.2
   - emboss =6.6
   - flash =1.2
   # need to be >=2.8
   - cutadapt =2.10
   # need to be >=2.1
   - swarm =3.0.0
-  # need to be 2.13.4
-  - vsearch =2.15.1
+  # need to be >= 2.17
+  - vsearch =2.17.0
   - itsx =1.1.2
   - blast =2.10
   # - rdptool=2.0.3 # is already included in the frogs dependency

diff --git a/libexec/addAffiliation2biom.py b/libexec/addAffiliation2biom.py
@@ -170,12 +170,16 @@ def get_bests_blast_affi( blast_files, taxonomy_by_subject ):
             if query_id not in blast_annot or blast_annot[query_id]['score'] < score:
                 blast_annot[query_id] = {
                     'score': score,
-                    'alignments': list(),
+                    'alignments': dict(),
                 }
             if blast_annot[query_id]['score'] == score: # select best HSP
                 # ~ subject_id = parts[1].split("#")[0]  # Subject field : <ID>#<PARTIAL_DESC> # why do not take into account the partial description ?
                 subject_id = parts[1]
-                blast_annot[query_id]['alignments'].append({
+                #store alignment by taxonomy to ordered affiliations by taxonomy and allow ordered output in bio_to_tsv tool
+                taxonomy = ';'.join(taxonomy_by_subject[subject_id])
+                if not taxonomy in blast_annot[query_id]['alignments']:
+                    blast_annot[query_id]['alignments'][taxonomy] = list()
+                blast_annot[query_id]['alignments'][taxonomy].append({
                     'subject': subject_id,
                     'taxonomy': taxonomy_by_subject[subject_id],
                     'evalue': parts[10],
@@ -219,8 +223,9 @@ def aff_to_metadata(reference_file, biom_in, biom_out, blast_files=None, rdp_fil
             blast_taxonomy = list()
             blast_affiliations = list()
             if cluster_id in cluster_blast_annot: # Current observation has a match
-                blast_taxonomy = get_tax_consensus( [alignment['taxonomy'] for alignment in cluster_blast_annot[cluster_id]['alignments']] )
-                blast_affiliations = cluster_blast_annot[cluster_id]['alignments']
+                blast_taxonomy = get_tax_consensus( [taxonomy.split(';') for taxonomy in cluster_blast_annot[cluster_id]['alignments']] )
+                for taxonomy in  cluster_blast_annot[cluster_id]['alignments']:
+                    blast_affiliations.extend(cluster_blast_annot[cluster_id]['alignments'][taxonomy])
             biom.add_metadata( cluster_id, "blast_affiliations", blast_affiliations, "observation" )
             biom.add_metadata( cluster_id, "blast_taxonomy", blast_taxonomy, "observation" )
         # RDP

diff --git a/libexec/parallelChimera.py b/libexec/parallelChimera.py
@@ -94,7 +94,8 @@ def write_summary( samples_names, sample_logs, log_remove_global, log_remove_spl
     FH_out.write( "\n" )
 
     FH_out.write( '##Metrics by sample\n' )
-    FH_out.write( "\t".join(['#Sample name', 'Kept nb', 'Kept abundance', 'Removed nb', 'Removed abundance', 'Abundance of the most abundant removed', 'Detected nb', 'Detected abundance', 'Abundance of the most abundant detected']) + "\n" )
+    # FH_out.write( "\t".join(['#Sample name', 'Kept nb', 'Kept abundance', 'Removed nb', 'Removed abundance', 'Abundance of the most abundant removed', 'Detected nb', 'Detected abundance', 'Abundance of the most abundant detected']) + "\n" )
+    FH_out.write( "\t".join(['#Sample name', "Clusters kept", "Cluster abundance kept", "Chimeric clusters removed", "Chimeric abundance removed", "Abundance of the most abundant chimera removed", "Individual chimera detected", "Individual chimera abundance detected", "Abundance of the most abundant individual chimera detected"]) + "\n" )
     for sample in sorted(samples_names):
         sample_remove_results = "\t".join(map(str, [sample,
                                                     log_remove_spl[sample]['nb_kept'],

diff --git a/libexec/parallelITSx.py b/libexec/parallelITSx.py
@@ -129,14 +129,13 @@ def submit_cmd( cmd, cwd=None):
     stdout, stderr = p.communicate()
 
     # check error status
-    if p.returncode != 0:
-        # stdeh = open(stderr)
-        error_msg = "".join( map(str, stderr.decode('utf-8').readlines()) )
-        # stdeh.close()
-        raise_exception( Exception( "\n\n#ERROR : " + error_msg + "\n\n" ))
-
-def parallel_submission( function, inputs, its, cwds, outputs, logs, cpu_used):
-    processes = [{'process':None, 'inputs':None, 'its':its, 'cwd' : None, 'outputs':None, 'log_files':None} for idx in range(cpu_used)]
+    # if p.returncode != 0:		# ==> ITSx do not return exit code 1!!
+    error_msg = stderr.decode('utf-8')
+    if "ERROR" in error_msg:      
+        raise_exception( Exception( "\n\n#ERROR : \n" + error_msg + "\n\n" ))
+
+def parallel_submission( function, inputs, its, organism_groups, cwds, outputs, logs, cpu_used):
+    processes = [{'process':None, 'inputs':None, 'its':its, 'organism_groups':organism_groups, 'cwd' : None, 'outputs':None, 'log_files':None} for idx in range(cpu_used)]
     # Launch processes
     for idx in range(len(inputs)):
         process_idx = idx % cpu_used
@@ -148,10 +147,10 @@ def parallel_submission( function, inputs, its, cwds, outputs, logs, cpu_used):
     for current_process in processes:
         if idx == 0:  # First process is threaded with parent job
             current_process['process'] = threading.Thread(target=function,
-                                                          args=(current_process['inputs'],current_process['its'], current_process['cwd'], current_process['outputs'], current_process['log_files']))
+                                                          args=(current_process['inputs'],current_process['its'], current_process['organism_groups'], current_process['cwd'], current_process['outputs'], current_process['log_files']))
         else:  # Others processes are processed on diffrerent CPU
             current_process['process'] = multiprocessing.Process(target=function,
-                                                                 args=(current_process['inputs'], current_process['its'], current_process['cwd'], current_process['outputs'], current_process['log_files']))
+                                                                 args=(current_process['inputs'], current_process['its'], current_process['organism_groups'], current_process['cwd'], current_process['outputs'], current_process['log_files']))
         current_process['process'].start()
     # Wait processes end
     for current_process in processes:
@@ -211,7 +210,7 @@ def parseITSxResult(input_dir, prefix, its, out, log):
                 FH_log.write("\tnb "+detection_type+ " (removed): " + str(count_ITSx[detection_type]) + "\n")
     FH_log.close()
 
-def process_ITSx(in_fasta, its, cwd, out, log_file):
+def process_ITSx(in_fasta, its, organism_groups, cwd, out, log_file):
 
     os.mkdir(cwd)
     prefix = os.path.splitext(os.path.split(in_fasta)[1])[0]
@@ -220,7 +219,7 @@ def process_ITSx(in_fasta, its, cwd, out, log_file):
     FH_log = Logger( log_file )
     FH_log.write("## Input file : " + os.path.split(in_fasta)[1] + "\n" ) 
     FH_log.write("## in working directory: " + cwd + "\n")
-    cmd = ["ITSx", "-i", in_fasta, "-o", prefix , "--preserve", "T","-t","F","--save_regions","all"]
+    cmd = ["ITSx", "-i", in_fasta, "-o", prefix , "--preserve", "T","-t",",".join(organism_groups),"--save_regions","all"]
     FH_log.write("## ITSx command: " + " ".join(cmd) + "\n")
     submit_cmd( cmd , cwd )
     FH_log.close()
@@ -418,9 +417,9 @@ def main_process(args):
             in_fasta = os.path.abspath(args.input_fasta)
             tmp_dir = tmpFiles.add_dir(os.path.split(args.output_fasta)[1])
             if not args.check_its_only:
-                process_ITSx(in_fasta, args.its, tmp_dir, args.output_fasta, args.log_file)
+                process_ITSx(in_fasta, args.its, args.organism_groups, tmp_dir, args.output_fasta, args.log_file)
             else:
-                process_ITSx(in_fasta, 'no_detections', tmp_dir, args.output_fasta, args.log_file)
+                process_ITSx(in_fasta, 'no_detections', args.organism_groups, tmp_dir, args.output_fasta, args.log_file)
         else:
             fasta_ITSx_list = list()
             ITSx_outputs = list()
@@ -432,9 +431,9 @@ def main_process(args):
             logs_ITSx = [tmpFiles.add(os.path.basename(current_fasta) + "_itsx.log") for current_fasta in fasta_ITSx_list]
             tmp_dirs = [ tmpFiles.add_dir(os.path.split(current_fasta)[1]) for current_fasta in fasta_ITSx_list ]
             if not args.check_its_only:
-                parallel_submission( process_ITSx, fasta_ITSx_list, args.its, tmp_dirs, ITSx_outputs, logs_ITSx, len(fasta_ITSx_list) )
+                parallel_submission( process_ITSx, fasta_ITSx_list, args.its, args.organism_groups, tmp_dirs, ITSx_outputs, logs_ITSx, len(fasta_ITSx_list) )
             else:
-                parallel_submission( process_ITSx, fasta_ITSx_list, 'no_detections', tmp_dirs, ITSx_outputs, logs_ITSx, len(fasta_ITSx_list) )
+                parallel_submission( process_ITSx, fasta_ITSx_list, 'no_detections', args.organism_groups, tmp_dirs, ITSx_outputs, logs_ITSx, len(fasta_ITSx_list) )
 
             # Logs
             append_results(ITSx_outputs, logs_ITSx, args.output_fasta, args.log_file)
@@ -470,6 +469,7 @@ def main_process(args):
     parser.add_argument( '--debug', default=False, action='store_true', help="Keep temporary files to debug program." )
     parser.add_argument( '-v', '--version', action='version', version=__version__ + " [ITSx " + get_ITSx_version() + "]" )
     parser.add_argument( '-i', '--its', type=str, required=True, choices=['ITS1','ITS2'], help='Which ITS region are targeted. either ITS1 or ITS2 ')
+    parser.add_argument( '--organism-groups', type=str, nargs="*", default=['F'], help='Reduce ITSx scan to specified organim groups. [Default: %(default)s , which means Fungi only]')
     parser.add_argument( '--check-its-only', action='store_true', default=False, help='Check only if sequences seem to be an ITS. No sequence trimming will happen' )
     group_input = parser.add_argument_group( 'Inputs' ) # Inputs
     group_input.add_argument( '-f', '--input-fasta', required=True, help='The fasta input sequences to treat' )

diff --git a/libexec/reduce_ref_for_needleall.py b/libexec/reduce_ref_for_needleall.py
@@ -94,10 +94,10 @@ def extract_ref(input_blast_R1, input_blast_R2, input_ref, output_ref):
     parse_blast(input_blast_R2, best_ref)
 
     # extract ref
+    c = 0
     if len(best_ref) > 0 :
         FH_in = FastaIO(input_ref)
         FH_out = FastaIO(output_ref,"wt")
-        c = 0
         for record in FH_in:
             c += 1
             if record.id in best_ref:

diff --git a/test/test.sh b/test/test.sh
@@ -474,19 +474,19 @@ then
 fi
 
 
-echo "Step deseq2_visualization `date`"
+echo "Step deseq2_visualisation `date`"
 
-deseq2_visualization.py \
+deseq2_visualisation.py \
  --phyloseqData $out_dir/16-phylo_import.Rdata \
  --dds $out_dir/23-deseq2_preprocess.Rdata \
- --log-file $out_dir/24-deseq2_visualization.log \
- --html $out_dir/24-deseq2_visualization.nb.html \
+ --log-file $out_dir/24-deseq2_visualisation.log \
+ --html $out_dir/24-deseq2_visualisation.nb.html \
  --var EnvType --mod1 BoeufHache --mod2 SaumonFume
 
 
 if [ $? -ne 0 ]
 then
-	echo "Error in deseq2_visualization " >&2
+	echo "Error in deseq2_visualisation " >&2
 	exit 1;
 fi
 

diff --git a/test/test_dependancies.sh b/test/test_dependancies.sh
@@ -883,31 +883,31 @@ then
 	echo "Difference in deseq2_preprocess : 23-deseq2_preprocess.Rdata " >&2
 fi
 
-echo "Step deseq2_visualization `date`"
+echo "Step deseq2_visualisation `date`"
 
 if $run_programs
 then
-	deseq2_visualization.py \
+	deseq2_visualisation.py \
 	 --phyloseqData $expected_dir/16-phylo_import.Rdata \
 	 --dds $expected_dir/23-deseq2_preprocess.Rdata \
-	 --log-file $out_dir/24-deseq2_visualization.log \
-	 --html $out_dir/24-deseq2_visualization.nb.html \
+	 --log-file $out_dir/24-deseq2_visualisation.log \
+	 --html $out_dir/24-deseq2_visualisation.nb.html \
 	 --var EnvType --mod1 BoeufHache --mod2 SaumonFume
 
 
 	if [ $? -ne 0 ]
 	then
-		echo "Error in deseq2_visualization " >&2
+		echo "Error in deseq2_visualisation " >&2
 		exit 1;
 	fi
 fi
 
-grep otu_01582 $out_dir/24-deseq2_visualization.nb.html | sed 's/],/],\n/g' > tmp
-grep otu_01582 $expected_dir/24-deseq2_visualization.nb.html | sed 's/],/],\n/g'  > tmp1
+grep otu_01582 $out_dir/24-deseq2_visualisation.nb.html | sed 's/],/],\n/g' > tmp
+grep otu_01582 $expected_dir/24-deseq2_visualisation.nb.html | sed 's/],/],\n/g'  > tmp1
 
 if diff_line tmp tmp1 1
 then
-	echo "Difference in deseq2_visualization : 24-deseq2_visualization.nb.html  " >&2
+	echo "Difference in deseq2_visualisation : 24-deseq2_visualisation.nb.html  " >&2
 fi
 
 rm tmp tmp1