From 421c4f24789e6c9022f80c2bc6e330f72d96d41e Mon Sep 17 00:00:00 2001 From: kiesers Date: Thu, 31 May 2018 11:02:35 +0200 Subject: [PATCH 1/4] added spades 3.12 merged reads are used sperately by spades. added possibility to drop normalization --- atlas/default_values.py | 8 +++- atlas/envs/required_packages.yaml | 2 +- atlas/rules/assemble.snakefile | 66 ++++++++++++++++++++++--------- atlas/template_config.yaml | 8 +++- 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/atlas/default_values.py b/atlas/default_values.py index f77c2faf..19268d40 100644 --- a/atlas/default_values.py +++ b/atlas/default_values.py @@ -106,7 +106,7 @@ def make_default_config(): conf["deduplicate"] = True conf["error_correction_overlapping_pairs"] = True - conf["merge_pairs_before_assembly"] = True + conf["contaminant_max_indel"] = CONTAMINANT_MAX_INDEL conf["contaminant_min_ratio"] = CONTAMINANT_MIN_RATIO @@ -117,10 +117,15 @@ def make_default_config(): conf["duplicates_only_optical"] = DUPLICATES_ONLY_OPTICAL conf["duplicates_allow_substitutions"] = DUPLICATES_ALLOW_SUBSTITUTIONS + + conf["normalize_reads_before_assembly"] = True conf["normalization_kmer_length"] = NORMALIZATION_KMER_LENGTH conf["normalization_target_depth"] = NORMALIZATION_TARGET_DEPTH conf["normalization_minimum_kmers"] = NORMALIZATION_MINIMUM_KMERS + conf["error_correction_before_assembly"] = True + + conf["merge_pairs_before_assembly"] = True conf["merging_k"] = MERGING_K conf["merging_extend2"] = MERGING_EXTEND2 conf["merging_flags"] = MERGING_FLAGS @@ -140,6 +145,7 @@ def make_default_config(): conf["prefilter_minimum_contig_length"] = PREFILTER_MINIMUM_CONTIG_LENGTH conf["spades_k"] = SPADES_K conf["spades_preset"] = 'meta' + conf["spades_skip_BayesHammer"] = False conf["minimum_average_coverage"] = MINIMUM_AVERAGE_COVERAGE conf["minimum_percent_covered_bases"] = MINIMUM_PERCENT_COVERED_BASES conf["minimum_mapped_reads"] = MINIMUM_MAPPED_READS diff --git a/atlas/envs/required_packages.yaml b/atlas/envs/required_packages.yaml index c24e97c4..cf976591 100644 --- a/atlas/envs/required_packages.yaml +++ b/atlas/envs/required_packages.yaml @@ -15,6 +15,6 @@ dependencies: - prokka=1.12 - pyyaml=3.12 - samtools=1.3.1 - - spades=3.11.0 + - spades=3.12.0 - sqlite=3.13.0 - subread=1.5.3 diff --git a/atlas/rules/assemble.snakefile b/atlas/rules/assemble.snakefile index e2917d84..495da362 100644 --- a/atlas/rules/assemble.snakefile +++ b/atlas/rules/assemble.snakefile @@ -9,13 +9,21 @@ import warnings localrules: rename_megahit_output, rename_spades_output, initialize_checkm, \ finalize_contigs, build_bin_report, build_assembly_report +ASSEMBLY_FRACTIONS = MULTIFILE_FRACTIONS +if config.get("merge_pairs_before_assembly", True) and PAIRED_END: + ASSEMBLY_FRACTIONS += ['me'] def get_preprocessing_steps(config): - preprocessing_steps = ["normalized"] - if config.get("error_correction_overlapping_pairs", True): + preprocessing_steps = [] + if config.get("normalize_reads_before_assembly", True): + preprocessing_steps.append("normalized") + + if config.get("error_correction_before_assembly", True): preprocessing_steps.append("errorcorr") + if config.get("merge_pairs_before_assembly", True) and PAIRED_END: preprocessing_steps.append("merged") + return ".".join(preprocessing_steps) @@ -142,7 +150,7 @@ rule merge_pairs: fraction=MULTIFILE_FRACTIONS) output: temp(expand("{{sample}}/assembly/reads/{{previous_steps}}.merged_{fraction}.fastq.gz", - fraction=MULTIFILE_FRACTIONS)) + fraction=ASSEMBLY_FRACTIONS)) threads: config.get("threads", 1) resources: @@ -159,25 +167,39 @@ rule merge_pairs: params: kmer = config.get("merging_k", MERGING_K), extend2 = config.get("merging_extend2", MERGING_EXTEND2), - flags = config.get("merging_flags", MERGING_FLAGS), - outmerged = lambda wc, output: os.path.join(os.path.dirname(output[0]), "%s_merged_pairs.fastq.gz" % wc.sample) + flags = config.get("merging_flags", MERGING_FLAGS) shell: """ bbmerge.sh -Xmx{resources.java_mem}G threads={threads} \ in1={input[0]} in2={input[1]} \ - outmerged={params.outmerged} \ + outmerged={output[3]} \ outu={output[0]} outu2={output[1]} \ {params.flags} k={params.kmer} \ extend2={params.extend2} 2> {log} - cat {params.outmerged} {input[2]} \ - > {output[2]} 2>> {log} + cp {input[2]} {output[2]} 2>> {log} """ assembly_params={} if config.get("assembler", "megahit") == "megahit": assembly_params['megahit']={'default':'','meta-sensitive':'--presets meta-sensitive','meta-large':' --presets meta-large'} + if PAIRED_END and config.get("merge_pairs_before_assembly", True): + + localrules: merge_se_me_for_megahit + rule merge_se_me_for_megahit: + input: + expand("{{sample}}/assembly/reads/{assembly_preprocessing_steps}_{fraction}.fastq.gz", + fraction=['se','me'], assembly_preprocessing_steps=assembly_preprocessing_steps) + output: + temp(expand("{{sample}}/assembly/reads/{assembly_preprocessing_steps}_{fraction}.fastq.gz", + fraction=['co'], assembly_preprocessing_steps=assembly_preprocessing_steps)) + shell: + "zcat {input} > {output}" + + ASSEMBLY_FRACTIONS = ['R1','R2','co'] + + rule run_megahit: input: expand("{{sample}}/assembly/reads/{assembly_preprocessing_steps}_{fraction}.fastq.gz", @@ -200,9 +222,8 @@ if config.get("assembler", "megahit") == "megahit": low_local_ratio = config.get("megahit_low_local_ratio", MEGAHIT_LOW_LOCAL_RATIO), min_contig_len = config.get("prefilter_minimum_contig_length", PREFILTER_MINIMUM_CONTIG_LENGTH), outdir = lambda wc, output: os.path.dirname(output[0]), - inputs = lambda wc, input: "-1 {0} -2 {1} --read {2}".format(*input) if PAIRED_END else "--read {0}".format(*input), - preset = assembly_params['megahit'][config['megahit_preset']] - + inputs = lambda wc, input: "-1 {0} -2 {1} ".format(*input) if PAIRED_END else "--read {0}".format(*input), + preset = assembly_params['megahit'][config['megahit_preset']], conda: "%s/required_packages.yaml" % CONDAENV threads: @@ -212,6 +233,7 @@ if config.get("assembler", "megahit") == "megahit": shell: """ rm -r {params.outdir} 2> {log} + megahit \ {params.inputs} \ --tmp-dir {TMPDIR} \ @@ -246,18 +268,19 @@ else: rule run_spades: input: expand("{{sample}}/assembly/reads/{assembly_preprocessing_steps}_{fraction}.fastq.gz", - fraction=MULTIFILE_FRACTIONS, + fraction=ASSEMBLY_FRACTIONS, assembly_preprocessing_steps=assembly_preprocessing_steps) output: temp("{sample}/assembly/contigs.fasta") benchmark: "logs/benchmarks/assembly/spades/{sample}.txt" params: - inputs = lambda wc, input: "-1 {0} -2 {1} -s {2}".format(*input) if PAIRED_END else "-s {0}".format(*input), + inputs = lambda wc, input: "-pe1-1 {0} -pe1-2 {1} -pe1-s {2}".format(*input) if PAIRED_END else "-s {0}".format(*input), + input_merged = lambda wc, input: "pe1-m {3}".format(*input) if len(input) == 4 else "", k = config.get("spades_k", SPADES_K), outdir = lambda wc: "{sample}/assembly".format(sample=wc.sample), - preset = assembly_params['spades'][config['spades_preset']] - # min_length=config.get("prefilter_minimum_contig_length", PREFILTER_MINIMUM_CONTIG_LENGTH) + preset = assembly_params['spades'][config['spades_preset']], + skip_error_correction = "--only-assembler" if config['spades_skip_BayesHammer'] else "f" log: "{sample}/logs/assembly/spades.log" # shadow: @@ -269,10 +292,15 @@ else: resources: mem=config.get("assembly_memory", ASSEMBLY_MEMORY) #in GB shell: - """ - spades.py --threads {threads} --memory {resources.mem} \ - -o {params.outdir} {params.preset} {params.inputs} > {log} 2>&1 - """ + "spades.py" + "--threads {threads}" + "--memory {resources.mem}" + "-o {params.outdir}" + "{params.preset}" + "{params.inputs} {params.input_merged}" + "{params.skip_error_correction}" + "> {log} 2>&1" + rule rename_spades_output: diff --git a/atlas/template_config.yaml b/atlas/template_config.yaml index 31b6a994..4b904900 100644 --- a/atlas/template_config.yaml +++ b/atlas/template_config.yaml @@ -60,12 +60,17 @@ contaminant_ambiguous: best ######################## # Pre-assembly-processing ######################## + +normalize_reads_before_assembly : true # target kmer depth normalization_target_depth: 100 normalization_kmer_length: 21 normalization_minimum_kmers: 15 + +error_correction_before_assembly : true + # join R1 and R2 at overlap; unjoined reads are still utilized -merge_pairs_before_assembly: true +merge_pairs_before_assembly : true # extend reads while merging to this many nucleotides merging_extend2: 40 # Iterations are performed until extend2 x iterations @@ -94,6 +99,7 @@ megahit_preset: default # Spades #------------ +spades_skip_BayesHammer: False spades_k: auto # Filtering From 9247476fef2cced76067b801b915b30063fa426c Mon Sep 17 00:00:00 2001 From: kiesers Date: Thu, 31 May 2018 11:32:18 +0200 Subject: [PATCH 2/4] corrected bugs- tested on spades and megahit --- atlas/rules/assemble.snakefile | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/atlas/rules/assemble.snakefile b/atlas/rules/assemble.snakefile index 495da362..153ed915 100644 --- a/atlas/rules/assemble.snakefile +++ b/atlas/rules/assemble.snakefile @@ -4,12 +4,12 @@ import sys from glob import glob from snakemake.utils import report import warnings - +from copy import copy localrules: rename_megahit_output, rename_spades_output, initialize_checkm, \ finalize_contigs, build_bin_report, build_assembly_report -ASSEMBLY_FRACTIONS = MULTIFILE_FRACTIONS +ASSEMBLY_FRACTIONS = copy(MULTIFILE_FRACTIONS) if config.get("merge_pairs_before_assembly", True) and PAIRED_END: ASSEMBLY_FRACTIONS += ['me'] @@ -233,7 +233,7 @@ if config.get("assembler", "megahit") == "megahit": shell: """ rm -r {params.outdir} 2> {log} - + megahit \ {params.inputs} \ --tmp-dir {TMPDIR} \ @@ -275,12 +275,12 @@ else: benchmark: "logs/benchmarks/assembly/spades/{sample}.txt" params: - inputs = lambda wc, input: "-pe1-1 {0} -pe1-2 {1} -pe1-s {2}".format(*input) if PAIRED_END else "-s {0}".format(*input), - input_merged = lambda wc, input: "pe1-m {3}".format(*input) if len(input) == 4 else "", + inputs = lambda wc, input: "--pe1-1 {0} --pe1-2 {1} --pe1-s {2}".format(*input) if PAIRED_END else "-s {0}".format(*input), + input_merged = lambda wc, input: "--pe1-m {3}".format(*input) if len(input) == 4 else "", k = config.get("spades_k", SPADES_K), outdir = lambda wc: "{sample}/assembly".format(sample=wc.sample), preset = assembly_params['spades'][config['spades_preset']], - skip_error_correction = "--only-assembler" if config['spades_skip_BayesHammer'] else "f" + skip_error_correction = "--only-assembler" if config['spades_skip_BayesHammer'] else "" log: "{sample}/logs/assembly/spades.log" # shadow: @@ -292,14 +292,14 @@ else: resources: mem=config.get("assembly_memory", ASSEMBLY_MEMORY) #in GB shell: - "spades.py" - "--threads {threads}" - "--memory {resources.mem}" - "-o {params.outdir}" - "{params.preset}" - "{params.inputs} {params.input_merged}" - "{params.skip_error_correction}" - "> {log} 2>&1" + "spades.py " + " --threads {threads} " + " --memory {resources.mem} " + " -o {params.outdir} " + " {params.preset} " + " {params.inputs} {params.input_merged} " + " {params.skip_error_correction} " + " > {log} 2>&1 " From 3f63fa45aa4081d9bfb4932c1899259d5448d90d Mon Sep 17 00:00:00 2001 From: kiesers Date: Fri, 1 Jun 2018 08:26:34 +0200 Subject: [PATCH 3/4] added loose parameters made filtering optional not tested --- atlas/default_values.py | 25 ++-- atlas/rules/assemble.snakefile | 205 +++++++++++++++++---------------- atlas/template_config.yaml | 16 +-- 3 files changed, 133 insertions(+), 113 deletions(-) diff --git a/atlas/default_values.py b/atlas/default_values.py index 19268d40..6fd4a871 100644 --- a/atlas/default_values.py +++ b/atlas/default_values.py @@ -27,8 +27,11 @@ DUPLICATES_ALLOW_SUBSTITUTIONS = 2 NORMALIZATION_KMER_LENGTH = 21 -NORMALIZATION_TARGET_DEPTH = 100 -NORMALIZATION_MINIMUM_KMERS = 15 + +# almost no filtering unless grossly over-represented +NORMALIZATION_TARGET_DEPTH = 10000 #500 +# allow very low represented kmers to remain +NORMALIZATION_MINIMUM_KMERS = 3 #15 ASSEMBLY_MEMORY = 50 ASSEMBLY_THREADS = 8 @@ -40,13 +43,16 @@ MEGAHIT_PRUNE_LEVEL = 2 MEGAHIT_LOW_LOCAL_RATIO = 0.2 SPADES_K = "auto" -PREFILTER_MINIMUM_CONTIG_LENGTH = 500 -MINIMUM_CONTIG_LENGTH = 2200 - -MINIMUM_AVERAGE_COVERAGE = 5 -MINIMUM_PERCENT_COVERED_BASES = 40 +# basically no filtering after assembly +PREFILTER_MINIMUM_CONTIG_LENGTH = 200 #500 +# this is bumped up slightly to filter non-merged R1 and R2 sequences +MINIMUM_CONTIG_LENGTH = 300 #2200 + +# leave all contigs +MINIMUM_AVERAGE_COVERAGE = 1 #5 +MINIMUM_PERCENT_COVERED_BASES = 20 #40 MINIMUM_MAPPED_READS = 0 -CONTIG_TRIM_BP = 100 +CONTIG_TRIM_BP = 0 #100 # bases MINIMUM_REGION_OVERLAP = 1 @@ -124,7 +130,7 @@ def make_default_config(): conf["normalization_minimum_kmers"] = NORMALIZATION_MINIMUM_KMERS conf["error_correction_before_assembly"] = True - + conf["merge_pairs_before_assembly"] = True conf["merging_k"] = MERGING_K conf["merging_extend2"] = MERGING_EXTEND2 @@ -146,6 +152,7 @@ def make_default_config(): conf["spades_k"] = SPADES_K conf["spades_preset"] = 'meta' conf["spades_skip_BayesHammer"] = False + conf["filter_contigs"] = True conf["minimum_average_coverage"] = MINIMUM_AVERAGE_COVERAGE conf["minimum_percent_covered_bases"] = MINIMUM_PERCENT_COVERED_BASES conf["minimum_mapped_reads"] = MINIMUM_MAPPED_READS diff --git a/atlas/rules/assemble.snakefile b/atlas/rules/assemble.snakefile index 153ed915..6b8bfab1 100644 --- a/atlas/rules/assemble.snakefile +++ b/atlas/rules/assemble.snakefile @@ -356,107 +356,118 @@ rule combine_sample_contig_stats: c.to_csv(output[0], sep='\t') +if config['filter_contigs']: -rule calculate_prefiltered_contig_coverage_stats: - input: - unpack(get_quality_controlled_reads), - fasta = "{sample}/assembly/{sample}_prefilter_contigs.fasta" - output: # bbwrap gives output statistics only for single ended - covstats = "{sample}/assembly/contig_stats/prefilter_coverage_stats.txt", - sam = temp("{sample}/sequence_alignment/alignment_to_prefilter_contigs.sam") - benchmark: - "logs/benchmarks/assembly/post_process/align_reads_to_prefiltered_contigs/{sample}.txt" - params: - input = lambda wc, input : input_params_for_bbwrap(wc, input), - maxsites = config.get("maximum_counted_map_sites", MAXIMUM_COUNTED_MAP_SITES), - max_distance_between_pairs = config.get('contig_max_distance_between_pairs', CONTIG_MAX_DISTANCE_BETWEEN_PAIRS), - paired_only = 't' if config.get("contig_map_paired_only", CONTIG_MAP_PAIRED_ONLY) else 'f', - min_id = config.get('contig_min_id', CONTIG_MIN_ID), - maxindel = 100, - #ambiguous = 'all' if CONTIG_COUNT_MULTI_MAPPED_READS else 'best' - log: - "{sample}/logs/assembly/post_process/align_reads_to_prefiltered_contigs.log" - conda: - "%s/required_packages.yaml" % CONDAENV - threads: - config.get("threads", 1) - resources: - mem = config.get("java_mem", JAVA_MEM), - java_mem = int(config.get("java_mem", JAVA_MEM) * JAVA_MEM_FRACTION) - shell: - """bbwrap.sh \ - nodisk=t \ - ref={input.fasta} \ - {params.input} \ - fast=t \ - threads={threads} \ - ambiguous=all \ - pairlen={params.max_distance_between_pairs} \ - pairedonly={params.paired_only} \ - mdtag=t \ - xstag=fs \ - nmtag=t \ - local=t \ - secondary=t \ - maxsites={params.maxsites} \ - -Xmx{resources.java_mem}G \ - out={output.sam} 2> {log} - - pileup.sh \ - ref={input.fasta} \ - in={output.sam} \ - threads={threads} \ - secondary=t \ - -Xmx{resources.java_mem}G \ - covstats={output.covstats} 2>> {log} - """ + rule calculate_prefiltered_contig_coverage_stats: + input: + unpack(get_quality_controlled_reads), + fasta = "{sample}/assembly/{sample}_prefilter_contigs.fasta" + output: # bbwrap gives output statistics only for single ended + covstats = "{sample}/assembly/contig_stats/prefilter_coverage_stats.txt", + sam = temp("{sample}/sequence_alignment/alignment_to_prefilter_contigs.sam") + benchmark: + "logs/benchmarks/assembly/post_process/align_reads_to_prefiltered_contigs/{sample}.txt" + params: + input = lambda wc, input : input_params_for_bbwrap(wc, input), + maxsites = config.get("maximum_counted_map_sites", MAXIMUM_COUNTED_MAP_SITES), + max_distance_between_pairs = config.get('contig_max_distance_between_pairs', CONTIG_MAX_DISTANCE_BETWEEN_PAIRS), + paired_only = 't' if config.get("contig_map_paired_only", CONTIG_MAP_PAIRED_ONLY) else 'f', + min_id = config.get('contig_min_id', CONTIG_MIN_ID), + maxindel = 100, + #ambiguous = 'all' if CONTIG_COUNT_MULTI_MAPPED_READS else 'best' + log: + "{sample}/logs/assembly/post_process/align_reads_to_prefiltered_contigs.log" + conda: + "%s/required_packages.yaml" % CONDAENV + threads: + config.get("threads", 1) + resources: + mem = config.get("java_mem", JAVA_MEM), + java_mem = int(config.get("java_mem", JAVA_MEM) * JAVA_MEM_FRACTION) + shell: + """bbwrap.sh \ + nodisk=t \ + ref={input.fasta} \ + {params.input} \ + fast=t \ + threads={threads} \ + ambiguous=all \ + pairlen={params.max_distance_between_pairs} \ + pairedonly={params.paired_only} \ + mdtag=t \ + xstag=fs \ + nmtag=t \ + local=t \ + secondary=t \ + maxsites={params.maxsites} \ + -Xmx{resources.java_mem}G \ + out={output.sam} 2> {log} + + pileup.sh \ + ref={input.fasta} \ + in={output.sam} \ + threads={threads} \ + secondary=t \ + -Xmx{resources.java_mem}G \ + covstats={output.covstats} 2>> {log} + """ -rule filter_by_coverage: - input: - fasta = "{sample}/assembly/{sample}_prefilter_contigs.fasta", - covstats = "{sample}/assembly/contig_stats/prefilter_coverage_stats.txt" - output: - fasta = temp("{sample}/assembly/{sample}_final_contigs.fasta"), - removed_names = "{sample}/assembly/{sample}_discarded_contigs.fasta" - params: - minc = config.get("minimum_average_coverage", MINIMUM_AVERAGE_COVERAGE), - minp = config.get("minimum_percent_covered_bases", MINIMUM_PERCENT_COVERED_BASES), - minr = config.get("minimum_mapped_reads", MINIMUM_MAPPED_READS), - minl = config.get("minimum_contig_length", MINIMUM_CONTIG_LENGTH), - trim = config.get("contig_trim_bp", CONTIG_TRIM_BP) - log: - "{sample}/logs/assembly/post_process/filter_by_coverage.log" - conda: - "%s/required_packages.yaml" % CONDAENV - threads: - 1 - resources: - mem = config.get("java_mem", JAVA_MEM), - java_mem = int(config.get("java_mem", JAVA_MEM) * JAVA_MEM_FRACTION) - shell: - """filterbycoverage.sh in={input.fasta} \ - cov={input.covstats} \ - out={output.fasta} \ - outd={output.removed_names} \ - minc={params.minc} \ - minp={params.minp} \ - minr={params.minr} \ - minl={params.minl} \ - trim={params.trim} \ - -Xmx{resources.java_mem}G 2> {log}""" - - -rule finalize_contigs: - input: - "{sample}/assembly/{sample}_final_contigs.fasta" - output: - "{sample}/{sample}_contigs.fasta" - threads: - 1 - shell: - "cp {input} {output}" + rule filter_by_coverage: + input: + fasta = "{sample}/assembly/{sample}_prefilter_contigs.fasta", + covstats = "{sample}/assembly/contig_stats/prefilter_coverage_stats.txt" + output: + fasta = temp("{sample}/assembly/{sample}_final_contigs.fasta"), + removed_names = "{sample}/assembly/{sample}_discarded_contigs.fasta" + params: + minc = config.get("minimum_average_coverage", MINIMUM_AVERAGE_COVERAGE), + minp = config.get("minimum_percent_covered_bases", MINIMUM_PERCENT_COVERED_BASES), + minr = config.get("minimum_mapped_reads", MINIMUM_MAPPED_READS), + minl = config.get("minimum_contig_length", MINIMUM_CONTIG_LENGTH), + trim = config.get("contig_trim_bp", CONTIG_TRIM_BP) + log: + "{sample}/logs/assembly/post_process/filter_by_coverage.log" + conda: + "%s/required_packages.yaml" % CONDAENV + threads: + 1 + resources: + mem = config.get("java_mem", JAVA_MEM), + java_mem = int(config.get("java_mem", JAVA_MEM) * JAVA_MEM_FRACTION) + shell: + """filterbycoverage.sh in={input.fasta} \ + cov={input.covstats} \ + out={output.fasta} \ + outd={output.removed_names} \ + minc={params.minc} \ + minp={params.minp} \ + minr={params.minr} \ + minl={params.minl} \ + trim={params.trim} \ + -Xmx{resources.java_mem}G 2> {log}""" + + + rule finalize_contigs: + input: + "{sample}/assembly/{sample}_final_contigs.fasta" + output: + "{sample}/{sample}_contigs.fasta" + threads: + 1 + shell: + "cp {input} {output}" +else: # no filter + rule finalize_contigs: + input: + "{sample}/assembly/{sample}_prefilter_contigs.fasta" + output: + "{sample}/{sample}_contigs.fasta" + threads: + 1 + shell: + "cp {input} {output}" rule align_reads_to_final_contigs: input: diff --git a/atlas/template_config.yaml b/atlas/template_config.yaml index 4b904900..2b65110d 100644 --- a/atlas/template_config.yaml +++ b/atlas/template_config.yaml @@ -63,9 +63,9 @@ contaminant_ambiguous: best normalize_reads_before_assembly : true # target kmer depth -normalization_target_depth: 100 +normalization_target_depth: 10000 normalization_kmer_length: 21 -normalization_minimum_kmers: 15 +normalization_minimum_kmers: 3 error_correction_before_assembly : true @@ -104,16 +104,18 @@ spades_k: auto # Filtering #------------ -# trim contig tips -contig_trim_bp: 20 -# filter out assembled noise prefilter_minimum_contig_length: 200 +# filter out assembled noise +# this is more important for assemblys from megahit +filter_contigs: true +# trim contig tips +contig_trim_bp: 0 # require contigs to have read support minimum_average_coverage: 1 -minimum_percent_covered_bases: 40 +minimum_percent_covered_bases: 20 minimum_mapped_reads: 0 # after filtering -minimum_contig_length: 200 +minimum_contig_length: 300 ######################## From de8c3e6d68cca4da5e5f9a6326e5b2e823454d2d Mon Sep 17 00:00:00 2001 From: kiesers Date: Fri, 1 Jun 2018 09:32:08 +0200 Subject: [PATCH 4/4] corrected bug with rel paths --- atlas/rules/assemble.snakefile | 44 ++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/atlas/rules/assemble.snakefile b/atlas/rules/assemble.snakefile index 6b8bfab1..26227a1f 100644 --- a/atlas/rules/assemble.snakefile +++ b/atlas/rules/assemble.snakefile @@ -14,7 +14,7 @@ if config.get("merge_pairs_before_assembly", True) and PAIRED_END: ASSEMBLY_FRACTIONS += ['me'] def get_preprocessing_steps(config): - preprocessing_steps = [] + preprocessing_steps = ['QC'] if config.get("normalize_reads_before_assembly", True): preprocessing_steps.append("normalized") @@ -53,12 +53,23 @@ def bb_cov_stats_to_maxbin(tsv_in, tsv_out): assembly_preprocessing_steps = get_preprocessing_steps(config) +localrules: init_pre_assembly_processing +rule init_pre_assembly_processing: + input: + unpack(get_quality_controlled_reads) #expect SE or R1,R2 or R1,R2,SE + output: + temp(expand("{{sample}}/assembly/reads/QC_{fraction}.fastq.gz", + fraction=MULTIFILE_FRACTIONS)) + run: + # make symlink + for i in range(len(input)): + os.symlink(os.path.relpath(input[i],os.path.dirname(output[i])),output[i]) rule normalize_coverage_across_kmers: input: unpack(get_quality_controlled_reads) #expect SE or R1,R2 or R1,R2,SE output: - temp(expand("{{sample}}/assembly/reads/normalized_{fraction}.fastq.gz", + temp(expand("{{sample}}/assembly/reads/QC.normalized_{fraction}.fastq.gz", fraction=MULTIFILE_FRACTIONS)) params: k = config.get("normalization_kmer_length", NORMALIZATION_KMER_LENGTH), @@ -447,28 +458,31 @@ if config['filter_contigs']: trim={params.trim} \ -Xmx{resources.java_mem}G 2> {log}""" - - rule finalize_contigs: - input: - "{sample}/assembly/{sample}_final_contigs.fasta" - output: - "{sample}/{sample}_contigs.fasta" - threads: - 1 - shell: - "cp {input} {output}" - +# HACK: this makes two copies of the same file else: # no filter - rule finalize_contigs: + localrules: do_not_filter_contigs + rule do_not_filter_contigs: input: "{sample}/assembly/{sample}_prefilter_contigs.fasta" output: - "{sample}/{sample}_contigs.fasta" + "{sample}/assembly/{sample}_final_contigs.fasta" threads: 1 shell: "cp {input} {output}" +rule finalize_contigs: + input: + "{sample}/assembly/{sample}_final_contigs.fasta" + output: + "{sample}/{sample}_contigs.fasta" + threads: + 1 + run: + os.symlink(os.path.relpath(input[0],os.path.dirname(output[0])),output[0]) + + + rule align_reads_to_final_contigs: input: unpack(get_quality_controlled_reads),