diff --git a/Snakefile b/Snakefile index e15be4e..832afa5 100644 --- a/Snakefile +++ b/Snakefile @@ -11,6 +11,7 @@ samples = list(list_of_samples.Samples.unique()) include: "rules/download_samples_or_copy.smk" include: "rules/kraken2_mapping.smk" include: "rules/cellranger.smk" +include: "rules/kraken_processing.smk" include: "rules/extract_bam.smk" include: "rules/extract_tags.smk" include: "rules/report.smk" @@ -21,10 +22,12 @@ rule all: expand("data/{sample}/{sample}_S1_L001_R2_001.fastq.gz", sample=samples), expand("results/kraken2/{sample}.kraken",sample=samples), expand("results/kraken2/{sample}.report.txt",sample=samples), - directory(expand("results/cellranger/{sample}/{sample}/",sample=samples)), + expand("results/cellranger/{sample}/{sample}/",sample=samples), + expand("results/kraken_reads/{sample}_kraken_reads.sam", sample=samples), expand("results/cellranger/{sample}/possorted_genome_bam.bam", sample=samples), expand("results/cellranger/{sample}/unmapped_reads.sam", sample=samples), expand("results/count_matrix/{sample}/count_matrix.tsv", sample=samples), + expand("results/count_matrix/{sample}/kraken_reads_count_matrix.tsv",sample=samples), "results/kraken_plots/Familywise_tax_readcounts.tsv", "results/kraken_plots/Specieswise_tax_readcounts.tsv", "results/kraken_plots/Clustermap_Familywise_log10.png", @@ -34,4 +37,4 @@ onsuccess: print("sc-VirusScan Pipeline finished successfully!") onerror: - print("sc-VirusScan Pipeline has failed!") \ No newline at end of file + print("sc-VirusScan Pipeline has failed!") diff --git a/env.yaml b/env.yaml index 1c13e3d..76d6d62 100644 --- a/env.yaml +++ b/env.yaml @@ -5,11 +5,11 @@ channels: - defaults dependencies: - kraken2 - - samtools + - samtools==1.12 - entrez-direct - parallel-fastq-dump - pandas - snakemake - seaborn - pip: - - synapseclient \ No newline at end of file + - synapseclient diff --git a/rules/download_samples_or_copy.smk b/rules/download_samples_or_copy.smk index 2e1cf04..d6c6292 100644 --- a/rules/download_samples_or_copy.smk +++ b/rules/download_samples_or_copy.smk @@ -30,10 +30,9 @@ rule download_samples_or_copy: mv data/{wildcards.sample}/*_R2_* data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz else - mkdir tmp/ + mkdir -p tmp/ parallel-fastq-dump --sra-id {wildcards.sample} --split-files --threads {threads} --outdir {params.outdir} --gzip --tmpdir tmp/ mv data/{wildcards.sample}/{wildcards.sample}_2.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R1_001.fastq.gz mv data/{wildcards.sample}/{wildcards.sample}_3.fastq.gz data/{wildcards.sample}/{wildcards.sample}_S1_L001_R2_001.fastq.gz - rm -rf tmp/ fi """ diff --git a/rules/extract_tags.smk b/rules/extract_tags.smk index a9321bc..f438a9c 100644 --- a/rules/extract_tags.smk +++ b/rules/extract_tags.smk @@ -1,8 +1,10 @@ rule extract_tags: input: - "results/cellranger/{sample}/unmapped_reads.sam" + i1 = "results/cellranger/{sample}/unmapped_reads.sam", + i2 = "results/kraken_reads/{sample}_kraken_reads.sam" output: - "results/count_matrix/{sample}/count_matrix.tsv" + o1 = "results/count_matrix/{sample}/count_matrix.tsv", + o2 = "results/count_matrix/{sample}/kraken_reads_count_matrix.tsv", threads: 16 resources: mem_mb = 40000 @@ -13,4 +15,7 @@ rule extract_tags: log: "results/logs/count_matrix/{sample}_bam_extract.log" shell: - "python3 {config[scripts_dir]}/bam_extract.py -i {input} -k {params.p3} -b {params.p2} -o {params.p1}" \ No newline at end of file + """ + python3 {config[scripts_dir]}/bam_extract.py -i {input.i1} -k {params.p3} -b {params.p2} -o {output.o1} + python3 {config[scripts_dir]}/bam_extract.py -i {input.i2} -k {params.p3} -b {params.p2} -o {output.o2} + """ \ No newline at end of file diff --git a/rules/kraken_processing.smk b/rules/kraken_processing.smk new file mode 100644 index 0000000..802dde6 --- /dev/null +++ b/rules/kraken_processing.smk @@ -0,0 +1,23 @@ +rule kraken2_processing: + input: + i1 = "results/cellranger/{sample}/possorted_genome_bam.bam", + i2 = "results/kraken2/{sample}.kraken" + output: + o1 = "results/kraken_reads/{sample}_kraken_reads.sam", + priority: 90 + resources: + mem_mb = 20000 + threads: 16 + params: + p1 = "results/kraken_reads/{sample}_ReadIds.txt", + p2 = "results/kraken_reads/{sample}_krakenReads.bam" + log: + "results/logs/kraken2/{sample}.kraken.log" + shell: + """ + mkdir -p results/kraken_reads + awk -F'\t' '$3 != "unclassified (taxid 0)" && $3 != "Homo sapiens (taxid 9606)" {{print $2}}' {input.i2} > {params.p1} + samtools view -N {params.p1} -o {params.p2} {input.i1} + samtools index {params.p2} + samtools view -h {params.p2} > {output.o1} + """ \ No newline at end of file diff --git a/rules/report.smk b/rules/report.smk index 5ce3644..4b0f3ee 100644 --- a/rules/report.smk +++ b/rules/report.smk @@ -8,7 +8,7 @@ rule kraken_reports: o4 = "results/kraken_plots/Clustermap_Specieswise_log10.png" priority: 10 resources: - mem_mb = 1000 + mem_mb = 40000 params: p1 = "results/kraken_plots/", p2 = "results/kraken2/" @@ -16,5 +16,5 @@ rule kraken_reports: "results/logs/plots/kraken_plots.log" shell: """ - python3 scripts/kraken_plot.py -i {params.p2} -o {params.p1} + python3 {config[scripts_dir]}/kraken_plot.py -i {params.p2} -o {params.p1} """ diff --git a/scripts/bam_extract.py b/scripts/bam_extract.py index f3b8dde..2193d44 100644 --- a/scripts/bam_extract.py +++ b/scripts/bam_extract.py @@ -13,6 +13,7 @@ help="Filtered Barcodes TSV file from CellRanger") parser.add_argument("-o", "--output-file", metavar="PATH", + default="count_matrix.tsv", help="Path to your output file") args = parser.parse_args() @@ -69,4 +70,4 @@ result = merged_barcodes.pivot_table(index='Tax_ID', columns='CR', values='Read Name', aggfunc='count').fillna(0.).astype(int) -result.to_csv(args.output_file + "count_matrix.tsv", sep='\t') +result.to_csv(args.output_file, sep='\t')