Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #12

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 11 additions & 25 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,34 +14,11 @@ jobs:
strategy:
matrix:
test:
- sanity-snakemake
- sanity-snakemake-lint
- sanity-singularity
- sanity-no-reference
- sanity-reference-does-not-exist
- sanity-baits-only
- sanity-targets-only
- sanity-samples-overlapping-name
- sanity-multisample
- sanity

- dry-run-vanilla
- dry-run-target-baits
- dry-run-bed-coverage
- dry-run-multisample
- dry-run

- integration-vanilla
- integration-small-scatter
- integration-refflat
- integration-all-on-target
- integration-gene-bedfile
- integration-two-known-sites
- integration-two-readgroups
- integration-two-samples
- integration-target-baits
- integration-bed-coverage
- integration-restrict-BQSR
- integration-targets-only
- integration-multisample

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -101,3 +78,12 @@ jobs:
echo $file; cat $file
done
'

- name: Check size of created files
if: ${{ failure() }}
run: >-
bash -c '
for file in $(find /tmp/pytest_workflow_*/${{ matrix.test}}/ -type f); do
du -sh $file
done
'
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,16 @@ This document is user facing. Please word the changes in such a way
that users understand how the changes affect the new version.
-->

v2.2.2-dev
---------------------------

v2.0.1
---------------------------
+ `multisample_vcf` now acts on the scatters, instead of on the merged g.vcf
files.
+ The multisample output is located in `multisample/multisample.vcf.gz`.
+ Intermediate .bam, .bai and fastq files are automatically removed when no
longer needed.
+ Switch to using chunked-scatter

v2.0.0
Expand Down
75 changes: 48 additions & 27 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ rule all:
gvcf_tbi = expand("{s}/vcf/{s}.g.vcf.gz.tbi", s=config["samples"]),
coverage_stats = coverage_stats,
coverage_files = coverage_files,
multisample_vcf = "multisample.vcf.gz" if config["multisample_vcf"] else []
multisample_vcf = "multisample/multisample.vcf.gz" if config["multisample_vcf"] else []

rule create_tmp:
"""
Expand Down Expand Up @@ -80,8 +80,8 @@ rule cutadapt:
r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2'])
output:
r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz"
r1 = temp("{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz"),
r2 = temp("{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz")
log:
"{sample}/pre_process/{sample}-{read_group}.txt"
container:
Expand All @@ -104,11 +104,11 @@ rule align:
ref = config["reference"],
tmp = rules.create_tmp.output
output:
"{sample}/bams/{sample}-{read_group}.sorted.bam"
bam = temp("{sample}/bams/{sample}-{read_group}.sorted.bam"),
bai = temp("{sample}/bams/{sample}-{read_group}.sorted.bam.bai")
params:
compression_level = 1,
rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"

log:
bwa = "log/{sample}/align.{read_group}.bwa.log",
samtools = "log/{sample}/align.{read_group}.samtools.log"
Expand All @@ -121,14 +121,16 @@ rule align:
"bwa mem -t {threads} -R '{params.rg}' {input.ref} "
"{input.r1} {input.r2} 2> {log.bwa} | "
"samtools sort "
"-T {input.tmp} "
"-l {params.compression_level} "
"- -o {output} 2> {log.samtools};"
"samtools index {output}"
"- -o {output.bam} 2> {log.samtools};"
"samtools index {output.bam}"

rule markdup:
"""Mark duplicates in BAM file"""
input:
bam = sample_bamfiles,
bai = sample_baifiles,
tmp = rules.create_tmp.output
output:
bam = "{sample}/bams/{sample}.bam",
Expand All @@ -152,6 +154,7 @@ rule baserecal:
"""Base recalibrated BAM files"""
input:
bam = sample_bamfiles,
bai = sample_baifiles,
ref = config["reference"],
vcfs = config["known_sites"]
output:
Expand Down Expand Up @@ -283,6 +286,44 @@ rule genotype_gather:
"--output {output.vcf} --output-type z 2> {log} && "
"bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"

rule multisample_scatter:
""" Generate a true multisample VCF file with all samples """
input:
gvcfs = expand("{sample}/vcf/{sample}.{{chunk}}.g.vcf.gz", sample=config["samples"]),
tbis = expand("{sample}/vcf/{sample}.{{chunk}}.g.vcf.gz.tbi", sample=config["samples"]),
ref = config["reference"]
params:
gvcf_files = lambda wc: expand("-V {sample}/vcf/{sample}.{chunk}.g.vcf.gz", sample=config["samples"], chunk=wc.chunk),
output:
multisample_vcf = temp("multisample/{chunk}.vcf.gz"),
multisample_tbi = temp("multisample/{chunk}.vcf.gz.tbi")
log:
"log/multisample.{chunk}.log"
container:
containers["gatk"]
threads:
8
shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
"GenotypeGVCFs -R {input.ref} "
"{params.gvcf_files} -o {output.multisample_vcf} 2> {log}"

rule multisample_gather:
""" Gather all multisample VCFs scatters, and join them together """
input:
vcfs = gather_multisample_vcf,
vcfs_tbi = gather_multisample_vcf_tbi
output:
vcf = "multisample/multisample.vcf.gz",
vcf_tbi = "multisample/multisample.vcf.gz.tbi"
log:
"log/multisample_gather.log"
container:
containers["bcftools"]
shell:
"bcftools concat {input.vcfs} --allow-overlaps "
"--output {output.vcf} --output-type z 2> {log} && "
"bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"

rule fastqc:
"""Run fastqc on fastq files post pre-processing"""
input:
Expand Down Expand Up @@ -523,23 +564,3 @@ rule gvcf2coverage:
containers["gvcf2coverage"]
shell:
"gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"

rule multisample_vcf:
""" Generate a true multisample VCF file with all samples """
input:
gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=config["samples"]),
tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=config["samples"]),
ref = config["reference"]
params:
gvcf_files = lambda wc: expand("-V {sample}/vcf/{sample}.g.vcf.gz", sample=config["samples"]),
output:
"multisample.vcf.gz"
log:
"log/multisample.log"
container:
containers["gatk"]
threads:
8
shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
"GenotypeGVCFs -R {input.ref} "
"{params.gvcf_files} -o '{output}'"
1 change: 0 additions & 1 deletion cluster/slurm-cluster-status
Submodule slurm-cluster-status deleted from 4dd691
30 changes: 3 additions & 27 deletions cluster/slurm_cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,21 @@ __default__:

align:
threads: 8
vmem: 4G
vmem: 8G
time: 0-2

baserecal:
threads: 8
vmem: 6G
vmem: 48G
time: 0-2

covstats:
vmem: 6G
vmem: 12G

cutadapt:
threads: 8
time: 0-2

fastqc_raw:
threads: 4
time: 0-1

fastqc_merged:
threads: 4
time: 0-1

fastqc_postqc:
threads: 4
time: 0-1

fqcount_postqc:
time: 0-1

gvcf_scatter:
vmem: 20G
time: 0-1
Expand All @@ -58,12 +43,3 @@ markdup:
multiqc:
vmem: 30G
time: 0-1

sickle:
time: 0-1

split_vcf:
vmem: 20G

vcfstats:
time: 0-1
25 changes: 24 additions & 1 deletion common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ containers = {
'debian': 'docker://debian:buster-slim',
'fastqc': 'docker://quay.io/biocontainers/fastqc:0.11.7--4',
'gatk': 'docker://broadinstitute/gatk3:3.7-0',
'gvcf2coverage': 'docker://lumc/gvcf2coverage:0.1-dirty-2',
'gvcf2coverage': 'docker://redmar_van_den_berg/gvcf2coverage:0.1-dirty-2',
'multiqc': 'docker://quay.io/biocontainers/multiqc:1.8--py_2',
'picard': 'docker://quay.io/biocontainers/picard:2.22.8--0',
'python3': 'docker://python:3.6-slim',
Expand Down Expand Up @@ -99,6 +99,11 @@ def sample_bamfiles(wildcards):
files.append(f'{sample_name}/bams/{sample_name}-{read_group}.sorted.bam')
return files

def sample_baifiles(wildcards):
""" Determine the bai files for a sample (one for each readgroup)
"""
return [f"{bam}.bai" for bam in sample_bamfiles(wildcards)]

def gather_gvcf(wildcards):
""" Gather the gvcf files based on the scatterregions checkpoint

Expand Down Expand Up @@ -136,6 +141,24 @@ def gather_vcf_tbi(wildcards):
return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def gather_multisample_vcf(wildcards):
""" Gather the multisample vcf files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("multisample/{i}.vcf.gz",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def gather_multisample_vcf_tbi(wildcards):
""" Gather the multisample vcf index files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("multisample/{i}.vcf.gz.tbi",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def sample_cutadapt_files(wildcards):
""" Determine the cutadapt log files files for a sample (one for each
readgroup).
Expand Down
40 changes: 12 additions & 28 deletions config/example.json
Original file line number Diff line number Diff line change
@@ -1,31 +1,15 @@
{
"samples": {
"sample_01": {
"read_groups": {
"lib_l1": {
"R1": "1.fq.gz",
"R2": "2.fq.gz"
},
"lib_l2": {
"R1": "1.1.fq.gz",
"R2": "1.2.fq.gz"
}
}
},
"sample_02": {
"read_groups": {
"lib_l1": {
"R1": "3.1.fq.gz",
"R2": "3.2.fq.gz"
}
}
"samples": {
"micro": {
"read_groups": {
"lib_01": {
"R1": "tests/data/fastq/micro_R1.fq.gz",
"R2": "tests/data/fastq/micro_R2.fq.gz"
}
},
"reference": "/path/to/ref",
"dbsnp": "/path/to/vcf1",
"known_sites": ["/path/to/vcf1", "/path/to/vcf2"],
"scatter_size": 1000000000,
"female_threshold": 0.6,
"bedfile": "/path/to/bed",
"refflat": "/path/to/refflat"
}
}
},
"reference":"tests/data/reference/ref.fa",
"dbsnp": "tests/data/reference/database.vcf.gz",
"known_sites": ["tests/data/reference/database.vcf.gz"]
}
4 changes: 1 addition & 3 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,4 @@ channels:
- conda-forge
dependencies:
- pytest-workflow>=1.4.0
- snakemake-minimal
- boto3
- smart_open
- snakemake-minimal=5.31.1
1 change: 1 addition & 0 deletions tests/data/config/sample_config_multisample.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@
"known_sites": ["tests/data/reference/database.vcf.gz"],
"targetsfile": "tests/data/reference/full_chrM.bed",
"baitsfile": "tests/data/reference/target_baits.bed",
"scatter_size": 8000,
"multisample_vcf": true
}
2 changes: 1 addition & 1 deletion tests/data/config/sample_config_scatter.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@
"reference":"tests/data/reference/ref.fa",
"dbsnp": "tests/data/reference/database.vcf.gz",
"known_sites": ["tests/data/reference/database.vcf.gz"],
"scatter_size": 1000
"scatter_size": 8000
}
2 changes: 1 addition & 1 deletion tests/test_dry_run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,4 @@
stdout:
contains:
- Job counts
- rule multisample_vcf
- rule multisample_gather
Loading