From 0206c4f4182422bc055c61aad89040114d8a73b6 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Wed, 17 Mar 2021 14:01:21 +0100
Subject: [PATCH 1/7] Update example config file and add to tests

---
 .github/workflows/ci.yml |  1 +
 config/example.json      | 40 ++++++++++++----------------------------
 tests/test_sanity.yml    |  6 ++++++
 3 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7116bf2..2350038 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,6 +23,7 @@ jobs:
           - sanity-targets-only
           - sanity-samples-overlapping-name
           - sanity-multisample
+          - sanity-example-config
 
           - dry-run-vanilla
           - dry-run-target-baits
diff --git a/config/example.json b/config/example.json
index 139a68a..0c9e189 100644
--- a/config/example.json
+++ b/config/example.json
@@ -1,31 +1,15 @@
 {
-    "samples": {
-        "sample_01": {
-            "read_groups": {
-                "lib_l1": {
-                    "R1": "1.fq.gz",
-                    "R2": "2.fq.gz"
-                },
-                "lib_l2": {
-                    "R1": "1.1.fq.gz",
-                    "R2": "1.2.fq.gz"
-                }
-            }
-        },
-        "sample_02": {
-            "read_groups": {
-                "lib_l1": {
-                    "R1": "3.1.fq.gz",
-                    "R2": "3.2.fq.gz"
-                }
-            }
+  "samples": {
+    "micro": {
+      "read_groups": {
+        "lib_01": {
+        "R1": "tests/data/fastq/micro_R1.fq.gz",
+        "R2": "tests/data/fastq/micro_R2.fq.gz"
         }
-    },
-    "reference": "/path/to/ref",
-    "dbsnp": "/path/to/vcf1",
-    "known_sites": ["/path/to/vcf1", "/path/to/vcf2"],
-    "scatter_size": 1000000000,
-    "female_threshold": 0.6,
-    "bedfile": "/path/to/bed",
-    "refflat": "/path/to/refflat"
+      }
+    }
+  },
+  "reference":"tests/data/reference/ref.fa",
+  "dbsnp": "tests/data/reference/database.vcf.gz",
+  "known_sites": ["tests/data/reference/database.vcf.gz"]
 }
diff --git a/tests/test_sanity.yml b/tests/test_sanity.yml
index 37e74f7..d67b322 100644
--- a/tests/test_sanity.yml
+++ b/tests/test_sanity.yml
@@ -80,3 +80,9 @@
     - sanity
   command: >
     snakemake --lint --configfile tests/data/config/sample_config.json
+
+- name: sanity-example-config
+  tags:
+    - sanity
+  command: >
+    jsonschema -i config/example.json config/schema.json

From bc318dc8473bd8e615ba427d433569793e1a4347 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Wed, 17 Mar 2021 14:20:24 +0100
Subject: [PATCH 2/7] Automatically remove intermediate bam and fastq files

---
 CHANGELOG.md                   |  2 ++
 Snakefile                      | 14 ++++++++------
 common.smk                     |  5 +++++
 tests/test_integration_run.yml |  8 ++++++++
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index acd3f87..ab9cdc1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,8 @@ that users understand how the changes affect the new version.
 
 v2.0.1
 ---------------------------
++ Intermediate .bam, .bai and fastq files are automatically removed when no
+longer needed.
 + Switch to using chunked-scatter
 
 v2.0.0
diff --git a/Snakefile b/Snakefile
index 3d1a5c8..16e7ff1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -80,8 +80,8 @@ rule cutadapt:
         r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
         r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2'])
     output:
-        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
-        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz"
+        r1 = temp("{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz"),
+        r2 = temp("{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz")
     log:
         "{sample}/pre_process/{sample}-{read_group}.txt"
     container:
@@ -104,11 +104,11 @@ rule align:
         ref = config["reference"],
         tmp = rules.create_tmp.output
     output:
-        "{sample}/bams/{sample}-{read_group}.sorted.bam"
+        bam = temp("{sample}/bams/{sample}-{read_group}.sorted.bam"),
+        bai = temp("{sample}/bams/{sample}-{read_group}.sorted.bam.bai")
     params:
         compression_level = 1,
         rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
-
     log:
         bwa = "log/{sample}/align.{read_group}.bwa.log",
         samtools = "log/{sample}/align.{read_group}.samtools.log"
@@ -122,13 +122,14 @@ rule align:
         "{input.r1} {input.r2} 2> {log.bwa} | "
         "samtools sort "
         "-l {params.compression_level} "
-        "- -o {output} 2> {log.samtools};"
-        "samtools index {output}"
+        "- -o {output.bam} 2> {log.samtools};"
+        "samtools index {output.bam}"
 
 rule markdup:
     """Mark duplicates in BAM file"""
     input:
         bam = sample_bamfiles,
+        bai = sample_baifiles,
         tmp = rules.create_tmp.output
     output:
         bam = "{sample}/bams/{sample}.bam",
@@ -152,6 +153,7 @@ rule baserecal:
     """Base recalibrated BAM files"""
     input:
         bam = sample_bamfiles,
+        bai = sample_baifiles,
         ref = config["reference"],
         vcfs = config["known_sites"]
     output:
diff --git a/common.smk b/common.smk
index 1dabb22..7b1e9a3 100644
--- a/common.smk
+++ b/common.smk
@@ -99,6 +99,11 @@ def sample_bamfiles(wildcards):
         files.append(f'{sample_name}/bams/{sample_name}-{read_group}.sorted.bam')
     return files
 
+def sample_baifiles(wildcards):
+    """ Determine the bai files for a sample (one for each readgroup)
+    """
+    return [f"{bam}.bai" for bam in sample_bamfiles(wildcards)]
+
 def gather_gvcf(wildcards):
     """ Gather the gvcf files based on the scatterregions checkpoint
 
diff --git a/tests/test_integration_run.yml b/tests/test_integration_run.yml
index b7db54e..2018005 100644
--- a/tests/test_integration_run.yml
+++ b/tests/test_integration_run.yml
@@ -46,6 +46,14 @@
         - WIDTH_OF_99_PERCENT
         - picard_AlignmentSummaryMetrics
         - picard_DuplicationMetrics
+    - path: micro/pre_process/micro-lib_01_R1.fastq.gz
+      should_exist: false
+    - path: micro/pre_process/micro-lib_01_R2.fastq.gz
+      should_exist: false
+    - path: micro/bams/bams/micro-lib_01.sorted.bam
+      should_exist: false
+    - path: micro/bams/bams/micro-lib_01.sorted.bam.bai
+      should_exist: false
 
 - name: integration-small-scatter
   tags:

From 3e7d785226e95acbc1087ad3254a69f6febd9c7c Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Wed, 17 Mar 2021 14:22:36 +0100
Subject: [PATCH 3/7] Update slurm cluster configuration

---
 cluster/slurm_cluster.yml | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/cluster/slurm_cluster.yml b/cluster/slurm_cluster.yml
index e808a86..b61559d 100644
--- a/cluster/slurm_cluster.yml
+++ b/cluster/slurm_cluster.yml
@@ -7,36 +7,21 @@ __default__:
 
 align:
   threads: 8
-  vmem: 4G
+  vmem: 8G
   time: 0-2
 
 baserecal:
   threads: 8
-  vmem: 6G
+  vmem: 48G
   time: 0-2
 
 covstats:
-  vmem: 6G
+  vmem: 12G
 
 cutadapt:
   threads: 8
   time: 0-2
 
-fastqc_raw:
-  threads: 4
-  time: 0-1
-
-fastqc_merged:
-  threads: 4
-  time: 0-1
-
-fastqc_postqc:
-  threads: 4
-  time: 0-1
-
-fqcount_postqc:
-  time: 0-1
-
 gvcf_scatter:
   vmem: 20G
   time: 0-1
@@ -58,12 +43,3 @@ markdup:
 multiqc:
   vmem: 30G
   time: 0-1
-
-sickle:
-  time: 0-1
-
-split_vcf:
-  vmem: 20G
-
-vcfstats:
-  time: 0-1

From 84dd18999b38b197762c3efe66099c12b6286b03 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Tue, 23 Mar 2021 08:34:11 +0100
Subject: [PATCH 4/7] Rewrite multi sample calling to use scatters

Previously, multi sample calling was done on the merged per sample g.vcf
files. However, in rare cases, VCF files that are merged by bcftools can
trigger a crash in GATK.

To resolve this, the multi sample calling now uses the scattered per
sample g.vcf files, so that GATK does not have to act on VCF files that
were modified by bcftools.

As an added advantage, using the scatters directly for the multi sample
calling allows for better parallelisation.
---
 CHANGELOG.md                                  |  3 +
 Snakefile                                     | 60 ++++++++++++-------
 common.smk                                    | 18 ++++++
 .../config/sample_config_multisample.json     |  1 +
 tests/test_dry_run.yml                        |  2 +-
 tests/test_integration_run.yml                |  8 ++-
 6 files changed, 68 insertions(+), 24 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab9cdc1..11d5dc7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,9 @@ that users understand how the changes affect the new version.
 
 v2.0.1
 ---------------------------
++ `multisample_vcf` now acts on the scatters, instead of on the merged g.vcf
+files.
++ The multisample output is located in `multisample/multisample.vcf.gz`.
 + Intermediate .bam, .bai and fastq files are automatically removed when no
 longer needed.
 + Switch to using chunked-scatter
diff --git a/Snakefile b/Snakefile
index 16e7ff1..5679d83 100644
--- a/Snakefile
+++ b/Snakefile
@@ -44,7 +44,7 @@ rule all:
         gvcf_tbi = expand("{s}/vcf/{s}.g.vcf.gz.tbi", s=config["samples"]),
         coverage_stats = coverage_stats,
         coverage_files = coverage_files,
-        multisample_vcf = "multisample.vcf.gz" if config["multisample_vcf"] else []
+        multisample_vcf = "multisample/multisample.vcf.gz" if config["multisample_vcf"] else []
 
 rule create_tmp:
     """
@@ -285,6 +285,44 @@ rule genotype_gather:
         "--output {output.vcf} --output-type z 2> {log} && "
         "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
 
+rule multisample_scatter:
+    """ Generate a true multisample VCF file with all samples """
+    input:
+        gvcfs = expand("{sample}/vcf/{sample}.{{chunk}}.g.vcf.gz", sample=config["samples"]),
+        tbis = expand("{sample}/vcf/{sample}.{{chunk}}.g.vcf.gz.tbi", sample=config["samples"]),
+        ref = config["reference"]
+    params:
+        gvcf_files = lambda wc: expand("-V {sample}/vcf/{sample}.{chunk}.g.vcf.gz", sample=config["samples"], chunk=wc.chunk),
+    output:
+        multisample_vcf = temp("multisample/{chunk}.vcf.gz"),
+        multisample_tbi = temp("multisample/{chunk}.vcf.gz.tbi")
+    log:
+        "log/multisample.{chunk}.log"
+    container:
+        containers["gatk"]
+    threads:
+        8
+    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
+           "GenotypeGVCFs -R {input.ref} "
+           "{params.gvcf_files} -o {output.multisample_vcf} 2> {log}"
+
+rule multisample_gather:
+    """ Gather all multisample VCFs scatters, and join them together """
+    input:
+        vcfs = gather_multisample_vcf,
+        vcfs_tbi = gather_multisample_vcf_tbi
+    output:
+        vcf = "multisample/multisample.vcf.gz",
+        vcf_tbi = "multisample/multisample.vcf.gz.tbi"
+    log:
+        "log/multisample_gather.log"
+    container:
+        containers["bcftools"]
+    shell:
+        "bcftools concat {input.vcfs} --allow-overlaps "
+        "--output {output.vcf} --output-type z 2> {log} && "
+        "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
+
 rule fastqc:
     """Run fastqc on fastq files post pre-processing"""
     input:
@@ -525,23 +563,3 @@ rule gvcf2coverage:
         containers["gvcf2coverage"]
     shell:
         "gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"
-
-rule multisample_vcf:
-    """ Generate a true multisample VCF file with all samples """
-    input:
-        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=config["samples"]),
-        tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=config["samples"]),
-        ref = config["reference"]
-    params:
-        gvcf_files = lambda wc: expand("-V {sample}/vcf/{sample}.g.vcf.gz", sample=config["samples"]),
-    output:
-        "multisample.vcf.gz"
-    log:
-        "log/multisample.log"
-    container:
-        containers["gatk"]
-    threads:
-        8
-    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
-           "GenotypeGVCFs -R {input.ref} "
-           "{params.gvcf_files} -o '{output}'"
diff --git a/common.smk b/common.smk
index 7b1e9a3..5474851 100644
--- a/common.smk
+++ b/common.smk
@@ -141,6 +141,24 @@ def gather_vcf_tbi(wildcards):
     return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
        i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
 
+def gather_multisample_vcf(wildcards):
+    """ Gather the multisample vcf files based on the scatterregions checkpoint
+    This is depends on the 'scatter_size' parameter and the reference genome
+    used
+    """
+    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
+    return expand("multisample/{i}.vcf.gz",
+       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
+
+def gather_multisample_vcf_tbi(wildcards):
+    """ Gather the multisample vcf index files based on the scatterregions checkpoint
+    This is depends on the 'scatter_size' parameter and the reference genome
+    used
+    """
+    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
+    return expand("multisample/{i}.vcf.gz.tbi",
+       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
+
 def sample_cutadapt_files(wildcards):
     """ Determine the cutadapt log files files for a sample (one for each
     readgroup).
diff --git a/tests/data/config/sample_config_multisample.json b/tests/data/config/sample_config_multisample.json
index ccbe7cb..2b82b0e 100644
--- a/tests/data/config/sample_config_multisample.json
+++ b/tests/data/config/sample_config_multisample.json
@@ -22,5 +22,6 @@
   "known_sites": ["tests/data/reference/database.vcf.gz"],
   "targetsfile": "tests/data/reference/full_chrM.bed",
   "baitsfile":  "tests/data/reference/target_baits.bed",
+  "scatter_size": 1000,
   "multisample_vcf": true
 }
diff --git a/tests/test_dry_run.yml b/tests/test_dry_run.yml
index 9b92222..959b363 100644
--- a/tests/test_dry_run.yml
+++ b/tests/test_dry_run.yml
@@ -64,4 +64,4 @@
   stdout:
     contains:
       - Job counts
-      - rule multisample_vcf
+      - rule multisample_gather
diff --git a/tests/test_integration_run.yml b/tests/test_integration_run.yml
index 2018005..d3feb81 100644
--- a/tests/test_integration_run.yml
+++ b/tests/test_integration_run.yml
@@ -323,5 +323,9 @@
     --jobs 1 -w 120 -r -p
     --configfile tests/data/config/sample_config_multisample.json
   files:
-    - path: 'multisample.vcf.gz'
-    - path: 'multisample.vcf.gz.tbi'
+    - path: 'multisample/multisample.vcf.gz'
+    - path: 'multisample/multisample.vcf.gz.tbi'
+    - path: 'multisample/0.vcf.gz'
+      should_exist: false
+    - path: 'multisample/0.vcf.gz.tbi'
+      should_exist: false

From ea939fd4a37ed2781496af8011158cbb3748ce6c Mon Sep 17 00:00:00 2001
From: Redmar <redmar@ubuntu.com>
Date: Mon, 7 Jun 2021 14:07:23 +0200
Subject: [PATCH 5/7] Move gvcf2coverage image to quay.io

Docker hub has started to remove unused images from free accounts, which
means that it might remove images used by this pipeline without notice.
Therefore the pipeline now exclusively uses images from quay.io or
official repositories from docker hub, which do not have this
limitation.
---
 common.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common.smk b/common.smk
index 5474851..afbad5b 100644
--- a/common.smk
+++ b/common.smk
@@ -12,7 +12,7 @@ containers = {
    'debian': 'docker://debian:buster-slim',
    'fastqc': 'docker://quay.io/biocontainers/fastqc:0.11.7--4',
    'gatk': 'docker://broadinstitute/gatk3:3.7-0',
-   'gvcf2coverage': 'docker://lumc/gvcf2coverage:0.1-dirty-2',
+   'gvcf2coverage': 'docker://redmar_van_den_berg/gvcf2coverage:0.1-dirty-2',
    'multiqc': 'docker://quay.io/biocontainers/multiqc:1.8--py_2',
    'picard': 'docker://quay.io/biocontainers/picard:2.22.8--0',
    'python3': 'docker://python:3.6-slim',

From 5063dbb2eda951575130a542f7eb7f1673123b6c Mon Sep 17 00:00:00 2001
From: Redmar <redmar@ubuntu.com>
Date: Mon, 7 Jun 2021 14:11:10 +0200
Subject: [PATCH 6/7] Remove slurm status submodule

---
 .gitmodules                  | 3 ---
 cluster/slurm-cluster-status | 1 -
 2 files changed, 4 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 160000 cluster/slurm-cluster-status

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index fd20905..0000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "cluster/slurm-cluster-status"]
-	path = cluster/slurm-cluster-status
-	url = https://github.com/LUMC/slurm-cluster-status.git
diff --git a/cluster/slurm-cluster-status b/cluster/slurm-cluster-status
deleted file mode 160000
index 4dd6917..0000000
--- a/cluster/slurm-cluster-status
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4dd69175adc5360cadea79feac6386ccef41d923

From 5649dff1d3dc2c7cba7065b060140ec167ff6114 Mon Sep 17 00:00:00 2001
From: Redmar van den Berg <RedmarvandenBerg@lumc.nl>
Date: Wed, 12 Jan 2022 13:09:25 +0100
Subject: [PATCH 7/7] Add github CI testing

* Use tmp on shared filesystem for aling rule

* Group up tests to reduce load on test runners

* Only test small-scatter on github

* Check size of created files on failure

* Reduce the number of test scatters

* Add new entry to changelog

* Pin Snakemake version
---
 .github/workflows/ci.yml                      | 37 ++++++-------------
 CHANGELOG.md                                  |  3 ++
 Snakefile                                     |  1 +
 environment.yml                               |  4 +-
 .../config/sample_config_multisample.json     |  2 +-
 tests/data/config/sample_config_scatter.json  |  2 +-
 tests/test_integration_run.yml                |  6 +--
 7 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2350038..ff458f7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,35 +14,11 @@ jobs:
     strategy:
       matrix:
         test:
-          - sanity-snakemake
-          - sanity-snakemake-lint
-          - sanity-singularity
-          - sanity-no-reference
-          - sanity-reference-does-not-exist
-          - sanity-baits-only
-          - sanity-targets-only
-          - sanity-samples-overlapping-name
-          - sanity-multisample
-          - sanity-example-config
+          - sanity
 
-          - dry-run-vanilla
-          - dry-run-target-baits
-          - dry-run-bed-coverage
-          - dry-run-multisample
+          - dry-run
 
-          - integration-vanilla
           - integration-small-scatter
-          - integration-refflat
-          - integration-all-on-target
-          - integration-gene-bedfile
-          - integration-two-known-sites
-          - integration-two-readgroups
-          - integration-two-samples
-          - integration-target-baits
-          - integration-bed-coverage
-          - integration-restrict-BQSR
-          - integration-targets-only
-          - integration-multisample
 
     steps:
     - uses: actions/checkout@v2
@@ -102,3 +78,12 @@ jobs:
           echo $file; cat $file
         done
         '
+
+    - name: Check size of created files
+      if: ${{ failure() }}
+      run: >-
+        bash -c '
+        for file in $(find /tmp/pytest_workflow_*/${{ matrix.test}}/ -type f); do
+          du -sh $file
+        done
+        '
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11d5dc7..0dd9a48 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,9 @@ This document is user facing. Please word the changes in such a way
 that users understand how the changes affect the new version.
 -->
 
+v2.2.2-dev
+---------------------------
+
 v2.0.1
 ---------------------------
 + `multisample_vcf` now acts on the scatters, instead of on the merged g.vcf
diff --git a/Snakefile b/Snakefile
index 5679d83..8d6f21e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -121,6 +121,7 @@ rule align:
         "bwa mem -t {threads} -R '{params.rg}' {input.ref} "
         "{input.r1} {input.r2} 2> {log.bwa} | "
         "samtools sort "
+        "-T {input.tmp} "
         "-l {params.compression_level} "
         "- -o {output.bam} 2> {log.samtools};"
         "samtools index {output.bam}"
diff --git a/environment.yml b/environment.yml
index b9804d8..25c4881 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,6 +7,4 @@ channels:
   - conda-forge
 dependencies:
   - pytest-workflow>=1.4.0
-  - snakemake-minimal
-  - boto3
-  - smart_open
+  - snakemake-minimal=5.31.1
diff --git a/tests/data/config/sample_config_multisample.json b/tests/data/config/sample_config_multisample.json
index 2b82b0e..23a8df1 100644
--- a/tests/data/config/sample_config_multisample.json
+++ b/tests/data/config/sample_config_multisample.json
@@ -22,6 +22,6 @@
   "known_sites": ["tests/data/reference/database.vcf.gz"],
   "targetsfile": "tests/data/reference/full_chrM.bed",
   "baitsfile":  "tests/data/reference/target_baits.bed",
-  "scatter_size": 1000,
+  "scatter_size": 8000,
   "multisample_vcf": true
 }
diff --git a/tests/data/config/sample_config_scatter.json b/tests/data/config/sample_config_scatter.json
index 57a9cd9..f914de5 100644
--- a/tests/data/config/sample_config_scatter.json
+++ b/tests/data/config/sample_config_scatter.json
@@ -12,5 +12,5 @@
   "reference":"tests/data/reference/ref.fa",
   "dbsnp": "tests/data/reference/database.vcf.gz",
   "known_sites": ["tests/data/reference/database.vcf.gz"],
-  "scatter_size": 1000
+  "scatter_size": 8000
 }
diff --git a/tests/test_integration_run.yml b/tests/test_integration_run.yml
index d3feb81..d2a8db9 100644
--- a/tests/test_integration_run.yml
+++ b/tests/test_integration_run.yml
@@ -71,7 +71,7 @@
       - rror
   files:
     - path: scatter/scatter-0.bed
-    - path: scatter/scatter-15.bed
+    - path: scatter/scatter-1.bed
     - path: micro/vcf/micro.vcf.gz.tbi
     - path: micro/vcf/micro.vcf.gz
       contains_regex:
@@ -79,14 +79,14 @@
         - 'chrM\t263\t.\tA\tG\t323.*GT:AD:DP:GQ:PL\t1/1:0,108:108:99:3267,323,0'
         - 'chrM\t4745\t.\tA\tG\t56.*GT:AD:DP:GQ:PGT:PID:PL\t1/1:1,134:135:99:1|1:4745_A_G:5718,407,0'
         - 'chrM\t4769\t.\tA\tG\t5182.*GT:AD:DP:GQ:PGT:PID:PL\t1/1:1,120:121:99:1|1:4745_A_G:5211,363,0'
-        - 'chrM\t16023\t.\tG\tA\t.*GT:AD:DP:GQ:PL\t0/1:75,74:'
+        - 'chrM\t16023\t.\tG\tA\t.*GT:AD:DP:GQ:PL\t0/1:74,74:'
     - path: micro/vcf/micro.g.vcf.gz
       contains:
         - "chrM\t1\t.\tG\t<NON_REF>\t.\t.\tEND=151\tGT:DP:GQ:MIN_DP:PL\t0/0:164:99:137:0,120,1800"
         - "chrM\t16560\t.\tC\t<NON_REF>\t.\t.\tEND=16569\tGT:DP:GQ:MIN_DP:PL\t0/0:195:0:187:0,0,0"
       contains_regex:
         - 'chrM\t152\t.\tT\tC,<NON_REF>\t3960.*GT:AD:DP:GQ:PL:SB\t1/1:0,130,0:130:99:3989,388,0,3989,388,3989:0,0,47,83'
-        - 'chrM\t16023\t.\tG\tA,<NON_REF>\t.*GT:AD:DP:GQ:PL:SB\t0/1:75,74,0:'
+        - 'chrM\t16023\t.\tG\tA,<NON_REF>\t.*GT:AD:DP:GQ:PL:SB\t0/1:74,74,0:'
     - path: micro/vcf/micro.g.vcf.gz.tbi
     - path: micro/vcf/micro.0.vcf.gz
       should_exist: false