Normalization as optional and publishing improvment (nf-core#131)

* New dev * Rename function * Fix function * Update workflow name * Update tests * Update function usage * Update tests snapshot * Update changelog * Change phased to phase * Update documentation * Update snapshot * Update publishing panel * Update test * Update docs/usage.md Co-authored-by: Anabella Trigila <18577080+atrigila@users.noreply.github.com> --------- Co-authored-by: Anabella Trigila <18577080+atrigila@users.noreply.github.com>
LouisLeNezet · Oct 15, 2024 · 0a25c1c · 0a25c1c
1 parent 87f537c
commit 0a25c1c
Show file tree

Hide file tree

Showing 26 changed files with 240 additions and 223 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,7 +29,7 @@ concurrency:
 
 jobs:
   test:
-    name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})"
+    name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.TEST_PROFILE }} | ${{ matrix.profile }})"
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/phaseimpute') }}"
     runs-on: ubuntu-latest
@@ -103,6 +103,6 @@ jobs:
           wget -qO- https://code.askimed.com/install/nf-test | bash -s $NFTEST_VER
           sudo mv nf-test /usr/local/bin/
 
-      - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}"
+      - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.TEST_PROFILE }} | ${{ matrix.profile }}"
         run: |
           nf-test test --tag "${{ matrix.TEST_PROFILE }}" --profile ${{ matrix.profile }} --verbose
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co
 - [#119](https://github.com/nf-core/phaseimpute/pull/119) - Add dog test with panelprep and imputation.
 - [#118](https://github.com/nf-core/phaseimpute/pull/118) - Explain how to customize arguments in the pipeline.
 - [#111](https://github.com/nf-core/phaseimpute/pull/111) - Add nf-test for all sbwf, wf, modules and functions.
+- [#131](https://github.com/nf-core/phaseimpute/pull/131) - Set normalisation as optional. Fix extension detection function. Add support for validation with vcf files. Concatenate vcf only if more than one file. Change `--phased` to `--phase` for consistency.
 
 ### `Changed`
 

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -9,7 +9,7 @@
         "properties": {
             "sample": {
                 "type": "string",
-                "pattern": "^[a-zA-Z0-9_]+$",
+                "pattern": "^[a-zA-Z0-9_-]+$",
                 "errorMessage": "Sample name must be provided and cannot contain spaces nor special character '.' .",
                 "meta": ["id"]
             },

diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config
@@ -52,40 +52,38 @@ process {
             params.remove_samples ? "-s^${params.remove_samples}" : '',
             "--output-type z", "--write-index=tbi"
         ].join(' ')
-        ext.prefix = { "${meta.id}_${meta.chr}_biallelic_snps" }
+        ext.prefix = { "${meta.id}_${meta.chr}_normalized" }
         publishDir = [
-            path: { "${params.outdir}/prep_panel/normalized" },
-            saveAs: { filename -> params.compute_freq ? null : filename.equals('versions.yml') ? null : filename },
-            enabled: true
+            path: { "${params.outdir}/prep_panel/panel" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: { !params.compute_freq && !params.phase }
         ]
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:VCFLIB_VCFFIXUP' {
-        ext.prefix   = { "${meta.id}_${meta.chr}" }
+        ext.prefix   = { "${meta.id}_${meta.chr}_fixup" }
         publishDir = [
-            path: { "${params.outdir}/prep_panel/normalized" },
+            path: { "${params.outdir}/prep_panel/panel" },
+            mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-            enabled: true
+            enabled: { !params.phase }
         ]
     }
 
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' {
         ext.args     = "--tbi"
         publishDir = [
-            path: { "${params.outdir}/prep_panel/normalized" },
+            path: { "${params.outdir}/prep_panel/panel" },
+            mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-            enabled: true
+            enabled: { !params.phase }
         ]
     }
 
     // Subworkflow: VCF_PHASE_SHAPEIT5
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:.*' {
-        publishDir = [
-            path: { "${params.outdir}/prep_panel/phasing" },
-            mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
-            enabled: false
-        ]
+        publishDir = [ enabled: false ]
         tag = {"${meta.id} ${meta.chr}"}
     }
 
@@ -97,6 +95,27 @@ process {
         ext.prefix = { "${meta.id}_${meta.chunk.replace(':',"_")}_chunks" }
     }
 
+    withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:SHAPEIT5_LIGATE' {
+        ext.prefix = { "${meta.id}_${meta.chr}_phased" }
+        publishDir = [
+            path: { "${params.outdir}/prep_panel/panel" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: true
+        ]
+    }
+
+    withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:VCF_BCFTOOLS_INDEX_2' {
+        ext.prefix = { "${meta.id}_${meta.chr}_phased" }
+        ext.args   = "--csi"
+        publishDir = [
+            path: { "${params.outdir}/prep_panel/panel" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: true
+        ]
+    }
+
     // Subworkflow: VCF_SITES_EXTRACT_BCFTOOLS
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:.*' {
         publishDir = [ enabled: false ]
@@ -125,13 +144,24 @@ process {
         ].join(' ')
         ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites" }
         publishDir = [
-            path: { "${params.outdir}/prep_panel/sites/vcf/" },
+            path: { "${params.outdir}/prep_panel/sites/" },
             mode: params.publish_dir_mode,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
             enabled: true
         ]
     }
 
+    withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_INDEX' {
+        ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites" }
+        publishDir = [
+            path: { "${params.outdir}/prep_panel/sites/" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: true
+        ]
+    }
+
+
     // Subworkflow: Concat phased panel
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:.*' {
         publishDir = [ enabled: false ]

diff --git a/conf/test.config b/conf/test.config
@@ -30,7 +30,6 @@ params {
     fasta     = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
     fasta_fai = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz.fai"
     panel     = "${projectDir}/tests/csv/panel.csv"
-    phased    = true
 
     // Pipeline steps
     steps  = "impute"

diff --git a/conf/test_all.config b/conf/test_all.config
@@ -30,7 +30,8 @@ params {
     // Genome references
     fasta        = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
     panel        = "${projectDir}/tests/csv/panel.csv"
-    phased       = false
+    phase        = true
+    normalize    = true
     compute_freq = false
 
     // Pipeline steps

diff --git a/conf/test_all_fullchr.config b/conf/test_all_fullchr.config
@@ -5,7 +5,7 @@
     Defines input files and everything required to run a fast and simple pipeline test.
 
     Use as follows:
-        nextflow run nf-core/phaseimpute -profile test_panelprep,<docker/singularity> --outdir <OUTDIR>
+        nextflow run nf-core/phaseimpute -profile test_all_fullchr,<docker/singularity> --outdir <OUTDIR>
 
 ----------------------------------------------------------------------------------------
 */
@@ -25,11 +25,16 @@ params {
     // Genome references
     fasta  = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
     panel  = "${projectDir}/tests/csv/panel_fullchr.csv"
-    phased = false
     input  = "${projectDir}/tests/csv/sample_sim_full.csv"
+
     // Pipeline steps
     steps   = "all"
     tools   = "glimpse1,glimpse2,quilt,stitch"
     depth   = 1
+
+    // Panelprep optional args
     remove_samples = "NA12878,NA19401,NA20359,NA12891,NA12892,NA20362"
+    normalize      = true
+    compute_freq   = false
+    phase          = false
 }
diff --git a/conf/test_dog.config b/conf/test_dog.config
@@ -26,14 +26,19 @@ params {
     fasta        = params.pipelines_testdata_base_path + "dog_data/reference_genome/canFam3.s.fa.gz"
     fasta_fai    = params.pipelines_testdata_base_path + "dog_data/reference_genome/canFam3.s.fa.gz.fai"
     panel        = "${projectDir}/tests/csv/panel_dog.csv"
-    phased       = false
+
+    // Panelprep optional args
+    phase        = true
+    normalize    = false
+    compute_freq = false
     rename_chr   = true
+
+    // Input data
     input        = params.pipelines_testdata_base_path + "dog_data/csv/sample_dog.csv"
 
     // Pipeline steps
     steps   = "panelprep,impute"
     tools   = "glimpse1,glimpse2,quilt"
-
 }
 
 process {

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -36,6 +36,9 @@ params {
 
     // Panelprep optional args
     remove_samples = "NA12878,NA12891,NA12892"
+    normalize      = true
+    compute_freq   = true
+    phase          = true
 
     // Impute tools
     tools = "glimpse1"

diff --git a/conf/test_glimpse2.config b/conf/test_glimpse2.config
@@ -28,7 +28,6 @@ params {
     // Genome references
     fasta  = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
     panel  = "${projectDir}/tests/csv/panel.csv"
-    phased = true
 
     // Pipeline steps
     steps  = "impute"

diff --git a/conf/test_panelprep.config b/conf/test_panelprep.config
@@ -26,7 +26,10 @@ params {
     fasta          = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
     input_region   = "${projectDir}/tests/csv/region.csv"
     panel          = "${projectDir}/tests/csv/panel.csv"
-    phased         = true
+
+    // Panelprep optional args
+    phase          = true
+    normalize      = true
     compute_freq   = true
     remove_samples = "HG00096,HG00097,HG00099,HG00100"
 

diff --git a/conf/test_quilt.config b/conf/test_quilt.config
@@ -28,7 +28,6 @@ params {
 
     // Genome references
     fasta   = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
-    phased  = true
 
     // Pipeline steps
     steps   = "impute"

diff --git a/docs/output.md b/docs/output.md
@@ -24,15 +24,13 @@ This steps of the pipeline performs a QC of the reference panel data and produce
 The directory structure from `--steps panelprep` is:
 
 ```
+├── panel
+├── haplegend
+├── sites
 ├── chunks
 │   ├── glimpse1
 │   └── glimpse2
 ├── csv
-├── panel
-├── haplegend
-└── sites
-    ├── tsv
-    └── vcf
 ```
 
 ### Panel directory
@@ -41,12 +39,12 @@ The directory structure from `--steps panelprep` is:
 <summary>Output files</summary>
 
 - `prep_panel/panel/`
-  - `*.vcf.gz`: A vcf for the prepared reference panel.
+  - `*.vcf.gz`: The reference panel vcf after all the preprocessing is done.
   - `*.tbi*`: A tbi for the prepared reference panel.
 
 </details>
 
-A directory containing the final phased and prepared panel per chromosome.
+A directory containing the reference panel per chromosome after preprocessing. The files will be normalized if the flag `--normalize` is used (with `_normalized` suffix). The files will have their allele frequency computed if the flaq `--compute_freq` is used (with `_fixup` suffix). The files will be phased if the flag `--phase` is used (with `_phased` suffix).
 
 ### Haplegend directory
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -32,11 +32,11 @@ SAMPLE5,AEG588A5.bam,AEG588A5.bai
 SAMPLE6,AEG588A6.bam,AEG588A6.bai
 ```
 
-| Column   | Description                                                                                                               |
-| -------- | ------------------------------------------------------------------------------------------------------------------------- |
-| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`).                              |
-| `file`   | Full path to a BAM or CRAM file. File has to be have the extension ".bam" or ".cram" and all files need to have the same. |
-| `index`  | Full path to a BAI or CRAI file. File has to be have the extension ".bai" or ".crai" and all files need to have the same. |
+| Column   | Description                                                                                                                                                                                    |
+| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`).                                                                                                   |
+| `file`   | Full path to an alignment or variant file. File has to have the extension ".bam", ".cram" or ".vcf", ".bcf" optionally compressed with bgzip ".gz". All files need to have the same extension. |
+| `index`  | Full path to index file. File has to be have the extension ".bai", ".crai", "csi", or "tbi". All files need to have the same extension.                                                        |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
@@ -228,9 +228,10 @@ The required flags for this mode are:
 
 - `--steps panelprep`: The steps to run.
 - `--panel reference.csv`: The samplesheet containing the reference panel files in `vcf.gz` format.
-- `--phased`: (optional) Whether the reference panel is phased (true|false).
+- `--phase`: (optional) Whether the reference panel should be phased (true|false).
+- `--normalize`: (optional) Whether the reference panel needs to be normalized or not (true|false). Default is true.
+- `--remove_samples`: (optional) A comma-separated list of samples to remove from the reference during the normalization process.
 - `--compute_freq`: (optional) Whether the frequency (AC/AN field) for each variants needs to be computed or not (true/false). This can be the case if the frequency is absent from the reference panel or if individuals have been removed.
-- `--remove_samples`: (optional) A comma-separated list of samples to remove from the reference.
 
 You can find an overview of the results produced by this steps in the [Output](output.md).
 
@@ -359,7 +360,7 @@ bcftools convert --haplegendsample ${vcf}
 
 #### GLIMPSE1
 
-[GLIMPSE1](https://github.com/odelaneau/GLIMPSE/tree/glimpse1) is a set of tools for phasing and imputation for low-coverage sequencing datasets. Recommended for many samples at >0.5x coverage and small reference panels. This is an example command to run this tool from the `--steps impute`:
+[GLIMPSE1](https://github.com/odelaneau/GLIMPSE/tree/glimpse1) is a set of tools for phasing and imputation for low-coverage sequencing datasets. Recommended for many samples at >0.5x coverage and small reference panels. Glimpse1 works with alignment (i.e. BAM or CRAM) as well as variant (i.e. VCF or BCF) files as input. This is an example command to run this tool from the `--steps impute`:
 
 ```bash
 nextflow run nf-core/phaseimpute \

diff --git a/nextflow.config b/nextflow.config
@@ -20,7 +20,8 @@ params {
 
     // Panel preparation
     panel                       = null
-    phased                      = null
+    phase                       = false
+    normalize                   = true
     compute_freq                = false
     rename_chr                  = false
     remove_samples              = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -108,8 +108,13 @@
                     "pattern": "^\\S+\\.(csv|tsv|txt)$",
                     "mimetype": "text/csv"
                 },
-                "phased": {
-                    "description": "Is the reference panel phased",
+                "phase": {
+                    "description": "Should the reference panel be phased",
+                    "type": "boolean",
+                    "pattern": "true|false"
+                },
+                "normalize": {
+                    "description": "Should the reference panel be normalized",
                     "type": "boolean",
                     "pattern": "true|false"
                 },

diff --git a/subworkflows/local/bam_gl_bcftools/main.nf b/subworkflows/local/bam_gl_bcftools/main.nf
@@ -50,7 +50,7 @@ workflow BAM_GL_BCFTOOLS {
     ch_multiqc_files = ch_multiqc_files.mix(BCFTOOLS_MPILEUP.out.stats.map{ it[1] })
 
     emit:
-    vcf           = ch_output        // channel: [ [id, panel, chr], vcf, tbi ]
+    vcf_tbi       = ch_output        // channel: [ [id, panel, chr], vcf, tbi ]
     versions      = ch_versions      // channel: [ versions.yml ]
     multiqc_files = ch_multiqc_files
 }
diff --git a/subworkflows/local/bam_impute_glimpse1/main.nf b/subworkflows/local/bam_impute_glimpse1/main.nf
@@ -19,15 +19,33 @@ workflow BAM_IMPUTE_GLIMPSE1 {
     ch_versions = Channel.empty()
     ch_multiqc_files = Channel.empty()
 
+        // Channels for branching
+    ch_input = ch_input
+        .branch {
+            bam: it[1] =~ 'bam|cram'
+            vcf: it[1] =~ '(vcf|bcf)(.gz)*'
+            other: true
+        }
+    ch_input.other
+        .map{ error "Input files must be either BAM/CRAM or VCF/BCF" }
+
     // Glimpse1 subworkflow
     BAM_GL_BCFTOOLS( // Compute GL for input data once per panel by chromosome
-        ch_input,
+        ch_input.bam,
         ch_posfile,
         ch_fasta
     )
     ch_multiqc_files = ch_multiqc_files.mix(BAM_GL_BCFTOOLS.out.multiqc_files)
     ch_versions = ch_versions.mix(BAM_GL_BCFTOOLS.out.versions)
 
+    // Combine input and chunks reference
+    ch_impute = ch_input.vcf
+        .combine(ch_posfile)
+        .map{ metaI, vcf, index, metaPC, legend ->
+            [metaI + ["panel": metaPC.id, "chr": metaPC.chr], vcf, index]
+        }
+        .mix(BAM_GL_BCFTOOLS.out.vcf_tbi)
+
     samples_file = Channel.of([[]]).collect()
     gmap_file    = Channel.of([[]]).collect()
 
@@ -39,7 +57,7 @@ workflow BAM_IMPUTE_GLIMPSE1 {
         }
 
     // Join input and chunks reference
-    ch_phase_input = BAM_GL_BCFTOOLS.out.vcf
+    ch_phase_input = ch_impute
         .map{ metaIPC, vcf, index -> [metaIPC.subMap("panel", "chr"), metaIPC, vcf, index] }
         .combine(samples_file)
         .combine(ch_chunks_panel, by: 0)