Skip to content

Commit

Permalink
Normalization as optional and publishing improvment (nf-core#131)
Browse files Browse the repository at this point in the history
* New dev

* Rename function

* Fix function

* Update workflow name

* Update tests

* Update function usage

* Update tests snapshot

* Update changelog

* Change phased to phase

* Update documentation

* Update snapshot

* Update publishing panel

* Update test

* Update docs/usage.md

Co-authored-by: Anabella Trigila <18577080+atrigila@users.noreply.github.com>

---------

Co-authored-by: Anabella Trigila <18577080+atrigila@users.noreply.github.com>
  • Loading branch information
LouisLeNezet and atrigila authored Oct 15, 2024
1 parent 87f537c commit 0a25c1c
Show file tree
Hide file tree
Showing 26 changed files with 240 additions and 223 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ concurrency:

jobs:
test:
name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})"
name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.TEST_PROFILE }} | ${{ matrix.profile }})"
# Only run on push if this is the nf-core dev branch (merged PRs)
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/phaseimpute') }}"
runs-on: ubuntu-latest
Expand Down Expand Up @@ -103,6 +103,6 @@ jobs:
wget -qO- https://code.askimed.com/install/nf-test | bash -s $NFTEST_VER
sudo mv nf-test /usr/local/bin/
- name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}"
- name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.TEST_PROFILE }} | ${{ matrix.profile }}"
run: |
nf-test test --tag "${{ matrix.TEST_PROFILE }}" --profile ${{ matrix.profile }} --verbose
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co
- [#119](https://github.com/nf-core/phaseimpute/pull/119) - Add dog test with panelprep and imputation.
- [#118](https://github.com/nf-core/phaseimpute/pull/118) - Explain how to customize arguments in the pipeline.
- [#111](https://github.com/nf-core/phaseimpute/pull/111) - Add nf-test for all sbwf, wf, modules and functions.
- [#131](https://github.com/nf-core/phaseimpute/pull/131) - Set normalisation as optional. Fix extension detection function. Add support for validation with vcf files. Concatenate vcf only if more than one file. Change `--phased` to `--phase` for consistency.

### `Changed`

Expand Down
2 changes: 1 addition & 1 deletion assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"properties": {
"sample": {
"type": "string",
"pattern": "^[a-zA-Z0-9_]+$",
"pattern": "^[a-zA-Z0-9_-]+$",
"errorMessage": "Sample name must be provided and cannot contain spaces nor special character '.' .",
"meta": ["id"]
},
Expand Down
62 changes: 46 additions & 16 deletions conf/steps/panel_prep.config
Original file line number Diff line number Diff line change
Expand Up @@ -52,40 +52,38 @@ process {
params.remove_samples ? "-s^${params.remove_samples}" : '',
"--output-type z", "--write-index=tbi"
].join(' ')
ext.prefix = { "${meta.id}_${meta.chr}_biallelic_snps" }
ext.prefix = { "${meta.id}_${meta.chr}_normalized" }
publishDir = [
path: { "${params.outdir}/prep_panel/normalized" },
saveAs: { filename -> params.compute_freq ? null : filename.equals('versions.yml') ? null : filename },
enabled: true
path: { "${params.outdir}/prep_panel/panel" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: { !params.compute_freq && !params.phase }
]
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:VCFLIB_VCFFIXUP' {
ext.prefix = { "${meta.id}_${meta.chr}" }
ext.prefix = { "${meta.id}_${meta.chr}_fixup" }
publishDir = [
path: { "${params.outdir}/prep_panel/normalized" },
path: { "${params.outdir}/prep_panel/panel" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: true
enabled: { !params.phase }
]
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' {
ext.args = "--tbi"
publishDir = [
path: { "${params.outdir}/prep_panel/normalized" },
path: { "${params.outdir}/prep_panel/panel" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: true
enabled: { !params.phase }
]
}

// Subworkflow: VCF_PHASE_SHAPEIT5
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:.*' {
publishDir = [
path: { "${params.outdir}/prep_panel/phasing" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: false
]
publishDir = [ enabled: false ]
tag = {"${meta.id} ${meta.chr}"}
}

Expand All @@ -97,6 +95,27 @@ process {
ext.prefix = { "${meta.id}_${meta.chunk.replace(':',"_")}_chunks" }
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:SHAPEIT5_LIGATE' {
ext.prefix = { "${meta.id}_${meta.chr}_phased" }
publishDir = [
path: { "${params.outdir}/prep_panel/panel" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: true
]
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_SHAPEIT5:VCF_BCFTOOLS_INDEX_2' {
ext.prefix = { "${meta.id}_${meta.chr}_phased" }
ext.args = "--csi"
publishDir = [
path: { "${params.outdir}/prep_panel/panel" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: true
]
}

// Subworkflow: VCF_SITES_EXTRACT_BCFTOOLS
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:.*' {
publishDir = [ enabled: false ]
Expand Down Expand Up @@ -125,13 +144,24 @@ process {
].join(' ')
ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites" }
publishDir = [
path: { "${params.outdir}/prep_panel/sites/vcf/" },
path: { "${params.outdir}/prep_panel/sites/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: true
]
}

withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_INDEX' {
ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites" }
publishDir = [
path: { "${params.outdir}/prep_panel/sites/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
enabled: true
]
}


// Subworkflow: Concat phased panel
withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:.*' {
publishDir = [ enabled: false ]
Expand Down
1 change: 0 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ params {
fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
fasta_fai = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz.fai"
panel = "${projectDir}/tests/csv/panel.csv"
phased = true

// Pipeline steps
steps = "impute"
Expand Down
3 changes: 2 additions & 1 deletion conf/test_all.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ params {
// Genome references
fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
panel = "${projectDir}/tests/csv/panel.csv"
phased = false
phase = true
normalize = true
compute_freq = false

// Pipeline steps
Expand Down
9 changes: 7 additions & 2 deletions conf/test_all_fullchr.config
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/phaseimpute -profile test_panelprep,<docker/singularity> --outdir <OUTDIR>
nextflow run nf-core/phaseimpute -profile test_all_fullchr,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/
Expand All @@ -25,11 +25,16 @@ params {
// Genome references
fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
panel = "${projectDir}/tests/csv/panel_fullchr.csv"
phased = false
input = "${projectDir}/tests/csv/sample_sim_full.csv"

// Pipeline steps
steps = "all"
tools = "glimpse1,glimpse2,quilt,stitch"
depth = 1

// Panelprep optional args
remove_samples = "NA12878,NA19401,NA20359,NA12891,NA12892,NA20362"
normalize = true
compute_freq = false
phase = false
}
9 changes: 7 additions & 2 deletions conf/test_dog.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,19 @@ params {
fasta = params.pipelines_testdata_base_path + "dog_data/reference_genome/canFam3.s.fa.gz"
fasta_fai = params.pipelines_testdata_base_path + "dog_data/reference_genome/canFam3.s.fa.gz.fai"
panel = "${projectDir}/tests/csv/panel_dog.csv"
phased = false

// Panelprep optional args
phase = true
normalize = false
compute_freq = false
rename_chr = true

// Input data
input = params.pipelines_testdata_base_path + "dog_data/csv/sample_dog.csv"

// Pipeline steps
steps = "panelprep,impute"
tools = "glimpse1,glimpse2,quilt"

}

process {
Expand Down
3 changes: 3 additions & 0 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ params {

// Panelprep optional args
remove_samples = "NA12878,NA12891,NA12892"
normalize = true
compute_freq = true
phase = true

// Impute tools
tools = "glimpse1"
Expand Down
1 change: 0 additions & 1 deletion conf/test_glimpse2.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ params {
// Genome references
fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
panel = "${projectDir}/tests/csv/panel.csv"
phased = true

// Pipeline steps
steps = "impute"
Expand Down
5 changes: 4 additions & 1 deletion conf/test_panelprep.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ params {
fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
input_region = "${projectDir}/tests/csv/region.csv"
panel = "${projectDir}/tests/csv/panel.csv"
phased = true

// Panelprep optional args
phase = true
normalize = true
compute_freq = true
remove_samples = "HG00096,HG00097,HG00099,HG00100"

Expand Down
1 change: 0 additions & 1 deletion conf/test_quilt.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ params {

// Genome references
fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz"
phased = true

// Pipeline steps
steps = "impute"
Expand Down
12 changes: 5 additions & 7 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,13 @@ This steps of the pipeline performs a QC of the reference panel data and produce
The directory structure from `--steps panelprep` is:

```
├── panel
├── haplegend
├── sites
├── chunks
│ ├── glimpse1
│ └── glimpse2
├── csv
├── panel
├── haplegend
└── sites
├── tsv
└── vcf
```

### Panel directory
Expand All @@ -41,12 +39,12 @@ The directory structure from `--steps panelprep` is:
<summary>Output files</summary>

- `prep_panel/panel/`
- `*.vcf.gz`: A vcf for the prepared reference panel.
- `*.vcf.gz`: The reference panel vcf after all the preprocessing is done.
- `*.tbi*`: A tbi for the prepared reference panel.

</details>

A directory containing the final phased and prepared panel per chromosome.
A directory containing the reference panel per chromosome after preprocessing. The files will be normalized if the flag `--normalize` is used (with `_normalized` suffix). The files will have their allele frequency computed if the flaq `--compute_freq` is used (with `_fixup` suffix). The files will be phased if the flag `--phase` is used (with `_phased` suffix).

### Haplegend directory

Expand Down
17 changes: 9 additions & 8 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ SAMPLE5,AEG588A5.bam,AEG588A5.bai
SAMPLE6,AEG588A6.bam,AEG588A6.bai
```

| Column | Description |
| -------- | ------------------------------------------------------------------------------------------------------------------------- |
| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`). |
| `file` | Full path to a BAM or CRAM file. File has to be have the extension ".bam" or ".cram" and all files need to have the same. |
| `index` | Full path to a BAI or CRAI file. File has to be have the extension ".bai" or ".crai" and all files need to have the same. |
| Column | Description |
| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`). |
| `file` | Full path to an alignment or variant file. File has to have the extension ".bam", ".cram" or ".vcf", ".bcf" optionally compressed with bgzip ".gz". All files need to have the same extension. |
| `index` | Full path to index file. File has to be have the extension ".bai", ".crai", "csi", or "tbi". All files need to have the same extension. |

An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.

Expand Down Expand Up @@ -228,9 +228,10 @@ The required flags for this mode are:

- `--steps panelprep`: The steps to run.
- `--panel reference.csv`: The samplesheet containing the reference panel files in `vcf.gz` format.
- `--phased`: (optional) Whether the reference panel is phased (true|false).
- `--phase`: (optional) Whether the reference panel should be phased (true|false).
- `--normalize`: (optional) Whether the reference panel needs to be normalized or not (true|false). Default is true.
- `--remove_samples`: (optional) A comma-separated list of samples to remove from the reference during the normalization process.
- `--compute_freq`: (optional) Whether the frequency (AC/AN field) for each variants needs to be computed or not (true/false). This can be the case if the frequency is absent from the reference panel or if individuals have been removed.
- `--remove_samples`: (optional) A comma-separated list of samples to remove from the reference.

You can find an overview of the results produced by this steps in the [Output](output.md).

Expand Down Expand Up @@ -359,7 +360,7 @@ bcftools convert --haplegendsample ${vcf}

#### GLIMPSE1

[GLIMPSE1](https://github.com/odelaneau/GLIMPSE/tree/glimpse1) is a set of tools for phasing and imputation for low-coverage sequencing datasets. Recommended for many samples at >0.5x coverage and small reference panels. This is an example command to run this tool from the `--steps impute`:
[GLIMPSE1](https://github.com/odelaneau/GLIMPSE/tree/glimpse1) is a set of tools for phasing and imputation for low-coverage sequencing datasets. Recommended for many samples at >0.5x coverage and small reference panels. Glimpse1 works with alignment (i.e. BAM or CRAM) as well as variant (i.e. VCF or BCF) files as input. This is an example command to run this tool from the `--steps impute`:

```bash
nextflow run nf-core/phaseimpute \
Expand Down
3 changes: 2 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ params {

// Panel preparation
panel = null
phased = null
phase = false
normalize = true
compute_freq = false
rename_chr = false
remove_samples = null
Expand Down
9 changes: 7 additions & 2 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,13 @@
"pattern": "^\\S+\\.(csv|tsv|txt)$",
"mimetype": "text/csv"
},
"phased": {
"description": "Is the reference panel phased",
"phase": {
"description": "Should the reference panel be phased",
"type": "boolean",
"pattern": "true|false"
},
"normalize": {
"description": "Should the reference panel be normalized",
"type": "boolean",
"pattern": "true|false"
},
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/bam_gl_bcftools/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ workflow BAM_GL_BCFTOOLS {
ch_multiqc_files = ch_multiqc_files.mix(BCFTOOLS_MPILEUP.out.stats.map{ it[1] })

emit:
vcf = ch_output // channel: [ [id, panel, chr], vcf, tbi ]
vcf_tbi = ch_output // channel: [ [id, panel, chr], vcf, tbi ]
versions = ch_versions // channel: [ versions.yml ]
multiqc_files = ch_multiqc_files
}
22 changes: 20 additions & 2 deletions subworkflows/local/bam_impute_glimpse1/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,33 @@ workflow BAM_IMPUTE_GLIMPSE1 {
ch_versions = Channel.empty()
ch_multiqc_files = Channel.empty()

// Channels for branching
ch_input = ch_input
.branch {
bam: it[1] =~ 'bam|cram'
vcf: it[1] =~ '(vcf|bcf)(.gz)*'
other: true
}
ch_input.other
.map{ error "Input files must be either BAM/CRAM or VCF/BCF" }

// Glimpse1 subworkflow
BAM_GL_BCFTOOLS( // Compute GL for input data once per panel by chromosome
ch_input,
ch_input.bam,
ch_posfile,
ch_fasta
)
ch_multiqc_files = ch_multiqc_files.mix(BAM_GL_BCFTOOLS.out.multiqc_files)
ch_versions = ch_versions.mix(BAM_GL_BCFTOOLS.out.versions)

// Combine input and chunks reference
ch_impute = ch_input.vcf
.combine(ch_posfile)
.map{ metaI, vcf, index, metaPC, legend ->
[metaI + ["panel": metaPC.id, "chr": metaPC.chr], vcf, index]
}
.mix(BAM_GL_BCFTOOLS.out.vcf_tbi)

samples_file = Channel.of([[]]).collect()
gmap_file = Channel.of([[]]).collect()

Expand All @@ -39,7 +57,7 @@ workflow BAM_IMPUTE_GLIMPSE1 {
}

// Join input and chunks reference
ch_phase_input = BAM_GL_BCFTOOLS.out.vcf
ch_phase_input = ch_impute
.map{ metaIPC, vcf, index -> [metaIPC.subMap("panel", "chr"), metaIPC, vcf, index] }
.combine(samples_file)
.combine(ch_chunks_panel, by: 0)
Expand Down
Loading

0 comments on commit 0a25c1c

Please sign in to comment.