From 84823bb37f73eb13f3c04173e16a03f5c3bece89 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 6 May 2024 13:23:40 +0000 Subject: [PATCH 01/17] add previous changelog info --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5b7e078..9fc04b00 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co - [#20](https://github.com/nf-core/phaseimpute/pull/20) - Added automatic detection of vcf contigs for the reference panel and automatic renaming available - [#22](https://github.com/nf-core/phaseimpute/pull/20) - Add validation step for concordance analysis. Input channels changed to match inputs steps. Outdir folder organised by steps. Modules config by subworkflows. - [#26](https://github.com/nf-core/phaseimpute/pull/26) - Added QUILT method +- [#40](https://github.com/nf-core/phaseimpute/pull/26) - Add STITCH method. Reorganize panelprep subworkflows. ### `Fixed` From 74d09d62d3a31924eb2c337a8109ae7c7f755717 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 6 May 2024 13:23:57 +0000 Subject: [PATCH 02/17] add remove samples from panel option --- conf/steps/panel_prep.config | 11 +++++++++++ conf/test_full.config | 3 +++ nextflow.config | 1 + nextflow_schema.json | 7 +++++-- .../vcf_normalize_bcftools/vcf_normalize_bcftools.nf | 9 +++++++++ 5 files changed, 29 insertions(+), 2 deletions(-) diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index 7497e3f0..c5a4bec1 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -148,5 +148,16 @@ process { ] } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_REMOVE' { + ext.args = { "-Oz -s^${params.remove_samples}" } + ext.prefix = { "${meta.id}_${meta.chr}_biallelic_removed_samples" } + maxRetries = 2 + publishDir = [ enabled: false ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_4' { + ext.args = "--tbi" + publishDir = [enabled: false] + } } diff --git a/conf/test_full.config b/conf/test_full.config index c2e654eb..0112e868 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -29,4 +29,7 @@ params { panel = "${projectDir}/tests/csv/panel_full.csv" input_region_string = "all" step = "simulate" + + // Settings + remvove_samples = "NA12878,NA12891,NA12892" } diff --git a/nextflow.config b/nextflow.config index 83c45c28..3dd6890d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -22,6 +22,7 @@ params { panel = null phased = null rename_chr = false + remove_samples = null // References genome = null diff --git a/nextflow_schema.json b/nextflow_schema.json index f10ee0c4..528ff12f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -41,6 +41,10 @@ "description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')", "pattern": "true|false" }, + "remove_samples": { + "type": "string", + "description": "Comma-separated list of samples to remove from the reference panel. Useful for benchmarking purposes." + }, "email": { "type": "string", "description": "Email address for completion summary.", @@ -427,8 +431,7 @@ "format": "file-path", "schema": "assets/schema_posfile.json", "pattern": "^\\S+\\.(csv|tsv|txt)$", - "mimetype": "text/csv", - "help_text": "" + "mimetype": "text/csv" }, "k_val": { "type": "integer", diff --git a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf index e96e4a02..2f004352 100644 --- a/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf +++ b/subworkflows/local/vcf_normalize_bcftools/vcf_normalize_bcftools.nf @@ -4,6 +4,8 @@ include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcf include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2} from '../../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_3} from '../../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_CONVERT } from '../../../modules/nf-core/bcftools/convert/main' +include { BCFTOOLS_VIEW as BCFTOOLS_REMOVE } from '../../../modules/nf-core/bcftools/view/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_4} from '../../../modules/nf-core/bcftools/index/main' workflow VCF_NORMALIZE_BCFTOOLS { @@ -34,6 +36,13 @@ workflow VCF_NORMALIZE_BCFTOOLS { // Join biallelic VCF and TBI ch_biallelic_vcf_tbi = BCFTOOLS_VIEW.out.vcf.join(BCFTOOLS_INDEX_2.out.tbi) + // (Optional) Remove benchmarking samples (e.g. NA12878) from the reference panel + if (!(params.remove_samples == null)){ + BCFTOOLS_REMOVE(ch_biallelic_vcf_tbi, [], [], []) + BCFTOOLS_INDEX_4(BCFTOOLS_REMOVE.out.vcf) + ch_biallelic_vcf_tbi = BCFTOOLS_REMOVE.out.vcf.join(BCFTOOLS_INDEX_4.out.tbi) + } + // Convert VCF to Hap and Legend files BCFTOOLS_CONVERT(ch_biallelic_vcf_tbi, ch_fasta, []) From 2550d8894cd3c980dcb3bf3da67fd9e710fe034d Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 6 May 2024 18:19:23 +0000 Subject: [PATCH 03/17] reorganize panelprep, remove redundant modules --- conf/steps/imputation.config | 33 ----- conf/steps/panel_prep.config | 122 +++++++++--------- .../local/vcf_normalize_bcftools/meta.yml | 44 +++++++ .../local/vcf_sites_extract_bcftools/main.nf | 32 +---- .../local/vcf_sites_extract_bcftools/meta.yml | 38 ++++++ workflows/phaseimpute/main.nf | 14 +- 6 files changed, 153 insertions(+), 130 deletions(-) delete mode 100644 conf/steps/imputation.config create mode 100644 subworkflows/local/vcf_normalize_bcftools/meta.yml create mode 100644 subworkflows/local/vcf_sites_extract_bcftools/meta.yml diff --git a/conf/steps/imputation.config b/conf/steps/imputation.config deleted file mode 100644 index 7e859eca..00000000 --- a/conf/steps/imputation.config +++ /dev/null @@ -1,33 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Config file for defining DSL2 per module options and publishing paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Available keys to override module options: - ext.args = Additional arguments appended to command in module. - ext.args2 = Second set of arguments appended to command in module (multi-tool modules). - ext.args3 = Third set of arguments appended to command in module (multi-tool modules). - ext.prefix = File name prefix for output files. ----------------------------------------------------------------------------------------- -*/ - -process { - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_IMPUT:.*' { - publishDir = [ - path: { "${params.outdir}/imputation/concat" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - ext.prefix = { "${meta.id}_impute_concat" } - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_IMPUT:BCFTOOLS_CONCAT' { - ext.args = {[ - "--ligate", - "--output-type z", - ].join(" ").trim()} - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_IMPUT:BCFTOOLS_INDEX' { - ext.args = "--tbi" - } -} diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index c5a4bec1..1f13369f 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -28,40 +28,62 @@ process { publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' { - ext.args = '-m +any --no-version --output-type z' + // Subworkflow: VCF_NORMALIZE_BCFTOOLS + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:.*' { + publishDir = [ enabled: false ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' { + ext.args = '-m -any --no-version --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } maxRetries = 2 publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX' { ext.args = "--tbi" publishDir = [enabled: false] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_VIEW' { - ext.args = '-v snps -Oz' - ext.prefix = { "${meta.id}_${meta.chr}_biallelic" } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_VIEW' { + ext.args = '-v snps -m 2 -M 2 -Oz' + ext.prefix = { "${meta.id}_${meta.chr}_biallelic_snps" } maxRetries = 2 publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_2' { - ext.args = '--tbi' + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_2' { + ext.args = "--tbi" + publishDir = [enabled: false] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_CONVERT' { + ext.args = {"--haplegendsample ${meta.id}_${meta.chr}"} + maxRetries = 2 + publishDir = [ + path: { "${params.outdir}/prep_panel/haplegend/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + // (Optional) Subworkflow: Remove samples from panel + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_REMOVE' { + ext.args = { "-Oz -s^${params.remove_samples}" } + ext.prefix = { "${meta.id}_${meta.chr}_biallelic_removed_samples" } maxRetries = 2 publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:VIEW_VCF_SNPS' { - ext.args = [ - "-m 2", - "-M 2", - "-v snps", - "--output-type z", - "--no-version" - ].join(' ') - ext.prefix = { "${meta.id}_SNPS" } + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_4' { + ext.args = "--tbi" + publishDir = [enabled: false] + } + + // Subworkflow: VCF_SITES_EXTRACT_BCFTOOLS + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:.*' { publishDir = [ enabled: false ] } @@ -74,7 +96,17 @@ process { "--output-type z", "--no-version" ].join(' ') - ext.prefix = { "${meta.id}_C${meta.chr}_SITES" } + ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites" } + publishDir = [ + path: { "${params.outdir}/prep_panel/sites/vcf/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_INDEX_2' { + maxRetries = 2 + ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites" } publishDir = [ path: { "${params.outdir}/prep_panel/sites/vcf/" }, mode: params.publish_dir_mode, @@ -86,7 +118,12 @@ process { ext.args = [ "-f'%CHROM\t%POS\t%REF,%ALT\\n'", ].join(' ') - ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites_tsv" } + publishDir = [ enabled: false ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:TABIX_BGZIP' { + ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites_tsv" } publishDir = [ path: { "${params.outdir}/prep_panel/sites/tsv/" }, mode: params.publish_dir_mode, @@ -100,7 +137,7 @@ process { "-b2", "-e2" ].join(' ') - ext.prefix = { "${meta.id}_glimpse_SITES_TSV" } + ext.prefix = { "${meta.id}_${meta.chr}_glimpse1_sites_tsv" } publishDir = [ path: { "${params.outdir}/prep_panel/sites/tsv/" }, mode: params.publish_dir_mode, @@ -108,18 +145,8 @@ process { ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_CONVERT' { - ext.args = {"--haplegendsample ${meta.id}_${meta.chr}"} - maxRetries = 2 - publishDir = [ - path: { "${params.outdir}/prep_panel/haplegend/" }, - mode: params.publish_dir_mode, - enabled: true - ] - } - - // Phasing - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_PANEL:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' { + // (Optional) Subworkflow: Phasing + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_PHASE_PANEL:VCF_PHASE_SHAPEIT5:BEDTOOLS_MAKEWINDOWS' { ext.args = [ '-w 60000', '-s 40000' @@ -128,36 +155,9 @@ process { publishDir = [ enabled: false ] } - // TSV - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:BCFTOOLS_QUERY_STITCH' { - ext.args = [ - "-f'%CHROM\t%POS\t%REF\t%ALT\\n'", - ].join(' ') - ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } - publishDir = [ enabled: false ] - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_SITES_EXTRACT_BCFTOOLS:GAWK_STITCH' { - ext.args = "'{ key = \$1 FS \$2 } !seen[key]++'" - ext.prefix = { "${meta.id}_${meta.chr}_posfile_stitch" } - ext.suffix = "txt" - publishDir = [ - path: { "${params.outdir}/prep_panel/sites/tsv/" }, - mode: params.publish_dir_mode, - enabled: true - ] - } - - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_REMOVE' { - ext.args = { "-Oz -s^${params.remove_samples}" } - ext.prefix = { "${meta.id}_${meta.chr}_biallelic_removed_samples" } - maxRetries = 2 + // Subworkflow: Concat phased panel + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_PANEL:.*' { publishDir = [ enabled: false ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_INDEX_4' { - ext.args = "--tbi" - publishDir = [enabled: false] - } - } diff --git a/subworkflows/local/vcf_normalize_bcftools/meta.yml b/subworkflows/local/vcf_normalize_bcftools/meta.yml new file mode 100644 index 00000000..8cf0a2f4 --- /dev/null +++ b/subworkflows/local/vcf_normalize_bcftools/meta.yml @@ -0,0 +1,44 @@ +name: "vcf_normalize_bcftools" +description: Normalize VCF files (bcftools norm -m -any). + Keep only SNPs and remove multiallelic records (bcftools view -m 2 -M 2 -v snps). + Convert to hap/legend format (bcftools convert --haplegendsample). + Optionally, remove samples from the reference panel (bcftools view -s ^SAMPLENAME). +keywords: + - bcftools + - norm + - view +components: + - bcftools/norm + - bcftools/view + - bcftools/index + - bcftools/convert +input: + - ch_vcf: + type: file + description: | + Reference panel of haplotypes in VCF/BCF format. + Index file of the Reference panel file. + Target region, usually a full chromosome (e.g. chr20:1000000-2000000 or chr20). + The file could possibly be without GT field (for efficiency reasons a file containing only the positions is recommended). + Structure: [ meta, vcf, csi, region ] + - ch_fasta: + type: file + description: | + Reference genome in fasta format. + Reference genome index in fai format + Structure: [ meta, fasta, fai ] +output: + - vcf_tbi: + type: file + description: | + Output VCF/BCF file for the normalized, only biallelic SNPs. + Structure: [meta, vcf, tbi] + - hap_legend: + type: file + description: | + Output Hap/Legend files for the normalized, only biallelic SNPs. + Structure: [meta, hap, legend] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" diff --git a/subworkflows/local/vcf_sites_extract_bcftools/main.nf b/subworkflows/local/vcf_sites_extract_bcftools/main.nf index d3d30674..9755da9e 100644 --- a/subworkflows/local/vcf_sites_extract_bcftools/main.nf +++ b/subworkflows/local/vcf_sites_extract_bcftools/main.nf @@ -1,14 +1,8 @@ -include { BCFTOOLS_VIEW as VIEW_VCF_SNPS } from '../../../modules/nf-core/bcftools/view/main.nf' include { BCFTOOLS_VIEW as VIEW_VCF_SITES } from '../../../modules/nf-core/bcftools/view/main.nf' -include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main.nf' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index/main.nf' include { TABIX_BGZIP } from '../../../modules/nf-core/tabix/bgzip/main' include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' include { BCFTOOLS_QUERY } from '../../../modules/nf-core/bcftools/query/main.nf' -include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_STITCH} from '../../../modules/nf-core/bcftools/query/main.nf' -include { GAWK as GAWK_STITCH } from '../../../modules/nf-core/gawk' - - workflow VCF_SITES_EXTRACT_BCFTOOLS { take: @@ -18,19 +12,8 @@ workflow VCF_SITES_EXTRACT_BCFTOOLS { ch_versions = Channel.empty() - // Extract only SNPs from VCF - VIEW_VCF_SNPS(ch_vcf, [], [], []) - ch_versions = ch_versions.mix(VIEW_VCF_SNPS.out.versions.first()) - - // Index SNPs - BCFTOOLS_INDEX(VIEW_VCF_SNPS.out.vcf) - ch_versions = ch_versions.mix(BCFTOOLS_INDEX.out.versions.first()) - - // Join VCF and Index - ch_panel_norm = VIEW_VCF_SNPS.out.vcf.combine(BCFTOOLS_INDEX.out.csi, by:0) - // Extract sites positions - VIEW_VCF_SITES( ch_panel_norm,[], [], []) + VIEW_VCF_SITES( ch_vcf,[], [], []) ch_versions = ch_versions.mix(VIEW_VCF_SITES.out.versions.first()) // Index extracted sites @@ -40,8 +23,6 @@ workflow VCF_SITES_EXTRACT_BCFTOOLS { // Join extracted sites and index ch_panel_sites = VIEW_VCF_SITES.out.vcf.combine(BCFTOOLS_INDEX_2.out.csi, by:0) - // Create TSVs for different tools - // Convert to TSV with structure for Glimpse BCFTOOLS_QUERY(ch_panel_sites, [], [], []) ch_versions = ch_versions.mix(BCFTOOLS_QUERY.out.versions.first()) @@ -57,18 +38,9 @@ workflow VCF_SITES_EXTRACT_BCFTOOLS { // Join compressed TSV and index ch_panel_tsv = TABIX_BGZIP.out.output.combine(TABIX_TABIX.out.tbi, by: 0) - // TSV for STITCH - // Convert position file to tab-separated file - BCFTOOLS_QUERY_STITCH(ch_panel_sites, [], [], []) - ch_posfile = BCFTOOLS_QUERY_STITCH.out.output - - // Remove multiallelic positions from tsv - GAWK_STITCH(ch_posfile, []) - emit: panel_tsv = ch_panel_tsv - vcf_tbi = ch_panel_norm + vcf_tbi = ch_vcf panel_sites = ch_panel_sites - posfile = GAWK_STITCH.out.output versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/vcf_sites_extract_bcftools/meta.yml b/subworkflows/local/vcf_sites_extract_bcftools/meta.yml new file mode 100644 index 00000000..5912b909 --- /dev/null +++ b/subworkflows/local/vcf_sites_extract_bcftools/meta.yml @@ -0,0 +1,38 @@ +name: "vcf_sites_extract_bcftools" +description: Extract only sites (no genotype data) (bcftools view -G -m 2 -M 2 -v). + Convert to glimpse1 TSV format (bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n'). + Convert to stitch/quilt TSV format (bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\n'). +keywords: + - bcftools + - query + - view +components: + - bcftools/view + - bcftools/query +input: + - ch_vcf: + type: file + description: | + Reference panel of haplotypes in VCF/BCF format. + Index file of the Reference panel file. + Target region, usually a full chromosome (e.g. chr20:1000000-2000000 or chr20). + The file could possibly be without GT field (for efficiency reasons a file containing only the positions is recommended). + Structure: [ meta, vcf, csi, region ] +output: + - vcf_tbi: + type: file + description: | + Output VCF/BCF file for the normalized, only biallelic SNPs. + Structure: [meta, vcf, tbi] + - panel_tsv: + type: file + description: | + Compressed panel TSV and index for Glimpse1. + - panel_sites: + type: file + description: | + Panel VCF and index for Glimpse1. + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index ab9cc73d..8d3a6cbe 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -132,7 +132,12 @@ workflow PHASEIMPUTE { VCF_SITES_EXTRACT_BCFTOOLS(VCF_NORMALIZE_BCFTOOLS.out.vcf_tbi) ch_versions = ch_versions.mix(VCF_SITES_EXTRACT_BCFTOOLS.out.versions) - // Phase panel + // Prepare posfile stitch + PREPARE_POSFILE_TSV(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta) + ch_versions = ch_versions.mix(PREPARE_POSFILE_TSV.out.versions) + + // If required, phase panel (currently not working, a test should be added) + // Phase panel with tool of choice (e.g. SHAPEIT5) VCF_PHASE_PANEL(VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, VCF_SITES_EXTRACT_BCFTOOLS.out.vcf_tbi, VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, @@ -156,12 +161,9 @@ workflow PHASEIMPUTE { .map{ metaPC, norm, n_index, sites, s_index, tsv, t_index, phased, p_index -> [metaPC, phased, p_index] } - // Prepare posfile stitch - PREPARE_POSFILE_TSV(VCF_SITES_EXTRACT_BCFTOOLS.out.panel_sites, ch_fasta) - ch_versions = ch_versions.mix(PREPARE_POSFILE_TSV.out.versions) // Create chunks from reference VCF - MAKE_CHUNKS(ch_panel) + MAKE_CHUNKS(ch_panel_phased) ch_versions = ch_versions.mix(MAKE_CHUNKS.out.versions) } @@ -221,7 +223,7 @@ workflow PHASEIMPUTE { print("Impute with STITCH") // Obtain the user's posfile if provided or calculate it from ref panel file - if (params.posfile) { // User supplied posfile + if (params.posfile ) { // User supplied posfile ch_posfile = ch_posfile } else if (params.panel && params.step.split(',').contains("panelprep")) { // Panelprep posfile ch_posfile = PREPARE_POSFILE_TSV.out.posfile From 17f98feb1bcb0788d329f8e7ee73519cc783f304 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Tue, 7 May 2024 20:13:55 +0000 Subject: [PATCH 04/17] add glimpse2 chunks to make_chunks panelprep --- conf/steps/imputation_quilt.config | 20 +++--- conf/steps/panel_prep.config | 32 +++++++++ docs/output.md | 66 ++++++++++++++----- docs/usage.md | 24 ++++++- nextflow.config | 1 - subworkflows/local/make_chunks/make_chunks.nf | 46 ++++++++++++- workflows/phaseimpute/main.nf | 2 +- 7 files changed, 157 insertions(+), 34 deletions(-) diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index fbb8dcc4..4e1fd47e 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -21,22 +21,20 @@ process { } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:.*' { - - ext.prefix = { "${meta.id}_${meta.chr}" } - publishDir = [ - [ - path: { "${params.outdir}/imputation/quilt/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}_chunk" }, - mode: params.publish_dir_mode, - enabled: false - ], - - + path: { "${params.outdir}/prep_panel/chunks/" }, + mode: params.publish_dir_mode, + enabled: true ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:GLIMPSE_CHUNK' { - ext.prefix = { "${meta.id}_${meta.chr}" } + ext.prefix = { "${meta.panel}_${meta.chr}_chunks_glimpse1" } + publishDir = [ + path: { "${params.outdir}/prep_panel/chunks/glimpse1/" }, + mode: params.publish_dir_mode, + enabled: true + ] } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:.*' { diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index 1f13369f..dfd6ed78 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -160,4 +160,36 @@ process { publishDir = [ enabled: false ] } + // Subworkflow: Make chunks + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:.*' { + publishDir = [ + path: { "${params.outdir}/prep_panel/chunks/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:GLIMPSE2_CHUNK' { + ext.args = [ + "--window-mb 2.0" + ].join(' ') + ext.prefix = { "${meta.panel}_${meta.chr}_chunks_glimpse2" } + publishDir = [ + path: { "${params.outdir}/prep_panel/chunks/glimpse2/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:GLIMPSE2_SPLITREFERENCE' { + ext.prefix = { "${meta.panel}_${meta.chr}_chunks_glimpse2" } + publishDir = [ + path: { "${params.outdir}/prep_panel/chunks/glimpse2/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + } diff --git a/docs/output.md b/docs/output.md index 97d7d4d7..9f85428e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,30 +10,57 @@ The directories listed below will be created in the results directory after the ## Pipeline overview -## QUILT imputation mode - -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +## Panel preparation outputs `--step panelprep` +This step of the pipeline performs a QC of the reference panel data and produces the necessary files for imputation (`--step impute`). It has two optional modes: reference panel phasing with SHAPEIT5 and removal of specified samples from reference panel. -- [Glimpse Chunk](#glimpse) - Create chunks of the reference panel - [Remove Multiallelics](#multiallelics) - Remove multiallelic sites from the reference panel - [Convert](#convert) - Convert reference panel to .hap and .legend files -- [QUILT](#quilt) - Perform imputation -- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF. +- [Posfile](#posfile) - Produce a TSV with the list of positions to genotype (for STITCH/QUILT) +- [Sites](#sites) - Produce a TSV with the list of positions to genotype (for GLIMPSE1) +- [Glimpse Chunk](#glimpse) - Create chunks of the reference panel + +### Convert + +- `prep_panel/haplegend/` + - `*.hap`: a .hap file for the reference panel. + - `*.legend*`: a .legend file for the reference panel. + +[bcftools](https://samtools.github.io/bcftools/bcftools.html) aids in the conversion of vcf files to .hap and .legend files. A .samples file is also generated. Once that you have generated the hap and legend files for your reference panel, you can skip the reference preparation step and directly submit these files for imputation (to be developed). The hap and legend files are input files used with `--tools quilt`. + +### Posfile + +- `prep_panel/posfile/` + - `*.hap`: a .txt file with the list of position to genotype. + +[bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces tab-delimited files per chromosome that can be gathered into a samplesheet and directly submitted for imputation with `--tools stitch` using the parameter `--posfile`. + +### Sites + +- `prep_panel/sites/` + - `vcf/` + - `*.vcf.gz`: VCF with biallelic SNPs only. + - `*.csi`: Index file for VCF. + - `tsv/` + - `*.txt.gz`: TXT file for biallelic SNPs. + - `*.tbi`: Index file for TSV. + +[bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces VCF (`*.vcf.gz`) files per chromosome. These QCed VCFs can be gathered into a csv and used with all the tools in `--step impute` using the flag `--panel`. + +In addition, [bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces tab-delimited files (`*_tsv.txt`) and, together with the VCFs, they can be gathered into a samplesheet and directly submitted for imputation with `--tools glimpse1` and `--posfile` (not yet implemented). ### Glimpse Chunk -- `imputation/glimpse_chunk/` +- `prep_panel/chunks/` - `*.txt`: TXT file containing the chunks obtained from running Glimpse chunks. -[Glimpse chunk](https://odelaneau.github.io/GLIMPSE/) defines chunks where to run imputation. For further reading and documentation see the [Glimpse documentation](https://odelaneau.github.io/GLIMPSE/glimpse1/commands.html). Once that you have generated the chunks for your reference panel, you can skip the reference preparation step and directly submit this file for imputation. +[Glimpse1 chunk](https://odelaneau.github.io/GLIMPSE/) defines chunks where to run imputation. For further reading and documentation see the [Glimpse1 documentation](https://odelaneau.github.io/GLIMPSE/glimpse1/commands.html). Once that you have generated the chunks for your reference panel, you can skip the reference preparation step and directly submit this file for imputation. -### Convert +## QUILT imputation mode -- `imputation/bcftools/convert/` - - `*.hap`: a .hap file for the reference panel. - - `*.legend*`: a .legend file for the reference panel. +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -[bcftools](https://samtools.github.io/bcftools/bcftools.html) aids in the conversion of vcf files to .hap and .legend files. A .samples file is also generated. Once that you have generated the hap and legend files for your reference panel, you can skip the reference preparation step and directly submit these files for imputation (to be developed). +- [QUILT](#quilt) - Perform imputation +- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF. ### QUILT @@ -45,7 +72,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d ### Concat -- `imputation/bcftools/concat` +- `imputation/quilt/bcftools/concat` - `.*.vcf.gz`: Imputed and ligated VCF for all the input samples. [bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs in chunks. @@ -54,13 +81,20 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [Remove Multiallelics](#multiallelics) - Remove multiallelic sites - [STITCH](#quilt) - Perform imputation - [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF +### STITCH + +- `imputation/stitch/` +- `stitch.*.vcf.gz`: Imputed VCF for a specific chunk. +- `stitch.*.vcf.gz.tbi`: TBI for the Imputed VCF for a specific chunk. + +[STITCH](https://github.com/rwdavies/STITCH) performs the imputation. This step will contain the VCF for each of the chunks. + ### Concat -- `imputation/bcftools/concat` +- `imputation/stitch/bcftools/concat` - `.*.vcf.gz`: Imputed and concatenated VCF for all the input samples. [bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs. diff --git a/docs/usage.md b/docs/usage.md index f5edb39f..b6b4029c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -125,6 +125,26 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +### Starting the pipeline +Phaseimpute can be started at different points in the analysis by setting the flag `--step` and the available options `[simulate, panelprep, impute, validate, all]`. + +### Start with panel preparation `--step panelprep` + +There are a few quality control steps that are applied to reference panels. These include actions such as removing multiallelic SNPs and indels, removing certain samples from the reference panel (such as related samples). In addition, chunks are produced which are then used in the imputation steps. The flag `--step panelprep` produces the filtered output files that are used as input in the subsequent imputation steps, such as `--step impute`. + +For starting from panel preparation, the required flags are `--step panelprep` and`--panel samplesheet_reference.csv`. + +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --step panelprep --outdir results --genome GRCh37 -profile docker +``` +### Start from imputation tools `--step impute` + +For starting from the imputation step, the required flags are: +- `--step impute` +- `--input input.csv`: The samplesheet containing the input sample files in `bam` format. +- `--panel samplesheet_reference.csv`: The files in `samplesheet_reference.csv` are the filtered, quality controlled, bi-allelic VCFs obtained from `--step panelprep`. +- `--tools [glimpse1, quilt, stitch]`: A selection of one or more of the available imputation tools. Each imputation tool has their own set of specific flags and input files. These are produced by `--step panelprep`. + ### Imputation tools `--step impute --tools [glimpse1, quilt, stitch]` You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. @@ -139,13 +159,13 @@ nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_ref [STITCH](https://github.com/rwdavies/STITCH) is an R program for low coverage sequencing genotype imputation without using a reference panel. The required inputs for this program are bam samples provided in the input samplesheet (`--input`) and a tsv file with the list of positions to genotype (`--posfile`). -If you do not have a list of position to genotype, you can provide a reference panel to run the `--mode panelprep` which produces a tsv with this list. +If you do not have a list of position to genotype, you can provide a reference panel to run the `--step panelprep` which produces a tsv with this list. ```bash nextflow run nf-core/phaseimpute --input samplesheet.csv --step panelprep --panel samplesheet_reference.csv --outdir results --genome GRCh37 -profile docker ``` -Otherwise, you can provide your own position file in the `--mode impute` with STITCH using the the `--posfile` parameter. +Otherwise, you can provide your own position file in the `--step impute` with STITCH using the the `--posfile` parameter. ```bash nextflow run nf-core/phaseimpute --input samplesheet.csv --step impute --posfile samplesheet_posfile.csv --tool stitch --outdir results --genome GRCh37 -profile docker diff --git a/nextflow.config b/nextflow.config index 3dd6890d..00b2e66c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -293,7 +293,6 @@ includeConfig 'conf/steps/simulation.config' includeConfig 'conf/steps/panel_prep.config' // imputation step -includeConfig 'conf/steps/imputation.config' includeConfig 'conf/steps/imputation_glimpse1.config' includeConfig 'conf/steps/imputation_quilt.config' includeConfig 'conf/steps/imputation_stitch.config' diff --git a/subworkflows/local/make_chunks/make_chunks.nf b/subworkflows/local/make_chunks/make_chunks.nf index 5c4319dc..67741ff6 100644 --- a/subworkflows/local/make_chunks/make_chunks.nf +++ b/subworkflows/local/make_chunks/make_chunks.nf @@ -1,29 +1,69 @@ include { GLIMPSE_CHUNK } from '../../../modules/nf-core/glimpse/chunk/main' +include { GLIMPSE2_CHUNK } from '../../../modules/nf-core/glimpse2/chunk/main' +include { GLIMPSE2_SPLITREFERENCE } from '../../../modules/nf-core/glimpse2/splitreference/main' workflow MAKE_CHUNKS { take: ch_reference // channel: [ val(meta),vcf ] + ch_map // channel (optional): [ meta, map ] main: ch_versions = Channel.empty() - // Make chunks + // Add chromosome to channel ch_vcf_csi_chr = ch_reference.map{meta, vcf, csi -> [meta, vcf, csi, meta.chr]} + ch_vcf_csi_chr.dump(tag:"ch_vcf_csi_chr") + + // Make chunks with Glimpse1 GLIMPSE_CHUNK(ch_vcf_csi_chr) ch_versions = ch_versions.mix(GLIMPSE_CHUNK.out.versions) // Rearrange chunks into channel for QUILT - ch_chunks = GLIMPSE_CHUNK.out.chunk_chr + ch_chunks_glimpse1 = GLIMPSE_CHUNK.out.chunk_chr .splitText() .map { metamap, line -> def fields = line.split("\t") def startEnd = fields[2].split(':')[1].split('-') [metamap, metamap.chr, startEnd[0], startEnd[1]] } + ch_chunks_glimpse1.dump(tag:"ch_chunks_glimpse1") + + // Make chunks with Glimpse2 (does not work with "sequential" mode) + chunk_model = "recursive" + + GLIMPSE2_CHUNK ( ch_vcf_csi_chr, ch_map, chunk_model ) + ch_versions = ch_versions.mix( GLIMPSE2_CHUNK.out.versions.first() ) + + // Rearrange channels + ch_chunks_glimpse2 = GLIMPSE2_CHUNK.out.chunk_chr + .splitCsv(header: ['ID', 'Chr', 'RegionBuf', 'RegionCnk', 'WindowCm', + 'WindowMb', 'NbTotVariants', 'NbComVariants'], + sep: "\t", skip: 0) + .map { meta, it -> [meta, it["RegionBuf"], it["RegionCnk"]]} + ch_chunks_glimpse2.dump(tag:"ch_chunks_glimpse2") + + // Split reference panel in bin files + // Segmentation fault occurs in small-sized panels + // Should be run only in full-sized panels + + ch_bins = [[]] + + if (params.binaryref == true) { + // Create channel to split reference + split_input = ch_reference.combine(ch_chunks_glimpse2, by: 0) + + // Create a binary reference panel for quick reading time + GLIMPSE2_SPLITREFERENCE( split_input, ch_map ) + ch_versions = ch_versions.mix( GLIMPSE2_SPLITREFERENCE.out.versions.first() ) + + ch_bins = GLIMPSE2_SPLITREFERENCE.out.bin_ref + } emit: - chunks = ch_chunks // channel: [ chr, val(meta), start, end, number ] + chunks = ch_chunks_glimpse1 // channel: [ chr, val(meta), start, end, number ] + chunks_glimpse2 = ch_chunks_glimpse2 + binary = ch_bins // channel: [ [meta], bin] versions = ch_versions // channel: [ versions.yml ] } diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 8d3a6cbe..55531a40 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -163,7 +163,7 @@ workflow PHASEIMPUTE { } // Create chunks from reference VCF - MAKE_CHUNKS(ch_panel_phased) + MAKE_CHUNKS(ch_panel_phased, ch_map) ch_versions = ch_versions.mix(MAKE_CHUNKS.out.versions) } From b291997dec0c5ad6c1e8d8615d303d7a085d295d Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 8 May 2024 19:02:23 +0000 Subject: [PATCH 05/17] change subworkflow name to nf-core standards --- conf/steps/imputation_quilt.config | 4 ++-- conf/steps/panel_prep.config | 6 +++--- subworkflows/local/make_chunks/make_chunks.nf | 2 +- workflows/phaseimpute/main.nf | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index 4e1fd47e..36239fa6 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -20,7 +20,7 @@ process { ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:.*' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:.*' { publishDir = [ path: { "${params.outdir}/prep_panel/chunks/" }, mode: params.publish_dir_mode, @@ -28,7 +28,7 @@ process { ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:GLIMPSE_CHUNK' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE_CHUNK' { ext.prefix = { "${meta.panel}_${meta.chr}_chunks_glimpse1" } publishDir = [ path: { "${params.outdir}/prep_panel/chunks/glimpse1/" }, diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index dfd6ed78..1f3bf2be 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -162,7 +162,7 @@ process { // Subworkflow: Make chunks - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:.*' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:.*' { publishDir = [ path: { "${params.outdir}/prep_panel/chunks/" }, mode: params.publish_dir_mode, @@ -171,7 +171,7 @@ process { } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:GLIMPSE2_CHUNK' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE2_CHUNK' { ext.args = [ "--window-mb 2.0" ].join(' ') @@ -183,7 +183,7 @@ process { ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:MAKE_CHUNKS:GLIMPSE2_SPLITREFERENCE' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_CHUNK_GLIMPSE:GLIMPSE2_SPLITREFERENCE' { ext.prefix = { "${meta.panel}_${meta.chr}_chunks_glimpse2" } publishDir = [ path: { "${params.outdir}/prep_panel/chunks/glimpse2/" }, diff --git a/subworkflows/local/make_chunks/make_chunks.nf b/subworkflows/local/make_chunks/make_chunks.nf index 67741ff6..021b1d45 100644 --- a/subworkflows/local/make_chunks/make_chunks.nf +++ b/subworkflows/local/make_chunks/make_chunks.nf @@ -2,7 +2,7 @@ include { GLIMPSE_CHUNK } from '../../../modules/nf-core/gli include { GLIMPSE2_CHUNK } from '../../../modules/nf-core/glimpse2/chunk/main' include { GLIMPSE2_SPLITREFERENCE } from '../../../modules/nf-core/glimpse2/splitreference/main' -workflow MAKE_CHUNKS { +workflow VCF_CHUNK_GLIMPSE { take: ch_reference // channel: [ val(meta),vcf ] diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 55531a40..8197db9f 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -36,7 +36,7 @@ include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/ include { VCF_CONCATENATE_BCFTOOLS as CONCAT_GLIMPSE1} from '../../subworkflows/local/vcf_concatenate_bcftools' // QUILT subworkflows -include { MAKE_CHUNKS } from '../../subworkflows/local/make_chunks/make_chunks' +include { VCF_CHUNK_GLIMPSE } from '../../subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse' include { IMPUTE_QUILT } from '../../subworkflows/local/impute_quilt/impute_quilt' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_QUILT } from '../../subworkflows/local/vcf_concatenate_bcftools' @@ -163,8 +163,8 @@ workflow PHASEIMPUTE { } // Create chunks from reference VCF - MAKE_CHUNKS(ch_panel_phased, ch_map) - ch_versions = ch_versions.mix(MAKE_CHUNKS.out.versions) + VCF_CHUNK_GLIMPSE(ch_panel_phased, ch_map) + ch_versions = ch_versions.mix(VCF_CHUNK_GLIMPSE.out.versions) } if (params.step.split(',').contains("impute") || params.step.split(',').contains("all")) { @@ -256,7 +256,7 @@ workflow PHASEIMPUTE { print("Impute with QUILT") // Impute BAMs with QUILT - IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, MAKE_CHUNKS.out.chunks) + IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, VCF_CHUNK_GLIMPSE.out.chunks) ch_versions = ch_versions.mix(IMPUTE_QUILT.out.versions) // Add to output channel From 5d8a31c0c86fe1793d1bcb7d641898248951d75c Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 8 May 2024 19:05:34 +0000 Subject: [PATCH 06/17] change subworkflow name to nf-core standard --- conf/steps/imputation_quilt.config | 10 +++++----- subworkflows/local/impute_quilt/impute_quilt.nf | 2 +- workflows/phaseimpute/main.nf | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/conf/steps/imputation_quilt.config b/conf/steps/imputation_quilt.config index 36239fa6..f43f5119 100644 --- a/conf/steps/imputation_quilt.config +++ b/conf/steps/imputation_quilt.config @@ -37,7 +37,7 @@ process { ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:.*' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:.*' { publishDir = [ [ path: { "${params.outdir}/imputation/quilt/" }, @@ -46,22 +46,22 @@ process { ] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:QUILT_QUILT' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:QUILT_QUILT' { ext.prefix = { "${meta.id}_R${meta.region.replace(':','_')}.impute" } publishDir = [enabled: false] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_INDEX_1' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:BCFTOOLS_INDEX_1' { ext.args = "--tbi" publishDir = [enabled: false] } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_ANNOTATE' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:BCFTOOLS_ANNOTATE' { ext.args = "--set-id '%CHROM:%POS:%REF:%ALT' -Oz" ext.prefix = { "${meta.id}_R${meta.region.replace(':','_')}.impute.annotate" } } - withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:IMPUTE_QUILT:BCFTOOLS_INDEX_2' { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_QUILT:BCFTOOLS_INDEX_2' { ext.args = "--tbi" } diff --git a/subworkflows/local/impute_quilt/impute_quilt.nf b/subworkflows/local/impute_quilt/impute_quilt.nf index 5bba1604..9935cf35 100644 --- a/subworkflows/local/impute_quilt/impute_quilt.nf +++ b/subworkflows/local/impute_quilt/impute_quilt.nf @@ -4,7 +4,7 @@ include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_1 } from '../../../modules/nf-core/bc include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_2 } from '../../../modules/nf-core/bcftools/index' -workflow IMPUTE_QUILT { +workflow BAM_IMPUTE_QUILT { take: ch_hap_legend // channel: [ [panel, chr], hap, legend ] diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 8197db9f..269473e9 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -37,7 +37,7 @@ include { VCF_CONCATENATE_BCFTOOLS as CONCAT_GLIMPSE1} from '../../subworkflows/ // QUILT subworkflows include { VCF_CHUNK_GLIMPSE } from '../../subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse' -include { IMPUTE_QUILT } from '../../subworkflows/local/impute_quilt/impute_quilt' +include { BAM_IMPUTE_QUILT } from '../../subworkflows/local/bam_impute_quilt/bam_impute_quilt' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_QUILT } from '../../subworkflows/local/vcf_concatenate_bcftools' // STITCH subworkflows @@ -256,14 +256,14 @@ workflow PHASEIMPUTE { print("Impute with QUILT") // Impute BAMs with QUILT - IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, VCF_CHUNK_GLIMPSE.out.chunks) - ch_versions = ch_versions.mix(IMPUTE_QUILT.out.versions) + BAM_IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, VCF_CHUNK_GLIMPSE.out.chunks) + ch_versions = ch_versions.mix(BAM_IMPUTE_QUILT.out.versions) // Add to output channel - ch_impute_output = ch_impute_output.mix(IMPUTE_QUILT.out.vcf_tbi) + ch_impute_output = ch_impute_output.mix(BAM_IMPUTE_QUILT.out.vcf_tbi) // Concatenate by chromosomes - CONCAT_QUILT(IMPUTE_QUILT.out.vcf_tbi) + CONCAT_QUILT(BAM_IMPUTE_QUILT.out.vcf_tbi) ch_versions = ch_versions.mix(CONCAT_QUILT.out.versions) // Add results to input validate From 783ff1bb1af14870c45e165f2d141c58e72ee18c Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 8 May 2024 19:31:57 +0000 Subject: [PATCH 07/17] rename files to match subworkflow nf-core standard --- .../impute_quilt.nf => bam_impute_quilt/bam_impute_quilt.nf} | 0 .../make_chunks.nf => vcf_chunk_glimpse/vcf_chunk_glimpse.nf} | 3 --- 2 files changed, 3 deletions(-) rename subworkflows/local/{impute_quilt/impute_quilt.nf => bam_impute_quilt/bam_impute_quilt.nf} (100%) rename subworkflows/local/{make_chunks/make_chunks.nf => vcf_chunk_glimpse/vcf_chunk_glimpse.nf} (94%) diff --git a/subworkflows/local/impute_quilt/impute_quilt.nf b/subworkflows/local/bam_impute_quilt/bam_impute_quilt.nf similarity index 100% rename from subworkflows/local/impute_quilt/impute_quilt.nf rename to subworkflows/local/bam_impute_quilt/bam_impute_quilt.nf diff --git a/subworkflows/local/make_chunks/make_chunks.nf b/subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse.nf similarity index 94% rename from subworkflows/local/make_chunks/make_chunks.nf rename to subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse.nf index 021b1d45..1703d96b 100644 --- a/subworkflows/local/make_chunks/make_chunks.nf +++ b/subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse.nf @@ -14,7 +14,6 @@ workflow VCF_CHUNK_GLIMPSE { // Add chromosome to channel ch_vcf_csi_chr = ch_reference.map{meta, vcf, csi -> [meta, vcf, csi, meta.chr]} - ch_vcf_csi_chr.dump(tag:"ch_vcf_csi_chr") // Make chunks with Glimpse1 GLIMPSE_CHUNK(ch_vcf_csi_chr) @@ -28,7 +27,6 @@ workflow VCF_CHUNK_GLIMPSE { def startEnd = fields[2].split(':')[1].split('-') [metamap, metamap.chr, startEnd[0], startEnd[1]] } - ch_chunks_glimpse1.dump(tag:"ch_chunks_glimpse1") // Make chunks with Glimpse2 (does not work with "sequential" mode) chunk_model = "recursive" @@ -42,7 +40,6 @@ workflow VCF_CHUNK_GLIMPSE { 'WindowMb', 'NbTotVariants', 'NbComVariants'], sep: "\t", skip: 0) .map { meta, it -> [meta, it["RegionBuf"], it["RegionCnk"]]} - ch_chunks_glimpse2.dump(tag:"ch_chunks_glimpse2") // Split reference panel in bin files // Segmentation fault occurs in small-sized panels From 0bd47041cdfd2fa3b7957885c2462d104214881a Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Wed, 8 May 2024 19:35:12 +0000 Subject: [PATCH 08/17] add full size data and params --- conf/test_full.config | 8 ++++---- tests/csv/sample_sim_full.csv | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conf/test_full.config b/conf/test_full.config index 0112e868..408d1dce 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -16,7 +16,7 @@ params { // Genome references - map = "https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip" + //map = "https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/plink.GRCh38.map.zip" genome = "GRCh38" // Resources increase incompatible with Github Action @@ -27,9 +27,9 @@ params { // Input data input = "${projectDir}/tests/csv/sample_sim_full.csv" panel = "${projectDir}/tests/csv/panel_full.csv" - input_region_string = "all" - step = "simulate" + step = "all" // Settings - remvove_samples = "NA12878,NA12891,NA12892" + tools = "glimpse1" + remove_samples = "NA12878,NA12891,NA12892" } diff --git a/tests/csv/sample_sim_full.csv b/tests/csv/sample_sim_full.csv index 592282cc..8fe97448 100644 --- a/tests/csv/sample_sim_full.csv +++ b/tests/csv/sample_sim_full.csv @@ -1,2 +1,2 @@ sample,file,index -#TODO find bam not in 1000G panel +NA12878,ftp://ftp-trace.ncbi.nih.gov/ReferenceSamples/giab/data/NA12878/Nebraska_NA12878_HG001_TruSeq_Exome/NIST-hg001-7001-ready.bam,ftp://ftp-trace.ncbi.nih.gov/ReferenceSamples/giab/data/NA12878/Nebraska_NA12878_HG001_TruSeq_Exome/NIST-hg001-7001-ready.bam.bai From a21b80dd41df806f9bd26a84d31283336152a7d4 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 9 May 2024 15:47:23 +0000 Subject: [PATCH 09/17] add docs info --- README.md | 52 ++++++++++++++++++++++++++++++++++++++++----------- docs/usage.md | 33 +++++++++++++++++++++++++++----- 2 files changed, 69 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 1ac75567..148130d8 100644 --- a/README.md +++ b/README.md @@ -44,17 +44,26 @@ sample,bam,bai 1_BAM_1X,/path/to/.bam,/path/to/.bai ``` -Each row represents a bam file with its index file. +Each row represents a bam file with its index file. For some tools and steps, you will also need to submit a samplesheet with the reference panel. + +A final samplesheet file for the reference panel may look something like the one below. This is for 3 chromosomes. + +```console +chr,vcf +1,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +2,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +3,ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz +``` Now, you can run the pipeline using: ```bash nextflow run nf-core/phaseimpute \ -profile \ - --input samplesheet.csv \ + --input \ --genome "GRCh38" \ --panel \ - --steps "impute" \ + --step "panelprep,impute" \ --tools "glimpse1" \ --outdir ``` @@ -65,18 +74,18 @@ nextflow run nf-core/phaseimpute \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage) and the [parameter documentation](https://nf-co.re/phaseimpute/parameters). -## Description of the different mode of the pipeline +## Description of the different steps of the pipeline -Here is a short description of the different mode of the pipeline. +Here is a short description of the different steps of the pipeline. For more information please refer to the [documentation](https://nf-core.github.io/phaseimpute/usage/). -| Mode | Flow chart | Description | +| Step | Flow chart | Description | | ------------------ | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Preprocessing** | phase_metro | The preprocessing mode is responsible to the preparation of the multiple input file that will be used by the phasing process.
The main processes are :
- **Haplotypes phasing** of the reference panel using [**Shapeit5**](https://odelaneau.github.io/shapeit5/).
- **Filter** the reference panel to select only the necessary variants.
- **Chunking the reference panel** in a subset of region for all the chromosomes.
- **Extract** the positions where to perform the imputation. | -| **Phasing** | phase_metro | The phasing mode is the core mode of this pipeline.
It is constituted of 3 main steps:
- **Phasing**: Phasing of the target dataset on the reference panel using either:
  - [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html)
  It's come with the necessety to compute the genotype likelihoods of the target dataset.
  This step is done using [BCFTOOLS_mpileup](https://samtools.github.io/bcftools/bcftools.html#mpileup)
  - [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) For this step the reference panel is transformed to binary chunks.
  - [**Stitch**](https://github.com/rwdavies/stitch)
  - [**Quilt**](https://github.com/rwdavies/QUILT)
- **Ligation**: all the different chunks are merged together.
- **Sampling** (optional) | +| **Panel prep** | phase_metro | The preprocessing mode is responsible to the preparation of the multiple input file that will be used by the phasing process.
The main processes are :
- **Haplotypes phasing** of the reference panel using [**Shapeit5**](https://odelaneau.github.io/shapeit5/).
- **Filter** the reference panel to select only the necessary variants.
- **Chunking the reference panel** in a subset of region for all the chromosomes.
- **Extract** the positions where to perform the imputation. | +| **Impute** | phase_metro | The imputation mode is the core mode of this pipeline.
It is constituted of 3 main steps:
- **Phasing**: Phasing of the target dataset on the reference panel using either:
  - [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html)
  It's come with the necessety to compute the genotype likelihoods of the target dataset.
  This step is done using [BCFTOOLS_mpileup](https://samtools.github.io/bcftools/bcftools.html#mpileup)
  - [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) For this step the reference panel is transformed to binary chunks.
  - [**Stitch**](https://github.com/rwdavies/stitch)
  - [**Quilt**](https://github.com/rwdavies/QUILT)
- **Ligation**: all the different chunks are merged together.
- **Sampling** (optional) | | **Simulate** | simulate_metro | The simulation mode is used to create artificial low informative genetic information from high density data. This allow to compare the imputed result to a _truth_ and therefore evaluate the quality of the imputation.
For the moment it is possible to simulate:
- Low-pass data by **downsample** BAM or CRAM using [SAMTOOLS_view -s]() at different depth
- Genotype data by **SNP selecting** the position used by a designated SNP chip.
The simulation mode will also compute the **Genotype likelihoods** of the high density data. | -| **Concordance** | concordance_metro | This mode compare two vcf together to compute a summary of the differences between them.
To do so it use either:
- [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html) concordance process.
- [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) concordance process
- Or convert the two vcf fill to `.zarr` using [**Scikit allele**](https://scikit-allel.readthedocs.io/en/stable/) and [**anndata**](https://anndata.readthedocs.io/en/latest/) before comparing the SNPs. | -| **Postprocessing** | postprocessing_metro | This final process unable to loop the whole pipeline for increasing the performance of the imputation. To do so it filter out the best imputed position and rerun the analysis using this positions. | +| **Validate** | concordance_metro | This mode compare two vcf together to compute a summary of the differences between them.
To do so it use either:
- [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html) concordance process.
- [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) concordance process
- Or convert the two vcf fill to `.zarr` using [**Scikit allele**](https://scikit-allel.readthedocs.io/en/stable/) and [**anndata**](https://anndata.readthedocs.io/en/latest/) before comparing the SNPs. | +| **Postprocessing** | postprocessing_metro | This final process unable to loop the whole pipeline for increasing the performance of the imputation. To do so it filter out the best imputed position and rerun the analysis using this positions [to be developed]. | ## Pipeline output @@ -99,7 +108,6 @@ We thank the following people for their extensive assistance in the development If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). -For further information or help, don't hesitate to get in touch on the [Slack `#phaseimpute` channel](https://nfcore.slack.com/channels/phaseimpute) (you can join with [this invite](https://nf-co.re/join/slack)). For further information or help, don't hesitate to get in touch on the [Slack `#phaseimpute` channel](https://nfcore.slack.com/channels/phaseimpute) (you can join with [this invite](https://nf-co.re/join/slack)). ## Citations @@ -117,6 +125,28 @@ You can cite one of the main imputation methods ([`QUILT`](https://github.com/rw > > _Nature genetics_ 2021 June 03. doi: [10.1038/s41588-021-00877-0](https://doi.org/10.1038/s41588-021-00877-0) +You can cite one of the main imputation methods ([`GLIMPSE`](https://github.com/odelaneau/GLIMPSE)) as follows: + +> **Efficient phasing and imputation of low-coverage sequencing data using large reference panels.** +> +> Rubinacci, S., Ribeiro, D. M., Hofmeister, R. J., & Delaneau, O. +> +> _Nature genetics_ 2021. doi:[]() + +> **Imputation of low-coverage sequencing data from 150,119 UK Biobank genomes** +> +> Rubinacci, S., Hofmeister, R. J., Sousa da Mota, B., & Delaneau, O. +> +> _Nature genetics_ 2023. doi:[]() + +You can cite one of the main imputation methods ([`STITCH`](https://github.com/rwdavies/STITCH)) as follows: + +> **Rapid genotype imputation from sequence without reference panels.** +> +> Davies, R. W., Flint, J., Myers, S., & Mott, R. +> +> _Nature genetics_ 2016 . doi: [](). + An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/docs/usage.md b/docs/usage.md index b6b4029c..d899cd36 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -125,19 +125,37 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Starting the pipeline -Phaseimpute can be started at different points in the analysis by setting the flag `--step` and the available options `[simulate, panelprep, impute, validate, all]`. +### Running the pipeline + +Phaseimpute can be started at different points in the analysis by setting the flag `--step` and the available options `[simulate, panelprep, impute, validate, all]`. You can also run several steps simultaneously by listing the required processes as `--step panelprep,impute` or you can choose to run all steps sequentially by using `--step all`. + +### Start with simulation `--step simulate` + +This step of the pipeline allows to create synthetic low-coverage input files by downsizing high density input data. A typical use case is to obtain low-coverage input data from a sequenced sample. This method is useful for comparing the imputation results to the truth and evaluate the quality of the imputation. You can skip this step if you already have low-pass genome sequencing data. A sample command for this step is: + +```bash +nextflow run nf-core/phaseimpute --input samplesheet.csv --step simulate --depth 1 --outdir results --genome GRCh37 -profile docker +``` +The required flags for this mode are: +- `--step simulate`: The step to run. +- `--input samplesheet.csv`: The samplesheet containing the input sample files in `bam` format. +- `--depth`: The final depth of the file [default: 1]. + +You can find an overview of the results produced by this steps in the [Output](output.md). ### Start with panel preparation `--step panelprep` -There are a few quality control steps that are applied to reference panels. These include actions such as removing multiallelic SNPs and indels, removing certain samples from the reference panel (such as related samples). In addition, chunks are produced which are then used in the imputation steps. The flag `--step panelprep` produces the filtered output files that are used as input in the subsequent imputation steps, such as `--step impute`. +This step pre-processes the reference panel in order to be ready for imputation. There are a few quality control steps that are applied to reference panels. These include actions such as removing multiallelic SNPs and indels and removing certain samples from the reference panel (such as related samples). In addition, chunks are produced which are then used in the imputation steps. It is recommended that this step is run once and the produced files are saved, to minimize the cost of reading the reference panel each time. Then, the output files from `--step panelprep` can be used as input in the subsequent imputation steps, such as `--step impute`. -For starting from panel preparation, the required flags are `--step panelprep` and`--panel samplesheet_reference.csv`. +For starting from panel preparation, the required flags are `--step panelprep` and `--panel samplesheet_reference.csv`. ```bash nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --step panelprep --outdir results --genome GRCh37 -profile docker ``` -### Start from imputation tools `--step impute` + +You can find an overview of the results produced by this steps in the [Output](output.md). + +### Start with imputation `--step impute` For starting from the imputation step, the required flags are: - `--step impute` @@ -145,6 +163,8 @@ For starting from the imputation step, the required flags are: - `--panel samplesheet_reference.csv`: The files in `samplesheet_reference.csv` are the filtered, quality controlled, bi-allelic VCFs obtained from `--step panelprep`. - `--tools [glimpse1, quilt, stitch]`: A selection of one or more of the available imputation tools. Each imputation tool has their own set of specific flags and input files. These are produced by `--step panelprep`. +You can find an overview of the results produced by this steps in the [Output](output.md). + ### Imputation tools `--step impute --tools [glimpse1, quilt, stitch]` You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. @@ -197,6 +217,9 @@ chr22 16570211 T C nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --step impute --tool glimpse1 --outdir results --genome GRCh37 -profile docker ``` +### Start with validation `--step validate` +This step compares a _truth_ VCF to an _imputed_ VCF in order to compute imputation accuracy. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: From d56016add8d46ca7a9fd0c783616459e5a4656a0 Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 9 May 2024 17:55:21 +0000 Subject: [PATCH 10/17] add glimpse2 chunk and arguments --- conf/test_glimpse2.config | 33 +++++++++++++ nextflow.config | 19 +++++--- nextflow_schema.json | 19 ++++++++ .../vcf_chunk_glimpse/vcf_chunk_glimpse.nf | 12 +++-- .../local/vcf_impute_glimpse2/main.nf | 46 +++++++++++++++++++ workflows/phaseimpute/main.nf | 22 +++++++-- 6 files changed, 138 insertions(+), 13 deletions(-) create mode 100644 conf/test_glimpse2.config create mode 100644 subworkflows/local/vcf_impute_glimpse2/main.nf diff --git a/conf/test_glimpse2.config b/conf/test_glimpse2.config new file mode 100644 index 00000000..d6823a6d --- /dev/null +++ b/conf/test_glimpse2.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/phaseimpute -profile test_glimpse2, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function with GLIMPSE2' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '2.GB' + max_time = '1.h' + + // Input data + input = "${projectDir}/tests/csv/sample_bam.csv" + + // Genome references + fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/reference_genome/21_22/hs38DH.chr21_22.fa" + panel = "${projectDir}/tests/csv/panel.csv" + phased = true + + // Impute parameters + step = "panelprep,impute" + tools = "glimpse2" +} diff --git a/nextflow.config b/nextflow.config index 00b2e66c..d7e39671 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,6 +57,10 @@ params { seed = 1 posfile = null + // GLIMPSE2 + binaryref = null + chunks = null + // Boilerplate options outdir = null publish_dir_mode = 'copy' @@ -205,13 +209,14 @@ profiles { executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } - test_sim { includeConfig 'conf/test_sim.config' } - test_validate { includeConfig 'conf/test_validate.config' } - test_all { includeConfig 'conf/test_all.config' } - test_quilt { includeConfig 'conf/test_quilt.config' } - test_stitch { includeConfig 'conf/test_stitch.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_sim { includeConfig 'conf/test_sim.config' } + test_validate { includeConfig 'conf/test_validate.config' } + test_all { includeConfig 'conf/test_all.config' } + test_quilt { includeConfig 'conf/test_quilt.config' } + test_stitch { includeConfig 'conf/test_stitch.config' } + test_glimpse2 { includeConfig 'conf/test_glimpse2.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index 528ff12f..f335be02 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -112,6 +112,22 @@ "description": "Is the reference panel phased", "type": "boolean", "pattern": "true|false" + }, + "binaryref": { + "type": "string", + "description": "Whether to generate a binary reference file to be used with GLIMPSE2" + } + } + }, + "imputation_options": { + "title": "Imputation options", + "type": "object", + "description": "Arguments for the imputation step", + "default": "", + "properties": { + "chunks": { + "type": "string", + "description": "Path to comma-separated file containing tab-separated files with the genomic chunks to be used for imputation." } } }, @@ -451,6 +467,9 @@ { "$ref": "#/definitions/panelprep" }, + { + "$ref": "#/definitions/imputation_options" + }, { "$ref": "#/definitions/validation" }, diff --git a/subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse.nf b/subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse.nf index 1703d96b..cc16b9b0 100644 --- a/subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse.nf +++ b/subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse.nf @@ -20,7 +20,7 @@ workflow VCF_CHUNK_GLIMPSE { ch_versions = ch_versions.mix(GLIMPSE_CHUNK.out.versions) // Rearrange chunks into channel for QUILT - ch_chunks_glimpse1 = GLIMPSE_CHUNK.out.chunk_chr + ch_chunks_quilt = GLIMPSE_CHUNK.out.chunk_chr .splitText() .map { metamap, line -> def fields = line.split("\t") @@ -28,6 +28,11 @@ workflow VCF_CHUNK_GLIMPSE { [metamap, metamap.chr, startEnd[0], startEnd[1]] } + // Rearrange chunks into channel for GLIMPSE1 + ch_chunks_glimpse1 = GLIMPSE_CHUNK.out.chunk_chr + .splitCsv(header: ['ID', 'Chr', 'RegionIn', 'RegionOut', 'Size1', 'Size2'], sep: "\t", skip: 0) + .map { meta, it -> [meta, it["RegionIn"], it["RegionOut"]]} + // Make chunks with Glimpse2 (does not work with "sequential" mode) chunk_model = "recursive" @@ -59,8 +64,9 @@ workflow VCF_CHUNK_GLIMPSE { } emit: - chunks = ch_chunks_glimpse1 // channel: [ chr, val(meta), start, end, number ] - chunks_glimpse2 = ch_chunks_glimpse2 + chunks_quilt = ch_chunks_quilt // channel: [ val(meta), chr, start, end ] + chunks_glimpse1 = ch_chunks_glimpse1 // channel: [ val(meta), chr, region1, region2 ] + chunks_glimpse2 = ch_chunks_glimpse2 // channel: [ val(meta), chr, region1, region2 ] binary = ch_bins // channel: [ [meta], bin] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/vcf_impute_glimpse2/main.nf b/subworkflows/local/vcf_impute_glimpse2/main.nf new file mode 100644 index 00000000..32a20972 --- /dev/null +++ b/subworkflows/local/vcf_impute_glimpse2/main.nf @@ -0,0 +1,46 @@ +include { GLIMPSE2_PHASE } from '../../../modules/nf-core/glimpse2/phase/main' +include { GLIMPSE2_LIGATE } from '../../../modules/nf-core/glimpse2/ligate/main' +include { BCFTOOLS_INDEX as INDEX_PHASE } from '../../../modules/nf-core/bcftools/index/main.nf' +include { BCFTOOLS_INDEX as INDEX_LIGATE } from '../../../modules/nf-core/bcftools/index/main.nf' + +workflow VCF_IMPUTE_GLIMPSE2 { + + take: + ch_input // channel (mandatory): [ meta, vcf, csi, infos ] + ch_panel // channel (mandatory): [ meta, vcf, csi, region ] + ch_chunks // channel (optional): [ meta, region1, region2 ] + ch_fasta + + main: + + ch_versions = Channel.empty() + + // Impute with Glimpse2 without using binary files + def samples_file = [[]] + def gmap = [[]] + + // Create input channel to impute with Glimpse2 + // Add chr as key to input + ch_input = ch_input.map{meta, bam, bai -> return[['chr': meta.chr], meta, bam, bai]} + + // Join chunks and panel + ch_chunks_panel = ch_chunks.join(ch_panel) + + // Change key:value names + ch_chunks_panel = ch_chunks_panel.map{meta, vcf, csi, region1, region2 -> return[['id': meta.panel, 'chr': meta.chr], vcf, csi, region1, region2]} + + // Add chr as key + ch_chunks_panel = ch_chunks_panel.map{meta, vcf, csi, region1, region2 -> return[['chr': meta.chr], vcf, csi, region1, region2]} + + // Join input and chunks reference + ch_input_glimpse2 = ch_input.map { it + samples_file }.join(ch_chunks_panel).map { it + gmap } + + // Remove chr key + ch_input_glimpse2 = ch_input_glimpse2.map{ it[1..-1] } + + //Impute with Glimpse2 + GLIMPSE2_PHASE(ch_input_glimpse2, ch_fasta) // Error: AC/AN INFO fields in VCF are inconsistent with GT field, update the values in the VCF + + emit: + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 269473e9..a8674351 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -34,6 +34,7 @@ include { VCF_IMPUTE_GLIMPSE as VCF_IMPUTE_GLIMPSE1 } from '../../subworkflows/ include { COMPUTE_GL as GL_TRUTH } from '../../subworkflows/local/compute_gl' include { COMPUTE_GL as GL_INPUT } from '../../subworkflows/local/compute_gl' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_GLIMPSE1} from '../../subworkflows/local/vcf_concatenate_bcftools' +include { VCF_IMPUTE_GLIMPSE2 } from '../../subworkflows/local/vcf_impute_glimpse2' // QUILT subworkflows include { VCF_CHUNK_GLIMPSE } from '../../subworkflows/local/vcf_chunk_glimpse/vcf_chunk_glimpse' @@ -215,8 +216,23 @@ workflow PHASEIMPUTE { } if (params.tools.split(',').contains("glimpse2")) { - error "Glimpse2 not yet implemented" - // Glimpse2 subworkflow + //error "Glimpse2 not yet implemented" + + // Use previous chunks if --step panelprep + if (params.panel && params.step.split(',').contains("panelprep") && !params.chunks) { + ch_chunks = VCF_CHUNK_GLIMPSE.out.chunks_glimpse1 + + VCF_IMPUTE_GLIMPSE2(ch_input_impute, + ch_panel_phased, + ch_chunks, + ch_fasta) + } else if (params.chunks) { + // use provided chunks + } else { + error "Either no reference panel was included or you did not set step --panelprep or you did not provide --chunks" + } + + } if (params.tools.split(',').contains("stitch")) { @@ -256,7 +272,7 @@ workflow PHASEIMPUTE { print("Impute with QUILT") // Impute BAMs with QUILT - BAM_IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, VCF_CHUNK_GLIMPSE.out.chunks) + BAM_IMPUTE_QUILT(VCF_NORMALIZE_BCFTOOLS.out.hap_legend, ch_input_impute, VCF_CHUNK_GLIMPSE.out.chunks_quilt) ch_versions = ch_versions.mix(BAM_IMPUTE_QUILT.out.versions) // Add to output channel From d72148809abe92ae2c87534ffc6eb7435d3f930e Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Thu, 9 May 2024 18:21:04 +0000 Subject: [PATCH 11/17] add changelog info --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fc04b00..12d9de85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co - [#22](https://github.com/nf-core/phaseimpute/pull/20) - Add validation step for concordance analysis. Input channels changed to match inputs steps. Outdir folder organised by steps. Modules config by subworkflows. - [#26](https://github.com/nf-core/phaseimpute/pull/26) - Added QUILT method - [#40](https://github.com/nf-core/phaseimpute/pull/26) - Add STITCH method. Reorganize panelprep subworkflows. +- [](). Add possibility to remove samples from reference panel. Add glimpse2 chunking method. Add full-size test parameters. ### `Fixed` From 6f47934bf3e6641b6c9e90bd4d7a3441dea1f37b Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 10 May 2024 14:00:58 +0000 Subject: [PATCH 12/17] replace nf-validation@1.3 by nf-schema@2.0.0 --- assets/schema_input.json | 2 +- assets/schema_input_panel.json | 2 +- assets/schema_input_region.json | 2 +- assets/schema_map.json | 2 +- assets/schema_posfile.json | 2 +- nextflow.config | 16 +++++---- nextflow_schema.json | 26 +++++++------- .../utils_nfcore_phaseimpute_pipeline/main.nf | 34 ++++++++----------- .../nf-core/utils_nfvalidation_plugin/main.nf | 4 +-- .../tests/nextflow_schema.json | 2 +- workflows/phaseimpute/main.nf | 2 +- 11 files changed, 45 insertions(+), 49 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 971c3fb3..f025049b 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_input.json", "title": "nf-core/phaseimpute pipeline - params.input", "description": "Schema for the file provided with params.input", diff --git a/assets/schema_input_panel.json b/assets/schema_input_panel.json index 242a4136..7fb1e534 100644 --- a/assets/schema_input_panel.json +++ b/assets/schema_input_panel.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_input_panel.json", "title": "nf-core/phaseimpute pipeline - params.panel schema", "description": "Schema for the file provided with params.panel", diff --git a/assets/schema_input_region.json b/assets/schema_input_region.json index 5592aea9..81d7bda3 100644 --- a/assets/schema_input_region.json +++ b/assets/schema_input_region.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_input_region.json", "title": "nf-core/phaseimpute pipeline - params.input_region schema", "description": "Schema for the file provided with params.input_region", diff --git a/assets/schema_map.json b/assets/schema_map.json index 4b981006..0210e012 100644 --- a/assets/schema_map.json +++ b/assets/schema_map.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_map.json", "title": "nf-core/phaseimpute pipeline - params.map schema", "description": "Schema for the file provided with params.map", diff --git a/assets/schema_posfile.json b/assets/schema_posfile.json index 5a3c9d5a..bfa6114f 100644 --- a/assets/schema_posfile.json +++ b/assets/schema_posfile.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_posfile.json", "title": "nf-core/phaseimpute pipeline - params.posfile schema", "description": "Schema for the file provided with params.posfile", diff --git a/nextflow.config b/nextflow.config index d7e39671..5a589ed5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -87,11 +87,7 @@ params { max_time = '240.h' // Schema validation default options - validationFailUnrecognisedParams = false - validationLenientMode = false - validationSchemaIgnoreParams = 'genomes,igenomes_base' - validationShowHiddenParams = false - validate_params = true + validate_params = true } @@ -228,9 +224,15 @@ docker.registry = 'quay.io' podman.registry = 'quay.io' singularity.registry = 'quay.io' -// Nextflow plugins plugins { - id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-schema@2.0.0' +} + +validation { + failUnrecognisedParams = false + lenientMode = true + defaultIgnoreParams = [ 'genomes', 'igenomes_base' ] + showHiddenParams = false } // Load igenomes.config if required diff --git a/nextflow_schema.json b/nextflow_schema.json index f335be02..86f012c7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,10 +1,10 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/nextflow_schema.json", "title": "nf-core/phaseimpute pipeline parameters", "description": "Phasing and imputation pipeline", "type": "object", - "definitions": { + "defs": { "input_output_options": { "title": "Input/output options", "type": "object", @@ -459,37 +459,37 @@ }, "allOf": [ { - "$ref": "#/definitions/input_output_options" + "$ref": "#/defs/input_output_options" }, { - "$ref": "#/definitions/simulate" + "$ref": "#/defs/simulate" }, { - "$ref": "#/definitions/panelprep" + "$ref": "#/defs/panelprep" }, { - "$ref": "#/definitions/imputation_options" + "$ref": "#/defs/imputation_options" }, { - "$ref": "#/definitions/validation" + "$ref": "#/defs/validation" }, { - "$ref": "#/definitions/reference_genome_options" + "$ref": "#/defs/reference_genome_options" }, { - "$ref": "#/definitions/institutional_config_options" + "$ref": "#/defs/institutional_config_options" }, { - "$ref": "#/definitions/max_job_request_options" + "$ref": "#/defs/max_job_request_options" }, { - "$ref": "#/definitions/generic_options" + "$ref": "#/defs/generic_options" }, { - "$ref": "#/definitions/quilt_parameters" + "$ref": "#/defs/quilt_parameters" }, { - "$ref": "#/definitions/stitch_parameters" + "$ref": "#/defs/stitch_parameters" } ] } diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 28fada14..74412ec7 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -9,8 +9,8 @@ */ include { UTILS_NFVALIDATION_PLUGIN } from '../../nf-core/utils_nfvalidation_plugin' -include { paramsSummaryMap } from 'plugin/nf-validation' -include { fromSamplesheet } from 'plugin/nf-validation' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { samplesheetToList } from 'plugin/nf-schema' include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' @@ -108,12 +108,9 @@ workflow PIPELINE_INITIALISATION { // // Create channel from input file provided through params.input // - ch_input = Channel - .fromSamplesheet("input") - .map { - meta, file, index -> - [ meta, file, index ] - } + + ch_input = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) + .map { meta, file, index ->[ meta, file, index ] } // Check if all extension are identical getAllFilesExtension(ch_input) @@ -122,8 +119,7 @@ workflow PIPELINE_INITIALISATION { // if (params.input_truth) { if (params.input_truth.endsWith("csv")) { - ch_input_truth = Channel - .fromSamplesheet("input_truth") + ch_input_truth = Channel.fromList(samplesheetToList(params.input_truth, "assets/schema_input.json")) .map { meta, file, index -> [ meta, file, index ] @@ -144,7 +140,7 @@ workflow PIPELINE_INITIALISATION { if (params.panel) { if (params.panel.endsWith("csv")) { print("Panel file provided as input is a samplesheet") - ch_panel = Channel.fromSamplesheet("panel") + ch_panel = Channel.fromList(samplesheetToList(params.panel, "assets/schema_input_panel.json")) } else { // #TODO Wait for `oneOf()` to be supported in the nextflow_schema.json error "Panel file provided is of another format than CSV (not yet supported). Please separate your panel by chromosome and use the samplesheet format." @@ -167,9 +163,10 @@ workflow PIPELINE_INITIALISATION { ch_regions = GET_REGION.out.regions } else if (params.input_region.endsWith(".csv")) { println "Region file provided as input is a csv file" - ch_regions = Channel.fromSamplesheet("input_region") + ch_regions =Channel.fromList(samplesheetToList(params.input_region, "assets/schema_input_region.json")) .map{ chr, start, end -> [["chr": chr], chr + ":" + start + "-" + end]} .map{ metaC, region -> [metaC + ["region": region], region]} + } else { error "Region file provided is of another format than CSV (not yet supported). Please separate your reference genome by chromosome and use the samplesheet format." } @@ -180,7 +177,7 @@ workflow PIPELINE_INITIALISATION { if (params.map) { if (params.map.endsWith(".csv")) { print("Map file provided as input is a samplesheet") - ch_map = Channel.fromSamplesheet("map") + ch_map = Channel.fromList(samplesheetToList(params.map, "assets/schema_map.json")) } else { error "Map file provided is of another format than CSV (not yet supported). Please separate your reference genome by chromosome and use the samplesheet format." } @@ -212,13 +209,12 @@ workflow PIPELINE_INITIALISATION { // if (params.posfile) { - ch_posfile = Channel - .fromSamplesheet("posfile") - .map { - meta, file -> - [ meta, file ] + ch_posfile =Channel.fromList(samplesheetToList(params.posfile, "assets/schema_posfile.json")) + .map { + meta, file -> + [ meta, file ] }} else { - ch_posfile = [[]] + ch_posfile = [[]] } emit: diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf b/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf index 2585b65d..74f94a86 100644 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf @@ -8,9 +8,7 @@ ======================================================================================== */ -include { paramsHelp } from 'plugin/nf-validation' -include { paramsSummaryLog } from 'plugin/nf-validation' -include { validateParameters } from 'plugin/nf-validation' +include { validateParameters; paramsHelp; paramsSummaryLog } from 'plugin/nf-schema' /* ======================================================================================== diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json index 7626c1c9..e464a111 100644 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json +++ b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json @@ -1,5 +1,5 @@ { - "$schema": "http://json-schema.org/draft-07/schema", + "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": ". pipeline parameters", "description": "", diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index a8674351..8c6fe2e2 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -8,7 +8,7 @@ // MODULE: Installed directly from nf-core/modules // include { MULTIQC } from '../../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../../subworkflows/local/utils_nfcore_phaseimpute_pipeline' From 3071eb6e05140884a594f5bbcbbd79458146111b Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 10 May 2024 15:19:01 +0000 Subject: [PATCH 13/17] small modification to make quilt run again --- conf/steps/panel_prep.config | 2 +- subworkflows/local/bam_impute_quilt/bam_impute_quilt.nf | 3 +++ subworkflows/local/vcf_normalize_bcftools/meta.yml | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/conf/steps/panel_prep.config b/conf/steps/panel_prep.config index 1f3bf2be..c5e0d944 100644 --- a/conf/steps/panel_prep.config +++ b/conf/steps/panel_prep.config @@ -35,7 +35,7 @@ process { } withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_NORMALIZE_BCFTOOLS:BCFTOOLS_NORM' { - ext.args = '-m -any --no-version --output-type z' + ext.args = '-m +any --no-version --output-type z' ext.prefix = { "${meta.id}_${meta.chr}_multiallelic" } maxRetries = 2 publishDir = [ enabled: false ] diff --git a/subworkflows/local/bam_impute_quilt/bam_impute_quilt.nf b/subworkflows/local/bam_impute_quilt/bam_impute_quilt.nf index 9935cf35..3e7cf8f9 100644 --- a/subworkflows/local/bam_impute_quilt/bam_impute_quilt.nf +++ b/subworkflows/local/bam_impute_quilt/bam_impute_quilt.nf @@ -14,6 +14,7 @@ workflow BAM_IMPUTE_QUILT { main: + ch_versions = Channel.empty() posfile = [] @@ -25,6 +26,8 @@ workflow BAM_IMPUTE_QUILT { ngen = params.ngen buffer = params.buffer + // Rename panel to id + ch_chunks = ch_chunks.map{meta, chr, start, end -> return[['id': meta.panel, 'chr': meta.chr], chr, start, end]} if (genetic_map_file.isEmpty()) { ch_hap_chunks = ch_hap_legend.combine(ch_chunks, by:0).map { it + ngen + buffer + [[]] } diff --git a/subworkflows/local/vcf_normalize_bcftools/meta.yml b/subworkflows/local/vcf_normalize_bcftools/meta.yml index 8cf0a2f4..30d793f9 100644 --- a/subworkflows/local/vcf_normalize_bcftools/meta.yml +++ b/subworkflows/local/vcf_normalize_bcftools/meta.yml @@ -1,6 +1,6 @@ name: "vcf_normalize_bcftools" -description: Normalize VCF files (bcftools norm -m -any). - Keep only SNPs and remove multiallelic records (bcftools view -m 2 -M 2 -v snps). +description: Normalize VCF files (bcftools norm -m +any). Combine records spanning different lines into a single line. + Keep only biallelic SNPs and remove multiallelic records (bcftools view -m 2 -M 2 -v snps). Convert to hap/legend format (bcftools convert --haplegendsample). Optionally, remove samples from the reference panel (bcftools view -s ^SAMPLENAME). keywords: From 3ea9975424781757c9062f24f52b9462d068a2cb Mon Sep 17 00:00:00 2001 From: Anabella Trigila <18577080+atrigila@users.noreply.github.com> Date: Fri, 10 May 2024 16:15:27 +0000 Subject: [PATCH 14/17] run pre-commit --- README.md | 16 ++++++++-------- docs/output.md | 7 ++++--- docs/usage.md | 4 ++++ nextflow.config | 2 +- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 148130d8..faa0fea3 100644 --- a/README.md +++ b/README.md @@ -79,13 +79,13 @@ For more details and further functionality, please refer to the [usage documenta Here is a short description of the different steps of the pipeline. For more information please refer to the [documentation](https://nf-core.github.io/phaseimpute/usage/). -| Step | Flow chart | Description | -| ------------------ | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **Panel prep** | phase_metro | The preprocessing mode is responsible to the preparation of the multiple input file that will be used by the phasing process.
The main processes are :
- **Haplotypes phasing** of the reference panel using [**Shapeit5**](https://odelaneau.github.io/shapeit5/).
- **Filter** the reference panel to select only the necessary variants.
- **Chunking the reference panel** in a subset of region for all the chromosomes.
- **Extract** the positions where to perform the imputation. | -| **Impute** | phase_metro | The imputation mode is the core mode of this pipeline.
It is constituted of 3 main steps:
- **Phasing**: Phasing of the target dataset on the reference panel using either:
  - [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html)
  It's come with the necessety to compute the genotype likelihoods of the target dataset.
  This step is done using [BCFTOOLS_mpileup](https://samtools.github.io/bcftools/bcftools.html#mpileup)
  - [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) For this step the reference panel is transformed to binary chunks.
  - [**Stitch**](https://github.com/rwdavies/stitch)
  - [**Quilt**](https://github.com/rwdavies/QUILT)
- **Ligation**: all the different chunks are merged together.
- **Sampling** (optional) | -| **Simulate** | simulate_metro | The simulation mode is used to create artificial low informative genetic information from high density data. This allow to compare the imputed result to a _truth_ and therefore evaluate the quality of the imputation.
For the moment it is possible to simulate:
- Low-pass data by **downsample** BAM or CRAM using [SAMTOOLS_view -s]() at different depth
- Genotype data by **SNP selecting** the position used by a designated SNP chip.
The simulation mode will also compute the **Genotype likelihoods** of the high density data. | -| **Validate** | concordance_metro | This mode compare two vcf together to compute a summary of the differences between them.
To do so it use either:
- [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html) concordance process.
- [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) concordance process
- Or convert the two vcf fill to `.zarr` using [**Scikit allele**](https://scikit-allel.readthedocs.io/en/stable/) and [**anndata**](https://anndata.readthedocs.io/en/latest/) before comparing the SNPs. | -| **Postprocessing** | postprocessing_metro | This final process unable to loop the whole pipeline for increasing the performance of the imputation. To do so it filter out the best imputed position and rerun the analysis using this positions [to be developed]. | +| Step | Flow chart | Description | +| ------------------ | ---------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Panel prep** | phase_metro | The preprocessing mode is responsible to the preparation of the multiple input file that will be used by the phasing process.
The main processes are :
- **Haplotypes phasing** of the reference panel using [**Shapeit5**](https://odelaneau.github.io/shapeit5/).
- **Filter** the reference panel to select only the necessary variants.
- **Chunking the reference panel** in a subset of region for all the chromosomes.
- **Extract** the positions where to perform the imputation. | +| **Impute** | phase_metro | The imputation mode is the core mode of this pipeline.
It is constituted of 3 main steps:
- **Phasing**: Phasing of the target dataset on the reference panel using either:
  - [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html)
  It's come with the necessety to compute the genotype likelihoods of the target dataset.
  This step is done using [BCFTOOLS_mpileup](https://samtools.github.io/bcftools/bcftools.html#mpileup)
  - [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) For this step the reference panel is transformed to binary chunks.
  - [**Stitch**](https://github.com/rwdavies/stitch)
  - [**Quilt**](https://github.com/rwdavies/QUILT)
- **Ligation**: all the different chunks are merged together.
- **Sampling** (optional) | +| **Simulate** | simulate_metro | The simulation mode is used to create artificial low informative genetic information from high density data. This allow to compare the imputed result to a _truth_ and therefore evaluate the quality of the imputation.
For the moment it is possible to simulate:
- Low-pass data by **downsample** BAM or CRAM using [SAMTOOLS_view -s]() at different depth
- Genotype data by **SNP selecting** the position used by a designated SNP chip.
The simulation mode will also compute the **Genotype likelihoods** of the high density data. | +| **Validate** | concordance_metro | This mode compare two vcf together to compute a summary of the differences between them.
To do so it use either:
- [**Glimpse1**](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html) concordance process.
- [**Glimpse2**](https://odelaneau.github.io/GLIMPSE/glimpse2/index.html) concordance process
- Or convert the two vcf fill to `.zarr` using [**Scikit allele**](https://scikit-allel.readthedocs.io/en/stable/) and [**anndata**](https://anndata.readthedocs.io/en/latest/) before comparing the SNPs. | +| **Postprocessing** | postprocessing_metro | This final process unable to loop the whole pipeline for increasing the performance of the imputation. To do so it filter out the best imputed position and rerun the analysis using this positions [to be developed]. | ## Pipeline output @@ -131,7 +131,7 @@ You can cite one of the main imputation methods ([`GLIMPSE`](https://github.com/ > > Rubinacci, S., Ribeiro, D. M., Hofmeister, R. J., & Delaneau, O. > -> _Nature genetics_ 2021. doi:[]() +> _Nature genetics_ 2021. doi:[]() > **Imputation of low-coverage sequencing data from 150,119 UK Biobank genomes** > diff --git a/docs/output.md b/docs/output.md index 9f85428e..20b46c2d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -11,6 +11,7 @@ The directories listed below will be created in the results directory after the ## Pipeline overview ## Panel preparation outputs `--step panelprep` + This step of the pipeline performs a QC of the reference panel data and produces the necessary files for imputation (`--step impute`). It has two optional modes: reference panel phasing with SHAPEIT5 and removal of specified samples from reference panel. - [Remove Multiallelics](#multiallelics) - Remove multiallelic sites from the reference panel @@ -32,14 +33,14 @@ This step of the pipeline performs a QC of the reference panel data and produces - `prep_panel/posfile/` - `*.hap`: a .txt file with the list of position to genotype. -[bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces tab-delimited files per chromosome that can be gathered into a samplesheet and directly submitted for imputation with `--tools stitch` using the parameter `--posfile`. +[bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces tab-delimited files per chromosome that can be gathered into a samplesheet and directly submitted for imputation with `--tools stitch` using the parameter `--posfile`. ### Sites - `prep_panel/sites/` - `vcf/` - - `*.vcf.gz`: VCF with biallelic SNPs only. - - `*.csi`: Index file for VCF. + - `*.vcf.gz`: VCF with biallelic SNPs only. + - `*.csi`: Index file for VCF. - `tsv/` - `*.txt.gz`: TXT file for biallelic SNPs. - `*.tbi`: Index file for TSV. diff --git a/docs/usage.md b/docs/usage.md index d899cd36..5a0c61e3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -136,7 +136,9 @@ This step of the pipeline allows to create synthetic low-coverage input files by ```bash nextflow run nf-core/phaseimpute --input samplesheet.csv --step simulate --depth 1 --outdir results --genome GRCh37 -profile docker ``` + The required flags for this mode are: + - `--step simulate`: The step to run. - `--input samplesheet.csv`: The samplesheet containing the input sample files in `bam` format. - `--depth`: The final depth of the file [default: 1]. @@ -158,6 +160,7 @@ You can find an overview of the results produced by this steps in the [Output](o ### Start with imputation `--step impute` For starting from the imputation step, the required flags are: + - `--step impute` - `--input input.csv`: The samplesheet containing the input sample files in `bam` format. - `--panel samplesheet_reference.csv`: The files in `samplesheet_reference.csv` are the filtered, quality controlled, bi-allelic VCFs obtained from `--step panelprep`. @@ -218,6 +221,7 @@ nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_ref ``` ### Start with validation `--step validate` + This step compares a _truth_ VCF to an _imputed_ VCF in order to compute imputation accuracy. ### Updating the pipeline diff --git a/nextflow.config b/nextflow.config index 5a589ed5..b2085b5e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -225,7 +225,7 @@ podman.registry = 'quay.io' singularity.registry = 'quay.io' plugins { - id 'nf-schema@2.0.0' + id 'nf-schema@2.0.0' } validation { From 8e0bfa56636930c4b12dfe43ade56e743d902d97 Mon Sep 17 00:00:00 2001 From: atrigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 13 May 2024 13:38:22 +0000 Subject: [PATCH 15/17] Revert "replace nf-validation@1.3 by nf-schema@2.0.0" This reverts commit 6f47934bf3e6641b6c9e90bd4d7a3441dea1f37b. --- assets/schema_input.json | 2 +- assets/schema_input_panel.json | 2 +- assets/schema_input_region.json | 2 +- assets/schema_map.json | 2 +- assets/schema_posfile.json | 2 +- nextflow.config | 16 ++++----- nextflow_schema.json | 26 +++++++------- .../utils_nfcore_phaseimpute_pipeline/main.nf | 34 +++++++++++-------- .../nf-core/utils_nfvalidation_plugin/main.nf | 4 ++- .../tests/nextflow_schema.json | 2 +- workflows/phaseimpute/main.nf | 2 +- 11 files changed, 49 insertions(+), 45 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index f025049b..971c3fb3 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,5 +1,5 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_input.json", "title": "nf-core/phaseimpute pipeline - params.input", "description": "Schema for the file provided with params.input", diff --git a/assets/schema_input_panel.json b/assets/schema_input_panel.json index 7fb1e534..242a4136 100644 --- a/assets/schema_input_panel.json +++ b/assets/schema_input_panel.json @@ -1,5 +1,5 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_input_panel.json", "title": "nf-core/phaseimpute pipeline - params.panel schema", "description": "Schema for the file provided with params.panel", diff --git a/assets/schema_input_region.json b/assets/schema_input_region.json index 81d7bda3..5592aea9 100644 --- a/assets/schema_input_region.json +++ b/assets/schema_input_region.json @@ -1,5 +1,5 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_input_region.json", "title": "nf-core/phaseimpute pipeline - params.input_region schema", "description": "Schema for the file provided with params.input_region", diff --git a/assets/schema_map.json b/assets/schema_map.json index 0210e012..4b981006 100644 --- a/assets/schema_map.json +++ b/assets/schema_map.json @@ -1,5 +1,5 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_map.json", "title": "nf-core/phaseimpute pipeline - params.map schema", "description": "Schema for the file provided with params.map", diff --git a/assets/schema_posfile.json b/assets/schema_posfile.json index bfa6114f..5a3c9d5a 100644 --- a/assets/schema_posfile.json +++ b/assets/schema_posfile.json @@ -1,5 +1,5 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/assets/schema_posfile.json", "title": "nf-core/phaseimpute pipeline - params.posfile schema", "description": "Schema for the file provided with params.posfile", diff --git a/nextflow.config b/nextflow.config index b2085b5e..d7e39671 100644 --- a/nextflow.config +++ b/nextflow.config @@ -87,7 +87,11 @@ params { max_time = '240.h' // Schema validation default options - validate_params = true + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true } @@ -224,15 +228,9 @@ docker.registry = 'quay.io' podman.registry = 'quay.io' singularity.registry = 'quay.io' +// Nextflow plugins plugins { - id 'nf-schema@2.0.0' -} - -validation { - failUnrecognisedParams = false - lenientMode = true - defaultIgnoreParams = [ 'genomes', 'igenomes_base' ] - showHiddenParams = false + id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet } // Load igenomes.config if required diff --git a/nextflow_schema.json b/nextflow_schema.json index 86f012c7..f335be02 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,10 +1,10 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/phaseimpute/master/nextflow_schema.json", "title": "nf-core/phaseimpute pipeline parameters", "description": "Phasing and imputation pipeline", "type": "object", - "defs": { + "definitions": { "input_output_options": { "title": "Input/output options", "type": "object", @@ -459,37 +459,37 @@ }, "allOf": [ { - "$ref": "#/defs/input_output_options" + "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/defs/simulate" + "$ref": "#/definitions/simulate" }, { - "$ref": "#/defs/panelprep" + "$ref": "#/definitions/panelprep" }, { - "$ref": "#/defs/imputation_options" + "$ref": "#/definitions/imputation_options" }, { - "$ref": "#/defs/validation" + "$ref": "#/definitions/validation" }, { - "$ref": "#/defs/reference_genome_options" + "$ref": "#/definitions/reference_genome_options" }, { - "$ref": "#/defs/institutional_config_options" + "$ref": "#/definitions/institutional_config_options" }, { - "$ref": "#/defs/max_job_request_options" + "$ref": "#/definitions/max_job_request_options" }, { - "$ref": "#/defs/generic_options" + "$ref": "#/definitions/generic_options" }, { - "$ref": "#/defs/quilt_parameters" + "$ref": "#/definitions/quilt_parameters" }, { - "$ref": "#/defs/stitch_parameters" + "$ref": "#/definitions/stitch_parameters" } ] } diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 74412ec7..28fada14 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -9,8 +9,8 @@ */ include { UTILS_NFVALIDATION_PLUGIN } from '../../nf-core/utils_nfvalidation_plugin' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { samplesheetToList } from 'plugin/nf-schema' +include { paramsSummaryMap } from 'plugin/nf-validation' +include { fromSamplesheet } from 'plugin/nf-validation' include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' @@ -108,9 +108,12 @@ workflow PIPELINE_INITIALISATION { // // Create channel from input file provided through params.input // - - ch_input = Channel.fromList(samplesheetToList(params.input, "assets/schema_input.json")) - .map { meta, file, index ->[ meta, file, index ] } + ch_input = Channel + .fromSamplesheet("input") + .map { + meta, file, index -> + [ meta, file, index ] + } // Check if all extension are identical getAllFilesExtension(ch_input) @@ -119,7 +122,8 @@ workflow PIPELINE_INITIALISATION { // if (params.input_truth) { if (params.input_truth.endsWith("csv")) { - ch_input_truth = Channel.fromList(samplesheetToList(params.input_truth, "assets/schema_input.json")) + ch_input_truth = Channel + .fromSamplesheet("input_truth") .map { meta, file, index -> [ meta, file, index ] @@ -140,7 +144,7 @@ workflow PIPELINE_INITIALISATION { if (params.panel) { if (params.panel.endsWith("csv")) { print("Panel file provided as input is a samplesheet") - ch_panel = Channel.fromList(samplesheetToList(params.panel, "assets/schema_input_panel.json")) + ch_panel = Channel.fromSamplesheet("panel") } else { // #TODO Wait for `oneOf()` to be supported in the nextflow_schema.json error "Panel file provided is of another format than CSV (not yet supported). Please separate your panel by chromosome and use the samplesheet format." @@ -163,10 +167,9 @@ workflow PIPELINE_INITIALISATION { ch_regions = GET_REGION.out.regions } else if (params.input_region.endsWith(".csv")) { println "Region file provided as input is a csv file" - ch_regions =Channel.fromList(samplesheetToList(params.input_region, "assets/schema_input_region.json")) + ch_regions = Channel.fromSamplesheet("input_region") .map{ chr, start, end -> [["chr": chr], chr + ":" + start + "-" + end]} .map{ metaC, region -> [metaC + ["region": region], region]} - } else { error "Region file provided is of another format than CSV (not yet supported). Please separate your reference genome by chromosome and use the samplesheet format." } @@ -177,7 +180,7 @@ workflow PIPELINE_INITIALISATION { if (params.map) { if (params.map.endsWith(".csv")) { print("Map file provided as input is a samplesheet") - ch_map = Channel.fromList(samplesheetToList(params.map, "assets/schema_map.json")) + ch_map = Channel.fromSamplesheet("map") } else { error "Map file provided is of another format than CSV (not yet supported). Please separate your reference genome by chromosome and use the samplesheet format." } @@ -209,12 +212,13 @@ workflow PIPELINE_INITIALISATION { // if (params.posfile) { - ch_posfile =Channel.fromList(samplesheetToList(params.posfile, "assets/schema_posfile.json")) - .map { - meta, file -> - [ meta, file ] + ch_posfile = Channel + .fromSamplesheet("posfile") + .map { + meta, file -> + [ meta, file ] }} else { - ch_posfile = [[]] + ch_posfile = [[]] } emit: diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf b/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf index 74f94a86..2585b65d 100644 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfvalidation_plugin/main.nf @@ -8,7 +8,9 @@ ======================================================================================== */ -include { validateParameters; paramsHelp; paramsSummaryLog } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-validation' +include { paramsSummaryLog } from 'plugin/nf-validation' +include { validateParameters } from 'plugin/nf-validation' /* ======================================================================================== diff --git a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json index e464a111..7626c1c9 100644 --- a/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json +++ b/subworkflows/nf-core/utils_nfvalidation_plugin/tests/nextflow_schema.json @@ -1,5 +1,5 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", + "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", "title": ". pipeline parameters", "description": "", diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 8c6fe2e2..a8674351 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -8,7 +8,7 @@ // MODULE: Installed directly from nf-core/modules // include { MULTIQC } from '../../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../../subworkflows/local/utils_nfcore_phaseimpute_pipeline' From 4dc012089ed02ae9b0b904c3101b5d361e9f6076 Mon Sep 17 00:00:00 2001 From: atrigila <18577080+atrigila@users.noreply.github.com> Date: Mon, 13 May 2024 13:40:39 +0000 Subject: [PATCH 16/17] add error glimpse2 not yet implemented --- workflows/phaseimpute/main.nf | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index a8674351..2ac06d02 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -216,21 +216,21 @@ workflow PHASEIMPUTE { } if (params.tools.split(',').contains("glimpse2")) { - //error "Glimpse2 not yet implemented" + error "Glimpse2 not yet implemented" // Use previous chunks if --step panelprep - if (params.panel && params.step.split(',').contains("panelprep") && !params.chunks) { - ch_chunks = VCF_CHUNK_GLIMPSE.out.chunks_glimpse1 - - VCF_IMPUTE_GLIMPSE2(ch_input_impute, - ch_panel_phased, - ch_chunks, - ch_fasta) - } else if (params.chunks) { - // use provided chunks - } else { - error "Either no reference panel was included or you did not set step --panelprep or you did not provide --chunks" - } + // if (params.panel && params.step.split(',').contains("panelprep") && !params.chunks) { + // ch_chunks = VCF_CHUNK_GLIMPSE.out.chunks_glimpse1 + + // VCF_IMPUTE_GLIMPSE2(ch_input_impute, + // ch_panel_phased, + // ch_chunks, + // ch_fasta) + // } else if (params.chunks) { + // // use provided chunks + // } else { + // error "Either no reference panel was included or you did not set step --panelprep or you did not provide --chunks" + // } } From f24878852ad3401934b6d3dece004dcafbe1485f Mon Sep 17 00:00:00 2001 From: Louis LE NEZET <58640615+LouisLeNezet@users.noreply.github.com> Date: Mon, 13 May 2024 16:47:31 +0200 Subject: [PATCH 17/17] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ee86ca2..52cf7bb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,6 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co - [#40](https://github.com/nf-core/phaseimpute/pull/40) - Add STITCH method. Reorganize panelprep subworkflows. - [#51](https://github.com/nf-core/phaseimpute/pull/51) - Update all process and fix linting errors. Remove fastqc added by the template. - ### `Fixed` - [#15](https://github.com/nf-core/phaseimpute/pull/15) - Changed test csv files to point to nf-core repository