Merge pull request nf-core#74 from atrigila/handle_external_posfile

Add external posfile
LouisLeNezet · Jun 6, 2024 · f7fbeb8 · f7fbeb8
2 parents e58a7dc + d88bde1
commit f7fbeb8
Show file tree

Hide file tree

Showing 10 changed files with 189 additions and 115 deletions.
diff --git a/assets/schema_posfile.json b/assets/schema_posfile.json
@@ -7,18 +7,34 @@
     "items": {
         "type": "object",
         "properties": {
+            "panel": {
+                "type": "string",
+                "pattern": "^\\S+$",
+                "errorMessage": "Panel name must be provided as a string and cannot contain spaces",
+                "meta": ["panel"]
+            },
             "chr": {
                 "type": "string",
                 "pattern": "^\\S+$",
                 "errorMessage": "Chromosome name must be provided as a string and cannot contain spaces",
                 "meta": ["chr"]
             },
-            "file": {
+            "vcf": {
+                "type": "string",
+                "pattern": "^\\S+\\.((vcf)(\\.gz))?$",
+                "errorMessage": "VCF with sites per chromosome must be provided. Must have .vcf.gz extension"
+            },
+            "index": {
+                "type": "string",
+                "pattern": "^\\S+\\.(vcf|bcf)(\\.gz)?\\.(tbi|csi)$",
+                "errorMessage": "VCF index with sites per chromosome file must be provided, cannot contain spaces and must have extension '.vcf' or '.bcf' with optional '.gz' extension and with '.csi' or '.tbi' extension"
+            },
+            "txt": {
                 "type": "string",
-                "pattern": "^\\S+\\.txt$",
-                "errorMessage": "Posfile per chromosome must be provided. Must have .txt extension"
+                "pattern": "^\\S+\\.(txt|tsv)(\\.gz)?$",
+                "errorMessage": "TXT with sites (position file) per chromosome must be provided. Must have .txt or .tsv extension with optional .gz"
             }
         },
-        "required": ["chr", "file"]
+        "required": ["panel", "chr", "vcf", "index", "txt"]
     }
 }
diff --git a/conf/steps/imputation_stitch.config b/conf/steps/imputation_stitch.config
@@ -15,6 +15,19 @@ process {
         publishDir = [enabled: false]
     }
 
+    withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:POSFILE_PREPARE_GAWK:GUNZIP' {
+        ext.prefix = { "${meta.panel}_${meta.chr}_original_posfile" }
+        publishDir = [enabled: false]
+        tag = {"${meta.panel}_${meta.chr}"}
+    }
+
+    withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:POSFILE_PREPARE_GAWK:GAWK' {
+        ext.args   = "'{ gsub(\",\", \"\\t\") ; key = \$1 FS \$2 } !seen[key]++'" // Remove duplicates
+        ext.prefix = { "${meta.panel}_${meta.chr}_posfile_stitch" }
+        ext.suffix = "txt"
+        publishDir = [enabled: false]
+    }
+
     withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:BAM_IMPUTE_STITCH:STITCH' {
         ext.prefix = { "${meta.id}_stitch" }
     }

diff --git a/docs/output.md b/docs/output.md
@@ -21,22 +21,37 @@ This steps of the pipeline performs a QC of the reference panel data and produce
 - [Glimpse Chunk](#glimpse) - Create chunks of the reference panel
 - [CSV](#csv) - Obtain a CSV from this step
 
-### Convert
+The directory structure from `--steps panelprep` is:
 
-- `prep_panel/haplegend/`
-  - `*.hap`: a .hap file for the reference panel.
-  - `*.legend*`: a .legend file for the reference panel.
+```
+├── chunks
+│   ├── glimpse1
+│   └── glimpse2
+├── csv
+├── panel
+├── haplegend
+└── sites
+    ├── tsv
+    └── vcf
+```
+
+### Panel directory
 
-[bcftools](https://samtools.github.io/bcftools/bcftools.html) aids in the conversion of vcf files to .hap and .legend files. A .samples file is also generated. Once that you have generated the hap and legend files for your reference panel, you can skip the reference preparation steps and directly submit these files for imputation (to be developed). The hap and legend files are input files used with `--tools quilt`.
+- `prep_panel/panel/`
+  - `*.vcf.gz`: A vcf for the prepared reference panel.
+  - `*.tbi*`: A tbi for the prepared reference panel.
 
-### Posfile
+A directory containing the final phased and prepared panel per chromosome.
 
-- `prep_panel/posfile/`
-  - `*.hap`: a .txt file with the list of position to genotype.
+### Haplegend directory
+
+- `prep_panel/haplegend/`
+  - `*.hap`: a .hap file for the reference panel.
+  - `*.legend*`: a .legend file for the reference panel.
 
-[bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces tab-delimited files per chromosome that can be gathered into a samplesheet and directly submitted for imputation with `--tools stitch` using the parameter `--posfile`.
+[bcftools](https://samtools.github.io/bcftools/bcftools.html) aids in the conversion of vcf files to .hap and .legend files. A .samples file is also generated. Once that you have generated the hap and legend files for your reference panel, you can skip the reference preparation steps and directly submit these files for imputation. The hap and legend files are input files used with `--tools quilt`.
 
-### Sites
+### Sites directory
 
 - `prep_panel/sites/`
   - `vcf/`
@@ -48,74 +63,34 @@ This steps of the pipeline performs a QC of the reference panel data and produce
 
 [bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces VCF (`*.vcf.gz`) files per chromosome. These QCed VCFs can be gathered into a csv and used with all the tools in `--steps impute` using the flag `--panel`.
 
-In addition, [bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces tab-delimited files (`*_tsv.txt`) and, together with the VCFs, they can be gathered into a samplesheet and directly submitted for imputation with `--tools glimpse1` and `--posfile` (not yet implemented).
+In addition, [bcftools query](https://samtools.github.io/bcftools/bcftools.html) produces tab-delimited files (`*_tsv.txt`) and, together with the VCFs, they can be gathered into a samplesheet and directly submitted for imputation with `--tools glimpse1,stitch` and `--posfile`.
 
-### Glimpse Chunk
+### Chunks directory
 
 - `prep_panel/chunks/`
   - `*.txt`: TXT file containing the chunks obtained from running Glimpse chunks.
 
 [Glimpse1 chunk](https://odelaneau.github.io/GLIMPSE/) defines chunks where to run imputation. For further reading and documentation see the [Glimpse1 documentation](https://odelaneau.github.io/GLIMPSE/glimpse1/commands.html). Once that you have generated the chunks for your reference panel, you can skip the reference preparation steps and directly submit this file for imputation.
 
-## QUILT imputation mode
-
-The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
-
-- [QUILT](#quilt) - Perform imputation
-- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF.
-- [CSV](#csv) - Obtain a CSV from this step
+### CSV directory
 
-### QUILT
+- `prep_panel/csv/`
+  - `chunks.csv`: A csv containing the list of chunks obtained for each chromosome and panel.
+  - `panel.csv`: A csv containing the final phased and prepared for each chromosome and input panel.
+  - `posfile.csv`: A csv containing the final list of panel positions, in vcf and tsv, for each chromosome and input panel.
 
-- `imputation/quilt/`
-- `quilt.*.vcf.gz`: Imputed VCF for a specific chunk.
-- `quilt.*.vcf.gz.tbi`: TBI for the Imputed VCF for a specific chunk.
+## Imputation outputs `--steps impute`
 
-[quilt](https://github.com/rwdavies/QUILT) performs the imputation. This steps will contain the VCF for each of the chunks.
+The results from steps impute will have the following directory structure:
 
-### Concat
-
-- `imputation/quilt/bcftools/concat`
-- `.*.vcf.gz`: Imputed and ligated VCF for all the input samples.
+- `imputation/csv/`
+  - `impute.csv`: A single csv containing the path to a vcf and its index, of each imputed sample with their corresponding tool.
+- `imputation/[glimpse1,glimpse2,quilt,stitch]/`
+  - `concat/*.vcf.gz`: A vcf of each imputed sample.
+  - `concat/*.vcf.gz.tbi`: A tbi for the imputed vcf.
 
 [bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs in chunks.
 
-## STITCH imputation mode
-
-The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
-
-- [STITCH](#stitch) - Perform imputation
-- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF
-- [CSV](#csv) - Obtain a CSV from this step
-
-### STITCH
-
-- `imputation/stitch/`
-- `stitch.*.vcf.gz`: Imputed VCF for a specific chunk.
-- `stitch.*.vcf.gz.tbi`: TBI for the Imputed VCF for a specific chunk.
-
-[STITCH](https://github.com/rwdavies/STITCH) performs the imputation. This steps will contain the VCF for each of the chunks.
-
-### Concat
-
-- `imputation/stitch/bcftools/concat`
-- `.*.vcf.gz`: Imputed and concatenated VCF for all the input samples.
-
-[bcftools concat](https://samtools.github.io/bcftools/bcftools.html) will produce a single VCF from a list of imputed VCFs.
-
-## GLIMPSE2 imputation mode
-
-The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
-
-- [GLIMPSE2](#glimpse2) - Perform imputation
-- [Concatenate](#concatenate) - Concatenate all imputed chunks into a single VCF
-- [CSV](#csv) - Obtain a CSV from this step
-
-### GLIMPSE2 output files
-
-- `imputation/glimpse2/concat`
-- `.*.vcf.gz`: Imputed and concatenated VCF for all the input samples.
-
 ## Reports
 
 Reports contain useful metrics and pipeline information for the different modes.

diff --git a/docs/usage.md b/docs/usage.md
@@ -247,24 +247,42 @@ Otherwise, you can provide your own position file in the `--steps impute` with S
 nextflow run nf-core/phaseimpute --input samplesheet.csv --steps impute --posfile samplesheet_posfile.csv  --tool stitch --outdir results --genome GRCh37 -profile docker
 ```
 
-The csv provided in `--posfile` must contain two columns [chr, file]. The first column is the chromosome and the file column are tsvs with the list of positions, unique to each chromosome.
+The csv provided in `--posfile` must contain four columns [panel, chr, vcf, txt].
+
+- The first column [panel] is a name to identify the sites, typically a panel name. The panel name should be equal to the panel names in the `--panel samplesheet`, if using a panel.
+- The second column [chr] is the chromosome corresponding to each vcf and txt files.
+- The third column [vcf], required in GLIMPSE1 imputation, is used for the computation of genotype likelihood in the preprocessing of glimpse1. In addition, this file is used in `--steps validation`. This column can be kept empty if running `--steps impute --tools stitch` only.
+- The fourth column [txt] is a compressed tsv file containing the list of positions, unique to each chromosome.
 
 ```console
-chr,file
-chr1,posfile_chr1.txt
-chr2,posfile_chr2.txt
-chr3,posfile_chr3.txt
+panel,chr,vcf,txt
+1000G,chr1,,posfile_chr1.tsv.gz
+1000G,chr2,,posfile_chr2.tsv.gz
+1000G,chr3,,posfile_chr3.tsv.gz
 ```
 
-The file column should contain a TSV with the following structure, from STITCH documentation: "File is tab separated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. STITCH only handles bi-allelic SNPs" [STITCH](https://github.com/rwdavies/STITCH/blob/master/Options.md).
+The fourth column containing the compressed file has a TSV with the following structure, similar to that from [STITCH documentation](https://github.com/rwdavies/STITCH/blob/master/Options.md): File is tab separated with no header, one row per SNP, with
+
+- Column 1: chromosome
+- Column 2: physical position (sorted from smallest to largest)
+- Column 3: reference base,alternate base. Bases are capitalized. STITCH only handles bi-allelic SNPs.
+
+Unlike the files used in the original STITCH program, in `phaseimpute`, the last column, containing the reference vs. the alternate base is comma-separated.
 
-As an example, chr22 tsv file:
+As an example, a typical "posfile_chr22.tsv.gz" would look like:
 
 ```console
-chr22	16570065	A	G
-chr22	16570067	A	C
-chr22	16570176	C	A
-chr22	16570211	T	C
+chr22	16570065	A,G
+chr22	16570067	A,C
+chr22	16570176	C,A
+chr22	16570211	T,C
+```
+
+If you do not have a reference panel and you would like to obtain the posfile you can use the following command:
+
+```bash
+bcftools view -G -m 2 -M 2 -v ${vcf}
+bcftools query -f'%CHROM\t%POS\t%REF,%ALT\n' ${vcf}
 ```
 
 #### GLIMPSE1
@@ -275,7 +293,21 @@ chr22	16570211	T	C
 nextflow run nf-core/phaseimpute --input samplesheet.csv --panel samplesheet_reference.csv --steps impute --tool glimpse1 --outdir results --genome GRCh37 -profile docker --posfile posfile.csv --chunks chunks.csv
 ```
 
-Make sure the csv with the input panel is the output from `--step panelprep` or has been previously prepared.
+The csv provided in `--posfile` must contain four columns [panel, chr, vcf, txt].
+
+- The first column [panel] is a name to identify the sites, typically a panel name. The panel name should be equal to the panel names in the `--panel samplesheet`
+- The second column [chr] is the chromosome corresponding to that file.
+- The third column [vcf], required in GLIMPSE1 imputation, is used for the computation of genotype likelihood in the preprocessing of glimpse1. In addition, this file is used in `--steps validate`.
+- The fourth column [txt] is a compressed tsv file containing the list of positions, unique to each chromosome.
+
+```console
+panel,chr,vcf,txt
+1000G,chr1,posfile_chr1.vcf.gz,posfile_chr1.tsv.gz
+1000G,chr2,posfile_chr1.vcf.gz,posfile_chr2.tsv.gz
+1000G,chr3,posfile_chr1.vcf.gz,posfile_chr3.tsv.gz
+```
+
+The csv provided in `--panel` must be prepared with `--steps panelprep` and must contain two columns [panel, chr, vcf, index].
 
 #### GLIMPSE2
 
@@ -300,6 +332,7 @@ The required flags for this mode are:
 - `--steps validate`: The steps to run.
 - `--input samplesheet.csv`: The samplesheet containing the input sample files in `vcf` format.
 - `--input_truth samplesheet.csv`: The samplesheet containing the truth VCF files in `vcf` format.
+- `--posfile samplesheet.csv`: A samplesheet containing the vcf, tbi and txt sites to validate.
 
 ### Run all steps sequentially `--steps all`
 

diff --git a/subworkflows/local/posfile_prepare_gawk/main.nf b/subworkflows/local/posfile_prepare_gawk/main.nf
@@ -0,0 +1,26 @@
+include { GAWK                         } from '../../../modules/nf-core/gawk'
+include { GUNZIP                      } from '../../../modules/nf-core/gunzip'
+
+workflow POSFILE_PREPARE_GAWK {
+
+    take:
+    ch_posfile // channel:   [ [id, chr], vcf, csi, txt ]
+
+    main:
+    ch_versions = Channel.empty()
+
+    // Only keep the txt from the channel
+    ch_posfile = ch_posfile.map{meta,vcf, csi, txt -> tuple(meta,txt)}
+
+    // Decompress
+    GUNZIP(ch_posfile)
+    ch_posfile = GUNZIP.out.gunzip
+
+    // Convert TSV in "Glimpse format" to "Stitch format": Replace ","" to "\t"
+    GAWK(ch_posfile, [])
+    ch_versions = ch_versions.mix(GAWK.out.versions)
+
+    emit:
+    posfile  = GAWK.out.output          // channel:   [ [meta], txt ]
+
+}
diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf
@@ -216,8 +216,8 @@ workflow PIPELINE_INITIALISATION {
     //
     if (params.posfile) {
         ch_posfile = Channel
-            .fromSamplesheet("posfile")
-            .map {meta, file -> [ meta, file ]}
+                    .fromSamplesheet("posfile")
+                    .map {meta, vcf, csi, txt -> [ meta, vcf, csi, txt ]}
     } else {
         ch_posfile = [[],[]]
     }

diff --git a/subworkflows/local/vcf_sites_extract_bcftools/main.nf b/subworkflows/local/vcf_sites_extract_bcftools/main.nf
@@ -36,16 +36,17 @@ workflow VCF_SITES_EXTRACT_BCFTOOLS {
     TABIX_BGZIP(BCFTOOLS_QUERY.out.output)
     ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions.first())
 
-    // Index compressed TSV
-    TABIX_TABIX(TABIX_BGZIP.out.output)
-    ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first())
+    // Generate default posfile (sites vcf, sites index and sites txt)
+    ch_posfile = ch_panel_sites
+            .join(TABIX_BGZIP.out.output)
 
-    // Join compressed TSV and index
-    ch_panel_tsv = TABIX_BGZIP.out.output.combine(TABIX_TABIX.out.tbi, by: 0)
+    // Generate glimpse posfile
+    ch_glimpse_posfile = ch_posfile.map{ metaPC, sites, s_index, tsv -> [metaPC, sites, tsv]}
 
     emit:
-    panel_tsv_glimpse      = ch_panel_tsv     // channel: [ [id, chr], tsv, tbi ]
-    panel_tsv_stitch       = GAWK.out.output  // channel: [ [id, chr], txt ]
-    panel_sites            = ch_panel_sites   // channel: [ [id, chr], vcf, csi ]
-    versions               = ch_versions      // channel: [ versions.yml ]
+    panel_tsv_stitch       = GAWK.out.output     // channel: [ [id, chr], txt ]
+    panel_sites            = ch_panel_sites      // channel: [ [id, chr], vcf, csi ]
+    posfile                = ch_posfile          // channel: [ [id, chr], vcf, csi, tsv.gz ]
+    glimpse_posfile        = ch_glimpse_posfile  // channel: [ [id, chr], vcf, tsv.gz ]
+    versions               = ch_versions         // channel: [ versions.yml ]
 }
diff --git a/tests/csv/posfile.csv b/tests/csv/posfile.csv
@@ -1,2 +1,3 @@
-chr,file
-chr22,"https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/chr22_posfile_stitch.txt"
+panel,chr,vcf,index,txt
+1000GP.s.norel,chr21,"https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/21/1000GP.chr21.s.norel.sites.vcf.gz","https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/21/1000GP.chr21.s.norel.sites.vcf.gz.csi","https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/21/1000GP.chr21.s.norel.tsv.gz"
+1000GP.s.norel,chr22,"https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/1000GP.chr22.s.norel.sites.vcf.gz","https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/21/1000GP.chr21.s.norel.sites.vcf.gz.csi","https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/data/panel/22/1000GP.chr22.s.norel.tsv.gz"