Merge pull request nf-core#143 from LouisLeNezet/fix_checkchr

Fix checkchr
LouisLeNezet · Oct 29, 2024 · 262290e · 262290e
2 parents e351bb3 + abd697f
commit 262290e
Show file tree

Hide file tree

Showing 16 changed files with 120 additions and 172 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@ Initial release of nf-core/phaseimpute, created with the [nf-core](https://nf-co
 - [#118](https://github.com/nf-core/phaseimpute/pull/118) - Explain how to customize arguments in the pipeline.
 - [#111](https://github.com/nf-core/phaseimpute/pull/111) - Add nf-test for all sbwf, wf, modules and functions.
 - [#131](https://github.com/nf-core/phaseimpute/pull/131) - Set normalisation as optional. Fix extension detection function. Add support for validation with vcf files. Concatenate vcf only if more than one file. Change `--phased` to `--phase` for consistency.
+- [#143](https://github.com/nf-core/phaseimpute/pull/143) - Improve contigs warning and error logging. The number of chromosomes contigs is summarized if above `max_chr_names`.
 
 ### `Changed`
 

diff --git a/modules.json b/modules.json
@@ -207,17 +207,17 @@
                 "nf-core": {
                     "utils_nextflow_pipeline": {
                         "branch": "master",
-                        "git_sha": "9d05360da397692321d377b6102d2fb22507c6ef",
+                        "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082",
                         "installed_by": ["subworkflows"]
                     },
                     "utils_nfcore_pipeline": {
                         "branch": "master",
-                        "git_sha": "772684d9d66f37b650c8ba5146ac1ee3ecba2acb",
+                        "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba",
                         "installed_by": ["subworkflows"]
                     },
                     "utils_nfschema_plugin": {
                         "branch": "master",
-                        "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c",
+                        "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e",
                         "installed_by": ["subworkflows"]
                     }
                 }

diff --git a/modules/local/add_columns/environment.yml b/modules/local/add_columns/environment.yml
@@ -1,6 +1,5 @@
-name: gawk
 channels:
   - conda-forge
   - bioconda
 dependencies:
-  - anaconda::gawk=5.3.0
+  - conda-forge::gawk=5.3.0
diff --git a/modules/local/bam_chr_extract/environment.yml b/modules/local/bam_chr_extract/environment.yml
@@ -1,8 +1,6 @@
-name: bam_chr_extract
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
-  - bioconda::samtools=1.20
-  - bioconda::htslib=1.20
+  - bioconda::htslib=1.21
+  - bioconda::samtools=1.21
diff --git a/modules/local/list_to_file/environment.yml b/modules/local/list_to_file/environment.yml
@@ -1,6 +1,5 @@
-name: gawk
 channels:
   - conda-forge
   - bioconda
 dependencies:
-  - anaconda::gawk=5.3.0
+  - conda-forge::gawk=5.3.0
diff --git a/modules/local/vcf_chr_extract/environment.yml b/modules/local/vcf_chr_extract/environment.yml
@@ -1,7 +1,5 @@
-name: vcf_chr_extract
 channels:
   - conda-forge
   - bioconda
-  - defaults
 dependencies:
   - bioconda::bcftools=1.20
diff --git a/nextflow.config b/nextflow.config
@@ -23,9 +23,12 @@ params {
     phase                       = false
     normalize                   = true
     compute_freq                = false
-    rename_chr                  = false
     remove_samples              = null
 
+    // ChrCheck parameters
+    rename_chr                  = false
+    max_chr_names               = 4
+
     // References
     genome                      = null
     igenomes_base               = 's3://ngi-igenomes/igenomes/'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -41,6 +41,12 @@
                     "description": "Should the panel vcf files be renamed to match the reference genome (e.g. 'chr1' -> '1')",
                     "pattern": "true|false"
                 },
+                "max_chr_names": {
+                    "type": "integer",
+                    "description": "Maximum number of contigs name to print before resuming (i.e. show only subset and add '...' at the end).",
+                    "hidden": true,
+                    "default": 4
+                },
                 "remove_samples": {
                     "type": "string",
                     "description": "Comma-separated list of samples to remove from the reference panel. Useful for benchmarking purposes."

diff --git a/subworkflows/local/utils_nfcore_chrcheck_pipeline/main.nf b/subworkflows/local/utils_nfcore_chrcheck_pipeline/main.nf
@@ -42,7 +42,8 @@ def diffChr(chr_target, chr_ref, file) {
         }
         new_diff = diff - new_chr
         if (new_diff.size() != 0) {
-            error "Contig names: ${new_diff} absent from file: ${file} and cannot be solved by adding or removing the `chr` prefix."
+            chr_names = new_diff.size() > params.max_chr_names ? new_diff[0..params.max_chr_names - 1] + ['...'] : new_diff
+            error "Contig names: ${chr_names} absent from file: ${file} and cannot be solved by adding or removing the `chr` prefix."
         }
         diff = to_rename
     }

diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf
@@ -258,14 +258,23 @@ workflow PIPELINE_INITIALISATION {
     chr_all_mis = chr_ref_mis.concat(chr_chunks_mis, chr_map_mis, chr_panel_mis, chr_posfile_mis)
         .unique()
         .toList()
-        .subscribe{ chr ->  if (chr.size() > 0) { log.warn "The following contigs are absent from at least one file : ${chr} and therefore won't be used" } }
+        .subscribe{ chr ->
+            if (chr.size() > 0) {
+                chr_names = chr.size() > params.max_chr_names ? chr[0..params.max_chr_names - 1] + ['...'] : chr
+                log.warn "The following contigs are absent from at least one file : ${chr_names} and therefore won't be used" } }
 
     ch_regions = ch_regions
         .combine(chr_all_mis.toList())
         .filter { meta, regions, chr_mis ->
             !(meta.chr in chr_mis)
         }
         .map { meta, regions, chr_mis -> [meta, regions] }
+        .ifEmpty { error "No regions left to process" }
+
+    ch_regions
+        .map { it[1] }
+        .collect()
+        .subscribe { log.info "The following contigs will be processed: ${it}" }
 
     // Check that all input files have the correct index
     checkFileIndex(ch_input.mix(ch_input_truth, ch_ref_gen, ch_panel))
@@ -469,7 +478,9 @@ def checkMetaChr(chr_a, chr_b, name){
         .map{
             a, b ->
             if (b != [[]] && !(a - b).isEmpty()) {
-                log.warn "Chr : ${a - b} is missing from ${name}"
+                chr_names = (a - b).size() > params.max_chr_names ? (a - b)[0..params.max_chr_names - 1] + ['...'] : (a - b)
+                verb = (a - b).size() == 1 ? "is" : "are"
+                log.warn "Chr : ${chr_names} ${verb} missing from ${name}"
                 return (a-b)
             }
             return []

diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test
diff --git a/workflows/chrcheck/main.nf b/workflows/chrcheck/main.nf
@@ -19,9 +19,16 @@ workflow CHRCHECK {
         ch_versions = Channel.empty()
         // Split the input between VCF and BAM files
         ch_input = ch_input.branch{
-            bam: it[1] =~ 'bam|cram|sam'
+            bam: it[1] =~ 'bam|cram'
             vcf: it[1] =~ 'vcf|bcf'
+            other: it[1].size() > 0
+            empty: true
         }
+
+        ch_input.other.map {
+            error "File: ${it[1]} is not a VCF, BCFT or BAM, CRAM file."
+        }
+
         // Check if channel is empty
         chr_vcf_disjoint = Channel.empty()
         // Extract the contig names from the VCF files
@@ -51,10 +58,12 @@ workflow CHRCHECK {
             ch_vcf_renamed = VCF_CHR_RENAME_BCFTOOLS.out.vcf_renamed
         } else {
             chr_vcf_disjoint.to_rename.map {
-                error "Contig names: ${it[3]} in VCF: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
+                chr_names = it[3].size() > params.max_chr_names ? it[3][0..params.max_chr_names - 1] + ['...'] : it[3]
+                error "Contig names: ${chr_names} in VCF: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
             }
             chr_bam_disjoint.to_rename.map {
-                error "Contig names: ${it[3]} in BAM: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
+                chr_names = it[3].size() > params.max_chr_names ? it[3][0..params.max_chr_names - 1] + ['...'] : it[3]
+                error "Contig names: ${chr_names} in BAM: ${it[1]} are not present in reference genome with same writing. Please set `rename_chr` to `true` to rename the contigs."
             }
             ch_vcf_renamed = Channel.empty()
             ch_bam_renamed = Channel.empty()

diff --git a/workflows/chrcheck/tests/main.nf.test b/workflows/chrcheck/tests/main.nf.test
@@ -26,14 +26,14 @@ nextflow_workflow {
                 input[0] = Channel.fromList([
                     [
                         [id: "VCF_chr22"],
-                        file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz'],checkIfExist:true),
-                        file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz_tbi'],checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz',checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi',checkIfExist:true),
                         ["22"]
                     ],
                     [
                         [id: "BAM_chr22"],
-                        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
-                        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
                         ["22"]
                     ]
                 ])
@@ -44,21 +44,28 @@ nextflow_workflow {
         then {
             assertAll(
                 { assert workflow.success },
-                { assert snapshot(workflow.out).match() },
-                { assert snapshot(workflow.out.output.collect{
-                    if (it[1].endsWith("vcf.gz")) {
-                        path(it[1]).vcf.summary
-                    } else {
-                        bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
-                    }
-                    }).match("headernochr")
+                { assert snapshot(workflow.out.output
+                    .collect{
+                        if (it[1].endsWith("vcf.gz")) {
+                            path(it[1]).vcf.summary
+                        } else {
+                            bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
+                        }
+                    },
+                    workflow.out.output.collect{
+                        file(it[1]).getName()
+                        file(it[2]).getName()
+                    },
+                    workflow.out.versions
+                ).match()
                 }
             )
         }
     }
 
     test("Rename: VCF no chr + fasta chr") {
         config "./nextflow_rename.config"
+        tag "test"
         setup {
             run("BAM_CHR_RENAME_SAMTOOLS", alias: "PREPROCESS") {
                 script "../../../subworkflows/local/bam_chr_rename_samtools/main.nf"
@@ -67,8 +74,8 @@ nextflow_workflow {
                     input[0] = Channel.fromList([
                         [
                             [id: "BAM_22"],
-                            file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
-                            file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
+                            file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
+                            file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
                             "nochr"
                         ]
                     ])
@@ -97,14 +104,20 @@ nextflow_workflow {
         then {
             assertAll(
                 { assert workflow.success },
-                { assert snapshot(workflow.out).match() },
-                { assert snapshot(workflow.out.output.collect{
-                    if (it[1].endsWith("vcf.gz")) {
-                        path(it[1]).vcf.summary
-                    } else {
-                        bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
-                    }
-                    }).match("headerwithchr")
+                { assert snapshot(
+                    workflow.out.output.collect{
+                        if (it[1].endsWith("vcf.gz")) {
+                            path(it[1]).vcf.summary
+                        } else {
+                            bam(it[1]).getHeader().findAll { it.startsWith ("@SQ") }
+                        }
+                    },
+                    workflow.out.output.collect{
+                        file(it[1]).getName()
+                        file(it[2]).getName()
+                    },
+                    workflow.out.versions
+                    ).match()
                 }
             )
         }
@@ -118,8 +131,8 @@ nextflow_workflow {
                 input[0] = Channel.fromList([
                     [
                         [id: "VCF_chr22"],
-                        file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz'],checkIfExist:true),
-                        file(params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz_tbi'],checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz',checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi',checkIfExist:true),
                         ["22"]
                     ]
                 ])
@@ -145,8 +158,8 @@ nextflow_workflow {
                 input[0] = Channel.fromList([
                     [
                         [id: "BAM_chr22"],
-                        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
-                        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
                         ["22"]
                     ]
                 ])
@@ -169,12 +182,12 @@ nextflow_workflow {
         when {
             workflow {
                 """
-                lst_chr = ["chr22", "chr34", "GL000207.1"]
+                lst_chr = ["chr22", "chr34", "GL000207.1", "chr45", "chr46", "chr47", "chr48", "chr49"]
                 input[0] = Channel.fromList([
                     [
                         [id: "VCF_AllNoChr"],
-                        file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz",checkIfExist:true),
-                        file("https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi",checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz',checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi',checkIfExist:true),
                         lst_chr
                     ],
                 ])
@@ -185,7 +198,7 @@ nextflow_workflow {
         then {
             assertAll(
                 { assert workflow.failed },
-                { assert workflow.errorReport.contains("Contig names: [chr34, GL000207.1] absent from file: /nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz and cannot be solved by adding or removing the `chr` prefix.")}
+                { assert workflow.errorReport.contains("Contig names: [chr34, GL000207.1, chr45, chr46, ...] absent from file: /nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz and cannot be solved by adding or removing the `chr` prefix.")}
             )
         }
     }
@@ -198,8 +211,8 @@ nextflow_workflow {
                 input[0] = Channel.fromList([
                     [
                         [id: "BAM_chr22"],
-                        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam'], checkIfExist:true),
-                        file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam', checkIfExist:true),
+                        file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExist:true),
                         ["chr1"]
                     ],
                 ])