diff --git a/LICENSE b/LICENSE index e48cb19..0aefc2e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Kübra Narcı +Copyright (c) kuebra.narci@dkfz.de Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index d9d4aaa..a8a7b25 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ nf-core/variantbenchmarking + [![GitHub Actions CI Status](https://github.com/nf-core/variantbenchmarking/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/variantbenchmarking/actions?query=workflow%3A%22nf-core+CI%22) [![GitHub Actions Linting Status](https://github.com/nf-core/variantbenchmarking/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/variantbenchmarking/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/variantbenchmarking/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) @@ -13,7 +14,7 @@ [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/variantbenchmarking) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23variantbenchmarking-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/variantbenchmarking)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23benchmark-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/variantbenchmarking)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction @@ -29,14 +30,22 @@ workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. --> -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Standardization of SVs in test VCF files +2. Normalization of SVs in test VCF files +3. Normalization of SVs in truth VCF files +4. SV stats and histograms +5. Germline benchmarking of SVs +6. Somatic benchmarking of SVs +7. Final report and comparisons ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. +Supported SV callers: Manta, SVaba, Dragen, Delly, Lumpy .. +Available Truth samples: HG002, SEQC2 + Now, you can run the pipeline using: @@ -78,7 +86,7 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/variantbenchmarking was originally written by Kübra Narcı. +nf-core/variantbenchmarking was originally written by kuebra.narci@dkfz.de. We thank the following people for their extensive assistance in the development of this pipeline: diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index d26a71b..9653868 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,5 +1,5 @@ report_comment: > - This report has been generated by the nf-core/variantbenchmarking + This report has been generated by the nf-core/variantbenchmarking analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..d7db7d3 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,test_vcf,truth_vcf,caller,type +HG002,"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/HG002.svaba.germline.sv.convBNDtoDEL.vcf","/Users/w620-admin/Desktop/nf-core/dataset/hg37/GIAB_Assemblytics_structural_variants_only_160618/hg002.Assemblytics_structural_variants.sorted.vcf.gz",svaba,sv +HG003,"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Broad_svaba_05052017/HG003.svaba.germline.sv.convBNDtoDEL.vcf","/Users/w620-admin/Desktop/nf-core/dataset/hg37/GIAB_Assemblytics_structural_variants_only_160618/hg003.Assemblytics_structural_variants.sorted.vcf.gz",svaba,sv diff --git a/assets/samplesheet_HG002.csv b/assets/samplesheet_HG002.csv new file mode 100644 index 0000000..db4ccca --- /dev/null +++ b/assets/samplesheet_HG002.csv @@ -0,0 +1,4 @@ +test_vcf,caller +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz",delly +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.vcf.gz",lumpy +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype.vcf",manta \ No newline at end of file diff --git a/assets/samplesheet_HG002_hg19.csv b/assets/samplesheet_HG002_hg19.csv new file mode 100644 index 0000000..d1d8dba --- /dev/null +++ b/assets/samplesheet_HG002_hg19.csv @@ -0,0 +1,5 @@ +test_vcf,caller +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_DRAGEN_SV_hg19.vcf.gz",dragen +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz",delly +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.vcf.gz",lumpy +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype.vcf",manta \ No newline at end of file diff --git a/assets/samplesheet_HG002_hg38.csv b/assets/samplesheet_HG002_hg38.csv new file mode 100644 index 0000000..95cbeef --- /dev/null +++ b/assets/samplesheet_HG002_hg38.csv @@ -0,0 +1,5 @@ +test_vcf,caller +"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz",lumpy +"/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz",manta +"/Users/w620-admin/Desktop/nf-core/dataset/hg37/Ashkenazim_unnanotated/Ashkenazim_HG002.filtered.sv.vcf.gz",merged + diff --git a/assets/samplesheet_SEQC2.csv b/assets/samplesheet_SEQC2.csv new file mode 100644 index 0000000..5b64aed --- /dev/null +++ b/assets/samplesheet_SEQC2.csv @@ -0,0 +1,3 @@ +test_vcf,caller +"/Users/w620-admin/Desktop/nf-core/dataset/hg38/SEQC_somatic_mutation_truth/test/WGS.bwa.dedup-IL_T_1_vs_IL_N_1-Strelka.indel.vcf.gz",strelka +"/Users/w620-admin/Desktop/nf-core/dataset/hg38/SEQC_somatic_mutation_truth/test/WGS.bwa.dedup-IL_T_1_vs_IL_N_1-MuTect2.vcf.gz",mutect2 diff --git a/assets/schema_input.json b/assets/schema_input.json index c4f8dd9..2ef1310 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,36 +1,23 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/variantbenchmarking/master/assets/schema_input.json", - "title": "nf-core/variantbenchmarking pipeline - params.input schema", - "description": "Schema for the file provided with params.input", - "type": "array", - "items": { - "type": "object", - "properties": { - "sample": { - "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" - }, - "fastq_1": { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" - }, - "fastq_2": { - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", - "anyOf": [ - { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$" - }, - { - "type": "string", - "maxLength": 0 - } - ] - } - }, - "required": ["sample", "fastq_1"] - } + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/variantbenchmarking/master/assets/schema_input.json", + "title": "nf-core/variantbenchmarking pipeline - params.input schema", + "description": "Schema for the file provided with params.input", + "type": "array", + "items": { + "type": "object", + "properties": { + "test_vcf": { + "type": "string", + "pattern": "", + "errorMessage": "Test VCF must be provided, cannot contain spaces and must have extension '.vcf.gz'" + }, + "caller": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Name of the variant caller used to generate test file" + } + }, + "required": ["test_vcf","caller"] + } } diff --git a/assets/svync/default.yaml b/assets/svync/default.yaml new file mode 100644 index 0000000..e69de29 diff --git a/assets/svync/delly.yaml b/assets/svync/delly.yaml new file mode 100644 index 0000000..f820a1d --- /dev/null +++ b/assets/svync/delly.yaml @@ -0,0 +1,69 @@ +id: delly_$INFO/SVTYPE +alt: + BND: TRA +info: + CALLER: + value: delly + number: 1 + type: string + description: The caller used to determine this variant + SVLEN: + value: ~sub:$INFO/END,$POS + number: 1 + type: integer + description: The length of the structural variant + alts: + DEL: -~sub:$INFO/END,$POS + INS: $INFO/SVLEN + TRA: 1 + CIEND: + value: $INFO/CIEND + number: 2 + type: integer + description: PE confidence interval around END + CIPOS: + value: $INFO/CIPOS + number: 2 + type: integer + description: PE confidence interval around POS + SVTYPE: + value: $INFO/SVTYPE + number: 1 + type: string + description: Type of structural variant + CHR2: + value: + number: 1 + type: string + description: Chromosome for second position + alts: + TRA: $INFO/CHR2 + END: + value: $INFO/END + number: 1 + type: integer + description: End position of the structural variant + alts: + TRA: $INFO/POS2 + IMPRECISE: + value: $INFO/IMPRECISE + number: 0 + type: flag + description: Imprecise structural variation +format: + GT: + value: $FORMAT/GT + number: 1 + type: string + description: Genotype + PE: + value: $FORMAT/DR,$FORMAT/DV + number: 2 + type: integer + description: Paired-read support for the ref and alt alleles in the order listed + SR: + value: $FORMAT/RR,$FORMAT/RV + number: 2 + type: integer + description: Split-read support for the ref and alt alleles in the order listed + diff --git a/assets/svync/gridss.yaml b/assets/svync/gridss.yaml new file mode 100644 index 0000000..62c8c8a --- /dev/null +++ b/assets/svync/gridss.yaml @@ -0,0 +1,65 @@ +id: gridss_$INFO/SVTYPE +info: + CALLER: + value: gridss + number: 1 + type: string + description: The caller used to determine this variant + SVLEN: + value: ~sub:$INFO/END,$POS + number: 1 + type: integer + description: The length of the structural variant + alts: + BND: + TRA: 0 + DEL: -~sub:$INFO/END,$POS + CIEND: + value: $INFO/CIRPOS + number: 2 + type: integer + description: PE confidence interval around END + CIPOS: + value: $INFO/CIPOS + number: 2 + type: integer + description: PE confidence interval around POS + SVTYPE: + value: $INFO/SVTYPE + number: 1 + type: string + description: Type of structural variant + CHR2: + value: + number: 1 + type: string + description: Chromosome for second position + alts: + TRA: $INFO/CHR2 + END: + value: $INFO/END + number: 1 + type: integer + description: End position of the structural variant + IMPRECISE: + value: $INFO/IMPRECISE + number: 0 + type: flag + description: Imprecise structural variation +format: + GT: + value: $FORMAT/GT + number: 1 + type: string + description: Genotype + PE: + value: $FORMAT/REFPAIR,$FORMAT/RP + number: 2 + type: integer + description: Paired-read support for the ref and alt alleles in the order listed + SR: + value: .,$FORMAT/SR + number: 2 + type: integer + description: Split-read support for the ref and alt alleles in the order listed + diff --git a/assets/svync/manta.yaml b/assets/svync/manta.yaml new file mode 100644 index 0000000..eaa5869 --- /dev/null +++ b/assets/svync/manta.yaml @@ -0,0 +1,66 @@ +id: manta_$INFO/SVTYPE +info: + CALLER: + value: manta + number: 1 + type: string + description: The caller used to determine this variant + SVLEN: + value: $INFO/SVLEN + number: 1 + type: integer + description: The length of the structural variant + alts: + INS: ~sum:~len:LEFT_SVINSSEQ,~len:RIGHT_SVINSSEQ + TRA: 1 + CIEND: + value: $INFO/CIEND + number: 2 + type: integer + description: PE confidence interval around END + CIPOS: + value: $INFO/CIPOS + number: 2 + type: integer + description: PE confidence interval around POS + SVTYPE: + value: $INFO/SVTYPE + number: 1 + type: string + description: Type of structural variant + CHR2: + value: + number: 1 + type: string + description: Chromosome for second position + alts: + TRA: $INFO/CHR2 + END: + value: $INFO/END + number: 1 + type: integer + description: End position of the structural variant + alts: + TRA: $INFO/POS2 + IMPRECISE: + value: $INFO/IMPRECISE + number: 0 + type: flag + description: Imprecise structural variation +format: + GT: + value: $FORMAT/GT + number: 1 + type: string + description: Genotype + PE: + value: $FORMAT/PR + number: 2 + type: integer + description: Paired-read support for the ref and alt alleles in the order listed + SR: + value: $FORMAT/SR + number: 2 + type: integer + description: Split-read support for the ref and alt alleles in the order listed + diff --git a/assets/svync/smoove.yaml b/assets/svync/smoove.yaml new file mode 100644 index 0000000..ee277a7 --- /dev/null +++ b/assets/svync/smoove.yaml @@ -0,0 +1,61 @@ +id: smoove_$INFO/SVTYPE +info: + CALLER: + value: smoove + number: 1 + type: string + description: The caller used to determine this variant + SVLEN: + value: $INFO/SVLEN + number: 1 + type: integer + description: The length of the structural variant + CIEND: + value: $INFO/CIEND + number: 2 + type: integer + description: PE confidence interval around END + CIPOS: + value: $INFO/CIPOS + number: 2 + type: integer + description: PE confidence interval around POS + SVTYPE: + value: $INFO/SVTYPE + number: 1 + type: string + description: Type of structural variant + CHR2: + value: + number: 1 + type: string + description: Chromosome for second position + alts: + TRA: $INFO/CHR2 + END: + value: $INFO/END + number: 1 + type: integer + description: End position of the structural variant + IMPRECISE: + value: $INFO/IMPRECISE + number: 0 + type: flag + description: Imprecise structural variation +format: + GT: + value: $FORMAT/GT + number: 1 + type: string + description: Genotype + PE: + value: .,$FORMAT/PE + number: 2 + type: integer + description: Paired-read support for the ref and alt alleles in the order listed + SR: + value: .,$FORMAT/SR + number: 2 + type: integer + description: Split-read support for the ref and alt alleles in the order listed + diff --git a/bin/add_svtype.py b/bin/add_svtype.py new file mode 100644 index 0000000..d93981e --- /dev/null +++ b/bin/add_svtype.py @@ -0,0 +1,61 @@ +import argparse +import pysam +import logging +import os +import subprocess as sp + +logging.basicConfig( + format='%(levelname)-7s | %(asctime)s | %(message)s', + datefmt='%H:%M:%S') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +parser = argparse.ArgumentParser() +parser.add_argument('--graph', required=True, help='Input VCF') +parser.add_argument('--svlength', type=int, required=False, help='SV Lenght', default=50) +args = parser.parse_args() + + + +in_vcf = pysam.VariantFile(args.graph) +out_name = os.path.basename(args.graph) +if out_name.endswith('.gz'): + out_name = out_name[:-3] +if out_name.endswith('.vcf'): + out_name = out_name[:-4] + +anno_header = in_vcf.header +logger.info('Writing Header') +anno_header.info.add("SVTYPE","1","String","Type of structural variant") +anno_header.info.add("SVLEN",".","Integer","Difference in length between the REF and ALT alleles") + +anno_vcf = pysam.VariantFile('{}.annotated.vcf.gz'.format(out_name), 'w', header=anno_header) + +logger.info('Adding SVTYE/SVLEN to VCF') +counter=0 +for v in in_vcf: + svlen=(len(v.alts[0])-len(v.ref)) + limit=args.svlength + + if (svlen < -1*limit): + v.info.update({'SVTYPE':"DEL"}) + v.info.update({'SVLEN':svlen}) + anno_vcf.write(v) + counter=counter+1 + elif (svlen > limit): + v.info.update({'SVTYPE':"INS"}) + v.info.update({'SVLEN':svlen}) + anno_vcf.write(v) + counter=counter+1 + else: + anno_vcf.write(v) + +in_vcf.close() +anno_vcf.close() +logger.info('Finished adding SVTYE/SVLEN to VCF: {}'.format(counter)) + +sp.run(['tabix', anno_vcf.filename]) +logger.info('VCF indexed') + +logger.info('DONE') diff --git a/bin/annotate_repeat_regions.py b/bin/annotate_repeat_regions.py new file mode 100644 index 0000000..c92bcaa --- /dev/null +++ b/bin/annotate_repeat_regions.py @@ -0,0 +1,71 @@ +import pysam +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import argparse +import os + +parser = argparse.ArgumentParser() +parser.add_argument('--vcf', required=True, help='Input VCF') +parser.add_argument('--repeat', required=True, help='Simple Repeat regions') +parser.add_argument('--repmask', required=True, help='Repeat Mask regions') +parser.add_argument('--segdup', required=True, help='Segmental Duplication regions') + +args = parser.parse_args + +def print_AF(in_vcf,samplerep,repeatmask,segdup): + in_file = pysam.VariantFile(in_vcf) + out_name = os.path.basename(in_vcf) + if out_name.endswith('.gz'): + out_name = out_name[:-3] + if out_name.endswith('.vcf'): + out_name = out_name[:-4] + + with open(out_name + ".txt","w") as out_file: + + print('SV_Type', 'SV', 'Length' , 'AF', 'Simple_Repeat','Repeat_Name','Repeat_Class','Segmental_Dup', file=out_file) + for v in in_vcf: + temp1=samplerep[(samplerep['chrom']== v.chrom) &( (( v.start > samplerep['chromStart']-1 ) & ( v.start < samplerep['chromEnd'] +1 ) )| (( v.stop > samplerep['chromStart'] -1) & ( v.stop < samplerep['chromEnd'] +1 ))) ] + + if len(temp1) > 0: + simple_repeat=temp1.iloc[0].period + else: + simple_repeat='None' + + temp2=repeatmask[(repeatmask['genoName']== v.chrom) &( (( v.start > repeatmask['genoStart']-1 ) & ( v.start < repeatmask['genoEnd'] +1 ) )| (( v.stop > repeatmask['genoStart'] -1) & ( v.stop < repeatmask['genoEnd'] +1 ))) ] + + if len(temp2) > 0: + repname=temp2.iloc[0].repName + repclass=temp2.iloc[0].repClass + else: + repname='None' + repclass='None' + + temp3=segdup[(segdup['chrom']== v.chrom) &( (( v.start > segdup['chromStart']-1 ) & ( v.start < segdup['chromEnd'] +1 ) )| (( v.stop > segdup['chromStart'] -1) & ( v.stop < segdup['chromEnd'] +1 ))) ] + if len(temp3) > 0: + segdups=temp3.iloc[0].fracMatch + else: + segdups='None' + + + if len(v.ref) - len(v.alts[0]) > 49: + if len(v.ref) - len(v.alts[0]) < 1001: + print('DEL', v.ref, len(v.alts[0])- len(v.ref) , v.info['AF'][0],simple_repeat,repname, repclass,segdups, file=out_file) + else: + print('DEL', '...', len(v.alts[0])- len(v.ref) , v.info['AF'][0],simple_repeat,repname, repclass,segdups, file=out_file) + + elif len(v.alts[0]) - len(v.ref) > 49: + if len(v.ref) - len(v.alts[0]) < 1001: + print('INS', v.alts[0],len(v.alts[0]) - len(v.ref), v.info['AF'][0],simple_repeat,repname, repclass,segdups, file=out_file) + else: + print('INS', '...',len(v.alts[0]) - len(v.ref), v.info['AF'][0],simple_repeat,repname, repclass,segdups, file=out_file) + + else: + continue + + +repeat_file = pd.read_csv(args.repeat, header=0, sep='\t') +repmask_file = pd.read_csv(args.repmask, header=0, sep='\t') +segdup_file = pd.read_csv(args.segdups, header=0, sep='\t') + +print_AF(args.vcf, repeat_file,repmask_file,segdup_file ) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 4a758fe..5bbe208 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,258 +1,125 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging +import os import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. +import errno +import argparse - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" +def parse_args(args=None): + Description = "Reformat samplesheet file and check its contents." + Epilog = "Example usage: python check_samplesheet.py " -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input samplesheet file.") + parser.add_argument("FILE_OUT", help="Output file.") + return parser.parse_args(args) -def sniff_format(handle): - """ - Detect the tabular format. +def make_dir(path): + if len(path) > 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise exception - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect +def print_error(error, context="Line", context_str=""): + error_str = f"ERROR: Please check samplesheet -> {error}" + if context != "" and context_str != "": + error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'" + print(error_str) + sys.exit(1) def check_samplesheet(file_in, file_out): """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, + This function checks that the samplesheet follows the following structure: + test_vcf,caller + test1.vcf,manta + test2.vcf,svaba + For an example see: + https://github.com/ghga-de/nf-benchmark/assets/samplesheet.csv + """ - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + sample_mapping_dict = {} + with open(file_in, "r", encoding='utf-8-sig') as fin: - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + ## Check header + MIN_COLS = 2 + HEADER = ["test_vcf","caller"] + header = [x.strip('"') for x in fin.readline().strip().split(",")] + if header[: len(HEADER)] != HEADER: + print( + f"ERROR: Please check samplesheet header -> {','.join(header)} != {','.join(HEADER)}" + ) sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) + ## Check caller entries + for line in fin: + if line.strip(): + lspl = [x.strip().strip('"') for x in line.strip().split(",")] + + ## Check valid number of columns per row + if len(lspl) < len(HEADER): + print_error( + f"Invalid number of columns (minimum = {len(HEADER)})!", + "Line", + line, + ) + + num_cols = len([x for x in lspl if x]) + if num_cols < MIN_COLS: + print_error( + f"Invalid number of populated columns (minimum = {MIN_COLS})!", + "Line", + line, + ) + + ## Check caller name entries + test_vcf, caller = lspl[: len(HEADER)] + if caller.find(" ") != -1: + print( + f"WARNING: Spaces have been replaced by underscores for caller: {caller}" + ) + caller = caller.replace(" ", "_") + if not caller: + print_error("Caller entry has not been specified!", "Line", line) + + sample_info = [] ## [test_vcf, caller ] + + sample_info = [test_vcf, caller] + + ## Create caller mapping dictionary = {caller: [[test_vcf, caller ]]} + if caller not in sample_mapping_dict: + sample_mapping_dict[caller] = [sample_info] + else: + if sample_info in sample_mapping_dict[caller]: + print_error("Samplesheet contains duplicate rows!", "Line", line) + else: + sample_mapping_dict[caller].append(sample_info) + + ## Write validated samplesheet with appropriate columns + if len(sample_mapping_dict) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write( + ",".join(["test_vcf","caller"]) + + "\n" + ) + for caller in sorted(sample_mapping_dict.keys()): -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) + for idx, val in enumerate(sample_mapping_dict[caller]): + fout.write(",".join(val) + "\n") + else: + print_error(f"No entries to process!", "Samplesheet: {file_in}") -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) +def main(args=None): + args = parse_args(args) + check_samplesheet(args.FILE_IN, args.FILE_OUT) if __name__ == "__main__": diff --git a/bin/convertInversion.py b/bin/convertInversion.py new file mode 100644 index 0000000..8f70cdf --- /dev/null +++ b/bin/convertInversion.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python2 +# + + +import sys +import gzip +from io import BufferedReader +from subprocess import check_output +from os import path +from os.path import exists + + +class VcfRecord: + + def __init__(self, inline): + tokens = inline.strip().split('\t') + + self.chr = tokens[0] + self.pos = int(tokens[1]) + self.vid = tokens[2] + self.ref = tokens[3] + self.alt = tokens[4] + self.qual = tokens[5] + self.filter = tokens[6] + self.info = tokens[7].split(';') + self.others = "\t".join(tokens[8:]) + + # Create a dictionary for INFO + self.infoDict ={} + for infoItem in self.info: + items = infoItem.split('=') + if len(items) == 1: + self.infoDict[items[0]] = True + elif len(items) > 1: + self.infoDict[items[0]] = items[1] + + self.isINV3 = False + self.isINV5 = False + self.mateChr = "" + self.matePos = -1 + + + def checkInversion(self): + + def getMateInfo(splitChar): + items = self.alt.split(splitChar) + mateInfo = items[1].split(':') + # Assuming that the last item, contains position information + matePos = mateInfo[-1] + # Other items belong to chromosome + self.mateChr = ":".join(mateInfo[:-1]) + self.matePos = int(matePos) + + if self.alt.startswith('['): + getMateInfo('[') + if self.mateChr == self.chr: + self.isINV5 = True + elif self.alt.endswith(']'): + getMateInfo(']') + if self.mateChr == self.chr: + self.isINV3 = True + + + def makeLine(self): + infoStr = ";".join(self.info) + + self.line = "\t".join((self.chr, + str(self.pos), + self.vid, + self.ref, + self.alt, + self.qual, + self.filter, + infoStr, + self.others + ))+"\n" + + +def scanVcf(vcfFile): + + invMateDict = {} + + if vcfFile.endswith('gz'): + gzfp = gzip.open(vcfFile, 'rb') + fpVcf = BufferedReader(gzfp) + else: + fpVcf = open(vcfFile, 'rb') + + for line in fpVcf: + if line[0] == '#': + continue + + vcfRec = VcfRecord(line) + vcfRec.checkInversion() + if vcfRec.isINV3 or vcfRec.isINV5: + if vcfRec.vid in invMateDict: + # update mate INFO + invMateDict[vcfRec.vid] = vcfRec.infoDict + else: + mateId = vcfRec.infoDict["MATEID"] + invMateDict[mateId] = "" + + return invMateDict + + +def getReference(samtools, refFasta, chrom, start, end): + region = "%s:%d-%d" % (chrom, start, end) + samtoolsOut = check_output([samtools, "faidx", refFasta, region]) + refSeq = "" + for seq in samtoolsOut.split('\n'): + if not seq.startswith(">"): + refSeq += seq + + return refSeq.upper() + + +def writeLines(lines): + for line in lines: + sys.stdout.write(line) + + +def convertInversions(samtools, refFasta, vcfFile, invMateDict): + isHeaderInfoAdded = False + isHeaderAltAdded = False + lineBuffer = [] + bufferedChr = "" + bufferedPos = -1 + + if vcfFile.endswith('gz'): + gzfp = gzip.open(vcfFile, 'rb') + fpVcf = BufferedReader(gzfp) + else: + fpVcf = open(vcfFile, 'rb') + + for line in fpVcf: + if line.startswith('#'): + if (not isHeaderInfoAdded) and line.startswith("##FORMAT="): + sys.stdout.write("##INFO=\n") + sys.stdout.write("##INFO=\n") + isHeaderInfoAdded = True + + if (not isHeaderAltAdded) and line.startswith("##ALT="): + sys.stdout.write("##ALT=\n") + isHeaderAltAdded = True + + sys.stdout.write(line) + continue + + vcfRec = VcfRecord(line) + + # skip mate record + if vcfRec.vid in invMateDict: + continue + + vcfRec.checkInversion() + if vcfRec.isINV3 or vcfRec.isINV5: + if vcfRec.isINV5: + # adjust POS for INV5 + vcfRec.pos -= 1 + vcfRec.matePos -= 1 + vcfRec.ref = getReference(samtools, refFasta, + vcfRec.chr, vcfRec.pos, vcfRec.pos) + + # update manta ID + vidSuffix = vcfRec.vid.split("MantaBND")[1] + idx = vidSuffix.rfind(':') + vcfRec.vid = "MantaINV%s" % vidSuffix[:idx] + + # symbolic ALT + vcfRec.alt = "" + + # add END + infoEndStr = "END=%d" % vcfRec.matePos + + newInfo = [infoEndStr] + for infoItem in vcfRec.info: + if infoItem.startswith("SVTYPE"): + # change SVTYPE + newInfo.append("SVTYPE=INV") + # add SVLEN + infoSvLenStr = "SVLEN=%d" % (vcfRec.matePos-vcfRec.pos) + newInfo.append(infoSvLenStr) + + elif infoItem.startswith("CIPOS"): + newInfo.append(infoItem) + + # set CIEND + isImprecise = "IMPRECISE" in vcfRec.infoDict + # for imprecise calls, set CIEND to the mate breakpoint's CIPOS + if isImprecise: + mateId = vcfRec.infoDict["MATEID"] + mateInfoDict = invMateDict[mateId] + infoCiEndStr = "CIEND=%s" % (mateInfoDict["CIPOS"]) + newInfo.append(infoCiEndStr) + # for precise calls, set CIEND w.r.t HOMLEN + else: + if "HOMLEN" in vcfRec.infoDict: + infoCiEndStr = "CIEND=-%s,0" % vcfRec.infoDict["HOMLEN"] + newInfo.append(infoCiEndStr) + + elif infoItem.startswith("HOMSEQ"): + # update HOMSEQ for INV5 + if vcfRec.isINV5: + cipos = vcfRec.infoDict["CIPOS"].split(',') + homSeqStart = vcfRec.pos + int(cipos[0]) + 1 + homSeqEnd = vcfRec.pos + int(cipos[1]) + refSeq = getReference(samtools, refFasta, vcfRec.chr, + homSeqStart, homSeqEnd) + infoHomSeqStr = "HOMSEQ=%s" % refSeq + newInfo.append(infoHomSeqStr) + else: + newInfo.append(infoItem) + + # skip BND-specific tags + elif (infoItem.startswith("MATEID") or + infoItem.startswith("BND_DEPTH") or + infoItem.startswith("MATE_BND_DEPTH")): + continue + + # update event ID + elif infoItem.startswith("EVENT"): + eidSuffix = vcfRec.infoDict["EVENT"].split("MantaBND")[1] + idx = vidSuffix.rfind(':') + infoEventStr = "EVENT=MantaINV%s" % eidSuffix[:idx] + newInfo.append(infoEventStr) + + # apply all other tags + else: + newInfo.append(infoItem) + + # add INV3/INV5 tag + if vcfRec.isINV3: + newInfo.append("INV3") + elif vcfRec.isINV5: + newInfo.append("INV5") + + vcfRec.info = newInfo + + vcfRec.makeLine() + + # make sure the vcf is sorted in genomic order + if (not vcfRec.chr == bufferedChr) or (vcfRec.pos > bufferedPos): + if lineBuffer: + writeLines(lineBuffer) + + lineBuffer = [vcfRec.line] + bufferedChr = vcfRec.chr + bufferedPos = vcfRec.pos + elif vcfRec.pos < bufferedPos: + lineBuffer.insert(0, vcfRec.line) + else: + lineBuffer.append(vcfRec.line) + + if lineBuffer: + writeLines(lineBuffer) + + + +if __name__=='__main__': + + usage = "convertInversion.py \n" + if len(sys.argv) <= 3: + sys.stderr.write(usage) + sys.exit(1) + + samtools = sys.argv[1] + refFasta = sys.argv[2] + vcfFile = sys.argv[3] + + for inputFile in [samtools, refFasta, vcfFile]: + if not(exists(inputFile)): + errMsg = ('File %s does not exist.' + % inputFile) + sys.stderr.write(errMsg + '\nProgram exits.') + sys.exit(1) + + invMateDict = scanVcf(vcfFile) + convertInversions(samtools, refFasta, vcfFile, invMateDict) diff --git a/bin/id_annotation.py b/bin/id_annotation.py new file mode 100644 index 0000000..1289459 --- /dev/null +++ b/bin/id_annotation.py @@ -0,0 +1,52 @@ +import argparse +import pysam +import logging +import os +import subprocess as sp + +logging.basicConfig( + format='%(levelname)-7s | %(asctime)s | %(message)s', + datefmt='%H:%M:%S') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +parser = argparse.ArgumentParser() +parser.add_argument('--vcf', required=True, help='Input VCF') +args = parser.parse_args() + + + +in_vcf = pysam.VariantFile(args.vcf) +out_name = os.path.basename(args.vcf) +if out_name.endswith('.gz'): + out_name = out_name[:-3] +if out_name.endswith('.vcf'): + out_name = out_name[:-4] + +anno_header = in_vcf.header + +anno_vcf = pysam.VariantFile('{}.annotated.vcf.gz'.format(out_name), 'w', header=anno_header) + +logger.info('Adding CHROM_POS_TYPE to ID') +for v in in_vcf: + svlen=(len(v.alts[0])-len(v.ref)) + if (svlen < 0): + string=str(v.chrom) +'_'+ str(v.pos) + '_DEL' + v.id =string + elif (svlen > 0): + string=str(v.chrom) +'_'+ str(v.pos) + '_INS' + v.id =string + else: + string=str(v.chrom) +'_'+ str(v.pos) + '_SNP' + v.id =string + + +in_vcf.close() +anno_vcf.close() +logger.info('Finished') + +sp.run(['tabix', anno_vcf.filename]) +logger.info('VCF indexed') + +logger.info('DONE') diff --git a/bin/parse_vcf.py b/bin/parse_vcf.py new file mode 100644 index 0000000..6444b1d --- /dev/null +++ b/bin/parse_vcf.py @@ -0,0 +1,50 @@ +import argparse +import pysam +import logging +import os +import subprocess as sp + +logging.basicConfig( + format='%(levelname)-7s | %(asctime)s | %(message)s', + datefmt='%H:%M:%S') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +parser = argparse.ArgumentParser() +parser.add_argument('--vcf', required=True, help='Input VCF') +args = parser.parse_args() + +in_vcf = pysam.VariantFile(args.vcf) +out_name = os.path.basename(args.vcf) +if out_name.endswith('.gz'): + out_name = out_name[:-3] +if out_name.endswith('.vcf'): + out_name = out_name[:-4] + +out_header = in_vcf.header +out_vcf = pysam.VariantFile('{}.resolved.vcf.gz'.format(out_name), 'w', header=out_header) + +logger.info('Only will be sequence resolved, BND, and are filtered out!') +for v in in_vcf: + if v.info['SVTYPE'] == 'BND': + continue + else: + if v.alts[0] == '': + continue + + elif v.alts[0] == '': + continue + + elif v.alts[0] == '': + v.alleles = (fasta_f.fetch(v.chrom, v.pos - 1, v.stop), v.ref) + out_vcf.write(v) + else: + out_vcf.write(v) +fasta_f.close() +in_vcf.close() +out_vcf.close() + +sp.run(['tabix', out_vcf.filename]) +logger.info('VCF indexed') + +logger.info('DONE') diff --git a/bin/simple_event_annotator.R b/bin/simple_event_annotator.R new file mode 100644 index 0000000..ec99571 --- /dev/null +++ b/bin/simple_event_annotator.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript +# +# Copyright GRIDSS +## original version : https://github.com/PapenfussLab/gridss/blob/7b1fedfed32af9e03ed5c6863d368a821a4c699f/example/simple-event-annotation.R + +#source("https://bioconductor.org/biocLite.R") +#biocLite("VariantAnnotation") +#install_github("PapenfussLab/StructuralVariantAnnotation") +#install.packages("stringr") +library(VariantAnnotation) +library(StructuralVariantAnnotation) +library(stringr) +library(getopt) +#' Simple SV type classifier +## USAGE +## simple_event_annotator.R infile outfile genome + + +simpleEventType <- function(gr) { + return(ifelse(seqnames(gr) != seqnames(partner(gr)), "ITX", # inter-chromosomosal + ifelse(gr$insLen >= abs(gr$svLen) * 0.7, "INS", # TODO: improve classification of complex events + ifelse(strand(gr) == strand(partner(gr)), "INV", + ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == "-"), "DEL", + "DUP"))))) +} +cmdArgs = commandArgs(TRUE) +print(cmdArgs) +if (length(cmdArgs) < 3) print(paste("Incorrect number of arguments (3 expected): ",length(cmdArgs))) +infile = cmdArgs[1] +outfile = cmdArgs[2] +genome = cmdArgs[3] + + +# using the example in the GRIDSS /example directory +vcf <- readVcf(infile, genome) +gr <- breakpointRanges(vcf) +svtype <- simpleEventType(gr) +info(vcf)$SIMPLE_TYPE <- NA_character_ +info(vcf[gr$vcfId])$SIMPLE_TYPE <- svtype +info(vcf[gr$vcfId])$SVLEN <- gr$svLen +writeVcf(vcf, outfile) + +## TODO: perform event filtering here +## By default, GRIDSS is very sensitive but this comes at the cost of a high false discovery rate +#gr <- gr[gr$FILTER == "PASS" & partner(gr)$FILTER == "PASS"] # Remove low confidence calls + +#simplegr <- gr[simpleEventType(gr) %in% c("INS", "INV", "DEL", "DUP")] +#simplebed <- data.frame( +# chrom=seqnames(simplegr), +# # call the centre of the homology/inexact interval +# start=as.integer((start(simplegr) + end(simplegr)) / 2), +# end=as.integer((start(partner(simplegr)) + end(partner(simplegr))) / 2), +# name=simpleEventType(simplegr), +# score=simplegr$QUAL, +# strand="." +# ) +## Just the lower of the two breakends so we don't output everything twice +#simplebed <- simplebed[simplebed$start < simplebed$end,] +#write.table(simplebed, "chr12.1527326.DEL1024.simple.bed", quote=FALSE, sep='\t', row.names=FALSE, col.names=FALSE) \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index 194b09b..0e24cb6 100644 --- a/conf/base.config +++ b/conf/base.config @@ -28,17 +28,17 @@ process { // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } withLabel:process_medium { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 36.GB * task.attempt, 'memory' ) } + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } } withLabel:process_high { diff --git a/conf/igenomes.config b/conf/igenomes.config index 3f11437..cd55be5 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -13,6 +13,7 @@ params { genomes { 'GRCh37' { fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + fai = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" @@ -26,6 +27,7 @@ params { } 'GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + fai = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" @@ -38,249 +40,16 @@ params { } 'CHM13' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" + fai = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf" gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" mito_name = "chrM" } - 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" - } - 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" - mito_name = "Mt" - } - 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" - } - 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" - mito_name = "MT" - } - 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" - mito_name = "MtDNA" - macs_gsize = "9e7" - } - 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" - mito_name = "MT" - } - 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" - mito_name = "M" - macs_gsize = "1.2e8" - } - 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" - mito_name = "MT" - } - 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" - } - 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" - } - 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" - mito_name = "MT" - } - 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" - mito_name = "MT" - } - 'Rnor_5.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" - mito_name = "MT" - macs_gsize = "1.2e7" - } - 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.21e7" - } - 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" - } - 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" - mito_name = "MT" - } - 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" - mito_name = "Mt" - } 'hg38' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" + fai = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" @@ -293,6 +62,7 @@ params { } 'hg19' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" + fai = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" @@ -304,137 +74,5 @@ params { macs_gsize = "2.7e9" blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" } - 'mm10' { - fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" - } - 'bosTau8' { - fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'ce10' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "9e7" - } - 'canFam3' { - fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" - mito_name = "chrM" - } - 'danRer10' { - fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.37e9" - } - 'dm6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.2e8" - } - 'equCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" - mito_name = "chrM" - } - 'galGal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" - mito_name = "chrM" - } - 'panTro4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" - mito_name = "chrM" - } - 'rn6' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'sacCer3' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.2e7" - } - 'susScr3' { - fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" - mito_name = "chrM" - } } } diff --git a/conf/modules.config b/conf/modules.config index d91c6ab..6e27229 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -26,10 +26,6 @@ process { ] } - withName: FASTQC { - ext.args = '--quiet' - } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -46,5 +42,181 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - + withName: "BCFTOOLS_NORM_1" { + ext.args = {"--output-type z -N -m-any -c w" } + ext.prefix = {"${meta.id}.${meta2.caller}_norm"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/preprocess"}, + pattern: "*{.vcf.gz,vcf.gz.tbi}", + mode: params.publish_dir_mode + ] + } + withName: "BCFTOOLS_NORM_2" { + ext.args = {"--output-type z --rm-du exact -c w" } + ext.prefix = {"${meta.id}.${meta2.caller}_dedup"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/preprocess"}, + pattern: "*{.vcf.gz,vcf.gz.tbi}", + mode: params.publish_dir_mode + ] + } + withName: AWK_SORT { + ext.prefix = {"${meta.id}.${meta2.caller}_sort"} + publishDir = [ + path: { "${params.outdir}/test" }, + enabled: false + ] + } + withName: BCFTOOLS_RENAME_CHR { + ext.args = {"--output-type z" } + ext.prefix = {"${meta.id}.${meta2.caller}_rename_chr"} + publishDir = [ + path: { "${params.outdir}/test" }, + enabled: false + ] + } + withName: "BCFTOOLS_REHEADER_TRUTH" { + ext.args2 = {"--output-type v" } + ext.prefix = {"${meta.id}_truth"} + publishDir = [ + path: { "${params.outdir}/test" }, + enabled: false + ] + } + withName: "BCFTOOLS_REHEADER_TEST" { + ext.args2 = {"--output-type v" } + ext.prefix = {"${meta.id}_query"} + publishDir = [ + path: { "${params.outdir}/test" }, + enabled: false + ] + } + withName: BCFTOOLS_VIEW { + ext.args2 = {"--output-type z" } + ext.prefix = {"${meta.id}.${meta2.caller}_filter"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/"}, + pattern: "*{.vcf.gz,vcf.gz.tbi}", + mode: params.publish_dir_mode + ] + } + withName: BCFTOOLS_ISEC { + ext.args = {"--output-type v" } + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/svanalyzer_bench/${meta2.caller}/isec"}, + mode: params.publish_dir_mode + ] + } + withName: SURVIVOR_STATS { + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/stats/survivor/"}, + pattern: "*{.stats}", + mode: params.publish_dir_mode + ] + } + withName: SURVIVOR_FILTER { + ext.prefix = {"${meta.id}.${meta2.caller}.filter"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/preprocess"}, + pattern: "*{.vcf}", + mode: params.publish_dir_mode + ] + } + withName: BCFTOOLS_STATS { + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/stats/bcftools/"}, + pattern: "*{stats.txt}", + mode: params.publish_dir_mode + ] + } + withName: "TRUVARI_PHAB" { + ext.prefix = {"${meta.id}.${meta2.caller}.harm"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/truvari_phab/${meta2.caller}"}, + pattern: "*{.vcf.gz,vcf.gz.tbi}", + mode: params.publish_dir_mode + ] + } + withName: "TRUVARI_BENCH" { + ext.args = {"--pctsize 0.5 --pctovl 0.5 --refdist 1000"} + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/truvari_bench/${meta2.caller}"}, + pattern: "*{.vcf.gz,vcf.gz.tbi,json}", + mode: params.publish_dir_mode + ] + } + withName: SVANALYZER_SVBENCHMARK { + ext.args = {"-normshift 0.3 –normdist 0.3 –normsizediff 0.3"} + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/svanalyzer_bench/${meta2.caller}"}, + pattern: "*{.vcf,distances,log,report}", + mode: params.publish_dir_mode + ] + } + withName: WITTYER { + ext.args = {"--evaluationMode=CrossTypeAndSimpleCounting --percentDistance=0.3 --bpDistance=1000"} + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/wittyer_bench/${meta2.caller}"}, + pattern: "*{json,.vcf.gz.tbi,vcf.gz}", + mode: params.publish_dir_mode + ] + } + withName: VCFDIST { + ext.args = {"-v 0"} + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/vcfdist_bench/${meta2.caller}"}, + pattern: "*{.vcf,tsv}", + mode: params.publish_dir_mode + ] + } + withName: BAMSURGEON_EVALUATOR { + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/bamsurgeon_evalator/${meta2.caller}"}, + pattern: "*{.vcf}", + mode: params.publish_dir_mode + ] + } + withName: HAPPY_SOMPY { + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/sompy_bench/${meta2.caller}"}, + pattern: "*{.vcf.gz,vcf.gz.tbi,json,csv}", + mode: params.publish_dir_mode + ] + } + withName: MANTA_CONVERTINVERSION { + ext.prefix = {"${meta.id}.${meta2.caller}"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/preprocess/${meta2.caller}"}, + pattern: "*{.vcf.gz,vcf.gz.tbi}", + mode: params.publish_dir_mode + ] + } + withName: SVYNC { + ext.prefix = {"${meta.id}.${meta2.caller}.stnd"} + publishDir = [ + path: {"${params.outdir}/${meta.id}/preprocess"}, + pattern: "*{.vcf.gz,vcf.gz.tbi}", + mode: params.publish_dir_mode + ] + } +} +// +// Don't publish results for these processes +// +process { + withName: 'BGZIP_TABIX|TABIX_TABIX|TABIX_BGZIPTABIX' { + publishDir = [ + path: { "${params.outdir}/test" }, + enabled: false + ] + } } diff --git a/conf/test_data.config b/conf/test_data.config new file mode 100644 index 0000000..6a98dd0 --- /dev/null +++ b/conf/test_data.config @@ -0,0 +1,347 @@ +// README: +// https://github.com/nf-core/test-datasets/blob/modules/README.md + +params { + // Base directory for test data + test_data_base = "https://raw.githubusercontent.com/nf-core/test-datasets/modules" + + test_data { + 'homo_sapiens' { + 'genome' { + genome_elfasta = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.elfasta" + genome_fasta = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.fasta" + genome_fasta_fai = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.fasta.fai" + genome_fasta_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.fasta.gz" + genome_fasta_gz_fai = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.fasta.gz.fai" + genome_fasta_gz_gzi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.fasta.gz.gzi" + genome_strtablefile = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome_strtablefile.zip" + genome_dict = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.dict" + genome_gff3 = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.gff3" + genome_gtf = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.gtf" + genome_interval_list = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.interval_list" + genome_multi_interval_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.multi_intervals.bed" + genome_blacklist_interval_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.blacklist_intervals.bed" + genome_sizes = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.sizes" + genome_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.bed" + genome_header = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.header" + genome_bed_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.bed.gz" + genome_bed_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.bed.gz.tbi" + genome_elsites = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.elsites" + transcriptome_fasta = "${params.test_data_base}/data/genomics/homo_sapiens/genome/transcriptome.fasta" + genome2_fasta = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome2.fasta" + genome_chain_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.chain.gz" + genome_annotated_interval_tsv = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.annotated_intervals.tsv" + genome_mt_gb = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.NC_012920_1.gb" + genome_preprocessed_count_tsv = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.preprocessed_intervals.counts.tsv" + genome_preprocessed_interval_list = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.preprocessed_intervals.interval_list" + genome_ploidy_model = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.ploidy_model.tar.gz" + genome_ploidy_calls = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.ploidy_calls.tar.gz" + genome_germline_cnv_model = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.germline_cnv_model.tar.gz" + genome_germline_cnv_calls = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome.germline_cnv_calls.tar.gz" + genome_motifs = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome_motifs.txt" + genome_config = "${params.test_data_base}/data/genomics/homo_sapiens/genome/genome_config.json" + + genome_1_fasta = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr1/genome.fasta.gz" + genome_1_gtf = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr1/genome.gtf" + + genome_21_sdf = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome_sdf.tar.gz" + genome_21_fasta = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta" + genome_21_fasta_fai = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai" + genome_21_gencode_gtf = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/chr21_gencode.gtf" + genome_21_dict = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.dict" + genome_21_sizes = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.sizes" + genome_21_interval_list = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/genome.interval_list" + genome_21_annotated_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/annotated.bed" + genome_21_multi_interval_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed" + genome_21_multi_interval_antitarget_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.antitarget.bed" + genome_21_multi_interval_bed_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed.gz" + genome_21_multi_interval_bed_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/multi_intervals.bed.gz.tbi" + genome_21_chromosomes_dir = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/chromosomes.tar.gz" + genome_21_reference_cnn = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/reference_chr21.cnn" + genome_21_eigenstrat_snp = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/chr_21.snp" + genome_21_stitch_posfile = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/sequence/dbsnp_138.hg38.first_10_biallelic_sites.tsv" + + dbsnp_146_hg38_elsites = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.elsites" + dbsnp_146_hg38_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" + dbsnp_146_hg38_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi" + gnomad_r2_1_1_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" + gnomad_r2_1_1_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz.tbi" + mills_and_1000g_indels_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" + mills_and_1000g_indels_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz.tbi" + syntheticvcf_short_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/syntheticvcf_short.vcf.gz" + syntheticvcf_short_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/syntheticvcf_short.vcf.gz.tbi" + syntheticvcf_short_score = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/syntheticvcf_short.score" + gnomad_r2_1_1_sv_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1-sv.vcf.gz" + gnomad2_r2_1_1_sv_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/gnomAD2.r2.1.1-sv.vcf.gz" + + hapmap_3_3_hg38_21_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/hapmap_3.3.hg38.vcf.gz" + hapmap_3_3_hg38_21_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/hapmap_3.3.hg38.vcf.gz.tbi" + res_1000g_omni2_5_hg38_21_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/1000G_omni2.5.hg38.vcf.gz" + res_1000g_omni2_5_hg38_21_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/1000G_omni2.5.hg38.vcf.gz.tbi" + res_1000g_phase1_snps_hg38_21_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/1000G_phase1.snps.hg38.vcf.gz" + res_1000g_phase1_snps_hg38_21_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/1000G_phase1.snps.hg38.vcf.gz.tbi" + dbsnp_138_hg38_21_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbsnp_138.hg38.vcf.gz" + dbsnp_138_hg38_21_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbsnp_138.hg38.vcf.gz.tbi" + gnomad_r2_1_1_21_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz" + gnomad_r2_1_1_21_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi" + mills_and_1000g_indels_21_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/mills_and_1000G.indels.hg38.vcf.gz" + mills_and_1000g_indels_21_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/mills_and_1000G.indels.hg38.vcf.gz.tbi" + haplotype_map = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/haplotype_map.txt" + dbNSFP_4_1a_21_hg38_txt_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbNSFP4.1a.21.txt.gz" + dbNSFP_4_1a_21_hg38_txt_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/dbNSFP4.1a.21.txt.gz.tbi" + ngscheckmate_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/chr21/germlineresources/SNP_GRCh38_hg38_wChr.bed" + + index_salmon = "${params.test_data_base}/data/genomics/homo_sapiens/genome/index/salmon" + repeat_expansions = "${params.test_data_base}/data/genomics/homo_sapiens/genome/loci/repeat_expansions.json" + justhusky_ped = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/ped/justhusky.ped" + justhusky_minimal_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/ped/justhusky_minimal.vcf.gz" + justhusky_minimal_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/ped/justhusky_minimal.vcf.gz.tbi" + + vcfanno_tar_gz = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/vcfanno/vcfanno_grch38_module_test.tar.gz" + vcfanno_toml = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vcf/vcfanno/vcfanno.toml" + updsites_bed = "${params.test_data_base}/data/genomics/homo_sapiens/genome/updsites.bed" + + prg_input = "${params.test_data_base}/data/genomics/homo_sapiens/genome/PRG_test.zip" + crispr_functional_counts = "${params.test_data_base}/data/genomics/homo_sapiens/genome/tsv/functional_genomics_counts.tsv" + crispr_functional_library = "${params.test_data_base}/data/genomics/homo_sapiens/genome/tsv/library_functional_genomics.tsv" + + vep_cache = "${params.test_data_base}/data/genomics/homo_sapiens/genome/vep.tar.gz" + affy_array_samplesheet = "${params.test_data_base}/data/genomics/homo_sapiens/array_expression/GSE38751.csv" + affy_array_celfiles_tar = "${params.test_data_base}/data/genomics/homo_sapiens/array_expression/GSE38751_RAW.tar" + + } + 'pangenome' { + pangenome_fa = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.fa" + pangenome_fa_bgzip = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.fa.gz" + pangenome_fa_bgzip_fai = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.fa.gz.fai" + pangenome_fa_bgzip_gzi = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.fa.gz.gzi" + pangenome_paf = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.paf" + pangenome_paf_gz = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.paf.gz" + pangenome_seqwish_gfa = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.seqwish.gfa" + pangenome_smoothxg_gfa = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.smoothxg.gfa" + pangenome_gfaffix_gfa = "${params.test_data_base}/data/pangenomics/homo_sapiens/pangenome.gfaffix.gfa" + 'odgi' { + pangenome_og = "${params.test_data_base}/data/pangenomics/homo_sapiens/odgi/pangenome.og" + pangenome_lay = "${params.test_data_base}/data/pangenomics/homo_sapiens/odgi/pangenome.lay" + } + } + 'illumina' { + test_paired_end_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam" + test_paired_end_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai" + test_paired_end_name_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.name.sorted.bam" + test_paired_end_markduplicates_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam" + test_paired_end_markduplicates_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.bam.bai" + test_paired_end_markduplicates_sorted_referencesn_txt = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.markduplicates.sorted.referencesn.txt" + test_paired_end_recalibrated_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam" + test_paired_end_recalibrated_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai" + test_paired_end_umi_consensus_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_consensus.bam" + test_paired_end_umi_converted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_converted.bam" + test_paired_end_umi_grouped_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_grouped.bam" + test_paired_end_umi_histogram_txt = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_histogram.txt" + test_paired_end_umi_unsorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.umi_unsorted.bam" + test_paired_end_umi_unsorted_tagged_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.unsorted_tagged.bam" + test_paired_end_hla = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/example_hla_pe.bam" + test_paired_end_hla_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/example_hla_pe.sorted.bam" + test_paired_end_hla_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/example_hla_pe.sorted.bam.bai" + test_rna_paired_end_sorted_chr6_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.rna.paired_end.sorted.chr6.bam" + test_rna_paired_end_sorted_chr6_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test.rna.paired_end.sorted.chr6.bam.bai" + + test2_paired_end_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam" + test2_paired_end_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam.bai" + test2_paired_end_name_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.name.sorted.bam" + test2_paired_end_markduplicates_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.markduplicates.sorted.bam" + test2_paired_end_markduplicates_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.markduplicates.sorted.bam.bai" + test2_paired_end_recalibrated_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.recalibrated.sorted.bam" + test2_paired_end_recalibrated_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.recalibrated.sorted.bam.bai" + test2_paired_end_umi_consensus_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test2.paired_end.umi_consensus.bam" + test2_paired_end_umi_converted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test2.paired_end.umi_converted.bam" + test2_paired_end_umi_grouped_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test2.paired_end.umi_grouped.bam" + test2_paired_end_umi_histogram_txt = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test2.paired_end.umi_histogram.txt" + test2_paired_end_umi_unsorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test2.paired_end.umi_unsorted.bam" + test2_paired_end_umi_unsorted_tagged_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test2.paired_end.unsorted_tagged.bam" + test_paired_end_duplex_umi_unmapped_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_unmapped.bam" + test_paired_end_duplex_umi_mapped_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_mapped.bam" + test_paired_end_duplex_umi_mapped_tagged_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_mapped_tagged.bam" + test_paired_end_duplex_umi_grouped_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam" + test_paired_end_duplex_umi_duplex_consensus_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_duplex_consensus.bam" + + mitochon_standin_recalibrated_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/mitochon_standin.recalibrated.sorted.bam" + mitochon_standin_recalibrated_sorted_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/mitochon_standin.recalibrated.sorted.bam.bai" + test_illumina_mt_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test_illumina_mt.bam" + test_illumina_mt_bam_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test_illumina_mt.bam.bai" + + test3_single_end_markduplicates_sorted_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/test3.single_end.markduplicates.sorted.bam" + + read_group_settings_txt = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bam/read_group_settings.txt" + + test_paired_end_sorted_cram = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram" + test_paired_end_sorted_cram_crai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram.crai" + test_paired_end_markduplicates_sorted_cram = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test.paired_end.markduplicates.sorted.cram" + test_paired_end_markduplicates_sorted_cram_crai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test.paired_end.markduplicates.sorted.cram.crai" + test_paired_end_recalibrated_sorted_cram = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram" + test_paired_end_recalibrated_sorted_cram_crai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai" + + test2_paired_end_sorted_cram = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.sorted.cram" + test2_paired_end_sorted_cram_crai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.sorted.cram.crai" + test2_paired_end_markduplicates_sorted_cram = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.markduplicates.sorted.cram" + test2_paired_end_markduplicates_sorted_cram_crai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.markduplicates.sorted.cram.crai" + test2_paired_end_recalibrated_sorted_cram = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram" + test2_paired_end_recalibrated_sorted_cram_crai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai" + test3_paired_end_recalibrated_sorted_cram = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test3.paired_end.recalibrated.sorted.cram" + test3_paired_end_recalibrated_sorted_cram_crai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/cram/test3.paired_end.recalibrated.sorted.cram.crai" + + test_1_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz" + test_2_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz" + test_umi_1_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz" + test_umi_2_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz" + test2_1_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test2_1.fastq.gz" + test2_2_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test2_2.fastq.gz" + test2_umi_1_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test2.umi_1.fastq.gz" + test2_umi_2_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test2.umi_2.fastq.gz" + test_rnaseq_1_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz" + test_rnaseq_2_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz" + test_paired_end_duplex_umi_1_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test_duplex_umi_1.fastq.gz" + test_paired_end_duplex_umi_2_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/fastq/test_duplex_umi_2.fastq.gz" + + test_baserecalibrator_table = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test.baserecalibrator.table" + test2_baserecalibrator_table = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test2.baserecalibrator.table" + test_pileups_table = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test.pileups.table" + test2_pileups_table = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test2.pileups.table" + + test_paired_end_sorted_dragstrmodel = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test_paired_end_sorted_dragstrmodel.txt" + + test_genomicsdb_tar_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test_genomicsdb.tar.gz" + test_pon_genomicsdb_tar_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test_pon_genomicsdb.tar.gz" + + test2_haplotc_ann_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz" + test2_haplotc_ann_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz.tbi" + test_haplotc_cnn_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test_haplotcaller.cnn.vcf.gz" + test_haplotc_cnn_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test_haplotcaller.cnn.vcf.gz.tbi" + + test2_haplotc_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz" + test2_haplotc_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.vcf.gz.tbi" + + test2_recal = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.recal" + test2_recal_idx = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.recal.idx" + test2_tranches = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.tranches" + test2_allele_specific_recal = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2_allele_specific.recal" + test2_allele_specific_recal_idx = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2_allele_specific.recal.idx" + test2_allele_specific_tranches = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2_allele_specific.tranches" + + test_test2_paired_mutect2_calls_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/paired_mutect2_calls/test_test2_paired_mutect2_calls.vcf.gz" + test_test2_paired_mutect2_calls_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/paired_mutect2_calls/test_test2_paired_mutect2_calls.vcf.gz.tbi" + test_test2_paired_mutect2_calls_vcf_gz_stats = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/paired_mutect2_calls/test_test2_paired_mutect2_calls.vcf.gz.stats" + test_test2_paired_mutect2_calls_f1r2_tar_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/paired_mutect2_calls/test_test2_paired_mutect2_calls.f1r2.tar.gz" + test_test2_paired_mutect2_calls_artifact_prior_tar_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test_test2_paired_mutect2_calls.artifact-prior.tar.gz" + test_test2_paired_segmentation_table = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test_test2_paired.segmentation.table" + test_test2_paired_contamination_table = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/test_test2_paired.contamination.table" + + test_genome_vcf = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test.genome.vcf" + test_genome_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test.genome.vcf.gz" + test_genome_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test.genome.vcf.gz.tbi" + test_genome_vcf_idx = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test.genome.vcf.idx" + + test_genome_vcf_ud = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/svd/test.genome.vcf.UD" + test_genome_vcf_mu = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/svd/test.genome.vcf.mu" + test_genome_vcf_bed = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/svd/test.genome.vcf.bed" + + test2_genome_vcf = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test2.genome.vcf" + test2_genome_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test2.genome.vcf.gz" + test2_genome_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test2.genome.vcf.gz.tbi" + test2_genome_vcf_idx = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gvcf/test2.genome.vcf.idx" + + test_genome21_indels_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/test.genome_21.somatic_sv.vcf.gz" + test_genome21_indels_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/test.genome_21.somatic_sv.vcf.gz.tbi" + + test_mpileup = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/mpileup/test.mpileup.gz" + test2_mpileup = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/mpileup/test2.mpileup.gz" + + test_broadpeak = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/broadpeak/test.broadPeak" + test2_broadpeak = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/broadpeak/test2.broadPeak" + + test_narrowpeak = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/narrowpeak/test.narrowPeak" + test2_narrowpeak = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/narrowpeak/test2.narrowPeak" + + test_yak = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/yak/test.yak" + test2_yak = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/yak/test2.yak" + + cutandrun_bedgraph_test_1 = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bedgraph/cutandtag_h3k27me3_test_1.bedGraph" + cutandrun_bedgraph_test_2 = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bedgraph/cutandtag_igg_test_1.bedGraph" + na24385_chr22_coverage = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bedgraph/NA24385_coverage.bed" + + empty_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/empty.vcf.gz" + empty_vcf_gz_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/empty.vcf.gz.tbi" + + simulated_sv = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz" + simulated_sv_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv.vcf.gz.tbi" + simulated_sv2 = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv2.vcf.gz" + simulated_sv2_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/chr21/simulated_sv2.vcf.gz.tbi" + + test_rnaseq_vcf = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/test.rnaseq.vcf" + test_sv_vcf = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/sv_query.vcf.gz" + test_sv_vcf_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/sv_query.vcf.gz.tbi" + na24385_chr22_sv_vcf = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz" + na24385_chr22_sv_vcf_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz.tbi" + genmod_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/genmod.vcf.gz" + genmod_annotate_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/test_annotate.vcf.gz" + genmod_models_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/test_models.vcf.gz" + genmod_score_vcf_gz = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/test_score.vcf.gz" + + test_mito_vcf = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/vcf/NA12878_chrM.vcf.gz" + + test_pytor = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/pytor/test.pytor" + rank_model = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/genmod/svrank_model_-v1.8-.ini" + + test_flowcell = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bcl/flowcell.tar.gz" + test_flowcell_samplesheet = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/bcl/flowcell_samplesheet.csv" + + varlociraptor_scenario = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/varlociraptor/scenario.yml" + + contig_ploidy_priors_table = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/gatk/contig_ploidy_priors_table.tsv" + + purecn_ex1_bam = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/purecn/purecn_ex1.bam" + purecn_ex1_bai = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/purecn/purecn_ex1.bam.bai" + purecn_ex1_interval = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/purecn/purecn_ex1_intervals.txt" + purecn_ex1_normal = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/purecn/purecn_ex1_normal.txt.gz" + purecn_ex2_normal = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/purecn/purecn_ex2_normal.txt.gz" + purecn_normalpanel_vcf = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/purecn/purecn_normalpanel.vcf.gz" + purecn_normalpanel_tbi = "${params.test_data_base}/data/genomics/homo_sapiens/illumina/purecn/purecn_normalpanel.vcf.gz.tbi" + } + 'gene_set_analysis' { + gct = "${params.test_data_base}/data/genomics/homo_sapiens/gene_set_analysis/P53_6samples_collapsed_symbols.gct" + cls = "${params.test_data_base}/data/genomics/homo_sapiens/gene_set_analysis/P53_6samples.cls" + gmx = "${params.test_data_base}/data/genomics/homo_sapiens/gene_set_analysis/c1.symbols.reduced.gmx" + } + 'cnvkit' { + amplicon_cnr = "https://raw.githubusercontent.com/etal/cnvkit/v0.9.9/test/formats/amplicon.cnr" + amplicon_cns = "https://raw.githubusercontent.com/etal/cnvkit/v0.9.9/test/formats/amplicon.cns" + } + } + 'generic' { + 'csv' { + test_csv = "${params.test_data_base}/data/generic/csv/test.csv" + } + 'notebooks' { + rmarkdown = "${params.test_data_base}/data/generic/notebooks/rmarkdown/rmarkdown_notebook.Rmd" + ipython_md = "${params.test_data_base}/data/generic/notebooks/jupyter/ipython_notebook.md" + ipython_ipynb = "${params.test_data_base}/data/generic/notebooks/jupyter/ipython_notebook.ipynb" + } + 'tar' { + tar_gz = "${params.test_data_base}/data/generic/tar/hello.tar.gz" + } + 'tsv' { + test_tsv = "${params.test_data_base}/data/generic/tsv/test.tsv" + } + 'txt' { + hello = "${params.test_data_base}/data/generic/txt/hello.txt" + } + 'unsorted_data' { + 'unsorted_text' { + genome_file = "${params.test_data_base}/data/generic/unsorted_data/unsorted_text/test.genome" + intervals = "${params.test_data_base}/data/generic/unsorted_data/unsorted_text/test.bed" + numbers_csv = "${params.test_data_base}/data/generic/unsorted_data/unsorted_text/test.csv" + } + } + } + } +} diff --git a/conf/test_hg19.config b/conf/test_hg19.config new file mode 100644 index 0000000..3623854 --- /dev/null +++ b/conf/test_hg19.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/variantbenchmarking -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 16 + max_memory = 100.GB + max_time = '8.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'assets/samplesheet_HG002.csv' + outdir = 'results' + + // Genome references + genome = 'GRCh37' + analysis = 'germline' //somatic + method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist + + similarity = 0 // determines the sequence similarity level in benchmarking. + standardization = true + preprocess = "normalization, deduplication" + //bnd_to_inv = true + + sample = "HG002" // available samples: SEQC2, HG002 + truth = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.vcf.gz" + high_conf = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.bed" + +} diff --git a/conf/test_hg37.config b/conf/test_hg37.config new file mode 100644 index 0000000..aa20eb1 --- /dev/null +++ b/conf/test_hg37.config @@ -0,0 +1,43 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/variantbenchmarking -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 16 + max_memory = 100.GB + max_time = '8.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'assets/samplesheet_HG002.csv' + outdir = 'results' + + // Genome references + genome = 'GRCh37' + analysis = 'germline' //somatic + method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist + + similarity = 0 // determines the sequence similarity level in benchmarking. + standardization = true + preprocess = "normalization, deduplication" + //bnd_to_inv = true + + sample = "HG002" // available samples: SEQC2, HG002 + truth = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.vcf.gz" + high_conf = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.bed" + +} + diff --git a/conf/test_hg38.config b/conf/test_hg38.config new file mode 100644 index 0000000..3336ae6 --- /dev/null +++ b/conf/test_hg38.config @@ -0,0 +1,53 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/benchmark -profile test, --outdir +variantbenchmarking +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 16 + max_memory = '100.GB' + max_time = '8.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'assets/samplesheet_HG002_hg38.csv' + outdir = 'results' + + // Genome references + genome = 'hg38' + + // Processes + analysis = 'germline' //somatic + method = 'truvari,svanalyzer,wittyer,vcfdist' // --not working for now : wittyer, vcfdist + //harmonize = true // ATTENTION : takes long time because of multi-alignment process. + similarity = 0 // determines the sequence similarity level in benchmarking. + preprocess = "normalization, deduplication" + + // Truvari params + //dup_to_ins = true // truvari cannot benchmark DUP type, convert DUP type to INS. Has to be used with similarity = 0 + + // Manta params + //bnd_to_inv = true // manta reports INV as BND + + // Gridss params + //gridss_annotate = true + + //standardization = true + + sample = "HG002" // available samples: SEQC2, HG002 + truth = "/Users/w620-admin/Desktop/nf-core/dataset/hg38/HG002_CMRG_V1_Truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.vcf.gz" + high_conf = "/Users/w620-admin/Desktop/nf-core/dataset/hg38/HG002_CMRG_V1_Truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.bed" + +} diff --git a/conf/test_hg38_somatic.config b/conf/test_hg38_somatic.config new file mode 100644 index 0000000..e8abecc --- /dev/null +++ b/conf/test_hg38_somatic.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/variantbenchmarking -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 16 + max_memory = '100.GB' + max_time = '8.h' + + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + input = 'assets/samplesheet_SEQC2.csv' + outdir = 'results' + + // Genome references + genome = 'hg38' + analysis = 'somatic' //somatic + method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist + + + sample = "SEQC2" // available samples: SEQC2, HG002 + truth = "/Users/w620-admin/Desktop/nf-core/dataset/hg38/SEQC_somatic_mutation_truth/sSNV_truth_set_v1.0.vcf.gz" + high_conf = "/Users/w620-admin/Desktop/nf-core/dataset/hg38/SEQC_somatic_mutation_truth/High-Confidence_Regions_v1.2.bed" + rename_chromosomes = "assets/rename_chroms_hg38.txt" + +} diff --git a/conf/truth.config b/conf/truth.config new file mode 100644 index 0000000..0a89ef4 --- /dev/null +++ b/conf/truth.config @@ -0,0 +1,24 @@ + +params { + // Base directory for test data + truth { + 'GRCh38' { + 'germline' { + 'HG002'{ + sv_bed = "/Users/w620-admin/Desktop/nf-core/dataset/hg38/NIST_GIAB/HG002_SVs_Tier1_v0.6.bed" + snv_bed = "/Users/w620-admin/Desktop/nf-core/dataset/hg38/NIST_GIAB/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" + } + } + 'somatic' { + 'HG002'{ + sv_bed = '/Users/w620-admin/Desktop/nf-core/dataset/hg38/HG002_CMRG_V1_Truth/HG002_GRCh38_CMRG_SV_v1.00.bed' + snv_bed = '/Users/w620-admin/Desktop/nf-core/dataset/hg38/HG002_CMRG_V1_Truth/HG002_GRCh38_CMRG_smallvar_v1.00.bed' + } + 'SEQC2'{ + snv_bed = '/Users/w620-admin/Desktop/nf-core/dataset/hg38/SEQC_somatic_mutation_truth/High-Confidence_Regions_v1.2.bed' + sv_bed = '/Users/w620-admin/Desktop/nf-core/dataset/hg38/SEQC_somatic_mutation_truth/High-Confidence_Regions_v1.2.bed' + } + } + } + } +} diff --git a/lib/WorkflowVariantbenchmarking.groovy b/lib/WorkflowVariantbenchmarking.groovy old mode 100755 new mode 100644 diff --git a/main.nf b/main.nf index 175c581..e7cf458 100644 --- a/main.nf +++ b/main.nf @@ -21,6 +21,7 @@ nextflow.enable.dsl = 2 // This is an example of how to use getGenomeAttribute() to fetch parameters // from igenomes.config using `--genome` params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.fai = WorkflowMain.getGenomeAttribute(params, 'fai') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 789c505..e608e79 100644 --- a/modules.json +++ b/modules.json @@ -1,27 +1,62 @@ { - "name": "nf-core/variantbenchmarking", - "homePage": "https://github.com/nf-core/variantbenchmarking", - "repos": { - "https://github.com/nf-core/modules.git": { - "modules": { - "nf-core": { - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", - "installed_by": ["modules"] - }, - "fastqc": { - "branch": "master", - "git_sha": "c9488585ce7bd35ccd2a30faa2371454c8112fb9", - "installed_by": ["modules"] - }, - "multiqc": { - "branch": "master", - "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", - "installed_by": ["modules"] - } - } - } + "name": "nf-core/variantbenchmarking", + "homePage": "https://github.com/nf-core/variantbenchmarking", + "repos": { + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "bcftools/annotate": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bcftools/norm": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "fastqc": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "happy/happy": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "happy/sompy": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "multiqc": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "tabix/tabix": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "truvari/bench": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "survivor/stats": { + "branch": "master", + "git_sha": "00d03755abd2cac54a039525716aeabea9efb9e7", + "installed_by": ["modules"] + } } + } } + } } diff --git a/modules/local/addhead.nf b/modules/local/addhead.nf new file mode 100644 index 0000000..89a2a50 --- /dev/null +++ b/modules/local/addhead.nf @@ -0,0 +1,38 @@ +process ADDHEAD { + tag "$meta.id $meta2.caller" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'quay.io/biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), val(meta2), path(vcf), path(header) + + output: + tuple val(meta), val(meta2), path("*.vcf.gz") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + cat $header $vcf > ${prefix}_rehead.vcf + + bcftools \\ + sort \\ + --output ${prefix}.vcf.gz \\ + --output-type z \\ + --temp-dir . \\ + ${prefix}_rehead.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/awk_sort.nf b/modules/local/awk_sort.nf new file mode 100644 index 0000000..cb2dc29 --- /dev/null +++ b/modules/local/awk_sort.nf @@ -0,0 +1,50 @@ +process AWK_SORT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'quay.io/biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta),val(meta2), path(vcf) + + output: + tuple val(meta),val(meta2), path("*.tmp.vcf.gz"),path("*.tmp.vcf.gz.tbi"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def zipname = vcf.getBaseName() + + if (vcf.getExtension() == "gz"){ + """ + bgzip -d $vcf + cat $zipname | awk '\$1 ~ /^#/ {print \$0;next} {print \$0 | "sort -k1,1 -k2,2n"}' > ${zipname}.tmp.vcf + bgzip -c ${zipname}.tmp.vcf > ${zipname}.tmp.vcf.gz + tabix -p vcf ${zipname}.tmp.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } + else{ + """ + cat $vcf | awk '\$1 ~ /^#/ {print \$0;next} {print \$0 | "sort -k1,1 -k2,2n"}' > ${zipname}.tmp.vcf + bgzip -c ${zipname}.tmp.vcf > ${zipname}.tmp.vcf.gz + tabix -p vcf ${zipname}.tmp.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/local/bamsurgeon_evaluator.nf b/modules/local/bamsurgeon_evaluator.nf new file mode 100644 index 0000000..384a515 --- /dev/null +++ b/modules/local/bamsurgeon_evaluator.nf @@ -0,0 +1,42 @@ +process BAMSURGEON_EVALUATOR { + tag "$meta.id" + label 'process_low' + + conda "" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://lethalfang/bamsurgeon:1.2': + 'lethalfang/bamsurgeon:1.2' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(tbi), path(truth_vcf), path(truth_tbi) + tuple path(fasta), path(fai) + val(muttype) + + output: + tuple val(meta),val(meta2), path("*.{vcf}"), emit: bench + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + python3 /usr/local/bamsurgeon/scripts/evaluator.py \\ + -v $vcf \\ + -t $truth_vcf \\ + -f $fasta \\ + -m $muttype \\ + $args \\ + --fp ${prefix}.falsepositives.vcf \\ + --tp ${prefix}.truepositives.vcf \\ + --fn ${prefix}.falsenegatives.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bamsurgeon: v1.2 + END_VERSIONS + """ +} diff --git a/modules/local/bcftools_view.nf b/modules/local/bcftools_view.nf new file mode 100644 index 0000000..eab11f7 --- /dev/null +++ b/modules/local/bcftools_view.nf @@ -0,0 +1,48 @@ +process BCFTOOLS_VIEW { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'quay.io/biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(index) + + + output: + tuple val(meta),val(meta2), path("*.vcf") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def regions = params.genome.contains("38")? "chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY" : "1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y" + + """ + bcftools \\ + view \\ + $vcf \\ + $regions > ${prefix}.vcf + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/bgzip_tabix.nf b/modules/local/bgzip_tabix.nf new file mode 100644 index 0000000..887da5f --- /dev/null +++ b/modules/local/bgzip_tabix.nf @@ -0,0 +1,47 @@ +process BGZIP_TABIX { + tag "$meta.id" + label 'process_single' + + conda "" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta),val(meta2), path(vcf) + + output: + tuple val(meta),val(meta2), path("*.vcf.gz"), path("*.tbi"), emit: gz_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def zipname = vcf.getBaseName() + ".temp.vcf.gz" + + if (vcf.getExtension() == "gz"){ + """ + cp $vcf $zipname + tabix -p vcf $zipname + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } + else{ + """ + bgzip ${args2} --threads ${task.cpus} -c $vcf > $zipname + tabix -p vcf $zipname + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/local/extract_main.nf b/modules/local/extract_main.nf new file mode 100644 index 0000000..819b077 --- /dev/null +++ b/modules/local/extract_main.nf @@ -0,0 +1,33 @@ +process EXTRACT_MAIN { + tag "$meta.id" + label 'process_single' + + conda "" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta), path(bed) + + output: + tuple val(meta), path("*.txt"), emit: chr_list + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def name = bed.getBaseName() + + """ + cat $bed | awk '{print \$1}' | uniq > ${name}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + +} diff --git a/modules/local/gridss_annotation.nf b/modules/local/gridss_annotation.nf new file mode 100644 index 0000000..317f5a6 --- /dev/null +++ b/modules/local/gridss_annotation.nf @@ -0,0 +1,60 @@ +// UNTESTED +process GRIDSS_ANNOTATION { + tag "$meta.id $meta2.caller" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gridss:2.13.2--h270b39a_0': + 'quay.io/biocontainers/gridss:2.13.2--h270b39a_0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(index) + tuple path(fasta), path(fasta_fai) + + output: + tuple val(meta),val(meta2), path("*.vcf.gz"),path("*.vcf.gz.tbi") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def genome = params.genome.contains("38") ? "hg38": "hg19" + def VERSION = '2.13.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + if (meta2.caller == "gridss"){ + """ + bgzip -d $vcf -c > unzziped.vcf + simple_event-annotator.R \\ + unzziped.vcf \\ + ${prefix}.vcf \\ + ${genome} + + bgzip --threads ${task.cpus} -c ${prefix}.vcf > ${prefix}.anno.vcf.gz + tabix -p vcf ${prefix}.anno.vcf.gz + + rm unzziped.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + """ + } + else{ + """ + cp $vcf ${prefix}.vcf.gz + cp $index ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gridss: ${VERSION} + END_VERSIONS + """ + + } + +} diff --git a/modules/local/main_chroms.nf b/modules/local/main_chroms.nf new file mode 100644 index 0000000..3078ef1 --- /dev/null +++ b/modules/local/main_chroms.nf @@ -0,0 +1,34 @@ +process MAIN_CHROMS { + tag "$meta.id" + label 'process_single' + + conda "" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta), path(fai) + + output: + tuple val(meta), path("*.sizes"), emit: sizes + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def name = fai.getBaseName() + + """ + cut -f 1,2 $fai > size + head -n24 size > ${name}.sizes + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + +} diff --git a/modules/local/truvari_phab.nf b/modules/local/truvari_phab.nf new file mode 100644 index 0000000..ea65b3a --- /dev/null +++ b/modules/local/truvari_phab.nf @@ -0,0 +1,42 @@ +process TRUVARI_PHAB { + tag "$meta.id $meta2.caller" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://kubran/truvari:v4.3.0': + 'kubran/truvari:v4.3.0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(tbi), path(truth_vcf), path(truth_tbi), path(bed) + tuple path(fasta), path(fai) + + output: + tuple val(meta), val(meta2), path("*.vcf.gz") , emit: harmon + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions = bed ? "--region $bed" : "" + + """ + truvari phab \\ + --base ${truth_vcf} \\ + --comp ${vcf} \\ + --bSample $meta.id \\ + --cSample $meta.id \\ + --reference ${fasta} \\ + --output ${prefix}.vcf.gz \\ + ${regions} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + truvari: \$(echo \$(truvari version 2>&1) | sed 's/^Truvari v//' )) + END_VERSIONS + """ +} diff --git a/modules/local/truvari_refine.nf b/modules/local/truvari_refine.nf new file mode 100644 index 0000000..e10d618 --- /dev/null +++ b/modules/local/truvari_refine.nf @@ -0,0 +1,38 @@ +process TRUVARI_REFINE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://kubran/truvari:v4.3.0': + 'kubran/truvari:v4.3.0' }" + + input: + tuple val(meta), val(meta2), path(bench) + each path(bed) + tuple path(fasta), path(fai) + + output: + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + truvari refine \\ + --use-original-vcfs \\ + --reference $fasta \\ + --regions $bed \\ + $bench + . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + truvari: \$(echo \$(truvari version 2>&1) | sed 's/^Truvari v//' )) + END_VERSIONS + """ +} diff --git a/modules/local/vcf_genotype_annotator.nf b/modules/local/vcf_genotype_annotator.nf new file mode 100644 index 0000000..9fa366a --- /dev/null +++ b/modules/local/vcf_genotype_annotator.nf @@ -0,0 +1,36 @@ +process VCF_GENOTYPE_ANNOTATOR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'ddocker://griffithlab/vatools:5.1.10' + 'griffithlab/vatools:5.1.10' }" + + input: + tuple val(meta),val(meta2), path(vcf) + + output: + tuple val(meta),val(meta2), path("*.{vcf}"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + vcf-genotype-annotator \\ + $vcf \\ + ${meta.id} \\ + $args + -o ${prefix}.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) + END_VERSIONS + """ +} diff --git a/modules/local/vcfdist.nf b/modules/local/vcfdist.nf new file mode 100644 index 0000000..0ae9939 --- /dev/null +++ b/modules/local/vcfdist.nf @@ -0,0 +1,41 @@ +process VCFDIST { + tag "$meta.id $meta2.caller" + label 'process_single' + + conda "" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://timd1/vcfdist:v2.3.2' : + 'timd1/vcfdist:v2.3.2' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(tbi), path(truth_vcf), path(truth_tbi), path(bed) + tuple path(fasta), path(fai) + + output: + tuple val(meta), path("*.tsv,vcf"), emit: bench + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions = bed ? "-b $bed" : "" + + """ + vcfdist \\ + ${vcf} \\ + ${truth_vcf} \\ + $fasta \\ + -p ${prefix} \\ + ${regions} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vcfdist: \$(echo \$(vcfdist --version 2>&1) | sed 's/^.*vcfdist v//') + END_VERSIONS + """ + +} diff --git a/modules/local/wittyer.nf b/modules/local/wittyer.nf new file mode 100644 index 0000000..20db34a --- /dev/null +++ b/modules/local/wittyer.nf @@ -0,0 +1,53 @@ +process WITTYER { + tag "$meta.id $meta2.caller" + label 'process_single' + + conda "" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://kubran/wittyer:0.3.3.0' : + 'kubran/wittyer:0.3.3.0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(tbi), path(truth_vcf), path(truth_tbi), path(bed) + path(config) + + output: + tuple val(meta), path("*ConfigFileUsed.json") , emit: config + tuple val(meta), path("*.Stats.json") , emit: report + tuple val(meta), path("*eval.vcf.gz") , emit: bench_vcf + tuple val(meta), path("*eval.vcf.gz.tbi") , emit: bench_vcf_gzi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions = bed ? "--includeBed=$bed" : "" + def config = config ? "--configFile=$config" : "" + + """ + mkdir bench + dotnet /opt/Wittyer/Wittyer.dll \\ + --truthVcf=${truth_vcf} \\ + --inputVcf=${vcf} \\ + --outputDirectory=bench \\ + ${regions} \\ + ${config} \\ + ${args} + + mv bench/Wittyer.ConfigFileUsed.json ${prefix}.ConfigFileUsed.json + mv bench/Wittyer.Stats.json ${prefix}.Stats.json + mv bench/*.vcf.gz ${prefix}.eval.vcf.gz + mv bench/*.vcf.gz.tbi ${prefix}.eval.vcf.gz.tbi + + rm -rf bench + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wittyer: 0.3.3.0 + END_VERSIONS + """ + +} diff --git a/modules/nf-core/bcftools/annotate/environment.yml b/modules/nf-core/bcftools/annotate/environment.yml new file mode 100644 index 0000000..273ffff --- /dev/null +++ b/modules/nf-core/bcftools/annotate/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_annotate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/annotate/main.nf b/modules/nf-core/bcftools/annotate/main.nf new file mode 100644 index 0000000..05b9c63 --- /dev/null +++ b/modules/nf-core/bcftools/annotate/main.nf @@ -0,0 +1,63 @@ +process BCFTOOLS_ANNOTATE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'quay.io/biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(input), path(index), path(annotations), path(annotations_index), path(header_lines) + + output: + tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def header_file = header_lines ? "--header-lines ${header_lines}" : '' + def annotations_file = annotations ? "--annotations ${annotations}" : '' + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + bcftools \\ + annotate \\ + $args \\ + $annotations_file \\ + $header_file \\ + --output ${prefix}.${extension} \\ + --threads $task.cpus \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/annotate/meta.yml b/modules/nf-core/bcftools/annotate/meta.yml new file mode 100644 index 0000000..f3aa463 --- /dev/null +++ b/modules/nf-core/bcftools/annotate/meta.yml @@ -0,0 +1,56 @@ +name: bcftools_annotate +description: Add or remove annotations. +keywords: + - bcftools + - annotate + - vcf + - remove + - add +tools: + - annotate: + description: Add or remove annotations. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: https://samtools.github.io/bcftools/bcftools.html#annotate + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Query VCF or BCF file, can be either uncompressed or compressed + - index: + type: file + description: Index of the query VCF or BCF file + - annotations: + type: file + description: Bgzip-compressed file with annotations + - annotations_index: + type: file + description: Index of the annotations file + - header_lines: + type: file + description: Contains lines to append to the output VCF header +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Compressed annotated VCF file + pattern: "*{vcf,vcf.gz,bcf,bcf.gz}" +authors: + - "@projectoriented" + - "@ramprasadn" +maintainers: + - "@projectoriented" + - "@ramprasadn" diff --git a/modules/nf-core/bcftools/isec/environment.yml b/modules/nf-core/bcftools/isec/environment.yml new file mode 100644 index 0000000..e798a40 --- /dev/null +++ b/modules/nf-core/bcftools/isec/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_isec +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/isec/main.nf b/modules/nf-core/bcftools/isec/main.nf new file mode 100644 index 0000000..64aae32 --- /dev/null +++ b/modules/nf-core/bcftools/isec/main.nf @@ -0,0 +1,33 @@ +process BCFTOOLS_ISEC { + tag "$meta.id $meta.caller" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta),val(meta2), path(vcfs), path(tbis) + + output: + tuple val(meta),val(meta2), path("${prefix}"), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + bcftools isec \\ + $args \\ + -p $prefix \\ + *.vcf.gz + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/isec/meta.yml b/modules/nf-core/bcftools/isec/meta.yml new file mode 100644 index 0000000..94f4256 --- /dev/null +++ b/modules/nf-core/bcftools/isec/meta.yml @@ -0,0 +1,52 @@ +name: bcftools_isec +description: Apply set operations to VCF files +keywords: + - variant calling + - intersect + - union + - complement + - VCF +tools: + - isec: + description: | + Computes intersections, unions and complements of VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcfs: + type: list + description: | + List containing 2 or more vcf files + e.g. [ 'file1.vcf', 'file2.vcf' ] + - tbis: + type: list + description: | + List containing the tbi index files corresponding to the vcfs input files + e.g. [ 'file1.vcf.tbi', 'file2.vcf.tbi' ] +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - results: + type: directory + description: Folder containing the set operations results perform on the vcf files + pattern: "${prefix}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bcftools/isec/tests/main.nf.test b/modules/nf-core/bcftools/isec/tests/main.nf.test new file mode 100644 index 0000000..89d5335 --- /dev/null +++ b/modules/nf-core/bcftools/isec/tests/main.nf.test @@ -0,0 +1,44 @@ +nextflow_process { + + name "Test Process BCFTOOLS_ISEC" + script "../main.nf" + process "BCFTOOLS_ISEC" + + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/isec" + + config "./nextflow.config" + + test("sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]]") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_vcf_gz'], checkIfExists: true)], + [ + file(params.test_data['sarscov2']['illumina']['test_vcf_gz_tbi'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_vcf_gz_tbi'], checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.results, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bcftools/isec/tests/main.nf.test.snap b/modules/nf-core/bcftools/isec/tests/main.nf.test.snap new file mode 100644 index 0000000..919809b --- /dev/null +++ b/modules/nf-core/bcftools/isec/tests/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]]": { + "content": [ + [ + [ + { + "id": "test" + }, + [ + "0000.vcf.gz:md5,a1e45fe6d2b386fc2611766e5d2937ee", + "0000.vcf.gz.tbi:md5,7f005943c935f2b55ba3f9d4802aa09f", + "0001.vcf.gz:md5,5937e33da388e9f6992ea0b44c5c2629", + "0001.vcf.gz.tbi:md5,2cdcee9edc71f6d84325d34d78d445cc", + "README.txt:md5,10fc33b66522645600d44afbd41fb792", + "sites.txt:md5,1cea3fbde7f6d3c97f3d39036f9690df" + ] + ] + ], + [ + "versions.yml:md5,c9e98dd502f38110979feabbee9937df" + ] + ], + "timestamp": "2023-11-29T14:00:22.623656731" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/isec/tests/nextflow.config b/modules/nf-core/bcftools/isec/tests/nextflow.config new file mode 100644 index 0000000..3212fe9 --- /dev/null +++ b/modules/nf-core/bcftools/isec/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = '--nfiles +2 --output-type z --no-version' +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/isec/tests/tags.yml b/modules/nf-core/bcftools/isec/tests/tags.yml new file mode 100644 index 0000000..c0fb5a1 --- /dev/null +++ b/modules/nf-core/bcftools/isec/tests/tags.yml @@ -0,0 +1,2 @@ +bcftools/isec: + - "modules/nf-core/bcftools/isec/**" diff --git a/modules/nf-core/bcftools/norm/environment.yml b/modules/nf-core/bcftools/norm/environment.yml new file mode 100644 index 0000000..74e94f2 --- /dev/null +++ b/modules/nf-core/bcftools/norm/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_norm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/norm/main.nf b/modules/nf-core/bcftools/norm/main.nf new file mode 100644 index 0000000..379acdb --- /dev/null +++ b/modules/nf-core/bcftools/norm/main.nf @@ -0,0 +1,63 @@ +process BCFTOOLS_NORM { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'quay.io/biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(index) + tuple path(fasta), path(fai) + tuple val(meta3), path(regions) + + output: + tuple val(meta),val(meta2), path("*.{vcf,vcf.gz,bcf,bcf.gz}") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf.gz" + def region = regions ? "-R ${regions}" : "" + + """ + bcftools norm \\ + --fasta-ref ${fasta} \\ + --output ${prefix}.${extension}\\ + $args \\ + --threads $task.cpus \\ + $region \\ + ${vcf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf.gz" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/norm/meta.yml b/modules/nf-core/bcftools/norm/meta.yml new file mode 100644 index 0000000..1f3e1b6 --- /dev/null +++ b/modules/nf-core/bcftools/norm/meta.yml @@ -0,0 +1,61 @@ +name: bcftools_norm +description: Normalize VCF file +keywords: + - normalize + - norm + - variant calling + - VCF +tools: + - norm: + description: | + Normalize VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + The vcf file to be normalized + e.g. 'file1.vcf' + pattern: "*.{vcf,vcf.gz}" + - tbi: + type: file + description: | + An optional index of the VCF file (for when the VCF is compressed) + pattern: "*.vcf.gz.tbi" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: One of uncompressed VCF (.vcf), compressed VCF (.vcf.gz), compressed BCF (.bcf.gz) or uncompressed BCF (.bcf) normalized output file + pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@ramprasadn" +maintainers: + - "@abhi18av" + - "@ramprasadn" diff --git a/modules/nf-core/bcftools/reheader/environment.yml b/modules/nf-core/bcftools/reheader/environment.yml new file mode 100644 index 0000000..aab0dc9 --- /dev/null +++ b/modules/nf-core/bcftools/reheader/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_reheader +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/reheader/main.nf b/modules/nf-core/bcftools/reheader/main.nf new file mode 100644 index 0000000..a60248c --- /dev/null +++ b/modules/nf-core/bcftools/reheader/main.nf @@ -0,0 +1,68 @@ +process BCFTOOLS_REHEADER { + tag "$meta.id $meta2.caller" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'quay.io/biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(index) + tuple path(fasta), path(fai) + + output: + tuple val(meta),val(meta2), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def fai_argument = fai ? "--fai $fai" : "" + def args2 = task.ext.args2 ?: '--output-type z' + def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : + args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : + args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : + args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : + "vcf" + """ + echo ${prefix} > sample.txt + + bcftools \\ + reheader \\ + $fai_argument \\ + --samples sample.txt \\ + $args \\ + --threads $task.cpus \\ + $vcf \\ + | bcftools view \\ + $args2 \\ + --output ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args2 = task.ext.args2 ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + + def extension = args2.contains("--output-type b") || args2.contains("-Ob") ? "bcf.gz" : + args2.contains("--output-type u") || args2.contains("-Ou") ? "bcf" : + args2.contains("--output-type z") || args2.contains("-Oz") ? "vcf.gz" : + args2.contains("--output-type v") || args2.contains("-Ov") ? "vcf" : + "vcf" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/reheader/meta.yml b/modules/nf-core/bcftools/reheader/meta.yml new file mode 100644 index 0000000..690d4ea --- /dev/null +++ b/modules/nf-core/bcftools/reheader/meta.yml @@ -0,0 +1,63 @@ +name: bcftools_reheader +description: Reheader a VCF file +keywords: + - reheader + - vcf + - update header +tools: + - reheader: + description: | + Modify header of VCF/BCF files, change sample names. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://samtools.github.io/bcftools/bcftools.html#reheader + doi: 10.1093/gigascience/giab008 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF/BCF file + pattern: "*.{vcf.gz,vcf,bcf}" + - header: + type: file + description: New header to add to the VCF + pattern: "*.{header.txt}" + - samples: + type: file + description: File containing sample names to update (one sample per line) + pattern: "*.{samples.txt}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Fasta index to update header sequences with + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: VCF with updated header, bgzipped per default + pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" +authors: + - "@bjohnnyd" + - "@jemten" + - "@ramprasadn" +maintainers: + - "@bjohnnyd" + - "@jemten" + - "@ramprasadn" diff --git a/modules/nf-core/bcftools/reheader/tests/bcf.config b/modules/nf-core/bcftools/reheader/tests/bcf.config new file mode 100644 index 0000000..2b7dff5 --- /dev/null +++ b/modules/nf-core/bcftools/reheader/tests/bcf.config @@ -0,0 +1,4 @@ +process { + ext.args2 = { "--no-version --output-type b" } + ext.prefix = "tested" +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/main.nf.test b/modules/nf-core/bcftools/reheader/tests/main.nf.test new file mode 100644 index 0000000..f3200cb --- /dev/null +++ b/modules/nf-core/bcftools/reheader/tests/main.nf.test @@ -0,0 +1,197 @@ +nextflow_process { + + name "Test Process BCFTOOLS_REHEADER" + script "../main.nf" + process "BCFTOOLS_REHEADER" + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/reheader" + + test("sarscov2 - [vcf, [], []], fai - vcf output") { + + config "./vcf.config" + when { + + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + [], + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [vcf, [], []], fai - vcf.gz output") { + + config "./vcf.gz.config" + when { + + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + [], + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [vcf, [], []], fai - bcf output") { + + config "./bcf.config" + when { + + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + [], + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [vcf, header, []], []") { + + config "./vcf.config" + when { + + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [vcf, [], samples], fai") { + + config "./vcf.config" + when { + + process { + """ + ch_no_samples = Channel.of([ + [ id:'test', single_end:false ], + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + [] + ]) + ch_samples = Channel.of(["samples.txt", "new_name"]) + .collectFile(newLine:true) + input[0] = ch_no_samples.combine(ch_samples) + input[1] = [ + [ id:'genome' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [vcf, [], []], fai - stub") { + + options "-stub" + config "./vcf.config" + when { + + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + [], + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.vcf[0][1]).name, + process.out.versions, + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap b/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap new file mode 100644 index 0000000..112736a --- /dev/null +++ b/modules/nf-core/bcftools/reheader/tests/main.nf.test.snap @@ -0,0 +1,166 @@ +{ + "sarscov2 - [vcf, [], []], fai - vcf output": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf:md5,8e722884ffb75155212a3fc053918766" + ] + ], + "1": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf:md5,8e722884ffb75155212a3fc053918766" + ] + ], + "versions": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ] + } + ], + "timestamp": "2023-11-29T13:05:44.058376693" + }, + "sarscov2 - [vcf, [], []], fai - bcf output": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.bcf.gz:md5,c31d9afd8614832c2a46d9a55682c97a" + ] + ], + "1": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.bcf.gz:md5,c31d9afd8614832c2a46d9a55682c97a" + ] + ], + "versions": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ] + } + ], + "timestamp": "2023-11-29T13:06:03.793372514" + }, + "sarscov2 - [vcf, [], []], fai - vcf.gz output": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf.gz:md5,a1e45fe6d2b386fc2611766e5d2937ee" + ] + ], + "1": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf.gz:md5,a1e45fe6d2b386fc2611766e5d2937ee" + ] + ], + "versions": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ] + } + ], + "timestamp": "2023-11-29T13:05:53.954090441" + }, + "sarscov2 - [vcf, [], []], fai - stub": { + "content": [ + "tested.vcf", + [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ] + ], + "timestamp": "2023-11-29T13:06:33.549685303" + }, + "sarscov2 - [vcf, [], samples], fai": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf:md5,c64c373c10b0be24b29d6f18708ec1e8" + ] + ], + "1": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf:md5,c64c373c10b0be24b29d6f18708ec1e8" + ] + ], + "versions": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ] + } + ], + "timestamp": "2023-11-29T13:06:23.474745156" + }, + "sarscov2 - [vcf, header, []], []": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf:md5,3189bc9a720d5d5d3006bf72d91300cb" + ] + ], + "1": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "tested.vcf:md5,3189bc9a720d5d5d3006bf72d91300cb" + ] + ], + "versions": [ + "versions.yml:md5,fbf8ac8da771b6295a47392003f983ce" + ] + } + ], + "timestamp": "2023-11-29T13:06:13.841648691" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/tags.yml b/modules/nf-core/bcftools/reheader/tests/tags.yml new file mode 100644 index 0000000..c252941 --- /dev/null +++ b/modules/nf-core/bcftools/reheader/tests/tags.yml @@ -0,0 +1,2 @@ +bcftools/reheader: + - modules/nf-core/bcftools/reheader/** diff --git a/modules/nf-core/bcftools/reheader/tests/vcf.config b/modules/nf-core/bcftools/reheader/tests/vcf.config new file mode 100644 index 0000000..820f2ae --- /dev/null +++ b/modules/nf-core/bcftools/reheader/tests/vcf.config @@ -0,0 +1,4 @@ +process { + ext.args2 = { "--no-version" } + ext.prefix = "tested" +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/reheader/tests/vcf.gz.config b/modules/nf-core/bcftools/reheader/tests/vcf.gz.config new file mode 100644 index 0000000..c3031c3 --- /dev/null +++ b/modules/nf-core/bcftools/reheader/tests/vcf.gz.config @@ -0,0 +1,4 @@ +process { + ext.args2 = { "--no-version --output-type z" } + ext.prefix = "tested" +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/sort/environment.yml b/modules/nf-core/bcftools/sort/environment.yml new file mode 100644 index 0000000..89cf911 --- /dev/null +++ b/modules/nf-core/bcftools/sort/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/sort/main.nf b/modules/nf-core/bcftools/sort/main.nf new file mode 100644 index 0000000..a739b09 --- /dev/null +++ b/modules/nf-core/bcftools/sort/main.nf @@ -0,0 +1,63 @@ +process BCFTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'quay.io/biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), val(meta2), path(vcf) + + output: + tuple val(meta), val(meta2), path("*.{vcf,vcf.gz,bcf,bcf.gz}") , emit: vcf + tuple val(meta), val(meta2), path("*.{tbi}") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + + """ + bcftools \\ + sort \\ + --output ${prefix}.${extension} \\ + --temp-dir . \\ + --write-index \\ + $args \\ + $vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/sort/meta.yml b/modules/nf-core/bcftools/sort/meta.yml new file mode 100644 index 0000000..84747c6 --- /dev/null +++ b/modules/nf-core/bcftools/sort/meta.yml @@ -0,0 +1,42 @@ +name: bcftools_sort +description: Sorts VCF files +keywords: + - sorting + - VCF + - variant calling +tools: + - sort: + description: Sort VCF files by coordinates. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + tool_dev_url: https://github.com/samtools/bcftools + doi: "10.1093/bioinformatics/btp352" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: The VCF/BCF file to be sorted + pattern: "*.{vcf.gz,vcf,bcf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Sorted VCF file + pattern: "*.{vcf.gz}" +authors: + - "@Gwennid" +maintainers: + - "@Gwennid" diff --git a/modules/nf-core/bcftools/stats/environment.yml b/modules/nf-core/bcftools/stats/environment.yml new file mode 100644 index 0000000..1a96952 --- /dev/null +++ b/modules/nf-core/bcftools/stats/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/stats/main.nf b/modules/nf-core/bcftools/stats/main.nf new file mode 100644 index 0000000..8a38397 --- /dev/null +++ b/modules/nf-core/bcftools/stats/main.nf @@ -0,0 +1,60 @@ +process BCFTOOLS_STATS { + tag "$meta.id $meta2.caller" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'quay.io/biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), val(meta2), path(vcf), path(index) + tuple val(meta3), path(regions) + tuple val(meta4), path(targets) + tuple val(meta5), path(samples) + tuple val(meta6), path(exons) + tuple val(meta7), path(fasta) + + output: + tuple val(meta),val(meta2), path("*stats.txt"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions_file = regions ? "--regions-file ${regions}" : "" + def targets_file = targets ? "--targets-file ${targets}" : "" + def samples_file = samples ? "--samples-file ${samples}" : "" + def reference_fasta = fasta ? "--fasta-ref ${fasta}" : "" + def exons_file = exons ? "--exons ${exons}" : "" + """ + bcftools stats \\ + $args \\ + $regions_file \\ + $targets_file \\ + $samples_file \\ + $reference_fasta \\ + $exons_file \\ + $vcf > ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/stats/meta.yml b/modules/nf-core/bcftools/stats/meta.yml new file mode 100644 index 0000000..7ea2103 --- /dev/null +++ b/modules/nf-core/bcftools/stats/meta.yml @@ -0,0 +1,77 @@ +name: bcftools_stats +description: Generates stats from VCF files +keywords: + - variant calling + - stats + - VCF +tools: + - stats: + description: | + Parses VCF or BCF and produces text file stats which is suitable for + machine processing and can be plotted using plot-vcfstats. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF input file + pattern: "*.{vcf}" + - tbi: + type: file + description: | + The tab index for the VCF file to be inspected. Optional: only required when parameter regions is chosen. + pattern: "*.tbi" + - regions: + type: file + description: | + Optionally, restrict the operation to regions listed in this file. (VCF, BED or tab-delimited) + - targets: + type: file + description: | + Optionally, restrict the operation to regions listed in this file (doesn't rely upon tbi index files) + - samples: + type: file + description: | + Optional, file of sample names to be included or excluded. + e.g. 'file.tsv' + - exons: + type: file + description: | + Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, optionally bgzip compressed). + e.g. 'exons.tsv.gz' + - fasta: + type: file + description: | + Faidx indexed reference sequence file to determine INDEL context. + e.g. 'reference.fa' +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: Text output file containing stats + pattern: "*_{stats.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@SusiJo" + - "@TCLamnidis" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@SusiJo" + - "@TCLamnidis" diff --git a/modules/nf-core/bcftools/view/environment.yml b/modules/nf-core/bcftools/view/environment.yml new file mode 100644 index 0000000..8937c6d --- /dev/null +++ b/modules/nf-core/bcftools/view/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/view/main.nf b/modules/nf-core/bcftools/view/main.nf new file mode 100644 index 0000000..63b44c8 --- /dev/null +++ b/modules/nf-core/bcftools/view/main.nf @@ -0,0 +1,55 @@ +process BCFTOOLS_VIEW { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'quay.io/biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(index) + tuple val(meta3), path(regions) + path(targets) + path(samples) + + output: + tuple val(meta),val(meta2), path("*.gz") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions_file = regions ? "--regions-file ${regions}" : "" + def targets_file = targets ? "--targets-file ${targets}" : "" + def samples_file = samples ? "--samples-file ${samples}" : "" + """ + bcftools view \\ + --output ${prefix}.vcf.gz \\ + ${regions_file} \\ + ${targets_file} \\ + ${samples_file} \\ + $args \\ + --threads $task.cpus \\ + ${vcf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/view/meta.yml b/modules/nf-core/bcftools/view/meta.yml new file mode 100644 index 0000000..eaa12b5 --- /dev/null +++ b/modules/nf-core/bcftools/view/meta.yml @@ -0,0 +1,64 @@ +name: bcftools_view +description: View, subset and filter VCF or BCF files by position and filtering expression. Convert between VCF and BCF +keywords: + - variant calling + - view + - bcftools + - VCF +tools: + - view: + description: | + View, subset and filter VCF or BCF files by position and filtering expression. Convert between VCF and BCF + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + The vcf file to be inspected. + e.g. 'file.vcf' + - index: + type: file + description: | + The tab index for the VCF file to be inspected. + e.g. 'file.tbi' + - regions: + type: file + description: | + Optionally, restrict the operation to regions listed in this file. + e.g. 'file.vcf' + - targets: + type: file + description: | + Optionally, restrict the operation to regions listed in this file (doesn't rely upon index files) + e.g. 'file.vcf' + - samples: + type: file + description: | + Optional, file of sample names to be included or excluded. + e.g. 'file.tsv' +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF normalized output file + pattern: "*.{vcf.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" +maintainers: + - "@abhi18av" diff --git a/modules/nf-core/bcftools/view/tests/main.nf.test b/modules/nf-core/bcftools/view/tests/main.nf.test new file mode 100644 index 0000000..c285674 --- /dev/null +++ b/modules/nf-core/bcftools/view/tests/main.nf.test @@ -0,0 +1,103 @@ +nextflow_process { + + name "Test Process BCFTOOLS_VIEW" + script "../main.nf" + process "BCFTOOLS_VIEW" + + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/view" + + config "./nextflow.config" + + test("sarscov2 - [vcf, tbi], [], [], []") { + + when { + process { + """ + input[0] = [ + [ id:'out', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf_gz_tbi'], checkIfExists: true) + ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [vcf, tbi], vcf, tsv, []") { + + when { + process { + """ + input[0] = [ + [ id:'out', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf_gz_tbi'], checkIfExists: true) + ] + input[1] = file(params.test_data['sarscov2']['illumina']['test3_vcf_gz'], checkIfExists: true) + input[2] = file(params.test_data['sarscov2']['illumina']['test2_vcf_targets_tsv_gz'], checkIfExists: true) + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [vcf, tbi], [], [], [] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'out', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_vcf_gz_tbi'], checkIfExists: true) + ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.vcf[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bcftools/view/tests/main.nf.test.snap b/modules/nf-core/bcftools/view/tests/main.nf.test.snap new file mode 100644 index 0000000..049ac25 --- /dev/null +++ b/modules/nf-core/bcftools/view/tests/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "sarscov2 - [vcf, tbi], vcf, tsv, []": { + "content": [ + [ + [ + { + "id": "out", + "single_end": false + }, + "out.vcf.gz:md5,3c47ba1a6aa4ef9b3ad800175814d739" + ] + ], + [ + "versions.yml:md5,106d119dde844ec7fee1cdd30828bcdc" + ] + ], + "timestamp": "2023-11-29T14:27:10.724842996" + }, + "sarscov2 - [vcf, tbi], [], [], [] - stub": { + "content": [ + "out.vcf.gz", + [ + "versions.yml:md5,106d119dde844ec7fee1cdd30828bcdc" + ] + ], + "timestamp": "2023-11-29T14:27:17.445846794" + }, + "sarscov2 - [vcf, tbi], [], [], []": { + "content": [ + [ + [ + { + "id": "out", + "single_end": false + }, + "out.vcf.gz:md5,a1e45fe6d2b386fc2611766e5d2937ee" + ] + ], + [ + "versions.yml:md5,106d119dde844ec7fee1cdd30828bcdc" + ] + ], + "timestamp": "2023-11-29T14:27:03.328392594" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/view/tests/nextflow.config b/modules/nf-core/bcftools/view/tests/nextflow.config new file mode 100644 index 0000000..b05aa50 --- /dev/null +++ b/modules/nf-core/bcftools/view/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = '--no-version' +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/view/tests/tags.yml b/modules/nf-core/bcftools/view/tests/tags.yml new file mode 100644 index 0000000..43b1f0a --- /dev/null +++ b/modules/nf-core/bcftools/view/tests/tags.yml @@ -0,0 +1,2 @@ +bcftools/view: + - "modules/nf-core/bcftools/view/**" diff --git a/modules/nf-core/bedgovcf/environment.yml b/modules/nf-core/bedgovcf/environment.yml new file mode 100644 index 0000000..61cfbcc --- /dev/null +++ b/modules/nf-core/bedgovcf/environment.yml @@ -0,0 +1,7 @@ +name: bedgovcf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedgovcf=0.1.0 diff --git a/modules/nf-core/bedgovcf/main.nf b/modules/nf-core/bedgovcf/main.nf new file mode 100644 index 0000000..f39f404 --- /dev/null +++ b/modules/nf-core/bedgovcf/main.nf @@ -0,0 +1,49 @@ +process BEDGOVCF { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedgovcf:0.1.0--h9ee0642_0': + 'biocontainers/bedgovcf:0.1.0--h9ee0642_0' }" + + input: + tuple val(meta), path(bed), path(config) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bedgovcf \\ + $args \\ + --bed $bed \\ + --fai $fai \\ + --config $config \\ + | bgzip --stdout --threads $task.cpus $args2 > ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedgovcf: \$(echo \$(bedgovcf --version 2>&1) | sed 's/^bedgovcf version //' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedgovcf: \$(echo \$(bedgovcf --version 2>&1) | sed 's/^bedgovcf version //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedgovcf/meta.yml b/modules/nf-core/bedgovcf/meta.yml new file mode 100644 index 0000000..406935e --- /dev/null +++ b/modules/nf-core/bedgovcf/meta.yml @@ -0,0 +1,57 @@ +--- +name: "bedgovcf" +description: Convert a BED file to a VCF file according to a YAML config +keywords: + - bed + - vcf + - conversion + - variants +tools: + - "bedgovcf": + description: "A simple tool to convert BED files to VCF files" + homepage: "https://github.com/nvnieuwk/bedgovcf" + documentation: "https://github.com/nvnieuwk/bedgovcf" + tool_dev_url: "https://github.com/nvnieuwk/bedgovcf" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - bed: + type: file + description: The BED file to convert to VCF + pattern: "*.bed" + - config: + type: file + description: The config file to use for the conversion + pattern: "*.{yml,yaml}" + - meta2: + type: map + description: | + Groovy Map containing fasta index information + e.g. `[ id:'test', single_end:false ]` + - fai: + type: file + description: The fasta index file + pattern: "*.fai" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: The converted VCF file + pattern: "*.vcf.gz" + +authors: + - "@nvnieuwk" diff --git a/modules/nf-core/bedgovcf/tests/main.nf.test b/modules/nf-core/bedgovcf/tests/main.nf.test new file mode 100644 index 0000000..f55fbb6 --- /dev/null +++ b/modules/nf-core/bedgovcf/tests/main.nf.test @@ -0,0 +1,77 @@ +nextflow_process { + + name "Test Process BEDGOVCF" + script "../main.nf" + process "BEDGOVCF" + tag "modules" + tag "modules_nfcore" + tag "bedgovcf" + + test("homo_sapiens - [bed, config] - fai") { + + // WARNING: md5sum between local tests and GHA tests differ + // Please change them manually after every snapshot update + when { + params { + outdir = $outputDir + } + process { + """ + raw_ch = Channel.of([ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_bed'], checkIfExists: true) + ]) + config = Channel.of(["config.yaml", "header:", " - name: test", " content: test"]) + .collectFile(newLine:true, sort:'index') + input[0] = raw_ch.combine(config) + input[1] = [ + [id:"ref"], + file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + + ) + } + } + + + test("homo_sapiens - [bed, config] - fai STUB") { + options "-stub" + when { + params { + outdir = $outputDir + } + process { + """ + raw_ch = Channel.of([ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_bed'], checkIfExists: true) + ]) + config = Channel.of(["config.yaml", "header:", " - name: test", " content: test"]) + .collectFile(newLine:true, sort:'index') + input[0] = raw_ch.combine(config) + input[1] = [ + [id:"ref"], + file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + + ) + } + } + +} diff --git a/modules/nf-core/bedgovcf/tests/main.nf.test.snap b/modules/nf-core/bedgovcf/tests/main.nf.test.snap new file mode 100644 index 0000000..2e34fe4 --- /dev/null +++ b/modules/nf-core/bedgovcf/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "homo_sapiens - [bed, config] - fai STUB": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,27e259763ddc44796207f766afb94887" + ], + "vcf": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,27e259763ddc44796207f766afb94887" + ] + } + ], + "timestamp": "2023-11-14T12:45:41.931292108" + }, + "homo_sapiens - [bed, config] - fai": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,a9cb689f02723f7e0e4d674ef144b48c" + ] + ], + "1": [ + "versions.yml:md5,27e259763ddc44796207f766afb94887" + ], + "vcf": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,a9cb689f02723f7e0e4d674ef144b48c" + ] + ], + "versions": [ + "versions.yml:md5,27e259763ddc44796207f766afb94887" + ] + } + ], + "timestamp": "2023-11-14T12:45:36.218283171" + } +} \ No newline at end of file diff --git a/modules/nf-core/bedgovcf/tests/tags.yml b/modules/nf-core/bedgovcf/tests/tags.yml new file mode 100644 index 0000000..d66b105 --- /dev/null +++ b/modules/nf-core/bedgovcf/tests/tags.yml @@ -0,0 +1,2 @@ +bedgovcf: + - modules/nf-core/bedgovcf/** diff --git a/modules/nf-core/bedtools/coverage/environment.yml b/modules/nf-core/bedtools/coverage/environment.yml new file mode 100644 index 0000000..53a3b4c --- /dev/null +++ b/modules/nf-core/bedtools/coverage/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_coverage +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.30.0 diff --git a/modules/nf-core/bedtools/coverage/main.nf b/modules/nf-core/bedtools/coverage/main.nf new file mode 100644 index 0000000..03b0545 --- /dev/null +++ b/modules/nf-core/bedtools/coverage/main.nf @@ -0,0 +1,39 @@ +process BEDTOOLS_COVERAGE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--h468198e_3': + 'quay.io/biocontainers/bedtools:2.30.0--h468198e_3' }" + + input: + tuple val(meta), path(input_A), path(input_B) + path genome_file + + output: + tuple val(meta), path("*.bed"), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = genome_file ? "-g ${genome_file} -sorted" : "" + """ + bedtools \\ + coverage \\ + $args \\ + $reference \\ + -a $input_A \\ + -b $input_B \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(echo \$(bedtools --version 2>&1) | sed 's/^.*bedtools v//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/coverage/meta.yml b/modules/nf-core/bedtools/coverage/meta.yml new file mode 100644 index 0000000..d825ca7 --- /dev/null +++ b/modules/nf-core/bedtools/coverage/meta.yml @@ -0,0 +1,57 @@ +name: "bedtools_coverage" +description: computes both the depth and breadth of coverage of features in file B on the features in file A +keywords: + - bedtools + - coverage + - bam + - bed + - gff + - vcf + - histogram +tools: + - "bedtools": + description: "A powerful toolset for genome arithmetic" + homepage: "https://bedtools.readthedocs.io/en/latest/index.html" + documentation: "https://bedtools.readthedocs.io/en/latest/content/bedtools-suite.html" + tool_dev_url: "https://github.com/arq5x/bedtools2" + doi: "10.1093/bioinformatics/btq033" + licence: "['GPL v2', 'MIT']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_A: + type: file + description: BAM/BED/GFF/VCF file + pattern: "*.{bam,bed,gff,vcf}" + - input_B: + type: file + description: One or more BAM/BED/GFF/VCF file + pattern: "*.{bam,bed,gff,vcf}" + - genome_file: + type: file + description: | + Optional reference genome 2 column file that defines the expected chromosome order + in the input files for use with the -sorted option. + When `genome_file` is provided, `-sorted` option is added to the command. + pattern: "*.{fai,txt,chromsizes}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: File containing coverage of sequence alignments + pattern: "*.bed" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@priyanka-surana" +maintainers: + - "@priyanka-surana" diff --git a/modules/nf-core/bedtools/genomecov/environment.yml b/modules/nf-core/bedtools/genomecov/environment.yml new file mode 100644 index 0000000..574c267 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_genomecov +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.0 diff --git a/modules/nf-core/bedtools/genomecov/main.nf b/modules/nf-core/bedtools/genomecov/main.nf new file mode 100644 index 0000000..5020882 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/main.nf @@ -0,0 +1,70 @@ +process BEDTOOLS_GENOMECOV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.0--hf5e1c6e_2' : + 'quay.io/biocontainers/bedtools:2.31.0--hf5e1c6e_2' }" + + input: + tuple val(meta), path(intervals), val(scale) + path sizes + val extension + + output: + tuple val(meta), path("*.${extension}"), emit: genomecov + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + args += (scale > 0 && scale != 1) ? " -scale $scale" : "" + if (!args_list.contains('-bg') && (scale > 0 && scale != 1)) { + args += " -bg" + } + + def prefix = task.ext.prefix ?: "${meta.id}" + if (intervals.name =~ /\.bam/) { + """ + bedtools \\ + genomecov \\ + -ibam $intervals \\ + $args \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + } else { + """ + bedtools \\ + genomecov \\ + -i $intervals \\ + -g $sizes \\ + $args \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/genomecov/meta.yml b/modules/nf-core/bedtools/genomecov/meta.yml new file mode 100644 index 0000000..cc32da9 --- /dev/null +++ b/modules/nf-core/bedtools/genomecov/meta.yml @@ -0,0 +1,59 @@ +name: bedtools_genomecov +description: Computes histograms (default), per-base reports (-d) and BEDGRAPH (-bg) summaries of feature coverage (e.g., aligned sequences) for a given genome. +keywords: + - bed + - bam + - genomecov + - bedtools + - histogram +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals: + type: file + description: BAM/BED/GFF/VCF + pattern: "*.{bam|bed|gff|vcf}" + - scale: + type: integer + description: Number containing the scale factor for the output. Set to 1 to disable. Setting to a value other than 1 will also get the -bg bedgraph output format as this is required for this command switch + - sizes: + type: file + description: Tab-delimited table of chromosome names in the first column and chromosome sizes in the second column + - extension: + type: string + description: Extension of the output file (e. g., ".bg", ".bedgraph", ".txt", ".tab", etc.) It is set arbitrarily by the user and corresponds to the file format which depends on arguments. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - genomecov: + type: file + description: Computed genome coverage file + pattern: "*.${extension}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Emiller88" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" + - "@chris-cheshire" +maintainers: + - "@Emiller88" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" + - "@chris-cheshire" diff --git a/modules/nf-core/bedtools/intersect/environment.yml b/modules/nf-core/bedtools/intersect/environment.yml new file mode 100644 index 0000000..f8bb5fb --- /dev/null +++ b/modules/nf-core/bedtools/intersect/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_intersect +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.30.0 diff --git a/modules/nf-core/bedtools/intersect/main.nf b/modules/nf-core/bedtools/intersect/main.nf new file mode 100644 index 0000000..8d6f280 --- /dev/null +++ b/modules/nf-core/bedtools/intersect/main.nf @@ -0,0 +1,59 @@ +process BEDTOOLS_INTERSECT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0' : + 'quay.io/biocontainers/bedtools:2.30.0--hc088bd4_0' }" + + input: + tuple val(meta), path(intervals1), path(intervals2) + tuple val(meta2), path(chrom_sizes) + + output: + tuple val(meta), path("*.${extension}"), emit: intersect + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + //Extension of the output file. It is set by the user via "ext.suffix" in the config. Corresponds to the file format which depends on arguments (e. g., ".bed", ".bam", ".txt", etc.). + extension = task.ext.suffix ?: "${intervals1.extension}" + def sizes = chrom_sizes ? "-g ${chrom_sizes}" : '' + if ("$intervals1" == "${prefix}.${extension}" || + "$intervals2" == "${prefix}.${extension}") + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + intersect \\ + -a $intervals1 \\ + -b $intervals2 \\ + $args \\ + $sizes \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + extension = task.ext.suffix ?: "bed" + if ("$intervals1" == "${prefix}.${extension}" || + "$intervals2" == "${prefix}.${extension}") + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/intersect/meta.yml b/modules/nf-core/bedtools/intersect/meta.yml new file mode 100644 index 0000000..8d9a08a --- /dev/null +++ b/modules/nf-core/bedtools/intersect/meta.yml @@ -0,0 +1,59 @@ +name: bedtools_intersect +description: Allows one to screen for overlaps between two sets of genomic features. +keywords: + - bed + - intersect + - overlap +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals1: + type: file + description: BAM/BED/GFF/VCF + pattern: "*.{bam|bed|gff|vcf}" + - intervals2: + type: file + description: BAM/BED/GFF/VCF + pattern: "*.{bam|bed|gff|vcf}" + - meta2: + type: map + description: | + Groovy Map containing reference chromosome sizes + e.g. [ id:'test' ] + - chrom_sizes: + type: file + description: Chromosome sizes file + pattern: "*{.sizes,.txt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intersect: + type: file + description: File containing the description of overlaps found between the two features + pattern: "*.${extension}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Emiller88" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" +maintainers: + - "@Emiller88" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" diff --git a/modules/nf-core/bedtools/merge/environment.yml b/modules/nf-core/bedtools/merge/environment.yml new file mode 100644 index 0000000..961e602 --- /dev/null +++ b/modules/nf-core/bedtools/merge/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.0 diff --git a/modules/nf-core/bedtools/merge/main.nf b/modules/nf-core/bedtools/merge/main.nf new file mode 100644 index 0000000..153145b --- /dev/null +++ b/modules/nf-core/bedtools/merge/main.nf @@ -0,0 +1,47 @@ +process BEDTOOLS_MERGE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.0--hf5e1c6e_2' : + 'quay.io/biocontainers/bedtools:2.31.0--hf5e1c6e_2' }" + + input: + tuple val(meta), path(bed) + + output: + tuple val(meta), path('*.bed'), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bed" == "${prefix}.bed") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + merge \\ + -i $bed \\ + $args \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/merge/meta.yml b/modules/nf-core/bedtools/merge/meta.yml new file mode 100644 index 0000000..5565ce4 --- /dev/null +++ b/modules/nf-core/bedtools/merge/meta.yml @@ -0,0 +1,45 @@ +name: bedtools_merge +description: combines overlapping or “book-ended” features in an interval file into a single feature which spans all of the combined features. +keywords: + - bed + - merge + - bedtools + - overlapped bed +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/merge.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Input BED file + pattern: "*.{bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Overlapped bed file with combined features + pattern: "*.{bed}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Emiller88" + - "@sruthipsuresh" + - "@drpatelh" +maintainers: + - "@Emiller88" + - "@sruthipsuresh" + - "@drpatelh" diff --git a/modules/nf-core/bedtools/subtract/environment.yml b/modules/nf-core/bedtools/subtract/environment.yml new file mode 100644 index 0000000..b869a4f --- /dev/null +++ b/modules/nf-core/bedtools/subtract/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_subtract +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.30.0 diff --git a/modules/nf-core/bedtools/subtract/main.nf b/modules/nf-core/bedtools/subtract/main.nf new file mode 100644 index 0000000..028660d --- /dev/null +++ b/modules/nf-core/bedtools/subtract/main.nf @@ -0,0 +1,50 @@ +process BEDTOOLS_SUBTRACT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0' : + 'biocontainers/bedtools:2.30.0--hc088bd4_0' }" + + input: + tuple val(meta), path(intervals1), path(intervals2) + + output: + tuple val(meta), path("*.bed"), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$intervals1" == "${prefix}.bed" || + "$intervals2" == "${prefix}.bed") + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + subtract \\ + -a $intervals1 \\ + -b $intervals2 \\ + $args \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/subtract/meta.yml b/modules/nf-core/bedtools/subtract/meta.yml new file mode 100644 index 0000000..0226ff1 --- /dev/null +++ b/modules/nf-core/bedtools/subtract/meta.yml @@ -0,0 +1,45 @@ +name: bedtools_subtract +description: Finds overlaps between two sets of regions (A and B), removes the overlaps from A and reports the remaining portion of A. +keywords: + - bed + - gff + - vcf + - subtract +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/subtract.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals1: + type: file + description: BED/GFF/VCF + pattern: "*.{bed|gff|vcf}" + - intervals2: + type: file + description: BED/GFF/VCF + pattern: "*.{bed|gff|vcf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: File containing the difference between the two sets of features + patters: "*.bed" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sidorov-si" +maintainers: + - "@sidorov-si" diff --git a/modules/nf-core/bedtools/unionbedg/environment.yml b/modules/nf-core/bedtools/unionbedg/environment.yml new file mode 100644 index 0000000..895a72f --- /dev/null +++ b/modules/nf-core/bedtools/unionbedg/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_unionbedg +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.0 diff --git a/modules/nf-core/bedtools/unionbedg/main.nf b/modules/nf-core/bedtools/unionbedg/main.nf new file mode 100644 index 0000000..68b9054 --- /dev/null +++ b/modules/nf-core/bedtools/unionbedg/main.nf @@ -0,0 +1,50 @@ +process BEDTOOLS_UNIONBEDG { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.0--hf5e1c6e_2' : + 'biocontainers/bedtools:2.31.0--hf5e1c6e_2' }" + + input: + tuple val(meta), path(bedgraph) + tuple val(meta2), path(chrom_sizes) + + output: + tuple val(meta), path("*.bed"), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sizes = chrom_sizes ? "-g ${chrom_sizes}" : '' + bedgraph.collect { if ("$it" == "${prefix}.bed") error "$it has the same name as the output, use \"task.ext.prefix\" to disambiguate!" } + """ + bedtools \\ + unionbedg \\ + -i $bedgraph \\ + $sizes \\ + $args \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/unionbedg/meta.yml b/modules/nf-core/bedtools/unionbedg/meta.yml new file mode 100644 index 0000000..ee73e8b --- /dev/null +++ b/modules/nf-core/bedtools/unionbedg/meta.yml @@ -0,0 +1,52 @@ +name: bedtools_unionbedg +description: Combines multiple BedGraph files into a single file +keywords: + - bed + - unionBedGraphs + - bedGraph + - comparisons + - combine +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/slop.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bedgraph: + type: file + description: | + Input BedGraph file: four column BED format, with 4th column with numerical values: integer or real, positive or negative + pattern: "*.{bedGraph,bedgraph}" + - meta2: + type: map + description: | + Groovy Map containing meta information for the reference chromosome sizes + e.g. [ id:'test' ] + - chrom_sizes: + type: file + description: Chromosome sizes file + pattern: "*{.sizes,.txt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Combined BED file with values from all bedGraph files + pattern: "*.{bed}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ekushele" +maintainers: + - "@ekushele" diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml index 9b3272b..f0c63f6 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/environment.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.19 + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index f218761..3c726cf 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -4,8 +4,8 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : - 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 5f15a5f..9414c32 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test index b1e1630..eec1db1 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -31,12 +31,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot( - process.out.versions, - file(process.out.mqc_yml[0]).readLines()[0..10], - file(process.out.yml[0]).readLines()[0..7] - ).match() - } + { assert snapshot(process.out).match() } ) } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap index 5f59a93..4274ed5 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -1,33 +1,27 @@ { "Should run without failures": { "content": [ - [ - "versions.yml:md5,76d454d92244589d32455833f7c1ba6d" - ], - [ - "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", - " \\n\\n\\n \\n \\n\\", - " \\ \\n\\n\\n\\n \\n \\", - " \\ \\n \\n\\n\\n\\n\\", - " \\n\\n \\n \\n\\", - " \\ \\n\\n\\n\\n\\n\\n \\n\\", - " \\ \\n \\n\\n\\n\\n\\", - " \\n\\n \\n \\n\\" - ], - [ - "CUSTOM_DUMPSOFTWAREVERSIONS:", - " python: 3.11.7", - " yaml: 5.4.1", - "TOOL1:", - " tool1: 0.11.9", - "TOOL2:", - " tool2: '1.9'", - "Workflow:" - ] + { + "0": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ], + "1": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "2": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "mqc_yml": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "versions": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "yml": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ] + } ], - "timestamp": "2024-01-09T23:01:18.710682" + "timestamp": "2023-11-03T14:43:22.157011" } -} \ No newline at end of file +} diff --git a/modules/nf-core/manta/convertinversion/main.nf b/modules/nf-core/manta/convertinversion/main.nf new file mode 100644 index 0000000..9010c3c --- /dev/null +++ b/modules/nf-core/manta/convertinversion/main.nf @@ -0,0 +1,36 @@ +process MANTA_CONVERTINVERSION { + tag "$meta.id" + label 'process_single' + label 'error_retry' + + conda "bioconda::manta=1.6.0 bioconda::samtools=1.16.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-40295ae41112676b05b649e513fe7000675e9b84:a0332aa38645fbb8969567731ce68cfb7f830ec4-0': + 'quay.io/biocontainers/mulled-v2-40295ae41112676b05b649e513fe7000675e9b84:a0332aa38645fbb8969567731ce68cfb7f830ec4-0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(index) + tuple path(fasta), path(fai) + + output: + tuple val(meta),val(meta2), path("*.vcf.gz"),path("*.vcf.gz.tbi") , emit: vcf_tabi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + convertInversion.py \$(which samtools) $fasta $vcf | bgzip --threads $task.cpus > ${prefix}.converted.vcf.gz + tabix ${prefix}.converted.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + manta: \$( configManta.py --version ) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' )) + END_VERSIONS + """ + +} diff --git a/modules/nf-core/manta/convertinversion/meta.yml b/modules/nf-core/manta/convertinversion/meta.yml new file mode 100644 index 0000000..cf9e169 --- /dev/null +++ b/modules/nf-core/manta/convertinversion/meta.yml @@ -0,0 +1,46 @@ +name: "manta_convertinversion" +description: Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads. This script reformats inversions into single inverted sequence junctions which was the format used in Manta versions <= 1.4.0. +keywords: + - structural variants + - conversion + - indels +tools: + - manta: + description: Structural variant and indel caller for mapped sequencing data + homepage: https://github.com/Illumina/manta + documentation: https://github.com/Illumina/manta/blob/v1.6.0/docs/userGuide/README.md + tool_dev_url: https://github.com/Illumina/manta + doi: "10.1093/bioinformatics/btv710" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF file produces by Manta + pattern: "*.vcf.gz" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: VCF file with reformatted inversions + pattern: "*.vcf.gz" + - tbi: + type: file + description: TBI file produces by Manta + pattern: "*.vcf.gz.tbi" +authors: + - "@FriederikeHanssen" diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index 7625b75..e5e14e8 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -1,7 +1,7 @@ -name: multiqc +name: MultiQC channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::multiqc=1.19 + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 1b9f7c4..a908c80 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,8 +3,8 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : - 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" @@ -25,14 +25,12 @@ process MULTIQC { def args = task.ext.args ?: '' def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' """ multiqc \\ --force \\ $args \\ $config \\ $extra_config \\ - $logo \\ . cat <<-END_VERSIONS > versions.yml @@ -43,7 +41,7 @@ process MULTIQC { stub: """ - mkdir multiqc_data + touch multiqc_data touch multiqc_plots touch multiqc_report.html diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index 45a9bc3..a61223e 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,4 +1,5 @@ -name: multiqc +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - QC diff --git a/modules/nf-core/survivor/filter/environment.yml b/modules/nf-core/survivor/filter/environment.yml new file mode 100644 index 0000000..56839ce --- /dev/null +++ b/modules/nf-core/survivor/filter/environment.yml @@ -0,0 +1,7 @@ +name: survivor_filter +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::survivor=1.0.7 diff --git a/modules/nf-core/survivor/filter/main.nf b/modules/nf-core/survivor/filter/main.nf new file mode 100644 index 0000000..d9c2557 --- /dev/null +++ b/modules/nf-core/survivor/filter/main.nf @@ -0,0 +1,64 @@ +process SURVIVOR_FILTER { + tag "$meta.id $meta2.caller" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/survivor:1.0.7--h9a82719_1': + 'quay.io/biocontainers/survivor:1.0.7--h9a82719_1' }" + + input: + tuple val(meta),val(meta2), path(vcf_file), path(bed) // VCF file to filter and BED file with regions to ignore (NA to disable) + val(minsv) // Min SV size (-1 to disable) + val(maxsv) // Max SV size (-1 to disable) + val(minallelefreq) // Min allele frequency (0-1) + val(minnumreads) // Min number of reads support: RE flag (-1 to disable) + + output: + tuple val(meta),val(meta2), path("*.vcf"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bed_file = bed ? "${bed}" : "NA" + + if( "$vcf_file" == "${prefix}.vcf" ){ + error "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + """ + SURVIVOR \\ + filter \\ + $vcf_file \\ + $bed_file \\ + $minsv \\ + $maxsv \\ + $minallelefreq \\ + $minnumreads \\ + ${prefix}.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + survivor: \$(echo \$(SURVIVOR 2>&1 | grep "Version" | sed 's/^Version: //')) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def bed_file = bed ? "${bed}" : "NA" + + if( "$vcf_file" == "${prefix}.vcf" ){ + error "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + + """ + touch ${prefix}.vcf + cat <<-END_VERSIONS > versions.yml + "${task.process}": + survivor: \$(echo \$(SURVIVOR 2>&1 | grep "Version" | sed 's/^Version: //')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/survivor/filter/meta.yml b/modules/nf-core/survivor/filter/meta.yml new file mode 100644 index 0000000..4cb2905 --- /dev/null +++ b/modules/nf-core/survivor/filter/meta.yml @@ -0,0 +1,58 @@ +name: "survivor_filter" +description: Filter a vcf file based on size and/or regions to ignore +keywords: + - survivor + - filter + - vcf + - structural variants +tools: + - "survivor": + description: "Toolset for SV simulation, comparison and filtering" + homepage: "https://github.com/fritzsedlazeck/SURVIVOR/wiki" + documentation: "https://github.com/fritzsedlazeck/SURVIVOR/wiki" + tool_dev_url: "https://github.com/fritzsedlazeck/SURVIVOR" + doi: "10.1038/NCOMMS14061" + licence: "['MIT']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF file to filter + pattern: "*.{vcf}" + - bed: + type: file + description: BED file with regions to ignore (NA to disable) + - minsv: + type: integer + description: Min SV size (-1 to disable) + - maxsv: + type: integer + description: Max SV size (-1 to disable) + - minallelefreq: + type: float + description: Min allele frequency (0-1) + - minnumreads: + type: integer + description: Min number of reads support [RE flag (-1 to disable)] +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Filtered VCF file + pattern: "*.{vcf}" +authors: + - "@LlaneroHiboreo" +maintainers: + - "@LlaneroHiboreo" diff --git a/modules/nf-core/survivor/stats/environment.yml b/modules/nf-core/survivor/stats/environment.yml new file mode 100644 index 0000000..4f64007 --- /dev/null +++ b/modules/nf-core/survivor/stats/environment.yml @@ -0,0 +1,7 @@ +name: survivor_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::survivor=1.0.7 diff --git a/modules/nf-core/survivor/stats/main.nf b/modules/nf-core/survivor/stats/main.nf new file mode 100644 index 0000000..528e94c --- /dev/null +++ b/modules/nf-core/survivor/stats/main.nf @@ -0,0 +1,54 @@ +process SURVIVOR_STATS { + tag "$meta.id $meta2.caller" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/survivor:1.0.7--h9a82719_1': + 'quay.io/biocontainers/survivor:1.0.7--h9a82719_1' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(index) + val(minsv) // Min SV size (-1 to disable) + val(maxsv) // Max SV size (-1 to disable) + val(minnumreads) // Min number of reads support: RE flag (-1 to disable) + + output: + tuple val(meta),val(meta2), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def name = vcf.getBaseName() + + """ + gzip -d $vcf + + SURVIVOR \\ + stats \\ + $name \\ + $minsv \\ + $maxsv \\ + $minnumreads \\ + ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + survivor: \$(echo \$(SURVIVOR 2>&1 | grep "Version" | sed 's/^Version: //')) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.stats + cat <<-END_VERSIONS > versions.yml + "${task.process}": + survivor: \$(echo \$(SURVIVOR 2>&1 | grep "Version" | sed 's/^Version: //')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/survivor/stats/meta.yml b/modules/nf-core/survivor/stats/meta.yml new file mode 100644 index 0000000..ba450a7 --- /dev/null +++ b/modules/nf-core/survivor/stats/meta.yml @@ -0,0 +1,52 @@ +name: "survivor_stats" +description: Report multipe stats over a VCF file +keywords: + - survivor + - statistics + - vcf + - structural variants +tools: + - "survivor": + description: "Toolset for SV simulation, comparison and filtering" + homepage: "https://github.com/fritzsedlazeck/SURVIVOR/wiki" + documentation: "https://github.com/fritzsedlazeck/SURVIVOR/wiki" + tool_dev_url: "https://github.com/fritzsedlazeck/SURVIVOR" + doi: "10.1038/NCOMMS14061" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF file to filter + pattern: "*.{vcf}" + - minsv: + type: integer + description: Min SV size (-1 to disable) + - maxsv: + type: integer + description: Max SV size (-1 to disable) + - minnumreads: + type: integer + description: Min number of reads support [RE flag (-1 to disable)] +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - stats: + type: file + description: File containing statistics given input VCF file + pattern: "*.{stats}" +authors: + - "@kubranarci" +maintainers: + - "@kubranarci" diff --git a/modules/nf-core/survivor/stats/tests/main.nf.test b/modules/nf-core/survivor/stats/tests/main.nf.test new file mode 100644 index 0000000..e83df57 --- /dev/null +++ b/modules/nf-core/survivor/stats/tests/main.nf.test @@ -0,0 +1,71 @@ +// nf-core modules test survivor/stats +nextflow_process { + + name "Test Process SURVIVOR_STATS" + script "../main.nf" + process "SURVIVOR_STATS" + + tag "modules" + tag "modules_nfcore" + tag "survivor" + tag "survivor/stats" + + test("homo_sapiens - haplotc - vcfgz") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_vcf_gz'], checkIfExists: true) + ] + input[1] = -1 + input[2] = -1 + input[3] = -1 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.stats, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - mutect - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_test2_paired_mutect2_calls_vcf_gz'], checkIfExists: true) + ] + input[1] = -1 + input[2] = -1 + input[3] = -1 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.stats, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/survivor/stats/tests/main.nf.test.snap b/modules/nf-core/survivor/stats/tests/main.nf.test.snap new file mode 100644 index 0000000..d629dfc --- /dev/null +++ b/modules/nf-core/survivor/stats/tests/main.nf.test.snap @@ -0,0 +1,34 @@ +{ + "homo_sapiens - haplotc - vcfgz": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.stats:md5,023a37c7d6fab374c9bdcf98c011022c" + ] + ], + [ + "versions.yml:md5,35bf44775e74ffec88173cd2093c2928" + ] + ], + "timestamp": "2024-01-15T15:37:42.562568997" + }, + "homo_sapiens - mutect - stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + [ + "versions.yml:md5,35bf44775e74ffec88173cd2093c2928" + ] + ], + "timestamp": "2024-01-15T15:27:09.132671858" + } +} \ No newline at end of file diff --git a/modules/nf-core/survivor/stats/tests/tags.yml b/modules/nf-core/survivor/stats/tests/tags.yml new file mode 100644 index 0000000..762cebe --- /dev/null +++ b/modules/nf-core/survivor/stats/tests/tags.yml @@ -0,0 +1,2 @@ +survivor/stats: + - "modules/nf-core/survivor/stats/**" diff --git a/modules/nf-core/svanalyzer/svbenchmark/environment.yml b/modules/nf-core/svanalyzer/svbenchmark/environment.yml new file mode 100644 index 0000000..12cac54 --- /dev/null +++ b/modules/nf-core/svanalyzer/svbenchmark/environment.yml @@ -0,0 +1,10 @@ +name: svanalyzer_svbenchmark + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - bioconda::edlib=1.2.3 + - bioconda::svanalyzer=0.35 diff --git a/modules/nf-core/svanalyzer/svbenchmark/main.nf b/modules/nf-core/svanalyzer/svbenchmark/main.nf new file mode 100644 index 0000000..afbcd9b --- /dev/null +++ b/modules/nf-core/svanalyzer/svbenchmark/main.nf @@ -0,0 +1,65 @@ + +process SVANALYZER_SVBENCHMARK { + tag "$meta.id $meta2.caller" + label 'process_medium' + + conda "bioconda::svanalyzer=0.35" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/svanalyzer:0.35--pl526_0': + 'quay.io/biocontainers/svanalyzer:0.35--pl526_0' }" + + input: + tuple val(meta),val(meta2), path(test), path(test_index), path(truth), path(truth_index), path(bed) + tuple path(fasta), path(fai) + + output: + tuple val(meta),val(meta2), path("*.falsenegatives.vcf"), emit: fns + tuple val(meta),val(meta2), path("*.falsepositives.vcf"), emit: fps + tuple val(meta),val(meta2), path("*.distances") , emit: distances + tuple val(meta),val(meta2), path("*.log") , emit: log + tuple val(meta),val(meta2), path("*.report") , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bed = bed ? "-includebed $bed" : "" + def VERSION = '0.35' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + """ + svanalyzer \\ + benchmark \\ + $args \\ + --ref $fasta \\ + --test $test \\ + --truth $truth \\ + --prefix $prefix \\ + $bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svanalyzer: ${VERSION} + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '0.35' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + """ + touch ${prefix}.falsenegatives.vcf + touch ${prefix}.falsepositives.vcf + touch ${prefix}.distances + touch ${prefix}.log + touch ${prefix}.report + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svanalyzer: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/svanalyzer/svbenchmark/meta.yml b/modules/nf-core/svanalyzer/svbenchmark/meta.yml new file mode 100644 index 0000000..ba72c3d --- /dev/null +++ b/modules/nf-core/svanalyzer/svbenchmark/meta.yml @@ -0,0 +1,80 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "svanalyzer_svbenchmark" +description: "SVbenchmark compares a set of “test” structural variants in VCF format to a known truth set (also in VCF format) and outputs estimates of sensitivity and specificity." +keywords: + - structural variant + - sv + - benchmarking +tools: + - "svanalyzer": + description: "SVanalyzer: tools for the analysis of structural variation in genomes" + homepage: "https://svanalyzer.readthedocs.io/en/latest/index.html" + documentation: "https://svanalyzer.readthedocs.io/en/latest/index.html" + tool_dev_url: "https://github.com/nhansen/SVanalyzer" + license: "['CC0']" + +input: + - meta: + type: map + description: | + Groovy Map containing test sample information + e.g. `[ id:'test' ]` + - meta2: + type: map + description: | + Groovy Map containing truth sample information + e.g. `[ id:'test2' ]` + - meta3: + type: map + description: | + Groovy Map containing reference genome information + e.g. `[ id:'test3' ]` + - test: + type: file + description: A VCF-formatted file of structural variants to test (required) + pattern: "*.{vcf,vcf.gz}" + - truth: + type: file + description: A VCF-formatted file of variants to compare against (required) + pattern: "*.{vcf,vcf.gz}" + - fasta: + type: file + description: The reference FASTA file for the supplied VCF file or files (required) + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + - bed: + type: file + description: BED File of regions from which to include variants. Used to filter both test and truth variants. + pattern: "*.{bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information inherited from test vcf + e.g. `[ id:'test']` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fns: + type: file + description: VCF file with False Negatives + pattern: "*.{vcf}" + - fps: + type: file + description: VCF file with False Positives + pattern: "*.{vcf}" + - distances: + type: file + description: TSV file with genomic distances and size differences between structural variants compared + pattern: "*.{distances}" + - log: + type: file + description: LOG file of the run + pattern: "*.{log}" + - report: + type: file + description: Text file reporting RECALL, PRECISION and F1. + pattern: "*.{log}" +authors: + - "@kubranarci" diff --git a/modules/nf-core/svanalyzer/svbenchmark/tests/main.nf.test b/modules/nf-core/svanalyzer/svbenchmark/tests/main.nf.test new file mode 100644 index 0000000..51466f7 --- /dev/null +++ b/modules/nf-core/svanalyzer/svbenchmark/tests/main.nf.test @@ -0,0 +1,85 @@ +nextflow_process { + + name "Test Process SVANALYZER_SVBENCHMARK" + script "../main.nf" + process "SVANALYZER_SVBENCHMARK" + tag "modules" + tag "modules_nfcore" + tag "svanalyzer" + tag "svanalyzer/svbenchmark" + + test("homo_sapiens - illumina - vcf.gz") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_test2_paired_mutect2_calls_vcf_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test2' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_vcf_gz'], checkIfExists: true) + ] + input[2] = [ + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + ] + input[3] = [ + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true) + ] + input[4] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.fns).match("fns") }, + { assert snapshot(process.out.fps).match("fps") }, + { assert snapshot(process.out.distances).match("distances") }, + { assert snapshot(process.out.report).match("report") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("homo_sapiens - illumina - vcf.gz - bed") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_test2_paired_mutect2_calls_vcf_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test2' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test2_haplotc_vcf_gz'], checkIfExists: true) + ] + input[2] = [ + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + ] + input[3] = [ + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta_fai'], checkIfExists: true) + ] + input[4] = [ + file(params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.fns).match("bed_fns") }, + { assert snapshot(process.out.fps).match("bed_fps") }, + { assert snapshot(process.out.distances).match("bed_distances") }, + { assert snapshot(process.out.report).match("bed_report") }, + { assert snapshot(process.out.versions).match("bed_versions") } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/svanalyzer/svbenchmark/tests/main.nf.test.snap b/modules/nf-core/svanalyzer/svbenchmark/tests/main.nf.test.snap new file mode 100644 index 0000000..599697d --- /dev/null +++ b/modules/nf-core/svanalyzer/svbenchmark/tests/main.nf.test.snap @@ -0,0 +1,188 @@ +{ + "homo_sapiens - illumina - vcf.gz": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.falsenegatives.vcf:md5,7a574ad3c614f0c38a6dccb5901ddc05" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.falsepositives.vcf:md5,146f05125330a9c0c4581a5a581744a7" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.distances:md5,b6a908b1187f159429dc8c6beb6e6f57" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.log:md5,16fd250ae2adcb07c8f101f640fdce81" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.report:md5,d0c7db2466442cbdd4f87ad516e00d32" + ] + ], + "5": [ + "versions.yml:md5,6ac08491dbe591d24615c131658cf771" + ], + "distances": [ + [ + { + "id": "test" + }, + "test.distances:md5,b6a908b1187f159429dc8c6beb6e6f57" + ] + ], + "fns": [ + [ + { + "id": "test" + }, + "test.falsenegatives.vcf:md5,7a574ad3c614f0c38a6dccb5901ddc05" + ] + ], + "fps": [ + [ + { + "id": "test" + }, + "test.falsepositives.vcf:md5,146f05125330a9c0c4581a5a581744a7" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,16fd250ae2adcb07c8f101f640fdce81" + ] + ], + "report": [ + [ + { + "id": "test" + }, + "test.report:md5,d0c7db2466442cbdd4f87ad516e00d32" + ] + ], + "versions": [ + "versions.yml:md5,6ac08491dbe591d24615c131658cf771" + ] + } + ], + "timestamp": "2023-11-24T15:28:29.422156227" + }, + "homo_sapiens - illumina - vcf.gz - bed": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.falsenegatives.vcf:md5,c04105833d5f9acb5ec50af0b89a6b0f" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.falsepositives.vcf:md5,146f05125330a9c0c4581a5a581744a7" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.distances:md5,95f7b1679e805979f0c9a4a322cb77f4" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.log:md5,75f05aac48afd818f2d5e8a3144116ea" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.report:md5,aaa9225dd6e261951c020e23473375b9" + ] + ], + "5": [ + "versions.yml:md5,6ac08491dbe591d24615c131658cf771" + ], + "distances": [ + [ + { + "id": "test" + }, + "test.distances:md5,95f7b1679e805979f0c9a4a322cb77f4" + ] + ], + "fns": [ + [ + { + "id": "test" + }, + "test.falsenegatives.vcf:md5,c04105833d5f9acb5ec50af0b89a6b0f" + ] + ], + "fps": [ + [ + { + "id": "test" + }, + "test.falsepositives.vcf:md5,146f05125330a9c0c4581a5a581744a7" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,75f05aac48afd818f2d5e8a3144116ea" + ] + ], + "report": [ + [ + { + "id": "test" + }, + "test.report:md5,aaa9225dd6e261951c020e23473375b9" + ] + ], + "versions": [ + "versions.yml:md5,6ac08491dbe591d24615c131658cf771" + ] + } + ], + "timestamp": "2023-11-24T15:29:05.547215282" + } +} \ No newline at end of file diff --git a/modules/nf-core/svanalyzer/svbenchmark/tests/tags.yml b/modules/nf-core/svanalyzer/svbenchmark/tests/tags.yml new file mode 100644 index 0000000..2a3bc9c --- /dev/null +++ b/modules/nf-core/svanalyzer/svbenchmark/tests/tags.yml @@ -0,0 +1,2 @@ +svanalyzer/svbenchmark: + - "modules/nf-core/svanalyzer/svbenchmark/**" diff --git a/modules/nf-core/svync/environment.yml b/modules/nf-core/svync/environment.yml new file mode 100644 index 0000000..fb7a1db --- /dev/null +++ b/modules/nf-core/svync/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "svync" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::svync=0.1.2" diff --git a/modules/nf-core/svync/main.nf b/modules/nf-core/svync/main.nf new file mode 100644 index 0000000..d28e526 --- /dev/null +++ b/modules/nf-core/svync/main.nf @@ -0,0 +1,40 @@ +process SVYNC { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/svync:0.1.2--h9ee0642_0': + 'quay.io/biocontainers/svync:0.1.2--h9ee0642_0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(tbi), path(config) + + output: + tuple val(meta),val(meta2), path("*.vcf.gz"), path("*.vcf.gz.tbi"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if ("$vcf" == "${prefix}.vcf.gz") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + + """ + svync \\ + $args \\ + --config $config \\ + --input $vcf \\ + | bgzip --threads $task.cpus $args2 > ${prefix}.vcf.gz + + tabix ${prefix}.vcf.gz + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svync: \$(svync --version | sed 's/svync version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/svync/meta.yml b/modules/nf-core/svync/meta.yml new file mode 100644 index 0000000..3846ae2 --- /dev/null +++ b/modules/nf-core/svync/meta.yml @@ -0,0 +1,59 @@ +--- +name: "svync" +description: A tool to standardize VCF files from structural variant callers +keywords: + - structural variants + - vcf + - standardization + - standardize + - sv +tools: + - "svync": + description: "A tool to standardize VCF files from structural variant callers" + homepage: "https://github.com/nvnieuwk/svync" + documentation: "https://github.com/nvnieuwk/svync" + tool_dev_url: "https://github.com/nvnieuwk/svync" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - vcf: + type: file + description: The input VCF file containing structural variants + pattern: "*.{vcf,vcf.gz}" + - tbi: + type: file + description: The index of the input VCF file containing structural variants + pattern: "*.tbi" + - meta2: + type: map + description: | + Groovy Map containing config meta information + - config: + type: file + description: The config stating how the standardization should happen + pattern: "*.{yml,yaml}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: The standardized VCF file + pattern: "*.vcf.gz" + +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/svync/tests/main.nf.test b/modules/nf-core/svync/tests/main.nf.test new file mode 100644 index 0000000..cc618f9 --- /dev/null +++ b/modules/nf-core/svync/tests/main.nf.test @@ -0,0 +1,84 @@ +nextflow_process { + + name "Test Process SVYNC" + script "../main.nf" + process "SVYNC" + + tag "modules" + tag "modules_nfcore" + tag "svync" + + test("sarscov2 - vcf, config") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['simulated_sv'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['simulated_sv_tbi'], checkIfExists: true) + ] + + input[1] = Channel.of([ + "config.yaml", + "info:", + " SVMETHOD:", + " value: svync", + " description: Type of approach used to detect SV", + " number: 1", + " type: string", + "format:", + " GT:", + " value: 1/1", + " description: Genotype", + " number: 1", + " type: string" + ]) + .collectFile(newLine:true, sort:'index') + .map { [[], it] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf[0][0], + path(process.out.vcf[0][1]).linesGzip[20..30] + ).match() } + ) + } + + } + + test("sarscov2 - vcf, config - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['simulated_sv'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['simulated_sv_tbi'], checkIfExists: true) + ] + + input[1] = Channel.of(["config.yaml", "info:", " SVMETHOD:", " value: svync", " description: Type of approach used to detect SV", " number: 1", " type: string"]) + .collectFile(newLine:true, sort:'index') + .map { [[], it] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/svync/tests/main.nf.test.snap b/modules/nf-core/svync/tests/main.nf.test.snap new file mode 100644 index 0000000..a951b8a --- /dev/null +++ b/modules/nf-core/svync/tests/main.nf.test.snap @@ -0,0 +1,55 @@ +{ + "sarscov2 - vcf, config": { + "content": [ + { + "id": "test", + "single_end": false + }, + [ + "chr21\t6279548\t_2\tN\t\t.\tLowQual\tEND=6280162;IMPRECISE;SVLEN=614;SVMETHOD=svync;SVTYPE=DUP\tGT\t1/1", + "chr21\t6279548\t_3\tN\t\t.\tLowQual\tEND=6280162;IMPRECISE;SVLEN=614;SVMETHOD=svync;SVTYPE=INV\tGT\t1/1", + "chr21\t6497225\t_4\tN\t\t.\tLowQual\tEND=6497978;IMPRECISE;SVLEN=753;SVMETHOD=svync;SVTYPE=INV\tGT\t1/1", + "chr21\t6801798\t_5\tN\t\t.\tLowQual\tEND=6801868;IMPRECISE;SVLEN=70;SVMETHOD=svync;SVTYPE=DEL\tGT\t1/1", + "chr21\t6801868\t_6\tN\t\t.\tLowQual\tEND=6802434;IMPRECISE;SVLEN=566;SVMETHOD=svync;SVTYPE=INV\tGT\t1/1", + "chr21\t6802434\t_7\tN\t\t.\tLowQual\tEND=6802504;IMPRECISE;SVLEN=70;SVMETHOD=svync;SVTYPE=DEL\tGT\t1/1", + "chr21\t6907396\t_8\tN\t\t.\tLowQual\tEND=6907463;IMPRECISE;SVLEN=67;SVMETHOD=svync;SVTYPE=DEL\tGT\t1/1", + "chr21\t6907463\t_9\tN\t\t.\tLowQual\tEND=6908007;IMPRECISE;SVLEN=544;SVMETHOD=svync;SVTYPE=INV\tGT\t1/1", + "chr21\t6908007\t_10\tN\t\t.\tLowQual\tEND=6908074;IMPRECISE;SVLEN=67;SVMETHOD=svync;SVTYPE=DEL\tGT\t1/1", + "chr21\t7776662\t_11\tN\t\t.\tLowQual\tEND=7777323;IMPRECISE;SVLEN=661;SVMETHOD=svync;SVTYPE=INV\tGT\t1/1", + "chr21\t8245292\t_12\tN\t\t.\tLowQual\tEND=8245913;IMPRECISE;SVLEN=621;SVMETHOD=svync;SVTYPE=INV\tGT\t1/1" + ] + ], + "timestamp": "2024-01-23T11:22:21.730837098" + }, + "sarscov2 - vcf, config - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,dd982c7896f22ebaa0ea51d00472c96c" + ], + "vcf": [ + [ + { + "id": "test", + "single_end": false + }, + "test.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,dd982c7896f22ebaa0ea51d00472c96c" + ] + } + ], + "timestamp": "2024-01-24T12:23:01.99066345" + } +} \ No newline at end of file diff --git a/modules/nf-core/svync/tests/tags.yml b/modules/nf-core/svync/tests/tags.yml new file mode 100644 index 0000000..e63467d --- /dev/null +++ b/modules/nf-core/svync/tests/tags.yml @@ -0,0 +1,2 @@ +svync: + - "modules/nf-core/svync/**" diff --git a/modules/nf-core/tabix/bgzip/environment.yml b/modules/nf-core/tabix/bgzip/environment.yml new file mode 100644 index 0000000..4fe40c5 --- /dev/null +++ b/modules/nf-core/tabix/bgzip/environment.yml @@ -0,0 +1,7 @@ +name: tabix_bgzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::tabix=1.11 diff --git a/modules/nf-core/tabix/bgzip/main.nf b/modules/nf-core/tabix/bgzip/main.nf new file mode 100644 index 0000000..8e22f38 --- /dev/null +++ b/modules/nf-core/tabix/bgzip/main.nf @@ -0,0 +1,54 @@ +process TABIX_BGZIP { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta),val(meta2), path(input) + + output: + tuple val(meta),val(meta2), path("${output}") , emit: output + tuple val(meta),val(meta2), path("${output}.gzi"), emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + in_bgzip = ["gz", "bgz", "bgzf"].contains(input.getExtension()) + extension = in_bgzip ? input.getBaseName().tokenize(".")[-1] : input.getExtension() + output = in_bgzip ? "${prefix}.${extension}" : "${prefix}.${extension}.gz" + command = in_bgzip ? '-d' : '' + // Name the index according to $prefix, unless a name has been requested + if ((args.matches("(^| )-i\\b") || args.matches("(^| )--index(\$| )")) && !args.matches("(^| )-I\\b") && !args.matches("(^| )--index-name\\b")) { + args = args + " -I ${output}.gzi" + } + """ + bgzip $command -c $args -@${task.cpus} $input > ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + in_bgzip = ["gz", "bgz", "bgzf"].contains(input.getExtension()) + output = in_bgzip ? input.getBaseName() : "${prefix}.${input.getExtension()}.gz" + + """ + touch ${output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/bgzip/meta.yml b/modules/nf-core/tabix/bgzip/meta.yml new file mode 100644 index 0000000..621d49e --- /dev/null +++ b/modules/nf-core/tabix/bgzip/meta.yml @@ -0,0 +1,52 @@ +name: tabix_bgzip +description: Compresses/decompresses files +keywords: + - compress + - decompress + - bgzip + - tabix +tools: + - bgzip: + description: | + Bgzip compresses or decompresses files in a similar manner to, and compatible with, gzip. + homepage: https://www.htslib.org/doc/tabix.html + documentation: http://www.htslib.org/doc/bgzip.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: file to compress or to decompress +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - output: + type: file + description: Output compressed/decompressed file + pattern: "*." + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" + - "@nvnieuwk" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" + - "@nvnieuwk" diff --git a/modules/nf-core/tabix/bgziptabix/environment.yml b/modules/nf-core/tabix/bgziptabix/environment.yml new file mode 100644 index 0000000..028461c --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/environment.yml @@ -0,0 +1,7 @@ +name: tabix_bgziptabix +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::tabix=1.11 diff --git a/modules/nf-core/tabix/bgziptabix/main.nf b/modules/nf-core/tabix/bgziptabix/main.nf new file mode 100644 index 0000000..3a099b8 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/main.nf @@ -0,0 +1,47 @@ +process TABIX_BGZIPTABIX { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta),val(meta2), path(input) + + output: + tuple val(meta),val(meta2), path("*.gz"), path("*.tbi"), optional: true, emit: gz_tbi + tuple val(meta),val(meta2), path("*.gz"), path("*.csi"), optional: true, emit: gz_csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bgzip --threads ${task.cpus} -c $args $input > ${prefix}.${input.getExtension()}.gz + tabix $args2 ${prefix}.${input.getExtension()}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${input.getExtension()}.gz + touch ${prefix}.${input.getExtension()}.gz.tbi + touch ${prefix}.${input.getExtension()}.gz.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/bgziptabix/meta.yml b/modules/nf-core/tabix/bgziptabix/meta.yml new file mode 100644 index 0000000..438aba4 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/meta.yml @@ -0,0 +1,53 @@ +name: tabix_bgziptabix +description: bgzip a sorted tab-delimited genome file and then create tabix index +keywords: + - bgzip + - compress + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tab: + type: file + description: TAB-delimited genome position file + pattern: "*.{bed,gff,sam,vcf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gz: + type: file + description: Output compressed file + pattern: "*.{gz}" + - tbi: + type: file + description: tabix index file + pattern: "*.{gz.tbi}" + - csi: + type: file + description: tabix alternate index file + pattern: "*.{gz.csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@DLBPointon" +maintainers: + - "@maxulysse" + - "@DLBPointon" diff --git a/modules/nf-core/tabix/tabix/environment.yml b/modules/nf-core/tabix/tabix/environment.yml new file mode 100644 index 0000000..7167fb8 --- /dev/null +++ b/modules/nf-core/tabix/tabix/environment.yml @@ -0,0 +1,7 @@ +name: tabix_tabix +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::tabix=1.11 diff --git a/modules/nf-core/tabix/tabix/main.nf b/modules/nf-core/tabix/tabix/main.nf new file mode 100644 index 0000000..3c6dea7 --- /dev/null +++ b/modules/nf-core/tabix/tabix/main.nf @@ -0,0 +1,42 @@ +process TABIX_TABIX { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta),val(meta2), path(tab) + + output: + tuple val(meta),val(meta2), path("*.tbi"), optional:true, emit: tbi + tuple val(meta),val(meta2), path("*.csi"), optional:true, emit: csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + tabix $args $tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${tab}.tbi + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/tabix/meta.yml b/modules/nf-core/tabix/tabix/meta.yml new file mode 100644 index 0000000..ae5b4f4 --- /dev/null +++ b/modules/nf-core/tabix/tabix/meta.yml @@ -0,0 +1,49 @@ +name: tabix_tabix +description: create tabix index from a sorted bgzip tab-delimited genome file +keywords: + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tab: + type: file + description: TAB-delimited genome position file compressed with bgzip + pattern: "*.{bed.gz,gff.gz,sam.gz,vcf.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tbi: + type: file + description: tabix index file + pattern: "*.{tbi}" + - csi: + type: file + description: coordinate sorted index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/truvari/bench/environment.yml b/modules/nf-core/truvari/bench/environment.yml new file mode 100644 index 0000000..033354b --- /dev/null +++ b/modules/nf-core/truvari/bench/environment.yml @@ -0,0 +1,7 @@ +name: truvari_bench +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::truvari=4.1.0 diff --git a/modules/nf-core/truvari/bench/main.nf b/modules/nf-core/truvari/bench/main.nf new file mode 100644 index 0000000..b0831d0 --- /dev/null +++ b/modules/nf-core/truvari/bench/main.nf @@ -0,0 +1,61 @@ +process TRUVARI_BENCH { + tag "$meta.id $meta2.caller" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/truvari:4.1.0--pyhdfd78af_0': + 'quay.io/biocontainers/truvari:4.1.0--pyhdfd78af_0' }" + + input: + tuple val(meta),val(meta2), path(vcf), path(tbi), path(truth_vcf), path(truth_tbi), path(bed) + tuple path(fasta), path(fai) + + output: + tuple val(meta), path("*.fn.vcf.gz") , emit: fn_vcf + tuple val(meta), path("*.fn.vcf.gz.tbi") , emit: fn_tbi + tuple val(meta), path("*.fp.vcf.gz") , emit: fp_vcf + tuple val(meta), path("*.fp.vcf.gz.tbi") , emit: fp_tbi + tuple val(meta), path("*.tp-base.vcf.gz") , emit: tp_base_vcf + tuple val(meta), path("*.tp-base.vcf.gz.tbi") , emit: tp_base_tbi + tuple val(meta), path("*.tp-comp.vcf.gz") , emit: tp_comp_vcf + tuple val(meta), path("*.tp-comp.vcf.gz.tbi") , emit: tp_comp_tbi + tuple val(meta), path("*.summary.json") , emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions = bed ? "--includebed $bed" : "" + def convert_type = params.dup_to_ins ? "--dup-to-ins" : "" + + """ + truvari bench \\ + --base ${truth_vcf} \\ + --comp ${vcf} \\ + --reference ${fasta} \\ + --output ${prefix} \\ + --pctseq $params.similarity \\ + $convert_type \\ + ${regions} \\ + ${args} + + mv ${prefix}/fn.vcf.gz ./${prefix}.fn.vcf.gz + mv ${prefix}/fn.vcf.gz.tbi ./${prefix}.fn.vcf.gz.tbi + mv ${prefix}/fp.vcf.gz ./${prefix}.fp.vcf.gz + mv ${prefix}/fp.vcf.gz.tbi ./${prefix}.fp.vcf.gz.tbi + mv ${prefix}/tp-base.vcf.gz ./${prefix}.tp-base.vcf.gz + mv ${prefix}/tp-base.vcf.gz.tbi ./${prefix}.tp-base.vcf.gz.tbi + mv ${prefix}/tp-comp.vcf.gz ./${prefix}.tp-comp.vcf.gz + mv ${prefix}/tp-comp.vcf.gz.tbi ./${prefix}.tp-comp.vcf.gz.tbi + mv ${prefix}/summary.json ./${prefix}.summary.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + truvari: \$(echo \$(truvari version 2>&1) | sed 's/^Truvari v//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/truvari/bench/meta.yml b/modules/nf-core/truvari/bench/meta.yml new file mode 100644 index 0000000..78f9abf --- /dev/null +++ b/modules/nf-core/truvari/bench/meta.yml @@ -0,0 +1,111 @@ +--- +name: "truvari_bench" +description: Given baseline and comparison sets of variants, calculate the recall/precision/f-measure +keywords: + - structural variants + - sv + - vcf + - benchmark + - comparison +tools: + - "truvari": + description: "Structural variant comparison tool for VCFs" + homepage: "https://github.com/ACEnglish/truvari" + documentation: "https://github.com/acenglish/truvari/wiki" + tool_dev_url: "https://github.com/ACEnglish/truvari" + doi: "10.1186/s13059-022-02840-6" + licence: "['MIT']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - vcf: + type: file + description: Input SV VCF file + pattern: "*.vcf.gz" + - tbi: + type: file + description: Input SV VCF index file + pattern: "*.vcf.gz.tbi" + - truth_vcf: + type: file + description: Input VCF file with truth SVs + pattern: "*.vcf.gz" + - tbi: + type: file + description: Input VCF index file with truth SVs + pattern: "*.vcf.gz.tbi" + - bed: + type: file + description: BED file containing regions to compare + pattern: "*.bed" + - meta2: + type: map + description: | + Groovy Map containing fasta information + e.g. `[ id:'test', single_end:false ]` + - fasta: + type: file + description: Reference FASTA file + pattern: "*.{fasta,fa,fna}" + - meta3: + type: map + description: | + Groovy Map containing fasta index information + e.g. `[ id:'test', single_end:false ]` + - fai: + type: file + description: Reference FASTA index file + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fn_vcf: + type: file + description: VCF file with false negatives + pattern: "*.fn.vcf.gz" + - fn_tbi: + type: file + description: VCF index file with false negatives + pattern: "*.fn.vcf.gz.tbi" + - fp_vcf: + type: file + description: VCF file with false positives + pattern: "*.fp.vcf.gz" + - fp_tbi: + type: file + description: VCF index file with false positives + pattern: "*.fp.vcf.gz.tbi" + - tp_base_vcf: + type: file + description: VCF file with base true positives + pattern: "*.tp-base.vcf.gz" + - tp_base_tbi: + type: file + description: VCF index file with base true positives + pattern: "*.tp-base.vcf.gz.tbi" + - tp_comp_vcf: + type: file + description: VCF file with compared true positives + pattern: "*.tp-comp.vcf.gz" + - tp_comp_tbi: + type: file + description: VCF index file with compared true positives + pattern: "*.tp-comp.vcf.gz.tbi" + - summary: + type: file + description: Summary JSON file with results from the benchmark + pattern: "*.summary.json" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/nextflow.config b/nextflow.config index b46c50e..1f4f8d7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,11 +12,42 @@ params { // TODO nf-core: Specify your pipeline's command line flags // Input options input = null + output = "results" + + // Preprocess spesific parameters + // normalization includes braking down multiallelic samples -m any + // deduplication removes one of the variants in the same position + preprocess = "" + standardization = "" + + // Benchmarking method + method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist + + // minsize effects both truvari and variantbenchmarkingmark in different ways! svbechmark filters both base and comp calls + // although with truvari it is possible to filter only base, which makes more sense to me! + // I will use this parameter to filter SVs before benchmarking! + min_sv_size = 50 + + // Truvari params + dup_to_ins = "" // truvari cannot benchmark DUP type, convert DUP type to INS. Has to be used with similarity = 0 + + // Manta params + bnd_to_inv = "" // manta reports INV as BND + + // Gridss params + gridss_annotate = "" + + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - + analysis = "germline" + + truth_config_ignore = false + sv_bed = null + snv_bed = null + sdf_file = null // MultiQC options multiqc_config = null @@ -43,7 +74,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - + // Max resource options // Defaults only, expecting to be overwritten @@ -71,7 +102,7 @@ try { } // Load nf-core/variantbenchmarking custom profiles from different institutions. -// Warning: Uncomment only if a pipeline-specific institutional config already exists on nf-core/configs! +// Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! // try { // includeConfig "${params.custom_config_base}/pipeline/variantbenchmarking.config" // } catch (Exception e) { @@ -91,7 +122,6 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false - channels = ['conda-forge', 'bioconda', 'defaults'] apptainer.enabled = false } mamba { @@ -106,16 +136,16 @@ profiles { } docker { docker.enabled = true + docker.userEmulation = true conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' } arm { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { singularity.enabled = true @@ -169,23 +199,27 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } + test_hg37 { includeConfig 'conf/test_hg37.config' } + test_hg38 { includeConfig 'conf/test_hg38.config' } test_full { includeConfig 'conf/test_full.config' } + truth { includeConfig 'conf/truth.config' } + } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Singularity are enabled // Set to your registry if you have a mirror of containers -apptainer.registry = 'quay.io' -docker.registry = 'quay.io' -podman.registry = 'quay.io' -singularity.registry = 'quay.io' +apptainer.registry = '' +docker.registry = '' +podman.registry = '' +singularity.registry = '' // Nextflow plugins plugins { - id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet } + // Load igenomes.config if required if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' @@ -229,9 +263,9 @@ dag { manifest { name = 'nf-core/variantbenchmarking' - author = """Kübra Narcı""" + author = """kuebra.narci@dkfz.de""" homePage = 'https://github.com/nf-core/variantbenchmarking' - description = """Nextflow variant benchmarking pipeline""" + description = """Master benchmarking pipeline for Structural Variant callers """ mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' version = '1.0dev' diff --git a/nextflow_schema.json b/nextflow_schema.json index 861d2ab..9cb22d3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,288 +1,296 @@ { - "$schema": "http://json-schema.org/draft-07/schema", - "$id": "https://raw.githubusercontent.com/nf-core/variantbenchmarking/master/nextflow_schema.json", - "title": "nf-core/variantbenchmarking pipeline parameters", - "description": "Nextflow variant benchmarking pipeline", - "type": "object", - "definitions": { - "input_output_options": { - "title": "Input/output options", - "type": "object", - "fa_icon": "fas fa-terminal", - "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], - "properties": { - "input": { - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/variantbenchmarking/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" - }, - "outdir": { - "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" - }, - "email": { - "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" - }, - "multiqc_title": { - "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" - } - } + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/variantbenchmarking/master/nextflow_schema.json", + "title": "nf-core/variantbenchmarking pipeline parameters", + "description": "Master benchmarking pipeline for variant callers ", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": ["input", "outdir"], + "properties": { + "input": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/csv", + "schema": "assets/schema_input.json", + "pattern": "^\\S+\\.(csv|tsv|yaml)$", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/variantbenchmarking/usage#samplesheet-input).", + "fa_icon": "fas fa-file-csv" }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, - "fasta": { - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." - } - } + "outdir": { + "type": "string", + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" }, - "institutional_config_options": { - "title": "Institutional config options", - "type": "object", - "fa_icon": "fas fa-university", - "description": "Parameters used to describe centralised config profiles. These should not be edited.", - "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", - "properties": { - "custom_config_version": { - "type": "string", - "description": "Git commit id for Institutional configs.", - "default": "master", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "custom_config_base": { - "type": "string", - "description": "Base directory for Institutional configs.", - "default": "https://raw.githubusercontent.com/nf-core/configs/master", - "hidden": true, - "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", - "fa_icon": "fas fa-users-cog" - }, - "config_profile_name": { - "type": "string", - "description": "Institutional config name.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_description": { - "type": "string", - "description": "Institutional config description.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_contact": { - "type": "string", - "description": "Institutional config contact information.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, - "config_profile_url": { - "type": "string", - "description": "Institutional config URL link.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - } - } + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" }, - "max_job_request_options": { - "title": "Max job request options", - "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", - "description": "Set the top limit for requested resources for any single job.", - "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", - "properties": { - "max_cpus": { - "type": "integer", - "description": "Maximum number of CPUs that can be requested for any single job.", - "default": 16, - "fa_icon": "fas fa-microchip", - "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" - }, - "max_memory": { - "type": "string", - "description": "Maximum amount of memory that can be requested for any single job.", - "default": "128.GB", - "fa_icon": "fas fa-memory", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" - }, - "max_time": { - "type": "string", - "description": "Maximum amount of time that can be requested for any single job.", - "default": "240.h", - "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", - "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" - } - } + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" + } + } + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + }, + "fasta": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", + "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", + "description": "Path to FASTA genome file.", + "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "fa_icon": "far fa-file-code" + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } + }, + "institutional_config_options": { + "title": "Institutional config options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "custom_config_version": { + "type": "string", + "description": "Git commit id for Institutional configs.", + "default": "master", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "custom_config_base": { + "type": "string", + "description": "Base directory for Institutional configs.", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true, + "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_description": { + "type": "string", + "description": "Institutional config description.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_contact": { + "type": "string", + "description": "Institutional config contact information.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_url": { + "type": "string", + "description": "Institutional config URL link.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + } + } + }, + "max_job_request_options": { + "title": "Max job request options", + "type": "object", + "fa_icon": "fab fa-acquisitions-incorporated", + "description": "Set the top limit for requested resources for any single job.", + "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", + "properties": { + "max_cpus": { + "type": "integer", + "description": "Maximum number of CPUs that can be requested for any single job.", + "default": 16, + "fa_icon": "fas fa-microchip", + "hidden": true, + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + }, + "max_memory": { + "type": "string", + "description": "Maximum amount of memory that can be requested for any single job.", + "default": "128.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" }, - "generic_options": { - "title": "Generic options", - "type": "object", - "fa_icon": "fas fa-file-import", - "description": "Less common options for the pipeline, typically set in a config file.", - "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", - "properties": { - "help": { - "type": "boolean", - "description": "Display help text.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "version": { - "type": "boolean", - "description": "Display version and exit.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, - "publish_dir_mode": { - "type": "string", - "default": "copy", - "description": "Method used to save pipeline results to output directory.", - "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", - "fa_icon": "fas fa-copy", - "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], - "hidden": true - }, - "email_on_fail": { - "type": "string", - "description": "Email address for completion summary, only when pipeline fails.", - "fa_icon": "fas fa-exclamation-triangle", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", - "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", - "hidden": true - }, - "plaintext_email": { - "type": "boolean", - "description": "Send plain-text email instead of HTML.", - "fa_icon": "fas fa-remove-format", - "hidden": true - }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true - }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true - }, - "hook_url": { - "type": "string", - "description": "Incoming hook URL for messaging service", - "fa_icon": "fas fa-people-group", - "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", - "hidden": true - }, - "multiqc_config": { - "type": "string", - "format": "file-path", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "multiqc_logo": { - "type": "string", - "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", - "fa_icon": "fas fa-image", - "hidden": true - }, - "multiqc_methods_description": { - "type": "string", - "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" - }, - "validate_params": { - "type": "boolean", - "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, - "fa_icon": "fas fa-check-square", - "hidden": true - }, - "validationShowHiddenParams": { - "type": "boolean", - "fa_icon": "far fa-eye-slash", - "description": "Show all params when using `--help`", - "hidden": true, - "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." - }, - "validationFailUnrecognisedParams": { - "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters fails when an unrecognised parameter is found.", - "hidden": true, - "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." - }, - "validationLenientMode": { - "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters in lenient more.", - "hidden": true, - "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." - } - } + "max_time": { + "type": "string", + "description": "Maximum amount of time that can be requested for any single job.", + "default": "240.h", + "fa_icon": "far fa-clock", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", + "hidden": true, + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } + } }, - "allOf": [ - { - "$ref": "#/definitions/input_output_options" + "generic_options": { + "title": "Generic options", + "type": "object", + "fa_icon": "fas fa-file-import", + "description": "Less common options for the pipeline, typically set in a config file.", + "help_text": "These options are common to all nf-core pipelines and allow you to customise some of the core preferences for how the pipeline runs.\n\nTypically these options would be set in a Nextflow config file loaded for all pipeline runs, such as `~/.nextflow/config`.", + "properties": { + "help": { + "type": "boolean", + "description": "Display help text.", + "fa_icon": "fas fa-question-circle", + "hidden": true }, - { - "$ref": "#/definitions/reference_genome_options" + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true }, - { - "$ref": "#/definitions/institutional_config_options" + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "Method used to save pipeline results to output directory.", + "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", + "fa_icon": "fas fa-copy", + "enum": [ + "symlink", + "rellink", + "link", + "copy", + "copyNoFollow", + "move" + ], + "hidden": true }, - { - "$ref": "#/definitions/max_job_request_options" + "email_on_fail": { + "type": "string", + "description": "Email address for completion summary, only when pipeline fails.", + "fa_icon": "fas fa-exclamation-triangle", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "help_text": "An email address to send a summary email to when the pipeline is completed - ONLY sent if the pipeline does not exit successfully.", + "hidden": true }, - { - "$ref": "#/definitions/generic_options" + "plaintext_email": { + "type": "boolean", + "description": "Send plain-text email instead of HTML.", + "fa_icon": "fas fa-remove-format", + "hidden": true + }, + "max_multiqc_email_size": { + "type": "string", + "description": "File size limit when attaching MultiQC reports to summary emails.", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "default": "25.MB", + "fa_icon": "fas fa-file-upload", + "hidden": true + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-palette", + "hidden": true + }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true + }, + "multiqc_config": { + "type": "string", + "format": "file-path", + "description": "Custom config file to supply to MultiQC.", + "fa_icon": "fas fa-cog", + "hidden": true + }, + "multiqc_logo": { + "type": "string", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", + "hidden": true + }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, + "validate_params": { + "type": "boolean", + "description": "Boolean whether to validate parameters against the schema at runtime", + "default": true, + "fa_icon": "fas fa-check-square", + "hidden": true + }, + "validationShowHiddenParams": { + "type": "boolean", + "fa_icon": "far fa-eye-slash", + "description": "Show all params when using `--help`", + "hidden": true, + "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } - ] + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/reference_genome_options" + }, + { + "$ref": "#/definitions/institutional_config_options" + }, + { + "$ref": "#/definitions/max_job_request_options" + }, + { + "$ref": "#/definitions/generic_options" + } + ] } diff --git a/pyproject.toml b/pyproject.toml index 7d08e1c..0d62beb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,10 @@ -# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. +# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. # Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. -[tool.ruff] +[tool.black] line-length = 120 -target-version = "py38" -select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] -cache-dir = "~/.cache/ruff" +target_version = ["py37", "py38", "py39", "py310"] -[tool.ruff.isort] -known-first-party = ["nf_core"] - -[tool.ruff.per-file-ignores] -"__init__.py" = ["E402", "F401"] +[tool.isort] +profile = "black" +known_first_party = ["nf_core"] +multi_line_output = 3 diff --git a/subworkflows/local/germline_benchmark.nf b/subworkflows/local/germline_benchmark.nf new file mode 100644 index 0000000..e07f1a6 --- /dev/null +++ b/subworkflows/local/germline_benchmark.nf @@ -0,0 +1,103 @@ +// +// GERMLINE: SUBWORKFLOW FOR GERMLINE VARIANTS +// + +params.options = [:] + +include { TRUVARI_PHAB } from '../../modules/local/truvari_phab' addParams( options: params.options ) +include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' addParams( options: params.options ) +include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' addParams( options: params.options ) +include { WITTYER } from '../../modules/local/wittyer' addParams( options: params.options ) +include { VCFDIST } from '../../modules/local/vcfdist' addParams( options: params.options ) +include { BAMSURGEON_EVALUATOR } from '../../modules/local/bamsurgeon_evaluator' addParams( options: params.options ) + +workflow GERMLINE_BENCHMARK { + take: + input_ch // channel: [val(meta),val(meta2), test_vcf, test_index , truth_vcf, truth_index] + ref // reference channel [ref.fa, ref.fa.fai] + truth_vcf // channel: [val(meta),val(meta2),truth_vcf, truth_index] + + main: + + versions=Channel.empty() + + // SV benchmarking + + if (params.method.contains('truvari')){ + + if(params.harmonize){ + // + // TRUVARI: TRUVARI_PHAB + // + TRUVARI_PHAB( + input_ch, + ref + ) + } + // + // MODULE: TRUVARI_BENCH + // + TRUVARI_BENCH( + input_ch, + ref + ) + versions = versions.mix(TRUVARI_BENCH.out.versions) + + } + + if (params.method.contains('svanalyzer')){ + // + // MODULE: SVANALYZER_SVBENCHMARK + // + // slower than truvari + SVANALYZER_SVBENCHMARK( + input_ch, + ref + ) + versions = versions.mix(SVANALYZER_SVBENCHMARK.out.versions) + + } + + if (params.method.contains('wittyer')){ + // + // MODULE: WITTYER + // + // BIG Advantage: reports by variant type + // Able to report CNV + WITTYER( + input_ch, + [] + ) + versions = versions.mix(WITTYER.out.versions) + } + + + if (params.method.contains('vcfdist')){ + // + // MODULE: VCFDIST + // + VCFDIST( + input_ch, + ref + ) + versions = versions.mix(VCFDIST.out.versions) + } + + if (params.method.contains('bamsurgeon')){ + // + // MODULE: BAMSURGEON_EVALUATOR + // + //https://github.com/adamewing/bamsurgeon/blob/master/scripts/evaluator.py + BAMSURGEON_EVALUATOR( + input_ch.map{it -> tuple(it[0],it[1], it[2], it[3], it[4], it[5])}, + ref, + "SV" + ) + versions = versions.mix(BAMSURGEON_EVALUATOR.out.versions) + } + + + + emit: + versions +} diff --git a/subworkflows/local/germline_benchmark_backup.nf b/subworkflows/local/germline_benchmark_backup.nf new file mode 100644 index 0000000..59607f8 --- /dev/null +++ b/subworkflows/local/germline_benchmark_backup.nf @@ -0,0 +1,162 @@ +// +// GERMLINE: SUBWORKFLOW FOR GERMLINE VARIANTS +// + +params.options = [:] + +include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' addParams( options: params.options ) +include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' addParams( options: params.options ) +include { WITTYER } from '../../modules/local/wittyer' addParams( options: params.options ) +include { VCFDIST } from '../../modules/local/vcfdist' addParams( options: params.options ) +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_QUERY } from '../../modules/local/bcftools_view' addParams( options: params.options ) +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_TRUTH } from '../../modules/local/bcftools_view' addParams( options: params.options ) +include { ADDHEAD as ADDHEAD_TRUTH } from '../../modules/local/addhead' addParams( options: params.options ) +include { ADDHEAD as ADDHEAD_QUERY } from '../../modules/local/addhead' addParams( options: params.options ) +include { BCFTOOLS_ISEC as BCFTOOLS_ISEC_TRUTH } from '../../modules/nf-core/bcftools/isec' addParams( options: params.options ) +include { BCFTOOLS_ISEC as BCFTOOLS_ISEC_QUERY } from '../../modules/nf-core/bcftools/isec' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_2 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) + +workflow GERMLINE_BENCHMARK { + take: + input_ch // channel: [val(meta),val(meta2), test_vcf, test_index , truth_vcf, truth_index] + bed // channel: bed + ref // reference channel [ref.fa, ref.fa.fai] + truth_vcf // channel: [val(meta),val(meta2),truth_vcf, truth_index] + + main: + + versions=Channel.empty() + + // SV benchmarking + + if (params.method.contains('truvari')){ + // + // MODULE: TRUVARI_BENCH + // + TRUVARI_BENCH( + input_ch, + bed, + ref + ) + versions = versions.mix(TRUVARI_BENCH.out.versions) + } + + if (params.method.contains('svanalyzer')){ + // + // MODULE: SVANALYZER_SVBENCHMARK + // + // slower than truvari + SVANALYZER_SVBENCHMARK( + input_ch, + ref, + bed + ) + versions = versions.mix(SVANALYZER_SVBENCHMARK.out.versions) + + // get the original headers from the vcfs + // + // MODULE: BCFTOOLS_VIEW + // + BCFTOOLS_VIEW_QUERY( + input_ch.map{it -> tuple(it[0],it[1], it[2], it[3])} + ) + query_header = BCFTOOLS_VIEW_QUERY.out.header + + BCFTOOLS_VIEW_TRUTH( + truth_vcf + ) + truth_header = BCFTOOLS_VIEW_TRUTH.out.header + versions = versions.mix(BCFTOOLS_VIEW_QUERY.out.versions) + // + // MODULE: BCFTOOLS_REHEADER + // + SVANALYZER_SVBENCHMARK.out.fns.combine(truth_header, by:0) + .map{it -> tuple(it[0],it[1], it[2], it[4])} + .set{fns_header} + fns_header.view() + ADDHEAD_TRUTH( + fns_header + ) + + TABIX_TABIX_1( + ADDHEAD_TRUTH.out.vcf + ) + + ADDHEAD_TRUTH.out.vcf.join(TABIX_TABIX_1.out.tbi, by:1) + .map{it -> tuple( it[1], it[0], it[2], it[4])} + .set{fns_vcf} + + ////// + SVANALYZER_SVBENCHMARK.out.fps.combine(query_header, by:0) + .map{it -> tuple(it[0],it[1], it[2], it[4])} + .set{fps_header} + + ADDHEAD_QUERY( + fps_header + ) + versions = versions.mix(ADDHEAD_QUERY.out.versions) + + TABIX_TABIX_2( + ADDHEAD_QUERY.out.vcf + ) + + ADDHEAD_QUERY.out.vcf.join(TABIX_TABIX_2.out.tbi, by:1) + .map{it -> tuple( it[1], it[0], it[2], it[4])} + .set{fps_ch} + // + // MODULE: BCFTOOLS_ISEC + // + + // Find TP_comp (query) + input_ch.map{it -> tuple(it[0],it[1], it[2], it[3])} + .combine(fps_ch, by:0) + .map{it -> tuple(it[0],it[1], it[2], it[5], it[3], it[6])} + .set{query_ch} + + BCFTOOLS_ISEC_QUERY( + query_ch + ) + + // Find TP_comp (query) + truth_vcf.combine(fns_vcf, by:0) + .map{it -> tuple(it[0],it[1], it[2], it[5], it[3], it[6])} + .set{truth_ch} + + // Find TP_base (truth) + BCFTOOLS_ISEC_TRUTH( + truth_ch + ) + versions = versions.mix(BCFTOOLS_ISEC_TRUTH.out.versions) + + } + + if (params.method.contains('wittyer')){ + // + // MODULE: WITTYER + // + WITTYER( + input_ch, + bed, + [] + ) + versions = versions.mix(WITTYER.out.versions) + } + + + if (params.method.contains('vcfdist')){ + // + // MODULE: VCFDIST + // + VCFDIST( + input_ch, + ref, + bed + ) + versions = versions.mix(VCFDIST.out.versions) + } + + + emit: + versions +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0aecf87..2c227a8 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,43 +2,55 @@ // Check input samplesheet and get read channels // -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' +params.options = [:] + +include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' addParams( options: params.options ) workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv main: - SAMPLESHEET_CHECK ( samplesheet ) + SAMPLESHEET_CHECK (samplesheet) .csv .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } + .map{ create_vcf_channel(it) } + .set {ch_sample} emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] + ch_sample // channel: [ val(meta), test_vcf] + versions = SAMPLESHEET_CHECK.out.versions } -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map +def create_vcf_channel(LinkedHashMap row) { +// create meta map def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.id = params.sample + def meta2 = [:] + meta2.caller = row.caller // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + def vcf_meta = [] + if (!file(row.test_vcf).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Test file does not exist!\n${row.test_vcf}" + } + + if (meta2.caller == "delly"){ + vcf_meta = [ meta, meta2, file(row.test_vcf), file("${projectDir}/assets/svync/delly.yaml")] + } + else if (meta2.caller == "gridss"){ + vcf_meta = [ meta, meta2, file(row.test_vcf), file("${projectDir}/assets/svync/gridss.yaml")] + } + else if (meta2.caller == "manta"){ + if (file("${projectDir}/assets/svync/manta.yaml").exists()){ + vcf_meta = [ meta, meta2, file(row.test_vcf), file("${projectDir}/assets/svync/manta.yaml")] + } } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta + else if (meta2.caller == "smoove"){ + vcf_meta = [ meta, meta2, file(row.test_vcf), file("${projectDir}/assets/svync/smoove.yaml")] + } + else{ + vcf_meta = [ meta, meta2, file(row.test_vcf), file("${projectDir}/assets/svync/default.yaml")] + } + return vcf_meta } diff --git a/subworkflows/local/prepare_regions.nf b/subworkflows/local/prepare_regions.nf new file mode 100644 index 0000000..050cf10 --- /dev/null +++ b/subworkflows/local/prepare_regions.nf @@ -0,0 +1,44 @@ +// +// PREPARE_STRATIFICATIONS: SUBWORKFLOW TO PREPARE BED FILES - HIGH CONFIDENCE AND OTHER LEVELS +// + +params.options = [:] + +include { MAIN_CHROMS } from '../../modules/local/main_chroms.nf' addParams( options: params.options ) +include { EXTRACT_MAIN } from '../../modules/local/extract_main.nf' addParams( options: params.options ) + + +workflow PREPARE_REGIONS { + take: + ref // reference channel [ref.fa, ref.fa.fai] + high_conf + + main: + + versions=Channel.empty() + + ref.map { it -> tuple([id: it[0].baseName], it[1]) } + .set{fasta} + + // this is not working! + + // get contig file including only main chroms + MAIN_CHROMS( + fasta + ) + main_chroms = MAIN_CHROMS.out.sizes + versions = versions.mix(MAIN_CHROMS.out.versions) + + high_conf.map { it -> tuple([id: it[0].baseName], it[0]) } + .set{bed} + bed.view() + EXTRACT_MAIN( + bed + ) + chr_list = EXTRACT_MAIN.out.chr_list + + emit: + main_chroms + chr_list + versions +} diff --git a/subworkflows/local/prepare_vcfs_test.nf b/subworkflows/local/prepare_vcfs_test.nf new file mode 100644 index 0000000..f01ef6e --- /dev/null +++ b/subworkflows/local/prepare_vcfs_test.nf @@ -0,0 +1,131 @@ +// +// PREPARE_VCFS: SUBWORKFLOW TO PREPARE INPUT VCFS +// + +params.options = [:] + +include { BCFTOOLS_VIEW } from '../../modules/local/bcftools_view' addParams( options: params.options ) +include { SURVIVOR_FILTER } from '../../modules/nf-core/survivor/filter' addParams( options: params.options ) +include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) +include { BCFTOOLS_NORM as BCFTOOLS_NORM_1 } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { BCFTOOLS_NORM as BCFTOOLS_NORM_2 } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_2 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_1 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_2 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_3 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) +include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TEST } from '../../modules/nf-core/bcftools/reheader' addParams( options: params.options ) + + +workflow PREPARE_VCFS_TEST { + take: + input_ch // channel: [val(meta),val(meta2), vcf] + ref // reference channel [ref.fa, ref.fa.fai] + main_chroms // channel path(chrom sizes) + chr_list + + main: + + versions=Channel.empty() + + //ref.map { it -> tuple([id: it[0].baseName], it[1]) } + // .set{fasta} + // + // PREPARE_VCFS + // + // Reheader needed to standardize sample names + BCFTOOLS_REHEADER_TEST( + input_ch, + ref + ) + versions = versions.mix(BCFTOOLS_REHEADER_TEST.out.versions) + + TABIX_BGZIPTABIX_1( + BCFTOOLS_REHEADER_TEST.out.vcf + ) + vcf_ch = TABIX_BGZIPTABIX_1.out.gz_tbi + + // + // BCFTOOLS_VIEW + // + // To filter out contigs! + BCFTOOLS_VIEW( + vcf_ch + ) + versions = versions.mix(BCFTOOLS_VIEW.out.versions) + + TABIX_BGZIPTABIX_2( + BCFTOOLS_VIEW.out.vcf + ) + vcf_ch = TABIX_BGZIPTABIX_2.out.gz_tbi + + if (params.preprocess.contains("normalization")){ + // + // BCFTOOLS_NORM + // + // Breaks down -any- multi-allelic variants + BCFTOOLS_NORM_1( + vcf_ch, + ref, + [[],[]] + ) + versions = versions.mix(BCFTOOLS_NORM_1.out.versions) + + TABIX_TABIX_1( + BCFTOOLS_NORM_1.out.vcf + ) + + BCFTOOLS_NORM_1.out.vcf.join(TABIX_TABIX_1.out.tbi, by:1) + .map{it -> tuple( it[1], it[0], it[2], it[4])} + .set{vcf_ch} + } + if (params.min_sv_size > 0){ + + TABIX_BGZIP( + vcf_ch.map{it -> tuple( it[0], it[1], it[2])} + ) + versions = versions.mix(TABIX_BGZIP.out.versions) + + // + // MODULE: SURVIVOR_FILTER + // + // filters out smaller SVs than min_sv_size + SURVIVOR_FILTER( + TABIX_BGZIP.out.output.map{it -> tuple( it[0], it[1], it[2], [])}, + params.min_sv_size, + -1, + -1, + -1 + ) + versions = versions.mix(SURVIVOR_FILTER.out.versions) + + TABIX_BGZIPTABIX_3( + SURVIVOR_FILTER.out.vcf + ) + vcf_ch = TABIX_BGZIPTABIX_3.out.gz_tbi + } + + if (params.preprocess.contains("deduplication")){ + // + // BCFTOOLS_NORM + // + // Deduplicates variants at the same position test + BCFTOOLS_NORM_2( + vcf_ch, + ref, + [[],[]] + ) + versions = versions.mix(BCFTOOLS_NORM_2.out.versions) + + TABIX_TABIX_2( + BCFTOOLS_NORM_2.out.vcf + ) + + BCFTOOLS_NORM_2.out.vcf.join(TABIX_TABIX_2.out.tbi, by:1) + .map{it -> tuple( it[1], it[0], it[2], it[4])} + .set{vcf_ch} + } + emit: + vcf_ch + versions +} diff --git a/subworkflows/local/prepare_vcfs_truth.nf b/subworkflows/local/prepare_vcfs_truth.nf new file mode 100644 index 0000000..10cff46 --- /dev/null +++ b/subworkflows/local/prepare_vcfs_truth.nf @@ -0,0 +1,99 @@ +// +// PREPARE_VCFS: SUBWORKFLOW TO PREPARE INPUT VCFS +// + +params.options = [:] + +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix.nf' addParams( options: params.options ) +include { BCFTOOLS_VIEW } from '../../modules/local/bcftools_view' addParams( options: params.options ) +include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) +include { BCFTOOLS_NORM as BCFTOOLS_NORM_1 } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { BCFTOOLS_NORM as BCFTOOLS_NORM_2 } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_2 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_3 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TRUTH } from '../../modules/nf-core/bcftools/reheader' addParams( options: params.options ) + +workflow PREPARE_VCFS_TRUTH { + take: + truth_ch // channel: [val(meta), vcf] + ref // reference channel [ref.fa, ref.fa.fai] + + main: + + versions=Channel.empty() + + // + // PREPARE_VCFS + // + truth_ch.map { it -> tuple([id: params.sample],[caller:"truth"], it[0]) } + .set{truth} + + // BGZIP if needed and index truth + BGZIP_TABIX( + truth + ) + versions = versions.mix(BGZIP_TABIX.out.versions) + vcf_ch = BGZIP_TABIX.out.gz_tbi + + // Reheader needed to standardize sample names + BCFTOOLS_REHEADER_TRUTH( + vcf_ch, + ref + ) + versions = versions.mix(BCFTOOLS_REHEADER_TRUTH.out.versions) + + TABIX_BGZIPTABIX( + BCFTOOLS_REHEADER_TRUTH.out.vcf + ) + vcf_ch = TABIX_BGZIPTABIX.out.gz_tbi + + if (params.preprocess.contains("normalization")){ + // + // MODULE: BCFTOOLS_NORM + // + // Normalize test + // multi-allelic variants will be splitted. + BCFTOOLS_NORM_1( + vcf_ch, + ref, + [[],[]] + ) + versions = versions.mix(BCFTOOLS_NORM_1.out.versions) + + TABIX_TABIX_1( + BCFTOOLS_NORM_1.out.vcf + ) + versions = versions.mix(TABIX_TABIX_1.out.versions) + + BCFTOOLS_NORM_1.out.vcf.join(TABIX_TABIX_1.out.tbi, by:1) + .map{it -> tuple(it[1],it[0], it[2], it[4])} + .set{vcf_ch} + } + if (params.preprocess.contains("deduplication")){ + // + // MODULE: BCFTOOLS_NORM + // + // Deduplicate variants at the same position + BCFTOOLS_NORM_2( + vcf_ch, + ref, + [[],[]] + ) + versions = versions.mix(BCFTOOLS_NORM_2.out.versions) + + TABIX_TABIX_2( + BCFTOOLS_NORM_2.out.vcf + ) + versions = versions.mix(TABIX_TABIX_2.out.versions) + + BCFTOOLS_NORM_2.out.vcf.join(TABIX_TABIX_2.out.tbi, by:1) + .map{it -> tuple(it[1],it[0], it[2], it[4])} + .set{vcf_ch} + } + + + emit: + vcf_ch + versions +} diff --git a/subworkflows/local/report_vcf_statistics.nf b/subworkflows/local/report_vcf_statistics.nf new file mode 100644 index 0000000..a17a262 --- /dev/null +++ b/subworkflows/local/report_vcf_statistics.nf @@ -0,0 +1,52 @@ +// +// PREPARE_VCFS: SUBWORKFLOW TO REPORT VCF STATS +// + +params.options = [:] + +include { SURVIVOR_STATS } from '../../modules/nf-core/survivor/stats' addParams( options: params.options ) +include { BCFTOOLS_STATS } from '../../modules/nf-core/bcftools/stats' addParams( options: params.options ) + +workflow REPORT_VCF_STATISTICS { + take: + input_ch // channel: [val(meta), vcf, index] + + main: + + versions=Channel.empty() + + // + // SURVIVOR_STATS + // + + SURVIVOR_STATS( + input_ch, + -1, + -1, + -1 + ) + survivor_stats = SURVIVOR_STATS.out.stats + versions = versions.mix(SURVIVOR_STATS.out.versions) + + // + // BCFTOOLS_STATS + // + BCFTOOLS_STATS( + input_ch, + [[],[]], + [[],[]], + [[],[]], + [[],[]], + [[],[]] + ) + bcftools_stats = BCFTOOLS_STATS.out.stats + versions = versions.mix(BCFTOOLS_STATS.out.versions) + + + // Add here a tool, to visualize SV statistics in a histogram. + + emit: + bcftools_stats + survivor_stats + versions +} diff --git a/subworkflows/local/somatic_benchmark.nf b/subworkflows/local/somatic_benchmark.nf new file mode 100644 index 0000000..cc1bc80 --- /dev/null +++ b/subworkflows/local/somatic_benchmark.nf @@ -0,0 +1,49 @@ +// +// SOMATIC: SUBWORKFLOW FOR SOMATIC VARIANTS +// + +params.options = [:] + +include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' addParams( options: params.options ) +include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' addParams( options: params.options ) + +workflow SOMATIC_BENCHMARK { + take: + input_ch // channel: [val(meta), test_vcf,test_index, truth_vcf, truth_index, bed] + ref // reference channel [ref.fa, ref.fa.fai] + + main: + + versions=Channel.empty() + + // SV Benchmarking + // + // MODULE: TRUVARI_BENCH + // + TRUVARI_BENCH( + input_ch, + ref + ) + versions = versions.mix(TRUVARI_BENCH.out.versions) + + // SV Benchmarking + // + // MODULE: SVANALYZER_SVBENCHMARK + // + // note: slow + //SVANALYZER_SVBENCHMARK( + // bench.sv, + // ref, + // sv_bed + //) + //versions = versions.mix(SVANALYZER_SVBENCHMARK.out.versions) + + // Small Variant Benchmarking + + // SOMPY https://sites.google.com/view/seqc2/home/benchmarking-examples?authuser=0 + // is used for somatic variant benchmarking! + + + emit: + versions +} diff --git a/subworkflows/local/vcf_conversion.nf b/subworkflows/local/vcf_conversion.nf new file mode 100644 index 0000000..b6e355f --- /dev/null +++ b/subworkflows/local/vcf_conversion.nf @@ -0,0 +1,126 @@ +// +// VCF_CONVERSIONS: SUBWORKFLOW TO apply tool spesific conversions +// + +params.options = [:] + +include { MANTA_CONVERTINVERSION } from '../../modules/nf-core/manta/convertinversion' addParams( options: params.options ) +include { GRIDSS_ANNOTATION } from '../../modules/local/gridss_annotation' addParams( options: params.options ) +include { SVYNC } from '../../modules/nf-core/svync' addParams( options: params.options ) +include { AWK_SORT } from '../../modules/local/awk_sort.nf' addParams( options: params.options ) + +workflow VCF_CONVERSIONS { + take: + input_ch // channel: [val(meta),val(meta2), vcf, config.yml] + ref // reference channel [ref.fa, ref.fa.fai] + + main: + + out_vcf_ch = Channel.empty() + versions = Channel.empty() + + // + // MODULE: AWK_SORT + // + // sort and index input test files + + AWK_SORT( + input_ch.map{it -> tuple(it[0], it[1], it[2])} + ) + versions = versions.mix(AWK_SORT.out.versions) + vcf_ch = AWK_SORT.out.vcf + + // + // MODULE: SVYNC + // + // + if(params.standardization){ + + vcf_ch.branch{ + tool: it[1].caller == "delly" || it[1].caller == "gridss" || it[1].caller == "manta" || it[1].caller == "smoove" + other: true} + .set{main_vcf_ch} + + + input_ch.map{it -> tuple(it[0], it[1], it[3])} + .combine(vcf_ch, by:1) + .map{it -> tuple(it[1], it[0], it[4], it[5], it[2])} + .set{snd_ch} + + snd_ch.branch{ + tool: it[1].caller == "delly" || it[1].caller == "gridss" || it[1].caller == "manta" || it[1].caller == "smoove" + other: true} + .set{input} + + SVYNC( + input.tool + ) + out_vcf_ch = out_vcf_ch.mix(SVYNC.out.vcf) + out_vcf_ch = out_vcf_ch.mix(input.other) + }else{ + out_vcf_ch = vcf_ch + } + + out2_vcf_ch = Channel.empty() + + // Check tool spesific conversions + if(params.bnd_to_inv){ + + out_vcf_ch.branch{ + tool: it[1].caller == "manta" || it[1].caller == "dragen" + other: true} + .set{input} + // + // MANTA_CONVERTINVERSION + // + //NOTE: should also work for dragen + // Not working now!!!!! + + MANTA_CONVERTINVERSION( + input.tool, + ref + ) + versions = versions.mix(MANTA_CONVERTINVERSION.out.versions) + + out2_vcf_ch = out2_vcf_ch.mix(MANTA_CONVERTINVERSION.out.vcf_tabi) + out2_vcf_ch = out2_vcf_ch.mix(input.other) + + // https://github.com/srbehera/DRAGEN_Analysis/blob/main/convertInversion.py + + } + else{ + out2_vcf_ch = out_vcf_ch + } + + out3_vcf_ch = Channel.empty() + + if (params.gridss_annotate){ + out2_vcf_ch.branch{ + tool: it[1].caller == "gridss" + other: true} + .set{input} + + // + // GRIDSS_ANNOTATION + // + // https://github.com/PapenfussLab/gridss/blob/7b1fedfed32af9e03ed5c6863d368a821a4c699f/example/simple-event-annotation.R#L9 + // GRIDSS simple event annotation + GRIDSS_ANNOTATION( + input.tool, + ref + ) + versions = versions.mix(GRIDSS_ANNOTATION.out.versions) + + out3_vcf_ch = out3_vcf_ch.mix(GRIDSS_ANNOTATION.out.vcf) + out3_vcf_ch = out3_vcf_ch.mix(input.other) + } + else{ + out3_vcf_ch = out2_vcf_ch + } + + // https://github.com/EUCANCan/variant-extractor/blob/main/examples/vcf_to_csv.py + + emit: + out3_vcf_ch + versions +} diff --git a/subworkflows/local/visualizations.nf b/subworkflows/local/visualizations.nf new file mode 100644 index 0000000..29f727c --- /dev/null +++ b/subworkflows/local/visualizations.nf @@ -0,0 +1,35 @@ +// +// PREPARE_VCFS: SUBWORKFLOW TO PREPARE INPUT VCFS +// + +params.options = [:] + +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix.nf' addParams( options: params.options ) +include { BCFTOOLS_VIEW } from '../../modules/local/bcftools_view' addParams( options: params.options ) +include { BCFTOOLS_NORM as BCFTOOLS_NORM_1 } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { BCFTOOLS_NORM as BCFTOOLS_NORM_2 } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_2 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_3 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { BGZIP_TABIX as BGZIP_TABIX_1 } from '../../modules/local/bgzip_tabix' addParams( options: params.options ) +include { BGZIP_TABIX as BGZIP_TABIX_2 } from '../../modules/local/bgzip_tabix' addParams( options: params.options ) + +workflow PREPARE_VCFS_TRUTH { + take: + truth_ch // channel: [val(meta), vcf] + ref // reference channel [ref.fa, ref.fa.fai] + main_chroms // channel: path(chrom sizes) + + main: + + versions=Channel.empty() + +// Check tool spesific conversions + + // https://github.com/PapenfussLab/gridss/blob/7b1fedfed32af9e03ed5c6863d368a821a4c699f/example/simple-event-annotation.R#L9 + // GRIDSS simple event annotation + + emit: + vcf_ch + versions +} diff --git a/workflows/variantbenchmarking.nf b/workflows/variantbenchmarking.nf index 04a3bf2..6ec48fe 100644 --- a/workflows/variantbenchmarking.nf +++ b/workflows/variantbenchmarking.nf @@ -4,7 +4,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' @@ -15,6 +15,21 @@ log.info logo + paramsSummaryLog(workflow) + citation WorkflowVariantbenchmarking.initialise(params, log) +// check mandatory parameters +ref = Channel.fromPath([params.fasta,params.fai], checkIfExists: true).collect() + +// check high confidence files + +truth = params.truth ? Channel.fromPath(params.truth, checkIfExists: true).collect() + : Channel.empty() + +high_conf = params.high_conf ? Channel.fromPath(params.high_conf, checkIfExists: true).collect() + : Channel.empty() + +// TODO: GET FILES FROM IGENOMES ACCORDING TO META.ID + + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -35,7 +50,16 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { SOMATIC_BENCHMARK } from '../subworkflows/local/somatic_benchmark' +include { GERMLINE_BENCHMARK } from '../subworkflows/local/germline_benchmark' +include { PREPARE_REGIONS } from '../subworkflows/local/prepare_regions' +include { PREPARE_VCFS_TRUTH } from '../subworkflows/local/prepare_vcfs_truth' +include { PREPARE_VCFS_TEST } from '../subworkflows/local/prepare_vcfs_test' +include { VCF_CONVERSIONS } from '../subworkflows/local/vcf_conversion' +include { REPORT_VCF_STATISTICS as REPORT_STATISTICS_TEST } from '../subworkflows/local/report_vcf_statistics' +include { REPORT_VCF_STATISTICS as REPORT_STATISTICS_TRUTH } from '../subworkflows/local/report_vcf_statistics' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -46,7 +70,6 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' @@ -70,17 +93,97 @@ workflow VARIANTBENCHMARKING { file(params.input) ) ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema + ch_input = INPUT_CHECK.out.ch_sample // - // MODULE: Run FastQC + // PREPARE_REGIONS: prepare stratifications and contigs // - FASTQC ( - INPUT_CHECK.out.reads + PREPARE_REGIONS( + ref, + high_conf ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_versions = ch_versions.mix(PREPARE_REGIONS.out.versions) + + // + // SUBWORKFLOW: VCF_CONVERSIONS + // + // Standardize VCFs, tool spesific modifications + VCF_CONVERSIONS( + ch_input, + ref + ) + ch_versions = ch_versions.mix(VCF_CONVERSIONS.out.versions) + + // + // SUBWORKFLOW: Prepare and normalize input vcfs + // + PREPARE_VCFS_TRUTH( + truth, + ref + ) + ch_versions = ch_versions.mix(PREPARE_VCFS_TRUTH.out.versions) + + PREPARE_VCFS_TEST( + VCF_CONVERSIONS.out.out3_vcf_ch.map{it -> tuple(it[0], it[1], it[2], it[3])}, + ref, + PREPARE_REGIONS.out.main_chroms, + PREPARE_REGIONS.out.chr_list + ) + ch_versions = ch_versions.mix(PREPARE_VCFS_TEST.out.versions) + + // + // SUBWORKFLOW: GET STATISTICS OF FILES + // + //REPORT_STATISTICS_TRUTH( + // + REPORT_STATISTICS_TEST( + PREPARE_VCFS_TEST.out.vcf_ch + ) + REPORT_STATISTICS_TRUTH( + PREPARE_VCFS_TRUTH.out.vcf_ch + ) + ch_versions = ch_versions.mix(PREPARE_VCFS_TRUTH.out.versions) + + // prepare benchmark set + + high_conf.map { it -> tuple([id: params.sample],[caller:"truth"], it[0]) } + .set{bed} + + PREPARE_VCFS_TEST.out.vcf_ch.combine(PREPARE_VCFS_TRUTH.out.vcf_ch, by:0) + .combine(bed, by:0) + .map{it -> tuple(it[0],it[1], it[2], it[3], it[5], it[6], it[8])} + .set{bench_ch} + + // + // SUBWORKFLOW: GERMLINE_BENCHMARK + // + //Benchmarking spesific to germline samples + + GERMLINE_BENCHMARK( + bench_ch, + ref, + PREPARE_VCFS_TRUTH.out.vcf_ch + ) + ch_versions = ch_versions.mix(GERMLINE_BENCHMARK.out.versions) + + + if (params.analysis.contains("somatic")){ + + // SOMATIC VARIANT BENCHMARKING + SOMATIC_BENCHMARK( + bench_ch, + ref + ) + ch_versions = ch_versions.mix(SOMATIC_BENCHMARK.out.versions) + } + + // TODO: NEED A TOOL TO COLLECT METRICS AND ROCS LIKE DATAVZRD OR SQLITE DATABASE + + + // TODO: BENCHMARKING OF CNV + + + // TODO: TRIO ANALYSIS : MENDELIAN INCONSISTANCE CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -89,7 +192,7 @@ workflow VARIANTBENCHMARKING { // // MODULE: MultiQC // - workflow_summary = WorkflowVariantbenchmarking.paramsSummaryMultiqc(workflow, summary_params) + workflow_summary = WorkflowVariantbenchmarking.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) methods_description = WorkflowVariantbenchmarking.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) @@ -99,7 +202,7 @@ workflow VARIANTBENCHMARKING { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + //ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) MULTIQC ( ch_multiqc_files.collect(),
Process Name \\", - " \\ Software Version
CUSTOM_DUMPSOFTWAREVERSIONSpython3.11.7
yaml5.4.1
TOOL1tool10.11.9
TOOL2tool21.9
WorkflowNextflow