diff --git a/assets/samplesheet_HG002_hg37.csv b/assets/samplesheet_HG002_hg37.csv index c711b62..d12c054 100644 --- a/assets/samplesheet_HG002_hg37.csv +++ b/assets/samplesheet_HG002_hg37.csv @@ -1,5 +1,9 @@ -test_vcf,caller,vartype -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.chr21.vcf.gz",lumpy,sv -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf",manta,sv -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/full.svaba.germline.sv.chr21.vcf",svaba,sv +test_vcf,caller,vartype,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,dup_to_ins,pctsize,pctseq,pctovl,evaluationmode +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf.gz",manta,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.svaba.germline.sv.chr21.vcf.gz",svaba,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002-NA24385-50x.union_170414.split.chr21.vcf.gz",sbg_graph,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.jointVC.filter.chr21.vcf.gz",gatk_joint,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_ALLCHROM_hs37d5_novoalign_Ilmn150bp300X_FB.chr21.vcf.gz",freebayes,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/hg002.Assemblytics_structural_variants.sorted.chr21.vcf.gz",assemblytics,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts diff --git a/assets/samplesheet_full.csv b/assets/samplesheet_full.csv index 9e68b6b..e00e2e0 100644 --- a/assets/samplesheet_full.csv +++ b/assets/samplesheet_full.csv @@ -1,7 +1,6 @@ -test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.chr21.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,, -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,, +test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,evaluationmode +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,,,, +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,,,, diff --git a/assets/schema_input.json b/assets/schema_input.json index c0a22d0..632f5c8 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -36,96 +36,77 @@ "normdist": { "type": "number", "errorMessage": "normshift is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized edit distance greater than normdist (default 0.2)", - "meta": ["normdist"], - "default": 0.2, - "minimum": 0, - "maximum": 1 + "meta": ["normdist"] }, "normsizediff": { "type": "number", "errorMessage": "normsizediff is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized size difference greater than normsizediff (default 0.2)", - "meta": ["normsizediff"], - "default": 0.2, - "minimum": 0, - "maximum": 1 + "meta": ["normsizediff"] }, "maxdist": { "type": "integer", "errorMessage": "maxdist is a svbenchmark parameter. Disallow matches if positions of two variants are more than maxdist bases from each other (default 100,000).", - "meta": ["maxdist"], - "default": 100000 + "meta": ["maxdist"] }, "pctsize": { "type": "number", "errorMessage": "pctsize is a truvari parameter. Has to be between 0-1. Ratio of min(base_size, comp_size)/max(base_size, comp_size).", - "meta": ["pctsize"], - "default": 0.7, - "minimum": 0, - "maximum": 1 + "meta": ["pctsize"] }, "pctseq": { "type": "number", "errorMessage": "pctseq is a truvari parameter. Has to be between 0-1. Edit distance ratio between the REF/ALT haplotype sequences of base and comparison call. turn it off (0) for no sequence comparison.", - "meta": ["pctseq"], - "default": 0.7, - "minimum": 0, - "maximum": 1 + "meta": ["pctseq"] }, "pctovl": { "type": "number", "errorMessage": "pctovl is a truvari parameter. Has to be between 0-1. Ratio of two calls' (overlapping bases)/(longest span)", - "meta": ["pctovl"], - "default": 0, - "minimum": 0, - "maximum": 1 + "meta": ["pctovl"] }, "refdist": { "type": "integer", "errorMessage": "refdist is a truvari parameter. Maximum distance comparison calls must be within from base call's start/end ", - "meta": ["refdist"], - "default": 500 + "meta": ["refdist"] }, "chunksize": { "type": "integer", "errorMessage": "chunksize is a truvari parameter. Create chunks of all calls overlapping within ±`--chunksize` basepairs", - "meta": ["chunksize"], - "default": 500 + "meta": ["chunksize"] + }, + "dup_to_ins": { + "type": "boolean", + "errorMessage": "a truvari parameter. converts DUP to INS type", + "meta": ["dup_to_ins"] + }, + "typeignore": { + "type": "boolean", + "errorMessage": "a truvari parameter. Ignore SVTYPE matching", + "meta": ["typeignore"] }, "bpDistance": { "type": "integer", "errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.", - "meta": ["bpDistance"], - "default": 500, - "minimum": 0 + "meta": ["bpDistance"] }, "percentThreshold": { "type": "number", "errorMessage": "percentThreshold is a wittyer parameter. This is used for percentage thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use for large tandem repeats. For all other SVs, in order to match between query and truth, the distance between boundaries should be within a number thats proportional to total SV (default 0.25)", - "meta": ["percentThreshold"], - "default": 0.25, - "minimum": 0, - "maximum": 1 + "meta": ["percentThreshold"] }, "absoluteThreshold": { "type": "integer", "errorMessage": "absoluteThreshold is a wittyer parameter. This is used for absolute thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use. For all other SVs, this is the upper bound of boundary distance when comparing truth and query. (default 10000)", - "meta": ["absoluteThreshold"], - "default": 10000, - "minimum": 0 + "meta": ["absoluteThreshold"] }, "maxMatches": { "type": "integer", "errorMessage": "maxMatches is a wittyer parameter. This is used for matching behaviour. Negative value means to match any number (for large SVs it is not recommended).", - "meta": ["maxMatches"], - "default": 10 + "meta": ["maxMatches"] }, - "similarityThreshold": { - "type": "number", - "errorMessage": "similarityThreshold is a wittyer parameter. This is used for sequence similarity thresholding.", - "meta": ["similarityThreshold"], - "default": 0.7, - "minimum": 0, - "maximum": 1 + "evaluationmode": { + "type": "string", + "errorMessage": "evaluationmode is a wittyer parameter. It is by default requires genotype matching. simpleCounting:sc, CrossTypeAndSimpleCounting:cts, genotypematch:d ", + "meta": ["evaluationmode"] } }, "required": ["test_vcf", "caller", "vartype"] diff --git a/bin/check_diff.py b/bin/check_diff.py new file mode 100755 index 0000000..cff223a --- /dev/null +++ b/bin/check_diff.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import pysam +from argparse import ArgumentParser + +def compare_vcf(file1, file2): + vcf1 = pysam.VariantFile(file1) + vcf2 = pysam.VariantFile(file2) + + variants1 = {rec.id for rec in vcf1.fetch()} + variants2 = {rec.id for rec in vcf2.fetch()} + + unique_to_file1 = variants1 - variants2 + unique_to_file2 = variants2 - variants1 + + print(f"Unique to {file1}: {unique_to_file1}") + print(f"Unique to {file2}: {unique_to_file2}") + +if __name__ == '__main__': + import os + import sys + + # Parse arguments + parser = ArgumentParser(description='Check differences') + parser.add_argument('file1', help='VCF file') + parser.add_argument('file2', help='Output VCF file') + + args = parser.parse_args() + compare_vcf(args.file1, args.file2) diff --git a/bin/merge_reports.py b/bin/merge_reports.py index a74fac2..058b431 100755 --- a/bin/merge_reports.py +++ b/bin/merge_reports.py @@ -35,7 +35,7 @@ def get_svbenchmark_resuls(file_paths): FP_pattern = re.compile(r'Number of false positives \(.*\): (\d+)') recall_pattern = re.compile(r'Recall \(.*\): (\d+\.\d+)%') precision_pattern = re.compile(r'Precision \(.*\): (\d+\.\d+)%') - f1_pattern = re.compile(r'F1 \(.*\): (\d+\.\d+)') + f1_pattern = re.compile(r'F1 \(.*\): ([\d\.]+(?:e[+-]?\d+)?)') # Iterate over each table file for file in file_paths: diff --git a/bin/normalize_vcf.py b/bin/normalize_vcf.py new file mode 100755 index 0000000..1b30391 --- /dev/null +++ b/bin/normalize_vcf.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +# Copyright 2022 - Barcelona Supercomputing Center +# Author: Rodrigo Martin +# BSC Dual License +''' +Generates a normalized VCF file from a VCF +Expected usage: + $ python normalize.py +Use --help for more information. +''' +import sys +from argparse import ArgumentParser + +import pysam + + +def _contig_to_int(contig): + contig = contig.lower().replace('chr', '') + if contig.isdigit(): + return int(contig) + else: + return 22 + ord(contig[0]) + + +if __name__ == '__main__': + import os + import sys + sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/') + from variant_extractor import VariantExtractor + from variant_extractor.variants import VariantType + + # Parse arguments + parser = ArgumentParser(description='Generate normalized VCF file from a VCF file') + parser.add_argument('vcf_file', help='VCF file') + parser.add_argument('output_vcf_file', help='Output VCF file') + parser.add_argument('fasta', help='Reference fasta file coupled with fai') + + args = parser.parse_args() + + # Extract header from original file + with open(args.vcf_file, 'r') as f: + with pysam.VariantFile(f) as input_file: + input_file.header.add_meta('cmdline', ' '.join(sys.argv)) + header_str = str(input_file.header) + + # Open output file, write as stream + with open(args.output_vcf_file, 'w') as output_vcf: + # Write header + output_vcf.write(header_str) + print(f'Reading {args.vcf_file}...') + # Open input file, read with variant_extractor + extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta) + records = list(extractor) + # Sort record by chromosome and position + records.sort(key=lambda x: (_contig_to_int(x.contig), x.pos)) + for variant_record in records: + output_vcf.write(str(variant_record)+'\n') diff --git a/bin/plots.R b/bin/plots.R index ed8bb07..0ad4d47 100755 --- a/bin/plots.R +++ b/bin/plots.R @@ -44,14 +44,20 @@ generate_plots <- function(table, benchmark, type, filter, stats) { labs(title = title1, x = "Tool", y = "Value", color = "Tool") + facet_wrap(~variable, scales = "free_y") + theme_minimal() + - theme(legend.position = "right", panel.background = element_rect(fill = "white")) + theme( + legend.position = "right", + panel.background = element_rect(fill = "white"), + axis.text.x = element_text(angle = 30, hjust = 0.5)) # Visualize Precision, Recall, and F1 in separate plots with white background metric_plot <- ggplot(metric_data, aes(x = Tool, y = value, color = Tool, shape = variable, linetype = variable, group = interaction(variable, Tool))) + geom_point() + labs(title = title2, x = "Tool", y = "Value", color = "Metric", linetype = "Metric") + theme_minimal() + - theme(legend.position = "right", panel.background = element_rect(fill = "white")) + theme( + legend.position = "right", + panel.background = element_rect(fill = "white"), + axis.text.x = element_text(angle = 30, hjust = 0.5)) # Save the plots tryCatch({ diff --git a/bin/vcf_to_csv.py b/bin/vcf_to_csv.py new file mode 100755 index 0000000..eb1fb53 --- /dev/null +++ b/bin/vcf_to_csv.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +# Copyright 2024 - GHGA +# Author: Kuebra Narci +''' +Generates a CSV file from a VCF +Expected usage: + $ python vcf_to_csv.py +Use --help for more information. +''' +import sys +from argparse import ArgumentParser +import re + +import csv + +def parse_info_field(info): + """Parse the INFO field of a VCF line into a dictionary.""" + info_dict = {} + for entry in info.split(';'): + key_value = entry.split('=') + if len(key_value) == 2: + info_dict[key_value[0]] = key_value[1] + else: + info_dict[key_value[0]] = True + return info_dict + +def extract_gt_from_sample(sample, format_field): + """Extract GT value from the sample field.""" + format_fields = format_field.split(':') + sample_values = sample.split(':') + if 'GT' in format_fields: + gt_index = format_fields.index('GT') + return sample_values[gt_index] + return './.' # Default GT value if not present + +def vcf_to_csv(vcf_file, csv_file): + """Convert a VCF file to a CSV file.""" + with open(vcf_file, 'r') as vcf: + headers = [] + sample_headers = [] + records = [] + include_supp_vec = False + include_supp = False + include_type_inferred = False + include_svtype = False + include_svlen = False + + for line in vcf: + if line.startswith('##'): + continue # Skip meta-information lines + elif line.startswith('#'): + headers = line[1:].strip().split('\t') + sample_headers = headers[9:] # The sample headers start from the 10th column + else: + row = line.strip().split('\t') + info_dict = parse_info_field(row[7]) + + # Check for SUPP_VEC, SUPP, type_inferred, SVTYPE, and SVLEN in the INFO field + if 'SUPP_VEC' in info_dict: + include_supp_vec = True + if 'SUPP' in info_dict: + include_supp = True + if 'type_inferred' in info_dict: + include_type_inferred = True + if 'SVTYPE' in info_dict: + include_svtype = True + if 'SVLEN' in info_dict: + include_svlen = True + + records.append((row, info_dict)) + + # Write the header with optional fields + headers_to_write = headers[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER + if include_supp_vec: + headers_to_write.append("SUPP_VEC") + if include_supp: + headers_to_write.append("SUPP") + if include_type_inferred: + headers_to_write.append("type_inferred") + if include_svtype: + headers_to_write.append("SVTYPE") + if include_svlen: + headers_to_write.append("SVLEN") + headers_to_write.extend([f'{sample}_GT' for sample in sample_headers]) + + with open(csv_file, 'w', newline='') as csvf: + csv_writer = csv.writer(csvf) + csv_writer.writerow(headers_to_write) + + for row, info_dict in records: + row_to_write = row[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER + if include_supp_vec: + row_to_write.append(info_dict.get('SUPP_VEC', '')) + if include_supp: + row_to_write.append(info_dict.get('SUPP', '')) + if include_type_inferred: + row_to_write.append(info_dict.get('type_inferred', '')) + if include_svtype: + row_to_write.append(info_dict.get('SVTYPE', '')) + if include_svlen: + row_to_write.append(info_dict.get('SVLEN', '')) + format_field = row[8] + gt_values = [extract_gt_from_sample(sample, format_field) for sample in row[9:]] + row_to_write.extend(gt_values) + + csv_writer.writerow(row_to_write) + +if __name__ == '__main__': + + # Parse arguments + parser = ArgumentParser(description='Generates a CSV file from a VCF') + parser.add_argument('vcf_file', help='VCF file') + parser.add_argument('output', help='Output CSV file') + args = parser.parse_args() + + vcf_to_csv(args.vcf_file, args.output) diff --git a/conf/modules.config b/conf/modules.config index 6582b65..5e84df4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -33,7 +33,14 @@ process { enabled: false ] } - + withName: "VARIANT_EXTRACTOR" { + ext.prefix = { input.baseName - ".vcf" } + publishDir = [ + path: {"${params.outdir}/${meta.id}/preprocess"}, + pattern: "*{.vcf.gz}", + mode: params.publish_dir_mode + ] + } withName: "BCFTOOLS_DEDUP" { ext.prefix = { vcf.baseName - ".vcf" + ".dedup"} ext.args = {"--output-type z --rm-du exact -c w" } @@ -130,14 +137,6 @@ process { mode: params.publish_dir_mode ] } - withName: "TRUVARI_PHAB" { - ext.prefix = {"${meta.id}.harm"} - publishDir = [ - path: {"${params.outdir}/${meta.id}/truvari_phab/"}, - pattern: "*{.vcf.gz,vcf.gz.tbi}", - mode: params.publish_dir_mode - ] - } // BENCHMARK TOOLS withName: "RTGTOOLS_FORMAT" { ext.when = { params.method.split(',').contains('rtgtools') && !params.sdf } @@ -187,7 +186,16 @@ process { } withName: "TRUVARI_BENCH" { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} - ext.args = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --chunksize ${meta.chunksize}"} + ext.args = {[ + "--sizemin 0 --sizefilt 0 --sizemax 5000000", + (meta.pctseq != null) ? "--pctsize ${meta.pctsize}" : '', + (meta.pctovl != null) ? "--pctovl ${meta.pctovl}" : '', + (meta.pctseq != null) ? "--pctseq ${meta.pctseq}" : '', + (meta.refdist != null) ? "--refdist ${meta.refdist}" : '', + (meta.chunksize != null) ? "--chunksize ${meta.chunksize}" : '', + (meta.dup_to_ins) ? "--dup-to-ins" : '', + (meta.typeignore) ? "--typeignore" : '' + ].join(' ').trim()} ext.when = { params.method.split(',').contains('truvari') } publishDir = [ path: {"${params.outdir}/${meta.id}/truvari_bench"}, @@ -197,7 +205,12 @@ process { } withName: SVANALYZER_SVBENCHMARK { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} - ext.args = {"-normshift ${meta.normshift} –normdist ${meta.normdist} –normsizediff ${meta.normsizediff} -maxdist ${meta.maxdist}"} + ext.args = {[ + (meta.normshift != null) ? "-normshift ${meta.normshift}" : '', + (meta.normdist != null) ? "-normdist ${meta.normdist}" : '', + (meta.normsizediff != null) ? "-normsizediff ${meta.normsizediff}" : '', + (meta.maxdist != null) ? "-maxdist ${meta.maxdist}" : '' + ].join(' ').trim()} ext.when = { params.method.split(',').contains('svanalyzer') } publishDir = [ path: {"${params.outdir}/${meta.id}/svanalyzer_bench"}, @@ -207,7 +220,14 @@ process { } withName: WITTYER { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} - ext.args = {"-em cts --ef [] --pt ${meta.percentThreshold} --at ${meta.absoluteThreshold} --bpd ${meta.bpDistance} --mm ${meta.maxMatches} --st ${meta.similarityThreshold}"} + ext.args = {[ + "--includedFilters=''", + (meta.evaluationmode != null) ? "-em ${meta.evaluationmode}" : '', + (meta.percentThreshold != null) ? "--pt ${meta.percentThreshold}" : '', + (meta.absoluteThreshold != null) ? "--at ${meta.absoluteThreshold}" : '', + (meta.bpDistance != null) ? "--bpd ${meta.bpDistance}" : '', + (meta.maxMatches != null) ? "--mm ${meta.maxMatches}" : '' + ].join(' ').trim()} ext.when = { params.method.split(',').contains('wittyer') } publishDir = [ path: {"${params.outdir}/${meta.id}/wittyer_bench"}, @@ -215,7 +235,6 @@ process { mode: params.publish_dir_mode ] } - withName: BAMSURGEON_EVALUATOR { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} publishDir = [ @@ -224,14 +243,6 @@ process { mode: params.publish_dir_mode ] } - withName: MANTA_CONVERTINVERSION { - ext.prefix = {"${params.sample}"} - publishDir = [ - path: {"${params.outdir}/${meta.id}/preprocess"}, - pattern: "*{.vcf.gz,vcf.gz.tbi}", - mode: params.publish_dir_mode - ] - } withName: SVYNC { ext.prefix = {"${params.sample}.stnd"} publishDir = [ @@ -244,7 +255,7 @@ process { ext.prefix = {"${meta.benchmark_tool}.${meta.vartype}"} publishDir = [ path: {"${params.outdir}/summary/tables"}, - pattern: "*{txt}", + pattern: "*{csv}", mode: params.publish_dir_mode ] } @@ -261,36 +272,16 @@ process { } withName: SURVIVOR_MERGE { ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{vcf}", - mode: params.publish_dir_mode - ] } withName: BCFTOOLS_MERGE { ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - ext.args = {"--output-type z --force-samples"} - publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{vcf.gz}", - mode: params.publish_dir_mode - ] - } - withName: BCFTOOLS_QUERY_SV { - ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - ext.args = {"-f '%CHROM\t%POS\t%INFO/SVTYPE\t%INFO/SVLEN\t%INFO/SUPP_VEC\t%INFO/SUPP\t%ID[\t%SAMPLE]'" } - publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{.txt}", - mode: params.publish_dir_mode - ] + ext.args = {"--output-type v --force-samples"} } - withName: BCFTOOLS_QUERY_SMALL { + withName: VCF_TO_CSV { ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - ext.args = {"-f '%CHROM\t%POS\t%REF\t%ALT\t%TYPE\t[%GT]\t%ID[\t%SAMPLE]'" } publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{.txt}", + path: {"${params.outdir}/summary/comparisons"}, + pattern: "*{.csv}", mode: params.publish_dir_mode ] } @@ -299,7 +290,7 @@ process { // Don't publish results for these processes // process { - withName: 'TABIX_TABIX|TABIX_BGZIPTABIX|TABIX_BGZIP|BGZIP_TABIX' { + withName: 'TABIX_TABIX|TABIX_BGZIP|TABIX_BGZIPTABIX|BGZIP_TABIX|SURVIVOR_MERGE|BCFTOOLS_MERGE' { publishDir = [ path: { "${params.outdir}/test" }, enabled: false diff --git a/conf/test.config b/conf/test.config index 895d1b0..bf5efc6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,6 +30,8 @@ params { analysis = 'germline' method = 'happy,truvari,svanalyzer,wittyer,rtgtools' // preprocess = "normalization,deduplication" + sv_standardization = "homogenize" + //variant_filtering = "include" // null, include, exclude //expression = 'FILTER="."' diff --git a/conf/test_full.config b/conf/test_full.config index 52723f8..bd109be 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -27,8 +27,10 @@ params { // Processes analysis = 'germline' - method = 'happy,truvari,svanalyzer' // - preprocess = "normalization,deduplication,prepy" + method = 'happy,truvari,svanalyzer,rtgtools' // + preprocess = "normalization,deduplication" + min_sv_size = 30 + sv_standardization = "homogenize" //variant_filtering = "include" // null, include, exclude //expression = 'FILTER="."' diff --git a/conf/test_hg37.config b/conf/test_hg37.config index 1fe7511..d2d2f3a 100644 --- a/conf/test_hg37.config +++ b/conf/test_hg37.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/variantbenchmarking -profile test, --outdir + nextflow run nf-core/variantbenchmarking -profile test_hg37, --outdir ---------------------------------------------------------------------------------------- */ @@ -22,21 +22,20 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'assets/samplesheet_HG002.csv' + input = 'assets/samplesheet_HG002_hg37.csv' outdir = 'results' // Genome references genome = 'GRCh37' analysis = 'germline' //somatic - method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist + method = 'truvari,svanalyzer' - standardization = true - preprocess = "normalization, deduplication" - //bnd_to_inv = true + preprocess = "normalization,deduplication" + min_sv_size = 30 + sv_standardization = "homogenize" sample = "HG002" // available samples: SEQC2, HG002 - truth_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.vcf.gz" - high_conf_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.bed" + truth_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/truth/HG002_SVs_Tier1_v0.6.chr21.vcf.gz" + high_conf_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/truth/HG002_SVs_Tier1_v0.6.chr21.bed" } - diff --git a/conf/test_hg38.config b/conf/test_hg38.config index e09318c..784afb4 100644 --- a/conf/test_hg38.config +++ b/conf/test_hg38.config @@ -37,7 +37,7 @@ params { //expression = 'FILTER="PASS"' sample = "HG002" // available samples: SEQC2, HG002 - truth_sv = "HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.annotated.vcf.gz" + truth_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.vcf.gz" high_conf_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.bed" } diff --git a/modules/local/variant_extractor/environment.yml b/modules/local/variant_extractor/environment.yml new file mode 100644 index 0000000..216715b --- /dev/null +++ b/modules/local/variant_extractor/environment.yml @@ -0,0 +1,10 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::htslib=1.20 + - bioconda::pysam=0.22.1 + - bioconda::tabix=1.11 + - pip + - pip: + - variant-extractor==4.0.6 diff --git a/modules/local/variant_extractor/main.nf b/modules/local/variant_extractor/main.nf new file mode 100644 index 0000000..e2eb5e3 --- /dev/null +++ b/modules/local/variant_extractor/main.nf @@ -0,0 +1,50 @@ +process VARIANT_EXTRACTOR { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/htslib_pysam_tabix_pip_variant-extractor:a12ef217eccf6ba8': + 'community.wave.seqera.io/library/htslib_pysam_tabix_pip_variant-extractor:a12ef217eccf6ba8' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_fai) + + output: + tuple val(meta), path("*.norm.vcf.gz") , emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + normalize_vcf.py \\ + $input \\ + ${prefix}.norm.vcf \\ + $fasta + + bgzip ${prefix}.norm.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.norm.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + +} diff --git a/modules/local/vcf_to_csv/environment.yml b/modules/local/vcf_to_csv/environment.yml new file mode 100644 index 0000000..a03f0fd --- /dev/null +++ b/modules/local/vcf_to_csv/environment.yml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::pysam=0.22.1 + - conda-forge::pandas=2.2.2 + - pip + - pip: + - variant-extractor==4.0.6 diff --git a/modules/local/vcf_to_csv/main.nf b/modules/local/vcf_to_csv/main.nf new file mode 100644 index 0000000..bff1b21 --- /dev/null +++ b/modules/local/vcf_to_csv/main.nf @@ -0,0 +1,45 @@ +process VCF_TO_CSV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/pysam_pandas_pip_variant-extractor:a2d9ec4a4efadbc3': + 'community.wave.seqera.io/library/pysam_pandas_pip_variant-extractor:a2d9ec4a4efadbc3' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.csv") , emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + vcf_to_csv.py \\ + $input \\ + ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + +} diff --git a/nextflow.config b/nextflow.config index 954b1b7..31794fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,7 +39,7 @@ params { sv_standardization = "" // harmonize,standardization,dup_to_ins,bnd_to_inv,gridss_annotate // Benchmarking method - method = 'truvari,svanalyzer,happy,rtgtools,wittyer' // --not working for now : vcfdist + method = 'truvari,svanalyzer,happy,rtgtools,wittyer' // Filtering parameters // minsize effects both truvari and variantbenchmarkingmark in different ways! svbechmark filters both base and comp calls diff --git a/subworkflows/local/compare_benchmark_results.nf b/subworkflows/local/compare_benchmark_results.nf index 89875f2..5e25586 100644 --- a/subworkflows/local/compare_benchmark_results.nf +++ b/subworkflows/local/compare_benchmark_results.nf @@ -5,13 +5,10 @@ params.options = [:] -include { SURVIVOR_MERGE } from '../../modules/nf-core/survivor/merge' addParams( options: params.options ) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { BCFTOOLS_MERGE } from '../../modules/nf-core/bcftools/merge' addParams( options: params.options ) -include { TABIX_BGZIP as TABIX_BGZIP_BENCH } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) -include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_SV } from '../../modules/nf-core/bcftools/query' addParams( options: params.options ) -include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_SMALL } from '../../modules/nf-core/bcftools/query' addParams( options: params.options ) +include { SURVIVOR_MERGE } from '../../modules/nf-core/survivor/merge' +include { BCFTOOLS_MERGE } from '../../modules/nf-core/bcftools/merge' +include { VCF_TO_CSV } from '../../modules/local/vcf_to_csv' +include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip' workflow COMPARE_BENCHMARK_RESULTS { take: @@ -22,6 +19,7 @@ workflow COMPARE_BENCHMARK_RESULTS { main: versions = Channel.empty() + merged_vcfs= Channel.empty() // Small Variants // @@ -34,19 +32,7 @@ workflow COMPARE_BENCHMARK_RESULTS { [] ) versions = versions.mix(BCFTOOLS_MERGE.out.versions) - - TABIX_TABIX( - BCFTOOLS_MERGE.out.merged_variants - ) - versions = versions.mix(TABIX_TABIX.out.versions) - - BCFTOOLS_QUERY_SMALL( - BCFTOOLS_MERGE.out.merged_variants.join(TABIX_TABIX.out.tbi), - [], - [], - [] - ) - versions = versions.mix(BCFTOOLS_QUERY_SMALL.out.versions) + merged_vcfs = merged_vcfs.mix(BCFTOOLS_MERGE.out.merged_variants) // SV part // @@ -54,12 +40,12 @@ workflow COMPARE_BENCHMARK_RESULTS { // // unzip vcfs - TABIX_BGZIP_BENCH( + TABIX_BGZIP( sv_ch ) - versions = versions.mix(TABIX_BGZIP_BENCH.out.versions) + versions = versions.mix(TABIX_BGZIP.out.versions) - TABIX_BGZIP_BENCH.out.output + TABIX_BGZIP.out.output .groupTuple() .set{vcf_ch} @@ -77,22 +63,12 @@ workflow COMPARE_BENCHMARK_RESULTS { 30 ) versions = versions.mix(SURVIVOR_MERGE.out.versions) + merged_vcfs = merged_vcfs.mix(SURVIVOR_MERGE.out.vcf) - TABIX_BGZIPTABIX( - SURVIVOR_MERGE.out.vcf - ) - versions = versions.mix(TABIX_BGZIPTABIX.out.versions) - - // add a plot to see better supported variants - // https://github.com/fritzsedlazeck/SURVIVOR/wiki - - BCFTOOLS_QUERY_SV( - TABIX_BGZIPTABIX.out.gz_tbi, - [], - [], - [] + VCF_TO_CSV( + merged_vcfs ) - versions = versions.mix(BCFTOOLS_QUERY_SV.out.versions) + versions = versions.mix(VCF_TO_CSV.out.versions) emit: versions diff --git a/subworkflows/local/prepare_vcfs_test.nf b/subworkflows/local/prepare_vcfs_test.nf index 0b8bdf8..e21af3a 100644 --- a/subworkflows/local/prepare_vcfs_test.nf +++ b/subworkflows/local/prepare_vcfs_test.nf @@ -3,21 +3,21 @@ // params.options = [:] -include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' addParams( options: params.options ) -include { VCF_VARIANT_FILTERING } from '../local/vcf_variant_filtering' addParams( options: params.options ) -include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' addParams( options: params.options ) -include { HAPPY_PREPY } from '../../modules/nf-core/happy/prepy/main' addParams( options: params.options ) -include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) -include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { TABIX_TABIX as TABIX_TABIX_3 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_1 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_2 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_3 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_4 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CONTIGS } from '../../modules/nf-core/bcftools/view' addParams( options: params.options ) -include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_SNV } from '../../modules/nf-core/bcftools/view' addParams( options: params.options ) -include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_INDEL } from '../../modules/nf-core/bcftools/view' addParams( options: params.options ) -include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TEST } from '../../modules/nf-core/bcftools/reheader' addParams( options: params.options ) +include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' +include { VCF_VARIANT_FILTERING } from '../local/vcf_variant_filtering' +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' +include { HAPPY_PREPY } from '../../modules/nf-core/happy/prepy/main' +include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' +include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' +include { TABIX_TABIX as TABIX_TABIX_3 } from '../../modules/nf-core/tabix/tabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_1 } from '../../modules/nf-core/tabix/bgziptabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_2 } from '../../modules/nf-core/tabix/bgziptabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_3 } from '../../modules/nf-core/tabix/bgziptabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_4 } from '../../modules/nf-core/tabix/bgziptabix' +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CONTIGS } from '../../modules/nf-core/bcftools/view' +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_SNV } from '../../modules/nf-core/bcftools/view' +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_INDEL } from '../../modules/nf-core/bcftools/view' +include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TEST } from '../../modules/nf-core/bcftools/reheader' workflow PREPARE_VCFS_TEST { diff --git a/subworkflows/local/prepare_vcfs_truth.nf b/subworkflows/local/prepare_vcfs_truth.nf index deca674..808070e 100644 --- a/subworkflows/local/prepare_vcfs_truth.nf +++ b/subworkflows/local/prepare_vcfs_truth.nf @@ -4,12 +4,12 @@ params.options = [:] -include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix.nf' addParams( options: params.options ) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) -include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' addParams( options: params.options ) -include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TRUTH } from '../../modules/nf-core/bcftools/reheader' addParams( options: params.options ) +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix.nf' +include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' +include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' +include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' +include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' +include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TRUTH } from '../../modules/nf-core/bcftools/reheader' workflow PREPARE_VCFS_TRUTH { take: diff --git a/subworkflows/local/report_benchmark_statistics.nf b/subworkflows/local/report_benchmark_statistics.nf index 0c97692..64a6d04 100644 --- a/subworkflows/local/report_benchmark_statistics.nf +++ b/subworkflows/local/report_benchmark_statistics.nf @@ -4,8 +4,8 @@ params.options = [:] -include { MERGE_REPORTS } from '../../modules/local/merge_reports' addParams( options: params.options ) -include { PLOTS } from '../../modules/local/plots' addParams( options: params.options ) +include { MERGE_REPORTS } from '../../modules/local/merge_reports' +include { PLOTS } from '../../modules/local/plots' workflow REPORT_BENCHMARK_STATISTICS { take: @@ -21,8 +21,6 @@ workflow REPORT_BENCHMARK_STATISTICS { versions = versions.mix(MERGE_REPORTS.out.versions) - MERGE_REPORTS.out.summary.view() - PLOTS( MERGE_REPORTS.out.summary ) diff --git a/subworkflows/local/report_vcf_statistics.nf b/subworkflows/local/report_vcf_statistics.nf index b070e24..2624a35 100644 --- a/subworkflows/local/report_vcf_statistics.nf +++ b/subworkflows/local/report_vcf_statistics.nf @@ -4,8 +4,8 @@ params.options = [:] -include { SURVIVOR_STATS } from '../../modules/nf-core/survivor/stats' addParams( options: params.options ) -include { BCFTOOLS_STATS } from '../../modules/nf-core/bcftools/stats' addParams( options: params.options ) +include { SURVIVOR_STATS } from '../../modules/nf-core/survivor/stats' +include { BCFTOOLS_STATS } from '../../modules/nf-core/bcftools/stats' workflow REPORT_VCF_STATISTICS { take: diff --git a/subworkflows/local/small_germline_benchmark.nf b/subworkflows/local/small_germline_benchmark.nf index 19ebee9..456b269 100644 --- a/subworkflows/local/small_germline_benchmark.nf +++ b/subworkflows/local/small_germline_benchmark.nf @@ -4,9 +4,9 @@ params.options = [:] -include { RTGTOOLS_FORMAT } from '../../modules/nf-core/rtgtools/format/main' addParams( options: params.options ) -include { RTGTOOLS_VCFEVAL } from '../../modules/nf-core/rtgtools/vcfeval/main' addParams( options: params.options ) -include { HAPPY_HAPPY } from '../../modules/nf-core/happy/happy/main' addParams( options: params.options ) +include { RTGTOOLS_FORMAT } from '../../modules/nf-core/rtgtools/format/main' +include { RTGTOOLS_VCFEVAL } from '../../modules/nf-core/rtgtools/vcfeval/main' +include { HAPPY_HAPPY } from '../../modules/nf-core/happy/happy/main' workflow SMALL_GERMLINE_BENCHMARK { take: diff --git a/subworkflows/local/small_somatic_benchmark.nf b/subworkflows/local/small_somatic_benchmark.nf index 54de782..6220c15 100644 --- a/subworkflows/local/small_somatic_benchmark.nf +++ b/subworkflows/local/small_somatic_benchmark.nf @@ -4,8 +4,8 @@ params.options = [:] -include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' addParams( options: params.options ) -include { BAMSURGEON_EVALUATOR } from '../../modules/local/bamsurgeon_evaluator.nf' addParams( options: params.options ) +include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' +include { BAMSURGEON_EVALUATOR } from '../../modules/local/bamsurgeon_evaluator.nf' workflow SMALL_SOMATIC_BENCHMARK { take: diff --git a/subworkflows/local/sv_germline_benchmark.nf b/subworkflows/local/sv_germline_benchmark.nf index ef2c711..f917865 100644 --- a/subworkflows/local/sv_germline_benchmark.nf +++ b/subworkflows/local/sv_germline_benchmark.nf @@ -4,12 +4,11 @@ params.options = [:] -include { TRUVARI_PHAB } from '../../modules/local/truvari_phab' addParams( options: params.options ) -include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' addParams( options: params.options ) -include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' addParams( options: params.options ) -include { WITTYER } from '../../modules/nf-core/wittyer' addParams( options: params.options ) -include { TABIX_BGZIP as TABIX_BGZIP_QUERY } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) -include { TABIX_BGZIP as TABIX_BGZIP_TRUTH } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) +include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' +include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' +include { WITTYER } from '../../modules/nf-core/wittyer' +include { TABIX_BGZIP as TABIX_BGZIP_QUERY } from '../../modules/nf-core/tabix/bgzip' +include { TABIX_BGZIP as TABIX_BGZIP_TRUTH } from '../../modules/nf-core/tabix/bgzip' workflow SV_GERMLINE_BENCHMARK { take: @@ -24,20 +23,7 @@ workflow SV_GERMLINE_BENCHMARK { tagged_variants=Channel.empty() // SV benchmarking - if (params.method.contains('truvari')){ - - if(params.sv_standardization.contains('harmonize')){ - // - // TRUVARI: TRUVARI_PHAB - // - TRUVARI_PHAB( - input_ch, - fasta, - fai - ) - versions = versions.mix(TRUVARI_PHAB.out.versions) - } // // MODULE: TRUVARI_BENCH // diff --git a/subworkflows/local/sv_somatic_benchmark copy.nf b/subworkflows/local/sv_somatic_benchmark copy.nf index d300754..6954aa1 100644 --- a/subworkflows/local/sv_somatic_benchmark copy.nf +++ b/subworkflows/local/sv_somatic_benchmark copy.nf @@ -4,7 +4,7 @@ params.options = [:] -include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' addParams( options: params.options ) +include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' addParams( options: params.options ) include { BAMSURGEON_EVALUATOR } from '../../modules/local/bamsurgeon_evaluator.nf' addParams( options: params.options ) workflow SMALL_SOMATIC_BENCHMARK { diff --git a/subworkflows/local/sv_vcf_conversion.nf b/subworkflows/local/sv_vcf_conversion.nf index 4f6a235..c45af5e 100644 --- a/subworkflows/local/sv_vcf_conversion.nf +++ b/subworkflows/local/sv_vcf_conversion.nf @@ -6,10 +6,10 @@ import groovy.io.FileType params.options = [:] -include { MANTA_CONVERTINVERSION } from '../../modules/nf-core/manta/convertinversion' addParams( options: params.options ) -include { GRIDSS_ANNOTATION } from '../../modules/local/gridss_annotation' addParams( options: params.options ) -include { SVYNC } from '../../modules/nf-core/svync' addParams( options: params.options ) -include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' addParams( options: params.options ) +include { SVYNC } from '../../modules/nf-core/svync' +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' +include { VARIANT_EXTRACTOR } from '../../modules/local/variant_extractor' +include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' workflow SV_VCF_CONVERSIONS { take: @@ -21,6 +21,25 @@ workflow SV_VCF_CONVERSIONS { main: versions = Channel.empty() + if (params.sv_standardization.contains("homogenize")){ + // uses VariantExtractor to homogenize variants + + VARIANT_EXTRACTOR( + input_ch, + fasta, + fai + ) + versions = versions.mix(VARIANT_EXTRACTOR.out.versions) + + // sort vcf + BCFTOOLS_SORT( + VARIANT_EXTRACTOR.out.output + ) + versions = versions.mix(BCFTOOLS_SORT.out.versions) + input_ch = BCFTOOLS_SORT.out.vcf + + } + // // MODULE: BGZIP_TABIX // @@ -77,60 +96,6 @@ workflow SV_VCF_CONVERSIONS { vcf_ch = out_vcf_ch.map{it -> tuple(it[0], it[1], it[2])} } - // Check tool spesific conversions - if(params.sv_standardization.contains("bnd_to_inv")){ - out_vcf_ch = Channel.empty() - - vcf_ch.branch{ - tool: it[0].id == "manta" || it[0].id == "dragen" - other: true} - .set{input} - // - // MANTA_CONVERTINVERSION - // - // NOTE: should also work for dragen - // Not working now!!!!! - - MANTA_CONVERTINVERSION( - input.tool.map{it -> tuple(it[0], it[1])}, - fasta - ) - versions = versions.mix(MANTA_CONVERTINVERSION.out.versions) - - out_ch = MANTA_CONVERTINVERSION.out.vcf.join(MANTA_CONVERTINVERSION.out.tbi) - out_vcf_ch = out_vcf_ch.mix(out_ch) - out_vcf_ch = out_vcf_ch.mix(input.other) - vcf_ch = out_vcf_ch - - // https://github.com/srbehera/DRAGEN_Analysis/blob/main/convertInversion.py - - } - - if (params.sv_standardization.contains("gridss_annotate")){ - out_vcf_ch = Channel.empty() - - vcf_ch.branch{ - tool: it[0].id == "gridss" - other: true} - .set{input} - - // - // GRIDSS_ANNOTATION - // - // https://github.com/PapenfussLab/gridss/blob/7b1fedfed32af9e03ed5c6863d368a821a4c699f/example/simple-event-annotation.R#L9 - // GRIDSS simple event annotation - GRIDSS_ANNOTATION( - input.tool, - fasta, - fai - ) - versions = versions.mix(GRIDSS_ANNOTATION.out.versions) - - out_vcf_ch = out_vcf_ch.mix(GRIDSS_ANNOTATION.out.vcf) - out_vcf_ch = out_vcf_ch.mix(input.other) - vcf_ch = out_vcf_ch - } - // https://github.com/EUCANCan/variant-extractor/blob/main/examples/vcf_to_csv.py emit: vcf_ch diff --git a/subworkflows/local/vcf_variant_deduplication.nf b/subworkflows/local/vcf_variant_deduplication.nf index b921b92..1a0b878 100644 --- a/subworkflows/local/vcf_variant_deduplication.nf +++ b/subworkflows/local/vcf_variant_deduplication.nf @@ -4,9 +4,9 @@ params.options = [:] -include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' addParams( options: params.options ) -include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { BCFTOOLS_NORM as BCFTOOLS_DEDUP } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' +include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' +include { BCFTOOLS_NORM as BCFTOOLS_DEDUP } from '../../modules/nf-core/bcftools/norm' workflow VCF_VARIANT_DEDUPLICATION { take: diff --git a/subworkflows/local/vcf_variant_filtering.nf b/subworkflows/local/vcf_variant_filtering.nf index 7232e57..b741c93 100644 --- a/subworkflows/local/vcf_variant_filtering.nf +++ b/subworkflows/local/vcf_variant_filtering.nf @@ -4,10 +4,10 @@ params.options = [:] -include { SURVIVOR_FILTER } from '../../modules/nf-core/survivor/filter' addParams( options: params.options ) -include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { BCFTOOLS_FILTER } from '../../modules/nf-core/bcftools/filter' addParams( options: params.options ) +include { SURVIVOR_FILTER } from '../../modules/nf-core/survivor/filter' +include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip' +include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' +include { BCFTOOLS_FILTER } from '../../modules/nf-core/bcftools/filter' workflow VCF_VARIANT_FILTERING {