nf-core · kubranarci · Jul 9, 2024 · Jul 2, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/assets/samplesheet_HG002_hg37.csv b/assets/samplesheet_HG002_hg37.csv
@@ -1,5 +1,9 @@
-test_vcf,caller,vartype
-"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv
-"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.chr21.vcf.gz",lumpy,sv
-"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf",manta,sv
-"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/full.svaba.germline.sv.chr21.vcf",svaba,sv
+test_vcf,caller,vartype,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,dup_to_ins,pctsize,pctseq,pctovl,evaluationmode
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf.gz",manta,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.svaba.germline.sv.chr21.vcf.gz",svaba,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002-NA24385-50x.union_170414.split.chr21.vcf.gz",sbg_graph,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.jointVC.filter.chr21.vcf.gz",gatk_joint,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_ALLCHROM_hs37d5_novoalign_Ilmn150bp300X_FB.chr21.vcf.gz",freebayes,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
+"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/hg002.Assemblytics_structural_variants.sorted.chr21.vcf.gz",assemblytics,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
diff --git a/assets/samplesheet_full.csv b/assets/samplesheet_full.csv
@@ -1,7 +1,6 @@
-test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
-https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.chr21.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
-https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
-https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
-https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
-https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,,
-https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,,
+test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,evaluationmode
+https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts
+https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts
+https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts
+https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,,,,
+https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,,,,
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -36,96 +36,77 @@
             "normdist": {
                 "type": "number",
                 "errorMessage": "normshift is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized edit distance greater than normdist (default 0.2)",
-                "meta": ["normdist"],
-                "default": 0.2,
-                "minimum": 0,
-                "maximum": 1
+                "meta": ["normdist"]
             },
             "normsizediff": {
                 "type": "number",
                 "errorMessage": "normsizediff is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized size difference greater than normsizediff (default 0.2)",
-                "meta": ["normsizediff"],
-                "default": 0.2,
-                "minimum": 0,
-                "maximum": 1
+                "meta": ["normsizediff"]
             },
             "maxdist": {
                 "type": "integer",
                 "errorMessage": "maxdist is a svbenchmark parameter. Disallow matches if positions of two variants are more than maxdist bases from each other (default 100,000).",
-                "meta": ["maxdist"],
-                "default": 100000
+                "meta": ["maxdist"]
             },
             "pctsize": {
                 "type": "number",
                 "errorMessage": "pctsize is a truvari parameter. Has to be between 0-1. Ratio of min(base_size, comp_size)/max(base_size, comp_size).",
-                "meta": ["pctsize"],
-                "default": 0.7,
-                "minimum": 0,
-                "maximum": 1
+                "meta": ["pctsize"]
             },
             "pctseq": {
                 "type": "number",
                 "errorMessage": "pctseq is a truvari parameter. Has to be between 0-1. Edit distance ratio between the REF/ALT haplotype sequences of base and comparison call. turn it off (0) for no sequence comparison.",
-                "meta": ["pctseq"],
-                "default": 0.7,
-                "minimum": 0,
-                "maximum": 1
+                "meta": ["pctseq"]
             },
             "pctovl": {
                 "type": "number",
                 "errorMessage": "pctovl is a truvari parameter. Has to be between 0-1. Ratio of two calls' (overlapping bases)/(longest span)",
-                "meta": ["pctovl"],
-                "default": 0,
-                "minimum": 0,
-                "maximum": 1
+                "meta": ["pctovl"]
             },
             "refdist": {
                 "type": "integer",
                 "errorMessage": "refdist is a truvari parameter. Maximum distance comparison calls must be within from base call's start/end ",
-                "meta": ["refdist"],
-                "default": 500
+                "meta": ["refdist"]
             },
             "chunksize": {
                 "type": "integer",
                 "errorMessage": "chunksize is a truvari parameter. Create chunks of all calls overlapping within ±`--chunksize` basepairs",
-                "meta": ["chunksize"],
-                "default": 500
+                "meta": ["chunksize"]
+            },
+            "dup_to_ins": {
+                "type": "boolean",
+                "errorMessage": "a truvari parameter. converts DUP to INS type",
+                "meta": ["dup_to_ins"]
+            },
+            "typeignore": {
+                "type": "boolean",
+                "errorMessage": "a truvari parameter. Ignore SVTYPE matching",
+                "meta": ["typeignore"]
             },
             "bpDistance": {
                 "type": "integer",
                 "errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.",
-                "meta": ["bpDistance"],
-                "default": 500,
-                "minimum": 0
+                "meta": ["bpDistance"]
             },
             "percentThreshold": {
                 "type": "number",
                 "errorMessage": "percentThreshold is a wittyer parameter. This is used for percentage thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use for large tandem repeats. For all other SVs, in order to match between query and truth, the distance between boundaries should be within a number thats proportional to total SV (default 0.25)",
-                "meta": ["percentThreshold"],
-                "default": 0.25,
-                "minimum": 0,
-                "maximum": 1
+                "meta": ["percentThreshold"]
             },
             "absoluteThreshold": {
                 "type": "integer",
                 "errorMessage": "absoluteThreshold is a wittyer parameter. This is used for absolute thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use. For all other SVs, this is the upper bound of boundary distance when comparing truth and query. (default 10000)",
-                "meta": ["absoluteThreshold"],
-                "default": 10000,
-                "minimum": 0
+                "meta": ["absoluteThreshold"]
             },
             "maxMatches": {
                 "type": "integer",
                 "errorMessage": "maxMatches is a wittyer parameter. This is used for matching behaviour. Negative value means to match any number (for large SVs it is not recommended).",
-                "meta": ["maxMatches"],
-                "default": 10
+                "meta": ["maxMatches"]
             },
-            "similarityThreshold": {
-                "type": "number",
-                "errorMessage": "similarityThreshold is a wittyer parameter. This is used for sequence similarity thresholding.",
-                "meta": ["similarityThreshold"],
-                "default": 0.7,
-                "minimum": 0,
-                "maximum": 1
+            "evaluationmode": {
+                "type": "string",
+                "errorMessage": "evaluationmode is a wittyer parameter. It is by default requires genotype matching.  simpleCounting:sc, CrossTypeAndSimpleCounting:cts, genotypematch:d  ",
+                "meta": ["evaluationmode"]
             }
         },
         "required": ["test_vcf", "caller", "vartype"]

diff --git a/bin/check_diff.py b/bin/check_diff.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+import pysam
+from argparse import ArgumentParser
+
+def compare_vcf(file1, file2):
+    vcf1 = pysam.VariantFile(file1)
+    vcf2 = pysam.VariantFile(file2)
+
+    variants1 = {rec.id for rec in vcf1.fetch()}
+    variants2 = {rec.id for rec in vcf2.fetch()}
+
+    unique_to_file1 = variants1 - variants2
+    unique_to_file2 = variants2 - variants1
+
+    print(f"Unique to {file1}: {unique_to_file1}")
+    print(f"Unique to {file2}: {unique_to_file2}")
+
+if __name__ == '__main__':
+    import os
+    import sys
+
+    # Parse arguments
+    parser = ArgumentParser(description='Check differences')
+    parser.add_argument('file1', help='VCF file')
+    parser.add_argument('file2', help='Output VCF file')
+
+    args = parser.parse_args()
+    compare_vcf(args.file1, args.file2)
diff --git a/bin/merge_reports.py b/bin/merge_reports.py
@@ -35,7 +35,7 @@ def get_svbenchmark_resuls(file_paths):
 	FP_pattern = re.compile(r'Number of false positives \(.*\): (\d+)')
 	recall_pattern = re.compile(r'Recall \(.*\): (\d+\.\d+)%')
 	precision_pattern = re.compile(r'Precision \(.*\): (\d+\.\d+)%')
-	f1_pattern = re.compile(r'F1 \(.*\): (\d+\.\d+)')
+	f1_pattern = re.compile(r'F1 \(.*\): ([\d\.]+(?:e[+-]?\d+)?)')
 
 	# Iterate over each table file
 	for file in file_paths:

diff --git a/bin/normalize_vcf.py b/bin/normalize_vcf.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+# Copyright 2022 - Barcelona Supercomputing Center
+# Author: Rodrigo Martin
+# BSC Dual License
+'''
+Generates a normalized VCF file from a VCF
+Expected usage:
+    $ python normalize.py <vcf_file> <output_vcf_file>
+Use --help for more information.
+'''
+import sys
+from argparse import ArgumentParser
+
+import pysam
+
+
+def _contig_to_int(contig):
+    contig = contig.lower().replace('chr', '')
+    if contig.isdigit():
+        return int(contig)
+    else:
+        return 22 + ord(contig[0])
+
+
+if __name__ == '__main__':
+    import os
+    import sys
+    sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/')
+    from variant_extractor import VariantExtractor
+    from variant_extractor.variants import VariantType
+
+    # Parse arguments
+    parser = ArgumentParser(description='Generate normalized VCF file from a VCF file')
+    parser.add_argument('vcf_file', help='VCF file')
+    parser.add_argument('output_vcf_file', help='Output VCF file')
+    parser.add_argument('fasta', help='Reference fasta file coupled with fai')
+
+    args = parser.parse_args()
+
+    # Extract header from original file
+    with open(args.vcf_file, 'r') as f:
+        with pysam.VariantFile(f) as input_file:
+            input_file.header.add_meta('cmdline', ' '.join(sys.argv))
+            header_str = str(input_file.header)
+
+    # Open output file, write as stream
+    with open(args.output_vcf_file, 'w') as output_vcf:
+        # Write header
+        output_vcf.write(header_str)
+        print(f'Reading {args.vcf_file}...')
+        # Open input file, read with variant_extractor
+        extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta)
+        records = list(extractor)
+        # Sort record by chromosome and position
+        records.sort(key=lambda x: (_contig_to_int(x.contig), x.pos))
+        for variant_record in records:
+            output_vcf.write(str(variant_record)+'\n')
diff --git a/bin/plots.R b/bin/plots.R
@@ -44,14 +44,20 @@ generate_plots <- function(table, benchmark, type, filter, stats) {
         labs(title = title1, x = "Tool", y = "Value", color = "Tool") +
         facet_wrap(~variable, scales = "free_y") +
         theme_minimal() +
-        theme(legend.position = "right", panel.background = element_rect(fill = "white"))
+        theme(
+            legend.position = "right",
+            panel.background = element_rect(fill = "white"),
+            axis.text.x = element_text(angle = 30, hjust = 0.5))
 
     # Visualize Precision, Recall, and F1 in separate plots with white background
     metric_plot <- ggplot(metric_data, aes(x = Tool, y = value, color = Tool, shape = variable,  linetype = variable, group = interaction(variable, Tool))) +
         geom_point() +
         labs(title = title2, x = "Tool", y = "Value", color = "Metric", linetype = "Metric") +
         theme_minimal() +
-        theme(legend.position = "right", panel.background = element_rect(fill = "white"))
+        theme(
+            legend.position = "right",
+            panel.background = element_rect(fill = "white"),
+            axis.text.x = element_text(angle = 30, hjust = 0.5))
 
     # Save the plots
     tryCatch({

diff --git a/bin/vcf_to_csv.py b/bin/vcf_to_csv.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+# Copyright 2024 - GHGA
+# Author: Kuebra Narci
+'''
+Generates a CSV file from a VCF
+Expected usage:
+    $ python vcf_to_csv.py <vcf_file> <output>
+Use --help for more information.
+'''
+import sys
+from argparse import ArgumentParser
+import re
+
+import csv
+
+def parse_info_field(info):
+    """Parse the INFO field of a VCF line into a dictionary."""
+    info_dict = {}
+    for entry in info.split(';'):
+        key_value = entry.split('=')
+        if len(key_value) == 2:
+            info_dict[key_value[0]] = key_value[1]
+        else:
+            info_dict[key_value[0]] = True
+    return info_dict
+
+def extract_gt_from_sample(sample, format_field):
+    """Extract GT value from the sample field."""
+    format_fields = format_field.split(':')
+    sample_values = sample.split(':')
+    if 'GT' in format_fields:
+        gt_index = format_fields.index('GT')
+        return sample_values[gt_index]
+    return './.'  # Default GT value if not present
+
+def vcf_to_csv(vcf_file, csv_file):
+    """Convert a VCF file to a CSV file."""
+    with open(vcf_file, 'r') as vcf:
+        headers = []
+        sample_headers = []
+        records = []
+        include_supp_vec = False
+        include_supp = False
+        include_type_inferred = False
+        include_svtype = False
+        include_svlen = False
+
+        for line in vcf:
+            if line.startswith('##'):
+                continue  # Skip meta-information lines
+            elif line.startswith('#'):
+                headers = line[1:].strip().split('\t')
+                sample_headers = headers[9:]  # The sample headers start from the 10th column
+            else:
+                row = line.strip().split('\t')
+                info_dict = parse_info_field(row[7])
+
+                # Check for SUPP_VEC, SUPP, type_inferred, SVTYPE, and SVLEN in the INFO field
+                if 'SUPP_VEC' in info_dict:
+                    include_supp_vec = True
+                if 'SUPP' in info_dict:
+                    include_supp = True
+                if 'type_inferred' in info_dict:
+                    include_type_inferred = True
+                if 'SVTYPE' in info_dict:
+                    include_svtype = True
+                if 'SVLEN' in info_dict:
+                    include_svlen = True
+
+                records.append((row, info_dict))
+
+        # Write the header with optional fields
+        headers_to_write = headers[:7]  # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
+        if include_supp_vec:
+            headers_to_write.append("SUPP_VEC")
+        if include_supp:
+            headers_to_write.append("SUPP")
+        if include_type_inferred:
+            headers_to_write.append("type_inferred")
+        if include_svtype:
+            headers_to_write.append("SVTYPE")
+        if include_svlen:
+            headers_to_write.append("SVLEN")
+        headers_to_write.extend([f'{sample}_GT' for sample in sample_headers])
+
+        with open(csv_file, 'w', newline='') as csvf:
+            csv_writer = csv.writer(csvf)
+            csv_writer.writerow(headers_to_write)
+
+            for row, info_dict in records:
+                row_to_write = row[:7]  # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
+                if include_supp_vec:
+                    row_to_write.append(info_dict.get('SUPP_VEC', ''))
+                if include_supp:
+                    row_to_write.append(info_dict.get('SUPP', ''))
+                if include_type_inferred:
+                    row_to_write.append(info_dict.get('type_inferred', ''))
+                if include_svtype:
+                    row_to_write.append(info_dict.get('SVTYPE', ''))
+                if include_svlen:
+                    row_to_write.append(info_dict.get('SVLEN', ''))
+                format_field = row[8]
+                gt_values = [extract_gt_from_sample(sample, format_field) for sample in row[9:]]
+                row_to_write.extend(gt_values)
+
+                csv_writer.writerow(row_to_write)
+
+if __name__ == '__main__':
+
+    # Parse arguments
+    parser = ArgumentParser(description='Generates a CSV file from a VCF')
+    parser.add_argument('vcf_file', help='VCF file')
+    parser.add_argument('output', help='Output CSV file')
+    args = parser.parse_args()
+
+    vcf_to_csv(args.vcf_file, args.output)