Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

58 test variant extractor for svs #60

Merged
merged 9 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions assets/samplesheet_HG002_hg37.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
test_vcf,caller,vartype
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.chr21.vcf.gz",lumpy,sv
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf",manta,sv
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/full.svaba.germline.sv.chr21.vcf",svaba,sv
test_vcf,caller,vartype,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,dup_to_ins,pctsize,pctseq,pctovl,evaluationmode
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf.gz",manta,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.svaba.germline.sv.chr21.vcf.gz",svaba,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002-NA24385-50x.union_170414.split.chr21.vcf.gz",sbg_graph,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.jointVC.filter.chr21.vcf.gz",gatk_joint,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_ALLCHROM_hs37d5_novoalign_Ilmn150bp300X_FB.chr21.vcf.gz",freebayes,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/hg002.Assemblytics_structural_variants.sorted.chr21.vcf.gz",assemblytics,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts
13 changes: 6 additions & 7 deletions assets/samplesheet_full.csv
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.chr21.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,,
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,,
test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,evaluationmode
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,,,,
https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,,,,
71 changes: 26 additions & 45 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,96 +36,77 @@
"normdist": {
"type": "number",
"errorMessage": "normshift is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized edit distance greater than normdist (default 0.2)",
"meta": ["normdist"],
"default": 0.2,
"minimum": 0,
"maximum": 1
Comment on lines -41 to -42
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can keep these. they won't be validated if the value has not been given

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wanted to be safe 🗡️ ahaha

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't worry it won't break anything :)

"meta": ["normdist"]
},
"normsizediff": {
"type": "number",
"errorMessage": "normsizediff is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized size difference greater than normsizediff (default 0.2)",
"meta": ["normsizediff"],
"default": 0.2,
"minimum": 0,
"maximum": 1
"meta": ["normsizediff"]
},
"maxdist": {
"type": "integer",
"errorMessage": "maxdist is a svbenchmark parameter. Disallow matches if positions of two variants are more than maxdist bases from each other (default 100,000).",
"meta": ["maxdist"],
"default": 100000
"meta": ["maxdist"]
},
"pctsize": {
"type": "number",
"errorMessage": "pctsize is a truvari parameter. Has to be between 0-1. Ratio of min(base_size, comp_size)/max(base_size, comp_size).",
"meta": ["pctsize"],
"default": 0.7,
"minimum": 0,
"maximum": 1
"meta": ["pctsize"]
},
"pctseq": {
"type": "number",
"errorMessage": "pctseq is a truvari parameter. Has to be between 0-1. Edit distance ratio between the REF/ALT haplotype sequences of base and comparison call. turn it off (0) for no sequence comparison.",
"meta": ["pctseq"],
"default": 0.7,
"minimum": 0,
"maximum": 1
"meta": ["pctseq"]
},
"pctovl": {
"type": "number",
"errorMessage": "pctovl is a truvari parameter. Has to be between 0-1. Ratio of two calls' (overlapping bases)/(longest span)",
"meta": ["pctovl"],
"default": 0,
"minimum": 0,
"maximum": 1
"meta": ["pctovl"]
},
"refdist": {
"type": "integer",
"errorMessage": "refdist is a truvari parameter. Maximum distance comparison calls must be within from base call's start/end ",
"meta": ["refdist"],
"default": 500
"meta": ["refdist"]
},
"chunksize": {
"type": "integer",
"errorMessage": "chunksize is a truvari parameter. Create chunks of all calls overlapping within ±`--chunksize` basepairs",
"meta": ["chunksize"],
"default": 500
"meta": ["chunksize"]
},
"dup_to_ins": {
"type": "boolean",
"errorMessage": "a truvari parameter. converts DUP to INS type",
"meta": ["dup_to_ins"]
},
"typeignore": {
"type": "boolean",
"errorMessage": "a truvari parameter. Ignore SVTYPE matching",
"meta": ["typeignore"]
},
"bpDistance": {
"type": "integer",
"errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.",
"meta": ["bpDistance"],
"default": 500,
"minimum": 0
"meta": ["bpDistance"]
},
"percentThreshold": {
"type": "number",
"errorMessage": "percentThreshold is a wittyer parameter. This is used for percentage thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use for large tandem repeats. For all other SVs, in order to match between query and truth, the distance between boundaries should be within a number thats proportional to total SV (default 0.25)",
"meta": ["percentThreshold"],
"default": 0.25,
"minimum": 0,
"maximum": 1
"meta": ["percentThreshold"]
},
"absoluteThreshold": {
"type": "integer",
"errorMessage": "absoluteThreshold is a wittyer parameter. This is used for absolute thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use. For all other SVs, this is the upper bound of boundary distance when comparing truth and query. (default 10000)",
"meta": ["absoluteThreshold"],
"default": 10000,
"minimum": 0
"meta": ["absoluteThreshold"]
},
"maxMatches": {
"type": "integer",
"errorMessage": "maxMatches is a wittyer parameter. This is used for matching behaviour. Negative value means to match any number (for large SVs it is not recommended).",
"meta": ["maxMatches"],
"default": 10
"meta": ["maxMatches"]
},
"similarityThreshold": {
"type": "number",
"errorMessage": "similarityThreshold is a wittyer parameter. This is used for sequence similarity thresholding.",
"meta": ["similarityThreshold"],
"default": 0.7,
"minimum": 0,
"maximum": 1
"evaluationmode": {
"type": "string",
"errorMessage": "evaluationmode is a wittyer parameter. It is by default requires genotype matching. simpleCounting:sc, CrossTypeAndSimpleCounting:cts, genotypematch:d ",
"meta": ["evaluationmode"]
}
},
"required": ["test_vcf", "caller", "vartype"]
Expand Down
29 changes: 29 additions & 0 deletions bin/check_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python

import pysam
from argparse import ArgumentParser

def compare_vcf(file1, file2):
vcf1 = pysam.VariantFile(file1)
vcf2 = pysam.VariantFile(file2)

variants1 = {rec.id for rec in vcf1.fetch()}
variants2 = {rec.id for rec in vcf2.fetch()}

unique_to_file1 = variants1 - variants2
unique_to_file2 = variants2 - variants1

print(f"Unique to {file1}: {unique_to_file1}")
print(f"Unique to {file2}: {unique_to_file2}")

if __name__ == '__main__':
import os
import sys

# Parse arguments
parser = ArgumentParser(description='Check differences')
parser.add_argument('file1', help='VCF file')
parser.add_argument('file2', help='Output VCF file')

args = parser.parse_args()
compare_vcf(args.file1, args.file2)
2 changes: 1 addition & 1 deletion bin/merge_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def get_svbenchmark_resuls(file_paths):
FP_pattern = re.compile(r'Number of false positives \(.*\): (\d+)')
recall_pattern = re.compile(r'Recall \(.*\): (\d+\.\d+)%')
precision_pattern = re.compile(r'Precision \(.*\): (\d+\.\d+)%')
f1_pattern = re.compile(r'F1 \(.*\): (\d+\.\d+)')
f1_pattern = re.compile(r'F1 \(.*\): ([\d\.]+(?:e[+-]?\d+)?)')

# Iterate over each table file
for file in file_paths:
Expand Down
58 changes: 58 additions & 0 deletions bin/normalize_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python

# Copyright 2022 - Barcelona Supercomputing Center
# Author: Rodrigo Martin
# BSC Dual License
'''
Generates a normalized VCF file from a VCF
Expected usage:
$ python normalize.py <vcf_file> <output_vcf_file>
Use --help for more information.
'''
import sys
from argparse import ArgumentParser

import pysam


def _contig_to_int(contig):
contig = contig.lower().replace('chr', '')
if contig.isdigit():
return int(contig)
else:
return 22 + ord(contig[0])


if __name__ == '__main__':
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/')
from variant_extractor import VariantExtractor
from variant_extractor.variants import VariantType

# Parse arguments
parser = ArgumentParser(description='Generate normalized VCF file from a VCF file')
parser.add_argument('vcf_file', help='VCF file')
parser.add_argument('output_vcf_file', help='Output VCF file')
parser.add_argument('fasta', help='Reference fasta file coupled with fai')

args = parser.parse_args()

# Extract header from original file
with open(args.vcf_file, 'r') as f:
with pysam.VariantFile(f) as input_file:
input_file.header.add_meta('cmdline', ' '.join(sys.argv))
header_str = str(input_file.header)

# Open output file, write as stream
with open(args.output_vcf_file, 'w') as output_vcf:
# Write header
output_vcf.write(header_str)
print(f'Reading {args.vcf_file}...')
# Open input file, read with variant_extractor
extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta)
records = list(extractor)
# Sort record by chromosome and position
records.sort(key=lambda x: (_contig_to_int(x.contig), x.pos))
for variant_record in records:
output_vcf.write(str(variant_record)+'\n')
10 changes: 8 additions & 2 deletions bin/plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,20 @@ generate_plots <- function(table, benchmark, type, filter, stats) {
labs(title = title1, x = "Tool", y = "Value", color = "Tool") +
facet_wrap(~variable, scales = "free_y") +
theme_minimal() +
theme(legend.position = "right", panel.background = element_rect(fill = "white"))
theme(
legend.position = "right",
panel.background = element_rect(fill = "white"),
axis.text.x = element_text(angle = 30, hjust = 0.5))

# Visualize Precision, Recall, and F1 in separate plots with white background
metric_plot <- ggplot(metric_data, aes(x = Tool, y = value, color = Tool, shape = variable, linetype = variable, group = interaction(variable, Tool))) +
geom_point() +
labs(title = title2, x = "Tool", y = "Value", color = "Metric", linetype = "Metric") +
theme_minimal() +
theme(legend.position = "right", panel.background = element_rect(fill = "white"))
theme(
legend.position = "right",
panel.background = element_rect(fill = "white"),
axis.text.x = element_text(angle = 30, hjust = 0.5))

# Save the plots
tryCatch({
Expand Down
117 changes: 117 additions & 0 deletions bin/vcf_to_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python

# Copyright 2024 - GHGA
# Author: Kuebra Narci
'''
Generates a CSV file from a VCF
Expected usage:
$ python vcf_to_csv.py <vcf_file> <output>
Use --help for more information.
'''
import sys
from argparse import ArgumentParser
import re

import csv

def parse_info_field(info):
"""Parse the INFO field of a VCF line into a dictionary."""
info_dict = {}
for entry in info.split(';'):
key_value = entry.split('=')
if len(key_value) == 2:
info_dict[key_value[0]] = key_value[1]
else:
info_dict[key_value[0]] = True
return info_dict

def extract_gt_from_sample(sample, format_field):
"""Extract GT value from the sample field."""
format_fields = format_field.split(':')
sample_values = sample.split(':')
if 'GT' in format_fields:
gt_index = format_fields.index('GT')
return sample_values[gt_index]
return './.' # Default GT value if not present

def vcf_to_csv(vcf_file, csv_file):
"""Convert a VCF file to a CSV file."""
with open(vcf_file, 'r') as vcf:
headers = []
sample_headers = []
records = []
include_supp_vec = False
include_supp = False
include_type_inferred = False
include_svtype = False
include_svlen = False

for line in vcf:
if line.startswith('##'):
continue # Skip meta-information lines
elif line.startswith('#'):
headers = line[1:].strip().split('\t')
sample_headers = headers[9:] # The sample headers start from the 10th column
else:
row = line.strip().split('\t')
info_dict = parse_info_field(row[7])

# Check for SUPP_VEC, SUPP, type_inferred, SVTYPE, and SVLEN in the INFO field
if 'SUPP_VEC' in info_dict:
include_supp_vec = True
if 'SUPP' in info_dict:
include_supp = True
if 'type_inferred' in info_dict:
include_type_inferred = True
if 'SVTYPE' in info_dict:
include_svtype = True
if 'SVLEN' in info_dict:
include_svlen = True

records.append((row, info_dict))

# Write the header with optional fields
headers_to_write = headers[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
if include_supp_vec:
headers_to_write.append("SUPP_VEC")
if include_supp:
headers_to_write.append("SUPP")
if include_type_inferred:
headers_to_write.append("type_inferred")
if include_svtype:
headers_to_write.append("SVTYPE")
if include_svlen:
headers_to_write.append("SVLEN")
headers_to_write.extend([f'{sample}_GT' for sample in sample_headers])

with open(csv_file, 'w', newline='') as csvf:
csv_writer = csv.writer(csvf)
csv_writer.writerow(headers_to_write)

for row, info_dict in records:
row_to_write = row[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER
if include_supp_vec:
row_to_write.append(info_dict.get('SUPP_VEC', ''))
if include_supp:
row_to_write.append(info_dict.get('SUPP', ''))
if include_type_inferred:
row_to_write.append(info_dict.get('type_inferred', ''))
if include_svtype:
row_to_write.append(info_dict.get('SVTYPE', ''))
if include_svlen:
row_to_write.append(info_dict.get('SVLEN', ''))
format_field = row[8]
gt_values = [extract_gt_from_sample(sample, format_field) for sample in row[9:]]
row_to_write.extend(gt_values)

csv_writer.writerow(row_to_write)

if __name__ == '__main__':

# Parse arguments
parser = ArgumentParser(description='Generates a CSV file from a VCF')
parser.add_argument('vcf_file', help='VCF file')
parser.add_argument('output', help='Output CSV file')
args = parser.parse_args()

vcf_to_csv(args.vcf_file, args.output)
Loading