From 34292480e1c19d115b5f681269b13e80aa65440f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Tue, 2 Jul 2024 16:00:45 +0200 Subject: [PATCH 1/9] add variant_extractor --- assets/samplesheet_HG002_hg37.csv | 10 ++-- bin/check_diff.py | 29 ++++++++++ bin/normalize_vcf.py | 58 +++++++++++++++++++ conf/modules.config | 8 +++ conf/test_hg37.config | 15 +++-- .../local/variant_extractor/environment.yml | 8 +++ modules/local/variant_extractor/main.nf | 48 +++++++++++++++ nextflow.config | 2 +- .../local/compare_benchmark_results.nf | 21 ++++--- subworkflows/local/sv_vcf_conversion.nf | 24 +++++++- 10 files changed, 201 insertions(+), 22 deletions(-) create mode 100755 bin/check_diff.py create mode 100755 bin/normalize_vcf.py create mode 100644 modules/local/variant_extractor/environment.yml create mode 100644 modules/local/variant_extractor/main.nf diff --git a/assets/samplesheet_HG002_hg37.csv b/assets/samplesheet_HG002_hg37.csv index c711b62..0d14775 100644 --- a/assets/samplesheet_HG002_hg37.csv +++ b/assets/samplesheet_HG002_hg37.csv @@ -1,5 +1,5 @@ -test_vcf,caller,vartype -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.chr21.vcf.gz",lumpy,sv -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf",manta,sv -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/full.svaba.germline.sv.chr21.vcf",svaba,sv +test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf.gz",manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/full.svaba.germline.sv.chr21.vcf.gz",svaba,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 diff --git a/bin/check_diff.py b/bin/check_diff.py new file mode 100755 index 0000000..cff223a --- /dev/null +++ b/bin/check_diff.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import pysam +from argparse import ArgumentParser + +def compare_vcf(file1, file2): + vcf1 = pysam.VariantFile(file1) + vcf2 = pysam.VariantFile(file2) + + variants1 = {rec.id for rec in vcf1.fetch()} + variants2 = {rec.id for rec in vcf2.fetch()} + + unique_to_file1 = variants1 - variants2 + unique_to_file2 = variants2 - variants1 + + print(f"Unique to {file1}: {unique_to_file1}") + print(f"Unique to {file2}: {unique_to_file2}") + +if __name__ == '__main__': + import os + import sys + + # Parse arguments + parser = ArgumentParser(description='Check differences') + parser.add_argument('file1', help='VCF file') + parser.add_argument('file2', help='Output VCF file') + + args = parser.parse_args() + compare_vcf(args.file1, args.file2) diff --git a/bin/normalize_vcf.py b/bin/normalize_vcf.py new file mode 100755 index 0000000..1b30391 --- /dev/null +++ b/bin/normalize_vcf.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +# Copyright 2022 - Barcelona Supercomputing Center +# Author: Rodrigo Martin +# BSC Dual License +''' +Generates a normalized VCF file from a VCF +Expected usage: + $ python normalize.py +Use --help for more information. +''' +import sys +from argparse import ArgumentParser + +import pysam + + +def _contig_to_int(contig): + contig = contig.lower().replace('chr', '') + if contig.isdigit(): + return int(contig) + else: + return 22 + ord(contig[0]) + + +if __name__ == '__main__': + import os + import sys + sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/') + from variant_extractor import VariantExtractor + from variant_extractor.variants import VariantType + + # Parse arguments + parser = ArgumentParser(description='Generate normalized VCF file from a VCF file') + parser.add_argument('vcf_file', help='VCF file') + parser.add_argument('output_vcf_file', help='Output VCF file') + parser.add_argument('fasta', help='Reference fasta file coupled with fai') + + args = parser.parse_args() + + # Extract header from original file + with open(args.vcf_file, 'r') as f: + with pysam.VariantFile(f) as input_file: + input_file.header.add_meta('cmdline', ' '.join(sys.argv)) + header_str = str(input_file.header) + + # Open output file, write as stream + with open(args.output_vcf_file, 'w') as output_vcf: + # Write header + output_vcf.write(header_str) + print(f'Reading {args.vcf_file}...') + # Open input file, read with variant_extractor + extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta) + records = list(extractor) + # Sort record by chromosome and position + records.sort(key=lambda x: (_contig_to_int(x.contig), x.pos)) + for variant_record in records: + output_vcf.write(str(variant_record)+'\n') diff --git a/conf/modules.config b/conf/modules.config index 6582b65..8d46489 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,6 +34,14 @@ process { ] } + withName: "VARIANT_EXTRACTOR" { + ext.prefix = { input.baseName - ".vcf" } + publishDir = [ + path: {"${params.outdir}/${meta.id}/preprocess"}, + pattern: "*{.vcf}", + mode: params.publish_dir_mode + ] + } withName: "BCFTOOLS_DEDUP" { ext.prefix = { vcf.baseName - ".vcf" + ".dedup"} ext.args = {"--output-type z --rm-du exact -c w" } diff --git a/conf/test_hg37.config b/conf/test_hg37.config index 1fe7511..3da09f4 100644 --- a/conf/test_hg37.config +++ b/conf/test_hg37.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/variantbenchmarking -profile test, --outdir + nextflow run nf-core/variantbenchmarking -profile test_hg37, --outdir ---------------------------------------------------------------------------------------- */ @@ -22,7 +22,7 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'assets/samplesheet_HG002.csv' + input = 'assets/samplesheet_HG002_hg37.csv' outdir = 'results' // Genome references @@ -30,13 +30,12 @@ params { analysis = 'germline' //somatic method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist - standardization = true - preprocess = "normalization, deduplication" - //bnd_to_inv = true + preprocess = "" + min_sv_size = 30 + sv_standardization = "homogenize" sample = "HG002" // available samples: SEQC2, HG002 - truth_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.vcf.gz" - high_conf_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.bed" + truth_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/truth/HG002_SVs_Tier1_v0.6.chr21.vcf.gz" + high_conf_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/truth/HG002_SVs_Tier1_v0.6.chr21.bed" } - diff --git a/modules/local/variant_extractor/environment.yml b/modules/local/variant_extractor/environment.yml new file mode 100644 index 0000000..ba302f8 --- /dev/null +++ b/modules/local/variant_extractor/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::pysam=0.22.1 + - pip + - pip: + - variant-extractor==4.0.6 diff --git a/modules/local/variant_extractor/main.nf b/modules/local/variant_extractor/main.nf new file mode 100644 index 0000000..b6da3dc --- /dev/null +++ b/modules/local/variant_extractor/main.nf @@ -0,0 +1,48 @@ +process VARIANT_EXTRACTOR { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/pysam_pip_variant-extractor:23ef51c8f5e0524a': + 'community.wave.seqera.io/library/pysam_pip_variant-extractor:23ef51c8f5e0524a' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_fai) + + output: + tuple val(meta), path("*.norm.vcf") , emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + normalize_vcf.py \\ + $input \\ + ${prefix}.norm.vcf \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.norm.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + +} diff --git a/nextflow.config b/nextflow.config index 954b1b7..31794fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,7 +39,7 @@ params { sv_standardization = "" // harmonize,standardization,dup_to_ins,bnd_to_inv,gridss_annotate // Benchmarking method - method = 'truvari,svanalyzer,happy,rtgtools,wittyer' // --not working for now : vcfdist + method = 'truvari,svanalyzer,happy,rtgtools,wittyer' // Filtering parameters // minsize effects both truvari and variantbenchmarkingmark in different ways! svbechmark filters both base and comp calls diff --git a/subworkflows/local/compare_benchmark_results.nf b/subworkflows/local/compare_benchmark_results.nf index 89875f2..4a7f814 100644 --- a/subworkflows/local/compare_benchmark_results.nf +++ b/subworkflows/local/compare_benchmark_results.nf @@ -7,11 +7,13 @@ params.options = [:] include { SURVIVOR_MERGE } from '../../modules/nf-core/survivor/merge' addParams( options: params.options ) include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) include { BCFTOOLS_MERGE } from '../../modules/nf-core/bcftools/merge' addParams( options: params.options ) +include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' addParams( options: params.options ) include { TABIX_BGZIP as TABIX_BGZIP_BENCH } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_SV } from '../../modules/nf-core/bcftools/query' addParams( options: params.options ) include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_SMALL } from '../../modules/nf-core/bcftools/query' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_SORT } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { TABIX_TABIX as TABIX_TABIX_MERGE } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) workflow COMPARE_BENCHMARK_RESULTS { take: @@ -35,13 +37,13 @@ workflow COMPARE_BENCHMARK_RESULTS { ) versions = versions.mix(BCFTOOLS_MERGE.out.versions) - TABIX_TABIX( + TABIX_TABIX_MERGE( BCFTOOLS_MERGE.out.merged_variants ) - versions = versions.mix(TABIX_TABIX.out.versions) + versions = versions.mix(TABIX_TABIX_MERGE.out.versions) BCFTOOLS_QUERY_SMALL( - BCFTOOLS_MERGE.out.merged_variants.join(TABIX_TABIX.out.tbi), + BCFTOOLS_MERGE.out.merged_variants.join(TABIX_TABIX_MERGE.out.tbi), [], [], [] @@ -78,16 +80,21 @@ workflow COMPARE_BENCHMARK_RESULTS { ) versions = versions.mix(SURVIVOR_MERGE.out.versions) - TABIX_BGZIPTABIX( + BCFTOOLS_SORT( SURVIVOR_MERGE.out.vcf ) - versions = versions.mix(TABIX_BGZIPTABIX.out.versions) + versions = versions.mix(BCFTOOLS_SORT.out.versions) + + TABIX_TABIX_SORT( + BCFTOOLS_SORT.out.vcf + ) + versions = versions.mix(TABIX_TABIX_SORT.out.versions) // add a plot to see better supported variants // https://github.com/fritzsedlazeck/SURVIVOR/wiki BCFTOOLS_QUERY_SV( - TABIX_BGZIPTABIX.out.gz_tbi, + BCFTOOLS_SORT.out.vcf.join(TABIX_TABIX_SORT.out.tbi), [], [], [] diff --git a/subworkflows/local/sv_vcf_conversion.nf b/subworkflows/local/sv_vcf_conversion.nf index 4f6a235..fe64d82 100644 --- a/subworkflows/local/sv_vcf_conversion.nf +++ b/subworkflows/local/sv_vcf_conversion.nf @@ -10,6 +10,8 @@ include { MANTA_CONVERTINVERSION } from '../../modules/nf-core/manta/convertinv include { GRIDSS_ANNOTATION } from '../../modules/local/gridss_annotation' addParams( options: params.options ) include { SVYNC } from '../../modules/nf-core/svync' addParams( options: params.options ) include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' addParams( options: params.options ) +include { VARIANT_EXTRACTOR } from '../../modules/local/variant_extractor' addParams( options: params.options ) +include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' addParams( options: params.options ) workflow SV_VCF_CONVERSIONS { take: @@ -21,6 +23,25 @@ workflow SV_VCF_CONVERSIONS { main: versions = Channel.empty() + if (params.sv_standardization.contains("homogenize")){ + // uses VariantExtractor to homogenize variants + + VARIANT_EXTRACTOR( + input_ch, + fasta, + fai + ) + versions = versions.mix(VARIANT_EXTRACTOR.out.versions) + + // sort vcf + BCFTOOLS_SORT( + VARIANT_EXTRACTOR.out.output + ) + versions = versions.mix(BCFTOOLS_SORT.out.versions) + input_ch = BCFTOOLS_SORT.out.vcf + + } + // // MODULE: BGZIP_TABIX // @@ -130,7 +151,8 @@ workflow SV_VCF_CONVERSIONS { out_vcf_ch = out_vcf_ch.mix(input.other) vcf_ch = out_vcf_ch } - // https://github.com/EUCANCan/variant-extractor/blob/main/examples/vcf_to_csv.py + // https://github.com/EUCANCan/variant-extractor/blob/main/examples/vcf_to_csv.py' + emit: vcf_ch From 28e3231afb38f067d81e22ecb87ca31f5db39ea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Wed, 3 Jul 2024 08:51:26 +0000 Subject: [PATCH 2/9] add vcf_to_csv.py --- bin/vcf_to_csv.py | 35 +++++++++++ conf/modules.config | 30 ++-------- conf/test_full.config | 6 +- modules/local/vcf_to_csv/environment.yml | 9 +++ modules/local/vcf_to_csv/main.nf | 48 +++++++++++++++ .../local/compare_benchmark_results.nf | 59 +++++-------------- subworkflows/local/prepare_vcfs_test.nf | 30 +++++----- subworkflows/local/prepare_vcfs_truth.nf | 12 ++-- .../local/report_benchmark_statistics.nf | 4 +- subworkflows/local/report_vcf_statistics.nf | 4 +- .../local/small_germline_benchmark.nf | 6 +- subworkflows/local/small_somatic_benchmark.nf | 4 +- subworkflows/local/sv_germline_benchmark.nf | 12 ++-- .../local/sv_somatic_benchmark copy.nf | 2 +- subworkflows/local/sv_vcf_conversion.nf | 14 ++--- .../local/vcf_variant_deduplication.nf | 6 +- subworkflows/local/vcf_variant_filtering.nf | 8 +-- 17 files changed, 167 insertions(+), 122 deletions(-) create mode 100755 bin/vcf_to_csv.py create mode 100644 modules/local/vcf_to_csv/environment.yml create mode 100644 modules/local/vcf_to_csv/main.nf diff --git a/bin/vcf_to_csv.py b/bin/vcf_to_csv.py new file mode 100755 index 0000000..6af803c --- /dev/null +++ b/bin/vcf_to_csv.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# Copyright 2022 - Barcelona Supercomputing Center +# Author: Rodrigo Martin +# BSC Dual License +''' +Generates a CSV file from an input VCF file +Expected usage: + $ python vcf_to_csv.py +Use --help for more information. +''' +from argparse import ArgumentParser + +if __name__ == '__main__': + import os + import sys + sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/') + from variant_extractor import VariantExtractor + + # Parse arguments + parser = ArgumentParser(description='Generate CSV file from a VCF file') + parser.add_argument('vcf_file', help='VCF file') + parser.add_argument('output_file', help='Output file') + parser.add_argument('-f', '--fasta-ref', help='FASTA reference file') + args = parser.parse_args() + + variants = [] + + print(f'Reading VCF file: {args.vcf_file}') + extractor = VariantExtractor(args.vcf_file, fasta_ref=args.fasta_ref) + df = extractor.to_dataframe() + # Insert id column in the first position + df.insert(0, 'id', '') + df['id'] = df['variant_record_obj'].apply(lambda x: x.id) + df.drop(['variant_record_obj'], axis=1, inplace=True) + df.to_csv(f'{args.output_file}', index=False) diff --git a/conf/modules.config b/conf/modules.config index 8d46489..f7faf25 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -269,36 +269,16 @@ process { } withName: SURVIVOR_MERGE { ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{vcf}", - mode: params.publish_dir_mode - ] } withName: BCFTOOLS_MERGE { ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - ext.args = {"--output-type z --force-samples"} - publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{vcf.gz}", - mode: params.publish_dir_mode - ] - } - withName: BCFTOOLS_QUERY_SV { - ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - ext.args = {"-f '%CHROM\t%POS\t%INFO/SVTYPE\t%INFO/SVLEN\t%INFO/SUPP_VEC\t%INFO/SUPP\t%ID[\t%SAMPLE]'" } - publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{.txt}", - mode: params.publish_dir_mode - ] + ext.args = {"--output-type v --force-samples"} } - withName: BCFTOOLS_QUERY_SMALL { + withName: VCF_TO_CSV { ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"} - ext.args = {"-f '%CHROM\t%POS\t%REF\t%ALT\t%TYPE\t[%GT]\t%ID[\t%SAMPLE]'" } publishDir = [ - path: {"${params.outdir}/summary/merged_vcf"}, - pattern: "*{.txt}", + path: {"${params.outdir}/summary/comparisons"}, + pattern: "*{.csv}", mode: params.publish_dir_mode ] } @@ -307,7 +287,7 @@ process { // Don't publish results for these processes // process { - withName: 'TABIX_TABIX|TABIX_BGZIPTABIX|TABIX_BGZIP|BGZIP_TABIX' { + withName: 'TABIX_TABIX|TABIX_BGZIP|TABIX_BGZIPTABIX|BGZIP_TABIX|SURVIVOR_MERGE|BCFTOOLS_MERGE' { publishDir = [ path: { "${params.outdir}/test" }, enabled: false diff --git a/conf/test_full.config b/conf/test_full.config index 52723f8..fa0f52d 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -27,8 +27,10 @@ params { // Processes analysis = 'germline' - method = 'happy,truvari,svanalyzer' // - preprocess = "normalization,deduplication,prepy" + method = 'happy,truvari,svanalyzer,rtgtools' // + preprocess = "normalization,deduplication" + min_sv_size = 30 + sv_standardization = "bed_to_ins" //variant_filtering = "include" // null, include, exclude //expression = 'FILTER="."' diff --git a/modules/local/vcf_to_csv/environment.yml b/modules/local/vcf_to_csv/environment.yml new file mode 100644 index 0000000..a03f0fd --- /dev/null +++ b/modules/local/vcf_to_csv/environment.yml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::pysam=0.22.1 + - conda-forge::pandas=2.2.2 + - pip + - pip: + - variant-extractor==4.0.6 diff --git a/modules/local/vcf_to_csv/main.nf b/modules/local/vcf_to_csv/main.nf new file mode 100644 index 0000000..c99b502 --- /dev/null +++ b/modules/local/vcf_to_csv/main.nf @@ -0,0 +1,48 @@ +process VCF_TO_CSV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/pysam_pandas_pip_variant-extractor:a2d9ec4a4efadbc3': + 'community.wave.seqera.io/library/pysam_pandas_pip_variant-extractor:a2d9ec4a4efadbc3' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_fai) + + output: + tuple val(meta), path("*.csv") , emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + vcf_to_csv.py \\ + $input \\ + ${prefix}.csv \\ + -f $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ + +} diff --git a/subworkflows/local/compare_benchmark_results.nf b/subworkflows/local/compare_benchmark_results.nf index 4a7f814..b527bdb 100644 --- a/subworkflows/local/compare_benchmark_results.nf +++ b/subworkflows/local/compare_benchmark_results.nf @@ -5,15 +5,10 @@ params.options = [:] -include { SURVIVOR_MERGE } from '../../modules/nf-core/survivor/merge' addParams( options: params.options ) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { BCFTOOLS_MERGE } from '../../modules/nf-core/bcftools/merge' addParams( options: params.options ) -include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' addParams( options: params.options ) -include { TABIX_BGZIP as TABIX_BGZIP_BENCH } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) -include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_SV } from '../../modules/nf-core/bcftools/query' addParams( options: params.options ) -include { BCFTOOLS_QUERY as BCFTOOLS_QUERY_SMALL } from '../../modules/nf-core/bcftools/query' addParams( options: params.options ) -include { TABIX_TABIX as TABIX_TABIX_SORT } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { TABIX_TABIX as TABIX_TABIX_MERGE } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) +include { SURVIVOR_MERGE } from '../../modules/nf-core/survivor/merge' +include { BCFTOOLS_MERGE } from '../../modules/nf-core/bcftools/merge' +include { VCF_TO_CSV } from '../../modules/local/vcf_to_csv' +include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip' workflow COMPARE_BENCHMARK_RESULTS { take: @@ -24,6 +19,7 @@ workflow COMPARE_BENCHMARK_RESULTS { main: versions = Channel.empty() + merged_vcfs= Channel.empty() // Small Variants // @@ -36,19 +32,7 @@ workflow COMPARE_BENCHMARK_RESULTS { [] ) versions = versions.mix(BCFTOOLS_MERGE.out.versions) - - TABIX_TABIX_MERGE( - BCFTOOLS_MERGE.out.merged_variants - ) - versions = versions.mix(TABIX_TABIX_MERGE.out.versions) - - BCFTOOLS_QUERY_SMALL( - BCFTOOLS_MERGE.out.merged_variants.join(TABIX_TABIX_MERGE.out.tbi), - [], - [], - [] - ) - versions = versions.mix(BCFTOOLS_QUERY_SMALL.out.versions) + merged_vcfs = merged_vcfs.mix(BCFTOOLS_MERGE.out.merged_variants) // SV part // @@ -56,12 +40,12 @@ workflow COMPARE_BENCHMARK_RESULTS { // // unzip vcfs - TABIX_BGZIP_BENCH( + TABIX_BGZIP( sv_ch ) - versions = versions.mix(TABIX_BGZIP_BENCH.out.versions) + versions = versions.mix(TABIX_BGZIP.out.versions) - TABIX_BGZIP_BENCH.out.output + TABIX_BGZIP.out.output .groupTuple() .set{vcf_ch} @@ -79,27 +63,14 @@ workflow COMPARE_BENCHMARK_RESULTS { 30 ) versions = versions.mix(SURVIVOR_MERGE.out.versions) + merged_vcfs = merged_vcfs.mix(SURVIVOR_MERGE.out.vcf) - BCFTOOLS_SORT( - SURVIVOR_MERGE.out.vcf - ) - versions = versions.mix(BCFTOOLS_SORT.out.versions) - - TABIX_TABIX_SORT( - BCFTOOLS_SORT.out.vcf - ) - versions = versions.mix(TABIX_TABIX_SORT.out.versions) - - // add a plot to see better supported variants - // https://github.com/fritzsedlazeck/SURVIVOR/wiki - - BCFTOOLS_QUERY_SV( - BCFTOOLS_SORT.out.vcf.join(TABIX_TABIX_SORT.out.tbi), - [], - [], - [] + VCF_TO_CSV( + merged_vcfs, + fasta, + fai ) - versions = versions.mix(BCFTOOLS_QUERY_SV.out.versions) + versions = versions.mix(VCF_TO_CSV.out.versions) emit: versions diff --git a/subworkflows/local/prepare_vcfs_test.nf b/subworkflows/local/prepare_vcfs_test.nf index 0b8bdf8..e21af3a 100644 --- a/subworkflows/local/prepare_vcfs_test.nf +++ b/subworkflows/local/prepare_vcfs_test.nf @@ -3,21 +3,21 @@ // params.options = [:] -include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' addParams( options: params.options ) -include { VCF_VARIANT_FILTERING } from '../local/vcf_variant_filtering' addParams( options: params.options ) -include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' addParams( options: params.options ) -include { HAPPY_PREPY } from '../../modules/nf-core/happy/prepy/main' addParams( options: params.options ) -include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) -include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { TABIX_TABIX as TABIX_TABIX_3 } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_1 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_2 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_3 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_4 } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CONTIGS } from '../../modules/nf-core/bcftools/view' addParams( options: params.options ) -include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_SNV } from '../../modules/nf-core/bcftools/view' addParams( options: params.options ) -include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_INDEL } from '../../modules/nf-core/bcftools/view' addParams( options: params.options ) -include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TEST } from '../../modules/nf-core/bcftools/reheader' addParams( options: params.options ) +include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' +include { VCF_VARIANT_FILTERING } from '../local/vcf_variant_filtering' +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' +include { HAPPY_PREPY } from '../../modules/nf-core/happy/prepy/main' +include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' +include { TABIX_TABIX as TABIX_TABIX_1 } from '../../modules/nf-core/tabix/tabix' +include { TABIX_TABIX as TABIX_TABIX_3 } from '../../modules/nf-core/tabix/tabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_1 } from '../../modules/nf-core/tabix/bgziptabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_2 } from '../../modules/nf-core/tabix/bgziptabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_3 } from '../../modules/nf-core/tabix/bgziptabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_4 } from '../../modules/nf-core/tabix/bgziptabix' +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CONTIGS } from '../../modules/nf-core/bcftools/view' +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_SNV } from '../../modules/nf-core/bcftools/view' +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_INDEL } from '../../modules/nf-core/bcftools/view' +include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TEST } from '../../modules/nf-core/bcftools/reheader' workflow PREPARE_VCFS_TEST { diff --git a/subworkflows/local/prepare_vcfs_truth.nf b/subworkflows/local/prepare_vcfs_truth.nf index deca674..808070e 100644 --- a/subworkflows/local/prepare_vcfs_truth.nf +++ b/subworkflows/local/prepare_vcfs_truth.nf @@ -4,12 +4,12 @@ params.options = [:] -include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix.nf' addParams( options: params.options ) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) -include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' addParams( options: params.options ) -include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TRUTH } from '../../modules/nf-core/bcftools/reheader' addParams( options: params.options ) +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix.nf' +include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' +include { BCFTOOLS_NORM } from '../../modules/nf-core/bcftools/norm' +include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' +include { VCF_VARIANT_DEDUPLICATION } from '../local/vcf_variant_deduplication' +include { BCFTOOLS_REHEADER as BCFTOOLS_REHEADER_TRUTH } from '../../modules/nf-core/bcftools/reheader' workflow PREPARE_VCFS_TRUTH { take: diff --git a/subworkflows/local/report_benchmark_statistics.nf b/subworkflows/local/report_benchmark_statistics.nf index 0c97692..e4d03aa 100644 --- a/subworkflows/local/report_benchmark_statistics.nf +++ b/subworkflows/local/report_benchmark_statistics.nf @@ -4,8 +4,8 @@ params.options = [:] -include { MERGE_REPORTS } from '../../modules/local/merge_reports' addParams( options: params.options ) -include { PLOTS } from '../../modules/local/plots' addParams( options: params.options ) +include { MERGE_REPORTS } from '../../modules/local/merge_reports' +include { PLOTS } from '../../modules/local/plots' workflow REPORT_BENCHMARK_STATISTICS { take: diff --git a/subworkflows/local/report_vcf_statistics.nf b/subworkflows/local/report_vcf_statistics.nf index b070e24..2624a35 100644 --- a/subworkflows/local/report_vcf_statistics.nf +++ b/subworkflows/local/report_vcf_statistics.nf @@ -4,8 +4,8 @@ params.options = [:] -include { SURVIVOR_STATS } from '../../modules/nf-core/survivor/stats' addParams( options: params.options ) -include { BCFTOOLS_STATS } from '../../modules/nf-core/bcftools/stats' addParams( options: params.options ) +include { SURVIVOR_STATS } from '../../modules/nf-core/survivor/stats' +include { BCFTOOLS_STATS } from '../../modules/nf-core/bcftools/stats' workflow REPORT_VCF_STATISTICS { take: diff --git a/subworkflows/local/small_germline_benchmark.nf b/subworkflows/local/small_germline_benchmark.nf index 19ebee9..456b269 100644 --- a/subworkflows/local/small_germline_benchmark.nf +++ b/subworkflows/local/small_germline_benchmark.nf @@ -4,9 +4,9 @@ params.options = [:] -include { RTGTOOLS_FORMAT } from '../../modules/nf-core/rtgtools/format/main' addParams( options: params.options ) -include { RTGTOOLS_VCFEVAL } from '../../modules/nf-core/rtgtools/vcfeval/main' addParams( options: params.options ) -include { HAPPY_HAPPY } from '../../modules/nf-core/happy/happy/main' addParams( options: params.options ) +include { RTGTOOLS_FORMAT } from '../../modules/nf-core/rtgtools/format/main' +include { RTGTOOLS_VCFEVAL } from '../../modules/nf-core/rtgtools/vcfeval/main' +include { HAPPY_HAPPY } from '../../modules/nf-core/happy/happy/main' workflow SMALL_GERMLINE_BENCHMARK { take: diff --git a/subworkflows/local/small_somatic_benchmark.nf b/subworkflows/local/small_somatic_benchmark.nf index 54de782..6220c15 100644 --- a/subworkflows/local/small_somatic_benchmark.nf +++ b/subworkflows/local/small_somatic_benchmark.nf @@ -4,8 +4,8 @@ params.options = [:] -include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' addParams( options: params.options ) -include { BAMSURGEON_EVALUATOR } from '../../modules/local/bamsurgeon_evaluator.nf' addParams( options: params.options ) +include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' +include { BAMSURGEON_EVALUATOR } from '../../modules/local/bamsurgeon_evaluator.nf' workflow SMALL_SOMATIC_BENCHMARK { take: diff --git a/subworkflows/local/sv_germline_benchmark.nf b/subworkflows/local/sv_germline_benchmark.nf index ef2c711..9f3e2d6 100644 --- a/subworkflows/local/sv_germline_benchmark.nf +++ b/subworkflows/local/sv_germline_benchmark.nf @@ -4,12 +4,12 @@ params.options = [:] -include { TRUVARI_PHAB } from '../../modules/local/truvari_phab' addParams( options: params.options ) -include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' addParams( options: params.options ) -include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' addParams( options: params.options ) -include { WITTYER } from '../../modules/nf-core/wittyer' addParams( options: params.options ) -include { TABIX_BGZIP as TABIX_BGZIP_QUERY } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) -include { TABIX_BGZIP as TABIX_BGZIP_TRUTH } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) +include { TRUVARI_PHAB } from '../../modules/local/truvari_phab' +include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' +include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' +include { WITTYER } from '../../modules/nf-core/wittyer' +include { TABIX_BGZIP as TABIX_BGZIP_QUERY } from '../../modules/nf-core/tabix/bgzip' +include { TABIX_BGZIP as TABIX_BGZIP_TRUTH } from '../../modules/nf-core/tabix/bgzip' workflow SV_GERMLINE_BENCHMARK { take: diff --git a/subworkflows/local/sv_somatic_benchmark copy.nf b/subworkflows/local/sv_somatic_benchmark copy.nf index d300754..6954aa1 100644 --- a/subworkflows/local/sv_somatic_benchmark copy.nf +++ b/subworkflows/local/sv_somatic_benchmark copy.nf @@ -4,7 +4,7 @@ params.options = [:] -include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' addParams( options: params.options ) +include { HAPPY_SOMPY } from '../../modules/nf-core/happy/sompy/main' addParams( options: params.options ) include { BAMSURGEON_EVALUATOR } from '../../modules/local/bamsurgeon_evaluator.nf' addParams( options: params.options ) workflow SMALL_SOMATIC_BENCHMARK { diff --git a/subworkflows/local/sv_vcf_conversion.nf b/subworkflows/local/sv_vcf_conversion.nf index fe64d82..50b42a5 100644 --- a/subworkflows/local/sv_vcf_conversion.nf +++ b/subworkflows/local/sv_vcf_conversion.nf @@ -6,12 +6,12 @@ import groovy.io.FileType params.options = [:] -include { MANTA_CONVERTINVERSION } from '../../modules/nf-core/manta/convertinversion' addParams( options: params.options ) -include { GRIDSS_ANNOTATION } from '../../modules/local/gridss_annotation' addParams( options: params.options ) -include { SVYNC } from '../../modules/nf-core/svync' addParams( options: params.options ) -include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' addParams( options: params.options ) -include { VARIANT_EXTRACTOR } from '../../modules/local/variant_extractor' addParams( options: params.options ) -include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' addParams( options: params.options ) +include { MANTA_CONVERTINVERSION } from '../../modules/nf-core/manta/convertinversion' +include { GRIDSS_ANNOTATION } from '../../modules/local/gridss_annotation' +include { SVYNC } from '../../modules/nf-core/svync' +include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' +include { VARIANT_EXTRACTOR } from '../../modules/local/variant_extractor' +include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' workflow SV_VCF_CONVERSIONS { take: @@ -103,7 +103,7 @@ workflow SV_VCF_CONVERSIONS { out_vcf_ch = Channel.empty() vcf_ch.branch{ - tool: it[0].id == "manta" || it[0].id == "dragen" + tool: it[0].id == "manta" other: true} .set{input} // diff --git a/subworkflows/local/vcf_variant_deduplication.nf b/subworkflows/local/vcf_variant_deduplication.nf index b921b92..1a0b878 100644 --- a/subworkflows/local/vcf_variant_deduplication.nf +++ b/subworkflows/local/vcf_variant_deduplication.nf @@ -4,9 +4,9 @@ params.options = [:] -include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' addParams( options: params.options ) -include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' addParams( options: params.options ) -include { BCFTOOLS_NORM as BCFTOOLS_DEDUP } from '../../modules/nf-core/bcftools/norm' addParams( options: params.options ) +include { BCFTOOLS_SORT } from '../../modules/nf-core/bcftools/sort' +include { TABIX_TABIX } from '../../modules/nf-core/tabix/tabix' +include { BCFTOOLS_NORM as BCFTOOLS_DEDUP } from '../../modules/nf-core/bcftools/norm' workflow VCF_VARIANT_DEDUPLICATION { take: diff --git a/subworkflows/local/vcf_variant_filtering.nf b/subworkflows/local/vcf_variant_filtering.nf index 7232e57..b741c93 100644 --- a/subworkflows/local/vcf_variant_filtering.nf +++ b/subworkflows/local/vcf_variant_filtering.nf @@ -4,10 +4,10 @@ params.options = [:] -include { SURVIVOR_FILTER } from '../../modules/nf-core/survivor/filter' addParams( options: params.options ) -include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip' addParams( options: params.options ) -include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' addParams( options: params.options ) -include { BCFTOOLS_FILTER } from '../../modules/nf-core/bcftools/filter' addParams( options: params.options ) +include { SURVIVOR_FILTER } from '../../modules/nf-core/survivor/filter' +include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip' +include { TABIX_BGZIPTABIX } from '../../modules/nf-core/tabix/bgziptabix' +include { BCFTOOLS_FILTER } from '../../modules/nf-core/bcftools/filter' workflow VCF_VARIANT_FILTERING { From ee9e8706dbed541cc4160026223af90bb4e61ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Wed, 3 Jul 2024 11:48:02 +0000 Subject: [PATCH 3/9] remove truvari phab --- conf/modules.config | 8 -------- subworkflows/local/sv_germline_benchmark.nf | 13 ------------- 2 files changed, 21 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f7faf25..d568182 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -138,14 +138,6 @@ process { mode: params.publish_dir_mode ] } - withName: "TRUVARI_PHAB" { - ext.prefix = {"${meta.id}.harm"} - publishDir = [ - path: {"${params.outdir}/${meta.id}/truvari_phab/"}, - pattern: "*{.vcf.gz,vcf.gz.tbi}", - mode: params.publish_dir_mode - ] - } // BENCHMARK TOOLS withName: "RTGTOOLS_FORMAT" { ext.when = { params.method.split(',').contains('rtgtools') && !params.sdf } diff --git a/subworkflows/local/sv_germline_benchmark.nf b/subworkflows/local/sv_germline_benchmark.nf index 9f3e2d6..cf00c11 100644 --- a/subworkflows/local/sv_germline_benchmark.nf +++ b/subworkflows/local/sv_germline_benchmark.nf @@ -4,7 +4,6 @@ params.options = [:] -include { TRUVARI_PHAB } from '../../modules/local/truvari_phab' include { TRUVARI_BENCH } from '../../modules/nf-core/truvari/bench' include { SVANALYZER_SVBENCHMARK } from '../../modules/nf-core/svanalyzer/svbenchmark' include { WITTYER } from '../../modules/nf-core/wittyer' @@ -26,18 +25,6 @@ workflow SV_GERMLINE_BENCHMARK { // SV benchmarking if (params.method.contains('truvari')){ - - if(params.sv_standardization.contains('harmonize')){ - // - // TRUVARI: TRUVARI_PHAB - // - TRUVARI_PHAB( - input_ch, - fasta, - fai - ) - versions = versions.mix(TRUVARI_PHAB.out.versions) - } // // MODULE: TRUVARI_BENCH // From e22485bf86d96b718dcc3981ca8789e41f0017b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Wed, 3 Jul 2024 13:16:47 +0000 Subject: [PATCH 4/9] benchmark params structure --- assets/samplesheet_full.csv | 14 +++++++------- assets/schema_input.json | 22 ++++++++++++++++------ conf/modules.config | 27 +++++++++++++++++++++++---- conf/test_full.config | 4 ++-- 4 files changed, 48 insertions(+), 19 deletions(-) diff --git a/assets/samplesheet_full.csv b/assets/samplesheet_full.csv index 9e68b6b..c1da7c0 100644 --- a/assets/samplesheet_full.csv +++ b/assets/samplesheet_full.csv @@ -1,7 +1,7 @@ -test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.chr21.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,, -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,, +test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,evaluationmode +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.chr21.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.strelka.variants.chr21.vcf.gz,strelka,small,,,,,,,,,,,, +https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/sarek/HG002.bcftools.chr21.vcf.gz,bcftools,small,,,,,,,,,,,, diff --git a/assets/schema_input.json b/assets/schema_input.json index c0a22d0..29ade93 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -91,6 +91,18 @@ "meta": ["chunksize"], "default": 500 }, + "dup_to_ins": { + "type": "boolean", + "errorMessage": "a truvari parameter. converts DUP to INS type", + "meta": ["dup_to_ins"], + "default": false + }, + "typeignore": { + "type": "boolean", + "errorMessage": "a truvari parameter. Ignore SVTYPE matching", + "meta": ["typeignore"], + "default": false + }, "bpDistance": { "type": "integer", "errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.", @@ -119,13 +131,11 @@ "meta": ["maxMatches"], "default": 10 }, - "similarityThreshold": { - "type": "number", - "errorMessage": "similarityThreshold is a wittyer parameter. This is used for sequence similarity thresholding.", + "evaluationmode": { + "type": "string", + "errorMessage": "evaluationmode is a wittyer parameter. It is by default requires genotype matching. simpleCounting:sc, CrossTypeAndSimpleCounting:cts ", "meta": ["similarityThreshold"], - "default": 0.7, - "minimum": 0, - "maximum": 1 + "default": "d" } }, "required": ["test_vcf", "caller", "vartype"] diff --git a/conf/modules.config b/conf/modules.config index d568182..43a0f8c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -187,7 +187,15 @@ process { } withName: "TRUVARI_BENCH" { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} - ext.args = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --chunksize ${meta.chunksize}"} + ext.args = {[ + (meta.pctsize) ? "--pctsize ${meta.pctsize}" : '', + (meta.pctovl) ? "--pctovl ${meta.pctovl}" : '', + (meta.pctseq) ? "--pctseq ${meta.pctseq}" : '', + (meta.refdist) ? "--refdist ${meta.refdist}" : '', + (meta.chunksize) ? "--chunksize ${meta.chunksize}" : '', + (meta.dup_to_ins) ? "--dup-to-ins" : '', + (meta.typeignore) ? "--typeignore" : '', + ].join(' ').trim()} ext.when = { params.method.split(',').contains('truvari') } publishDir = [ path: {"${params.outdir}/${meta.id}/truvari_bench"}, @@ -197,7 +205,12 @@ process { } withName: SVANALYZER_SVBENCHMARK { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} - ext.args = {"-normshift ${meta.normshift} –normdist ${meta.normdist} –normsizediff ${meta.normsizediff} -maxdist ${meta.maxdist}"} + ext.args = {[ + (meta.normshift) ? "-normshift ${meta.normshift}" : '', + (meta.normdist) ? "-normdist ${meta.normdist}" : '', + (meta.normsizediff) ? "-normsizediff ${meta.normsizediff}" : '', + (meta.maxdist) ? "-maxdist ${meta.maxdist}" : '', + ].join(' ').trim()} ext.when = { params.method.split(',').contains('svanalyzer') } publishDir = [ path: {"${params.outdir}/${meta.id}/svanalyzer_bench"}, @@ -207,7 +220,14 @@ process { } withName: WITTYER { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} - ext.args = {"-em cts --ef [] --pt ${meta.percentThreshold} --at ${meta.absoluteThreshold} --bpd ${meta.bpDistance} --mm ${meta.maxMatches} --st ${meta.similarityThreshold}"} + ext.args = {[ + "--ef []", + (meta.evaluationmode) ? "-em ${meta.evaluationmode}" : '', + (meta.percentThreshold) ? "--pt ${meta.percentThreshold}" : '', + (meta.absoluteThreshold) ? "--at ${meta.absoluteThreshold}" : '', + (meta.bpDistance) ? "--bpd ${meta.bpDistance}" : '', + (meta.maxMatches) ? "--mm ${meta.maxMatches}" : '', + ].join(' ').trim()} ext.when = { params.method.split(',').contains('wittyer') } publishDir = [ path: {"${params.outdir}/${meta.id}/wittyer_bench"}, @@ -215,7 +235,6 @@ process { mode: params.publish_dir_mode ] } - withName: BAMSURGEON_EVALUATOR { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} publishDir = [ diff --git a/conf/test_full.config b/conf/test_full.config index fa0f52d..f1e84f3 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -27,10 +27,10 @@ params { // Processes analysis = 'germline' - method = 'happy,truvari,svanalyzer,rtgtools' // + method = 'happy,truvari,svanalyzer,rtgtools,wittyer' // preprocess = "normalization,deduplication" min_sv_size = 30 - sv_standardization = "bed_to_ins" + sv_standardization = "" //variant_filtering = "include" // null, include, exclude //expression = 'FILTER="."' From 2413c0b8d462eb03bf40666d7f095ed67f922a34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Wed, 3 Jul 2024 15:04:47 +0000 Subject: [PATCH 5/9] remove unneccsary modules --- conf/modules.config | 9 ---- subworkflows/local/sv_vcf_conversion.nf | 57 ------------------------- 2 files changed, 66 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 43a0f8c..9a4db8a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -33,7 +33,6 @@ process { enabled: false ] } - withName: "VARIANT_EXTRACTOR" { ext.prefix = { input.baseName - ".vcf" } publishDir = [ @@ -243,14 +242,6 @@ process { mode: params.publish_dir_mode ] } - withName: MANTA_CONVERTINVERSION { - ext.prefix = {"${params.sample}"} - publishDir = [ - path: {"${params.outdir}/${meta.id}/preprocess"}, - pattern: "*{.vcf.gz,vcf.gz.tbi}", - mode: params.publish_dir_mode - ] - } withName: SVYNC { ext.prefix = {"${params.sample}.stnd"} publishDir = [ diff --git a/subworkflows/local/sv_vcf_conversion.nf b/subworkflows/local/sv_vcf_conversion.nf index 50b42a5..c45af5e 100644 --- a/subworkflows/local/sv_vcf_conversion.nf +++ b/subworkflows/local/sv_vcf_conversion.nf @@ -6,8 +6,6 @@ import groovy.io.FileType params.options = [:] -include { MANTA_CONVERTINVERSION } from '../../modules/nf-core/manta/convertinversion' -include { GRIDSS_ANNOTATION } from '../../modules/local/gridss_annotation' include { SVYNC } from '../../modules/nf-core/svync' include { BGZIP_TABIX } from '../../modules/local/bgzip_tabix' include { VARIANT_EXTRACTOR } from '../../modules/local/variant_extractor' @@ -98,61 +96,6 @@ workflow SV_VCF_CONVERSIONS { vcf_ch = out_vcf_ch.map{it -> tuple(it[0], it[1], it[2])} } - // Check tool spesific conversions - if(params.sv_standardization.contains("bnd_to_inv")){ - out_vcf_ch = Channel.empty() - - vcf_ch.branch{ - tool: it[0].id == "manta" - other: true} - .set{input} - // - // MANTA_CONVERTINVERSION - // - // NOTE: should also work for dragen - // Not working now!!!!! - - MANTA_CONVERTINVERSION( - input.tool.map{it -> tuple(it[0], it[1])}, - fasta - ) - versions = versions.mix(MANTA_CONVERTINVERSION.out.versions) - - out_ch = MANTA_CONVERTINVERSION.out.vcf.join(MANTA_CONVERTINVERSION.out.tbi) - out_vcf_ch = out_vcf_ch.mix(out_ch) - out_vcf_ch = out_vcf_ch.mix(input.other) - vcf_ch = out_vcf_ch - - // https://github.com/srbehera/DRAGEN_Analysis/blob/main/convertInversion.py - - } - - if (params.sv_standardization.contains("gridss_annotate")){ - out_vcf_ch = Channel.empty() - - vcf_ch.branch{ - tool: it[0].id == "gridss" - other: true} - .set{input} - - // - // GRIDSS_ANNOTATION - // - // https://github.com/PapenfussLab/gridss/blob/7b1fedfed32af9e03ed5c6863d368a821a4c699f/example/simple-event-annotation.R#L9 - // GRIDSS simple event annotation - GRIDSS_ANNOTATION( - input.tool, - fasta, - fai - ) - versions = versions.mix(GRIDSS_ANNOTATION.out.versions) - - out_vcf_ch = out_vcf_ch.mix(GRIDSS_ANNOTATION.out.vcf) - out_vcf_ch = out_vcf_ch.mix(input.other) - vcf_ch = out_vcf_ch - } - // https://github.com/EUCANCan/variant-extractor/blob/main/examples/vcf_to_csv.py' - emit: vcf_ch From 76aecda404365d1839e65fef7426574b036e1108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Thu, 4 Jul 2024 14:19:41 +0000 Subject: [PATCH 6/9] fix params in schema --- assets/samplesheet_HG002_hg37.csv | 10 +-- assets/schema_input.json | 61 +++++-------------- bin/vcf_to_csv.py | 2 +- conf/modules.config | 35 +++++------ conf/test_full.config | 4 +- conf/test_hg37.config | 4 +- conf/test_hg38.config | 2 +- .../local/report_benchmark_statistics.nf | 2 - subworkflows/local/sv_germline_benchmark.nf | 1 - 9 files changed, 45 insertions(+), 76 deletions(-) diff --git a/assets/samplesheet_HG002_hg37.csv b/assets/samplesheet_HG002_hg37.csv index 0d14775..5fc0da4 100644 --- a/assets/samplesheet_HG002_hg37.csv +++ b/assets/samplesheet_HG002_hg37.csv @@ -1,5 +1,5 @@ -test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf.gz",manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 -"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/full.svaba.germline.sv.chr21.vcf.gz",svaba,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000 +test_vcf,caller,vartype,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,dup_to_ins,pctsize,pctseq,pctovl,evaluationmode +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_delly_SV_hg19.chr21.vcf.gz",delly,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf.gz",manta,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.svaba.germline.sv.chr21.vcf.gz",svaba,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts diff --git a/assets/schema_input.json b/assets/schema_input.json index 29ade93..632f5c8 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -36,106 +36,77 @@ "normdist": { "type": "number", "errorMessage": "normshift is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized edit distance greater than normdist (default 0.2)", - "meta": ["normdist"], - "default": 0.2, - "minimum": 0, - "maximum": 1 + "meta": ["normdist"] }, "normsizediff": { "type": "number", "errorMessage": "normsizediff is a svbenchmark parameter. Has to be between 0-1. Disallow matches if alternate alleles have normalized size difference greater than normsizediff (default 0.2)", - "meta": ["normsizediff"], - "default": 0.2, - "minimum": 0, - "maximum": 1 + "meta": ["normsizediff"] }, "maxdist": { "type": "integer", "errorMessage": "maxdist is a svbenchmark parameter. Disallow matches if positions of two variants are more than maxdist bases from each other (default 100,000).", - "meta": ["maxdist"], - "default": 100000 + "meta": ["maxdist"] }, "pctsize": { "type": "number", "errorMessage": "pctsize is a truvari parameter. Has to be between 0-1. Ratio of min(base_size, comp_size)/max(base_size, comp_size).", - "meta": ["pctsize"], - "default": 0.7, - "minimum": 0, - "maximum": 1 + "meta": ["pctsize"] }, "pctseq": { "type": "number", "errorMessage": "pctseq is a truvari parameter. Has to be between 0-1. Edit distance ratio between the REF/ALT haplotype sequences of base and comparison call. turn it off (0) for no sequence comparison.", - "meta": ["pctseq"], - "default": 0.7, - "minimum": 0, - "maximum": 1 + "meta": ["pctseq"] }, "pctovl": { "type": "number", "errorMessage": "pctovl is a truvari parameter. Has to be between 0-1. Ratio of two calls' (overlapping bases)/(longest span)", - "meta": ["pctovl"], - "default": 0, - "minimum": 0, - "maximum": 1 + "meta": ["pctovl"] }, "refdist": { "type": "integer", "errorMessage": "refdist is a truvari parameter. Maximum distance comparison calls must be within from base call's start/end ", - "meta": ["refdist"], - "default": 500 + "meta": ["refdist"] }, "chunksize": { "type": "integer", "errorMessage": "chunksize is a truvari parameter. Create chunks of all calls overlapping within ±`--chunksize` basepairs", - "meta": ["chunksize"], - "default": 500 + "meta": ["chunksize"] }, "dup_to_ins": { "type": "boolean", "errorMessage": "a truvari parameter. converts DUP to INS type", - "meta": ["dup_to_ins"], - "default": false + "meta": ["dup_to_ins"] }, "typeignore": { "type": "boolean", "errorMessage": "a truvari parameter. Ignore SVTYPE matching", - "meta": ["typeignore"], - "default": false + "meta": ["typeignore"] }, "bpDistance": { "type": "integer", "errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.", - "meta": ["bpDistance"], - "default": 500, - "minimum": 0 + "meta": ["bpDistance"] }, "percentThreshold": { "type": "number", "errorMessage": "percentThreshold is a wittyer parameter. This is used for percentage thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use for large tandem repeats. For all other SVs, in order to match between query and truth, the distance between boundaries should be within a number thats proportional to total SV (default 0.25)", - "meta": ["percentThreshold"], - "default": 0.25, - "minimum": 0, - "maximum": 1 + "meta": ["percentThreshold"] }, "absoluteThreshold": { "type": "integer", "errorMessage": "absoluteThreshold is a wittyer parameter. This is used for absolute thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use. For all other SVs, this is the upper bound of boundary distance when comparing truth and query. (default 10000)", - "meta": ["absoluteThreshold"], - "default": 10000, - "minimum": 0 + "meta": ["absoluteThreshold"] }, "maxMatches": { "type": "integer", "errorMessage": "maxMatches is a wittyer parameter. This is used for matching behaviour. Negative value means to match any number (for large SVs it is not recommended).", - "meta": ["maxMatches"], - "default": 10 + "meta": ["maxMatches"] }, "evaluationmode": { "type": "string", - "errorMessage": "evaluationmode is a wittyer parameter. It is by default requires genotype matching. simpleCounting:sc, CrossTypeAndSimpleCounting:cts ", - "meta": ["similarityThreshold"], - "default": "d" + "errorMessage": "evaluationmode is a wittyer parameter. It is by default requires genotype matching. simpleCounting:sc, CrossTypeAndSimpleCounting:cts, genotypematch:d ", + "meta": ["evaluationmode"] } }, "required": ["test_vcf", "caller", "vartype"] diff --git a/bin/vcf_to_csv.py b/bin/vcf_to_csv.py index 6af803c..dfc0386 100755 --- a/bin/vcf_to_csv.py +++ b/bin/vcf_to_csv.py @@ -26,7 +26,7 @@ variants = [] print(f'Reading VCF file: {args.vcf_file}') - extractor = VariantExtractor(args.vcf_file, fasta_ref=args.fasta_ref) + extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta_ref) df = extractor.to_dataframe() # Insert id column in the first position df.insert(0, 'id', '') diff --git a/conf/modules.config b/conf/modules.config index 9a4db8a..be829a5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -187,13 +187,14 @@ process { withName: "TRUVARI_BENCH" { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} ext.args = {[ - (meta.pctsize) ? "--pctsize ${meta.pctsize}" : '', - (meta.pctovl) ? "--pctovl ${meta.pctovl}" : '', - (meta.pctseq) ? "--pctseq ${meta.pctseq}" : '', - (meta.refdist) ? "--refdist ${meta.refdist}" : '', - (meta.chunksize) ? "--chunksize ${meta.chunksize}" : '', + "--sizemin 0 --sizefilt 0 --sizemax 5000000", + (meta.pctseq != null) ? "--pctsize ${meta.pctsize}" : '', + (meta.pctovl != null) ? "--pctovl ${meta.pctovl}" : '', + (meta.pctseq != null) ? "--pctseq ${meta.pctseq}" : '', + (meta.refdist != null) ? "--refdist ${meta.refdist}" : '', + (meta.chunksize != null) ? "--chunksize ${meta.chunksize}" : '', (meta.dup_to_ins) ? "--dup-to-ins" : '', - (meta.typeignore) ? "--typeignore" : '', + (meta.typeignore) ? "--typeignore" : '' ].join(' ').trim()} ext.when = { params.method.split(',').contains('truvari') } publishDir = [ @@ -205,10 +206,10 @@ process { withName: SVANALYZER_SVBENCHMARK { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} ext.args = {[ - (meta.normshift) ? "-normshift ${meta.normshift}" : '', - (meta.normdist) ? "-normdist ${meta.normdist}" : '', - (meta.normsizediff) ? "-normsizediff ${meta.normsizediff}" : '', - (meta.maxdist) ? "-maxdist ${meta.maxdist}" : '', + (meta.normshift != null) ? "-normshift ${meta.normshift}" : '', + (meta.normdist != null) ? "-normdist ${meta.normdist}" : '', + (meta.normsizediff != null) ? "-normsizediff ${meta.normsizediff}" : '', + (meta.maxdist != null) ? "-maxdist ${meta.maxdist}" : '' ].join(' ').trim()} ext.when = { params.method.split(',').contains('svanalyzer') } publishDir = [ @@ -220,12 +221,12 @@ process { withName: WITTYER { ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"} ext.args = {[ - "--ef []", - (meta.evaluationmode) ? "-em ${meta.evaluationmode}" : '', - (meta.percentThreshold) ? "--pt ${meta.percentThreshold}" : '', - (meta.absoluteThreshold) ? "--at ${meta.absoluteThreshold}" : '', - (meta.bpDistance) ? "--bpd ${meta.bpDistance}" : '', - (meta.maxMatches) ? "--mm ${meta.maxMatches}" : '', + "--includedFilters=''", + (meta.evaluationmode != null) ? "-em ${meta.evaluationmode}" : '', + (meta.percentThreshold != null) ? "--pt ${meta.percentThreshold}" : '', + (meta.absoluteThreshold != null) ? "--at ${meta.absoluteThreshold}" : '', + (meta.bpDistance != null) ? "--bpd ${meta.bpDistance}" : '', + (meta.maxMatches != null) ? "--mm ${meta.maxMatches}" : '' ].join(' ').trim()} ext.when = { params.method.split(',').contains('wittyer') } publishDir = [ @@ -254,7 +255,7 @@ process { ext.prefix = {"${meta.benchmark_tool}.${meta.vartype}"} publishDir = [ path: {"${params.outdir}/summary/tables"}, - pattern: "*{txt}", + pattern: "*{csv}", mode: params.publish_dir_mode ] } diff --git a/conf/test_full.config b/conf/test_full.config index f1e84f3..bd109be 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -27,10 +27,10 @@ params { // Processes analysis = 'germline' - method = 'happy,truvari,svanalyzer,rtgtools,wittyer' // + method = 'happy,truvari,svanalyzer,rtgtools' // preprocess = "normalization,deduplication" min_sv_size = 30 - sv_standardization = "" + sv_standardization = "homogenize" //variant_filtering = "include" // null, include, exclude //expression = 'FILTER="."' diff --git a/conf/test_hg37.config b/conf/test_hg37.config index 3da09f4..ea9e3dc 100644 --- a/conf/test_hg37.config +++ b/conf/test_hg37.config @@ -28,9 +28,9 @@ params { // Genome references genome = 'GRCh37' analysis = 'germline' //somatic - method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist + method = 'truvari,svanalyzer,wittyer' - preprocess = "" + preprocess = "normalization,deduplication" min_sv_size = 30 sv_standardization = "homogenize" diff --git a/conf/test_hg38.config b/conf/test_hg38.config index e09318c..784afb4 100644 --- a/conf/test_hg38.config +++ b/conf/test_hg38.config @@ -37,7 +37,7 @@ params { //expression = 'FILTER="PASS"' sample = "HG002" // available samples: SEQC2, HG002 - truth_sv = "HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.annotated.vcf.gz" + truth_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.vcf.gz" high_conf_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.bed" } diff --git a/subworkflows/local/report_benchmark_statistics.nf b/subworkflows/local/report_benchmark_statistics.nf index e4d03aa..64a6d04 100644 --- a/subworkflows/local/report_benchmark_statistics.nf +++ b/subworkflows/local/report_benchmark_statistics.nf @@ -21,8 +21,6 @@ workflow REPORT_BENCHMARK_STATISTICS { versions = versions.mix(MERGE_REPORTS.out.versions) - MERGE_REPORTS.out.summary.view() - PLOTS( MERGE_REPORTS.out.summary ) diff --git a/subworkflows/local/sv_germline_benchmark.nf b/subworkflows/local/sv_germline_benchmark.nf index cf00c11..f917865 100644 --- a/subworkflows/local/sv_germline_benchmark.nf +++ b/subworkflows/local/sv_germline_benchmark.nf @@ -23,7 +23,6 @@ workflow SV_GERMLINE_BENCHMARK { tagged_variants=Channel.empty() // SV benchmarking - if (params.method.contains('truvari')){ // // MODULE: TRUVARI_BENCH From 2264f129ef080c42399360569e0a2021ae6c44a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Fri, 5 Jul 2024 13:04:24 +0000 Subject: [PATCH 7/9] add new samples and fix r plot --- assets/samplesheet_HG002_hg37.csv | 4 ++++ bin/merge_reports.py | 2 +- bin/plots.R | 10 ++++++++-- conf/test_hg37.config | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/assets/samplesheet_HG002_hg37.csv b/assets/samplesheet_HG002_hg37.csv index 5fc0da4..d12c054 100644 --- a/assets/samplesheet_HG002_hg37.csv +++ b/assets/samplesheet_HG002_hg37.csv @@ -3,3 +3,7 @@ test_vcf,caller,vartype,refdist,chunksize,normshift,normdist,normsizediff,maxdis "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_lumpy_SV_hg19.sorted.vcf.gz",lumpy,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_manta_SV_hg19_genotype.chr21.vcf.gz",manta,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.svaba.germline.sv.chr21.vcf.gz",svaba,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002-NA24385-50x.union_170414.split.chr21.vcf.gz",sbg_graph,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002.jointVC.filter.chr21.vcf.gz",gatk_joint,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/HG002_ALLCHROM_hs37d5_novoalign_Ilmn150bp300X_FB.chr21.vcf.gz",freebayes,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts +"https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg37/test/hg002.Assemblytics_structural_variants.sorted.chr21.vcf.gz",assemblytics,sv,100000,100000,0.3,0.3,0.3,100000,true,true,0.3,0,0,cts diff --git a/bin/merge_reports.py b/bin/merge_reports.py index a74fac2..058b431 100755 --- a/bin/merge_reports.py +++ b/bin/merge_reports.py @@ -35,7 +35,7 @@ def get_svbenchmark_resuls(file_paths): FP_pattern = re.compile(r'Number of false positives \(.*\): (\d+)') recall_pattern = re.compile(r'Recall \(.*\): (\d+\.\d+)%') precision_pattern = re.compile(r'Precision \(.*\): (\d+\.\d+)%') - f1_pattern = re.compile(r'F1 \(.*\): (\d+\.\d+)') + f1_pattern = re.compile(r'F1 \(.*\): ([\d\.]+(?:e[+-]?\d+)?)') # Iterate over each table file for file in file_paths: diff --git a/bin/plots.R b/bin/plots.R index ed8bb07..0ad4d47 100755 --- a/bin/plots.R +++ b/bin/plots.R @@ -44,14 +44,20 @@ generate_plots <- function(table, benchmark, type, filter, stats) { labs(title = title1, x = "Tool", y = "Value", color = "Tool") + facet_wrap(~variable, scales = "free_y") + theme_minimal() + - theme(legend.position = "right", panel.background = element_rect(fill = "white")) + theme( + legend.position = "right", + panel.background = element_rect(fill = "white"), + axis.text.x = element_text(angle = 30, hjust = 0.5)) # Visualize Precision, Recall, and F1 in separate plots with white background metric_plot <- ggplot(metric_data, aes(x = Tool, y = value, color = Tool, shape = variable, linetype = variable, group = interaction(variable, Tool))) + geom_point() + labs(title = title2, x = "Tool", y = "Value", color = "Metric", linetype = "Metric") + theme_minimal() + - theme(legend.position = "right", panel.background = element_rect(fill = "white")) + theme( + legend.position = "right", + panel.background = element_rect(fill = "white"), + axis.text.x = element_text(angle = 30, hjust = 0.5)) # Save the plots tryCatch({ diff --git a/conf/test_hg37.config b/conf/test_hg37.config index ea9e3dc..d2d2f3a 100644 --- a/conf/test_hg37.config +++ b/conf/test_hg37.config @@ -28,7 +28,7 @@ params { // Genome references genome = 'GRCh37' analysis = 'germline' //somatic - method = 'truvari,svanalyzer,wittyer' + method = 'truvari,svanalyzer' preprocess = "normalization,deduplication" min_sv_size = 30 From 81ff2a13eaded66adf889248b518ad95283f2032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Fri, 5 Jul 2024 14:05:51 +0000 Subject: [PATCH 8/9] bgzip added to variant_extractor --- assets/samplesheet_full.csv | 1 - conf/modules.config | 2 +- conf/test.config | 2 ++ modules/local/variant_extractor/environment.yml | 2 ++ modules/local/variant_extractor/main.nf | 10 ++++++---- modules/local/vcf_to_csv/main.nf | 1 + 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/assets/samplesheet_full.csv b/assets/samplesheet_full.csv index c1da7c0..e00e2e0 100644 --- a/assets/samplesheet_full.csv +++ b/assets/samplesheet_full.csv @@ -1,5 +1,4 @@ test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist,typeignore,evaluationmode -https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.chr21.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/manta.HG002.chr21.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/Ashkenazim_HG002.filtered.sv.chr21.vcf.gz,merged,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/test/HG002_DRAGEN_SV_hg19.chr21.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000,true,cts diff --git a/conf/modules.config b/conf/modules.config index be829a5..5e84df4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -37,7 +37,7 @@ process { ext.prefix = { input.baseName - ".vcf" } publishDir = [ path: {"${params.outdir}/${meta.id}/preprocess"}, - pattern: "*{.vcf}", + pattern: "*{.vcf.gz}", mode: params.publish_dir_mode ] } diff --git a/conf/test.config b/conf/test.config index 895d1b0..bf5efc6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -30,6 +30,8 @@ params { analysis = 'germline' method = 'happy,truvari,svanalyzer,wittyer,rtgtools' // preprocess = "normalization,deduplication" + sv_standardization = "homogenize" + //variant_filtering = "include" // null, include, exclude //expression = 'FILTER="."' diff --git a/modules/local/variant_extractor/environment.yml b/modules/local/variant_extractor/environment.yml index ba302f8..216715b 100644 --- a/modules/local/variant_extractor/environment.yml +++ b/modules/local/variant_extractor/environment.yml @@ -2,7 +2,9 @@ channels: - conda-forge - bioconda dependencies: + - bioconda::htslib=1.20 - bioconda::pysam=0.22.1 + - bioconda::tabix=1.11 - pip - pip: - variant-extractor==4.0.6 diff --git a/modules/local/variant_extractor/main.nf b/modules/local/variant_extractor/main.nf index b6da3dc..e2eb5e3 100644 --- a/modules/local/variant_extractor/main.nf +++ b/modules/local/variant_extractor/main.nf @@ -4,8 +4,8 @@ process VARIANT_EXTRACTOR { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/pysam_pip_variant-extractor:23ef51c8f5e0524a': - 'community.wave.seqera.io/library/pysam_pip_variant-extractor:23ef51c8f5e0524a' }" + 'oras://community.wave.seqera.io/library/htslib_pysam_tabix_pip_variant-extractor:a12ef217eccf6ba8': + 'community.wave.seqera.io/library/htslib_pysam_tabix_pip_variant-extractor:a12ef217eccf6ba8' }" input: tuple val(meta), path(input) @@ -13,8 +13,8 @@ process VARIANT_EXTRACTOR { tuple val(meta3), path(fasta_fai) output: - tuple val(meta), path("*.norm.vcf") , emit: output - path "versions.yml" , emit: versions + tuple val(meta), path("*.norm.vcf.gz") , emit: output + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -28,6 +28,8 @@ process VARIANT_EXTRACTOR { ${prefix}.norm.vcf \\ $fasta + bgzip ${prefix}.norm.vcf + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') diff --git a/modules/local/vcf_to_csv/main.nf b/modules/local/vcf_to_csv/main.nf index c99b502..bf1368a 100644 --- a/modules/local/vcf_to_csv/main.nf +++ b/modules/local/vcf_to_csv/main.nf @@ -28,6 +28,7 @@ process VCF_TO_CSV { ${prefix}.csv \\ -f $fasta + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') From 3deb09d837761cbc950ed2fc1b8d166a227f3c6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BCbra=20Narc=C4=B1?= Date: Mon, 8 Jul 2024 13:52:58 +0000 Subject: [PATCH 9/9] vcf_to_csv.py changed --- bin/vcf_to_csv.py | 126 +++++++++++++++--- modules/local/vcf_to_csv/main.nf | 6 +- .../local/compare_benchmark_results.nf | 4 +- 3 files changed, 106 insertions(+), 30 deletions(-) diff --git a/bin/vcf_to_csv.py b/bin/vcf_to_csv.py index dfc0386..eb1fb53 100755 --- a/bin/vcf_to_csv.py +++ b/bin/vcf_to_csv.py @@ -1,35 +1,117 @@ #!/usr/bin/env python -# Copyright 2022 - Barcelona Supercomputing Center -# Author: Rodrigo Martin -# BSC Dual License + +# Copyright 2024 - GHGA +# Author: Kuebra Narci ''' -Generates a CSV file from an input VCF file +Generates a CSV file from a VCF Expected usage: - $ python vcf_to_csv.py + $ python vcf_to_csv.py Use --help for more information. ''' +import sys from argparse import ArgumentParser +import re + +import csv + +def parse_info_field(info): + """Parse the INFO field of a VCF line into a dictionary.""" + info_dict = {} + for entry in info.split(';'): + key_value = entry.split('=') + if len(key_value) == 2: + info_dict[key_value[0]] = key_value[1] + else: + info_dict[key_value[0]] = True + return info_dict + +def extract_gt_from_sample(sample, format_field): + """Extract GT value from the sample field.""" + format_fields = format_field.split(':') + sample_values = sample.split(':') + if 'GT' in format_fields: + gt_index = format_fields.index('GT') + return sample_values[gt_index] + return './.' # Default GT value if not present + +def vcf_to_csv(vcf_file, csv_file): + """Convert a VCF file to a CSV file.""" + with open(vcf_file, 'r') as vcf: + headers = [] + sample_headers = [] + records = [] + include_supp_vec = False + include_supp = False + include_type_inferred = False + include_svtype = False + include_svlen = False + + for line in vcf: + if line.startswith('##'): + continue # Skip meta-information lines + elif line.startswith('#'): + headers = line[1:].strip().split('\t') + sample_headers = headers[9:] # The sample headers start from the 10th column + else: + row = line.strip().split('\t') + info_dict = parse_info_field(row[7]) + + # Check for SUPP_VEC, SUPP, type_inferred, SVTYPE, and SVLEN in the INFO field + if 'SUPP_VEC' in info_dict: + include_supp_vec = True + if 'SUPP' in info_dict: + include_supp = True + if 'type_inferred' in info_dict: + include_type_inferred = True + if 'SVTYPE' in info_dict: + include_svtype = True + if 'SVLEN' in info_dict: + include_svlen = True + + records.append((row, info_dict)) + + # Write the header with optional fields + headers_to_write = headers[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER + if include_supp_vec: + headers_to_write.append("SUPP_VEC") + if include_supp: + headers_to_write.append("SUPP") + if include_type_inferred: + headers_to_write.append("type_inferred") + if include_svtype: + headers_to_write.append("SVTYPE") + if include_svlen: + headers_to_write.append("SVLEN") + headers_to_write.extend([f'{sample}_GT' for sample in sample_headers]) + + with open(csv_file, 'w', newline='') as csvf: + csv_writer = csv.writer(csvf) + csv_writer.writerow(headers_to_write) + + for row, info_dict in records: + row_to_write = row[:7] # Only keep CHROM, POS, ID, REF, ALT, QUAL, FILTER + if include_supp_vec: + row_to_write.append(info_dict.get('SUPP_VEC', '')) + if include_supp: + row_to_write.append(info_dict.get('SUPP', '')) + if include_type_inferred: + row_to_write.append(info_dict.get('type_inferred', '')) + if include_svtype: + row_to_write.append(info_dict.get('SVTYPE', '')) + if include_svlen: + row_to_write.append(info_dict.get('SVLEN', '')) + format_field = row[8] + gt_values = [extract_gt_from_sample(sample, format_field) for sample in row[9:]] + row_to_write.extend(gt_values) + + csv_writer.writerow(row_to_write) if __name__ == '__main__': - import os - import sys - sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/') - from variant_extractor import VariantExtractor # Parse arguments - parser = ArgumentParser(description='Generate CSV file from a VCF file') + parser = ArgumentParser(description='Generates a CSV file from a VCF') parser.add_argument('vcf_file', help='VCF file') - parser.add_argument('output_file', help='Output file') - parser.add_argument('-f', '--fasta-ref', help='FASTA reference file') + parser.add_argument('output', help='Output CSV file') args = parser.parse_args() - variants = [] - - print(f'Reading VCF file: {args.vcf_file}') - extractor = VariantExtractor(args.vcf_file,ensure_pairs=False,fasta_ref=args.fasta_ref) - df = extractor.to_dataframe() - # Insert id column in the first position - df.insert(0, 'id', '') - df['id'] = df['variant_record_obj'].apply(lambda x: x.id) - df.drop(['variant_record_obj'], axis=1, inplace=True) - df.to_csv(f'{args.output_file}', index=False) + vcf_to_csv(args.vcf_file, args.output) diff --git a/modules/local/vcf_to_csv/main.nf b/modules/local/vcf_to_csv/main.nf index bf1368a..bff1b21 100644 --- a/modules/local/vcf_to_csv/main.nf +++ b/modules/local/vcf_to_csv/main.nf @@ -9,8 +9,6 @@ process VCF_TO_CSV { input: tuple val(meta), path(input) - tuple val(meta2), path(fasta) - tuple val(meta3), path(fasta_fai) output: tuple val(meta), path("*.csv") , emit: output @@ -25,9 +23,7 @@ process VCF_TO_CSV { """ vcf_to_csv.py \\ $input \\ - ${prefix}.csv \\ - -f $fasta - + ${prefix}.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/compare_benchmark_results.nf b/subworkflows/local/compare_benchmark_results.nf index b527bdb..5e25586 100644 --- a/subworkflows/local/compare_benchmark_results.nf +++ b/subworkflows/local/compare_benchmark_results.nf @@ -66,9 +66,7 @@ workflow COMPARE_BENCHMARK_RESULTS { merged_vcfs = merged_vcfs.mix(SURVIVOR_MERGE.out.vcf) VCF_TO_CSV( - merged_vcfs, - fasta, - fai + merged_vcfs ) versions = versions.mix(VCF_TO_CSV.out.versions)