Skip to content

Commit

Permalink
Bug fix when concatenating multiple files and first is empty. Fixes #77
Browse files Browse the repository at this point in the history
…. Provide ability to exclude samples from VCF files using GATK SelectVariants
  • Loading branch information
chapmanb committed Aug 1, 2013
1 parent 8c410b0 commit 1f5a620
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 27 deletions.
8 changes: 6 additions & 2 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
## 0.7.1 (in progress)

- Bug fix for concatenating files when first file in empty.

## 0.7.0 (July 30, 2013)

- RNA-seq pipeline updated: deprecate Tophat 1 in favor of Tophat 2. Perform
automatic adapter trimming of common adapter sequences. STAR aligner support.
- RNA-seq pipeline updated: deprecate Tophat 1 in favor of Tophat 2. Perform
automatic adapter trimming of common adapter sequences. STAR aligner support.
RNA-SeQC support for RNA-seq specific quality control. Transcript quantitation
with htseq-count.
- Updated installation and upgrade procedures, to make it easier to build an
Expand Down
2 changes: 1 addition & 1 deletion bcbio/pipeline/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__="0.7.0"
__version__="0.7.1a"
8 changes: 4 additions & 4 deletions bcbio/variation/cortex.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from bcbio.pipeline import config_utils
from bcbio.pipeline.shared import subset_variant_regions
from bcbio.utils import file_exists, safe_makedir
from bcbio.variation.genotype import write_empty_vcf
from bcbio.variation import vcfutils

def run_cortex(align_bams, items, ref_file, assoc_files, region=None,
out_file=None):
Expand Down Expand Up @@ -58,7 +58,7 @@ def run_cortex(align_bams, items, ref_file, assoc_files, region=None,
_combine_variants(regional_vcfs, combine_file, ref_file, config)
_select_final_variants(combine_file, out_file, config)
else:
write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
return out_file

def _passes_cortex_depth(line, min_depth):
Expand Down Expand Up @@ -138,7 +138,7 @@ def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base,
if not file_exists(out_file):
fastq = _get_fastq_in_region(region, align_bam, out_vcf_base)
if _count_fastq_reads(fastq, min_reads) < min_reads:
write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
else:
local_ref, genome_size = _get_local_ref(region, ref_file, out_vcf_base)
indexes = _index_local_ref(local_ref, cortex_dir, stampy_dir, kmers)
Expand All @@ -150,7 +150,7 @@ def _run_cortex_on_region(region, align_bam, ref_file, work_dir, out_file_base,
if cortex_out:
_remap_cortex_out(cortex_out, region, out_file)
else:
write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
finally:
if os.path.exists(base_dir):
shutil.rmtree(base_dir)
Expand Down
10 changes: 2 additions & 8 deletions bcbio/variation/genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def unified_genotyper(align_bams, items, ref_file, assoc_files,
region, out_file)
if (not isinstance(region, (list, tuple)) and
not all(has_aligned_reads(x, region) for x in align_bams)):
write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
else:
with file_transaction(out_file) as tx_out_file:
params += ["-T", "UnifiedGenotyper",
Expand All @@ -90,7 +90,7 @@ def haplotype_caller(align_bams, items, ref_file, assoc_files,
assert broad_runner.gatk_type() == "restricted", \
"Require full version of GATK 2.4+ for haplotype calling"
if not all(has_aligned_reads(x, region) for x in align_bams):
write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
else:
with file_transaction(out_file) as tx_out_file:
params += ["-T", "HaplotypeCaller",
Expand Down Expand Up @@ -130,12 +130,6 @@ def _gatk_location_hack(args):
extra_args.extend(exclude_args[chrom])
return args + extra_args

def write_empty_vcf(out_file):
with open(out_file, "w") as out_handle:
out_handle.write("##fileformat=VCFv4.1\n"
"## No variants; no reads aligned in region\n"
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")

# ## Variant filtration -- shared functionality

def _get_coverage_params(config):
Expand Down
7 changes: 3 additions & 4 deletions bcbio/variation/mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@
from bcbio.utils import file_exists
from bcbio.distributed.transaction import file_transaction
from bcbio.variation.realign import has_aligned_reads
from bcbio.variation.genotype import write_empty_vcf
from bcbio.pipeline.shared import subset_variant_regions
from bcbio.variation import bamprep
from bcbio.variation import bamprep, vcfutils

_PASS_EXCEPTIONS = set(["java.lang.RuntimeException: "
"java.lang.IllegalArgumentException: "
Expand Down Expand Up @@ -114,7 +113,7 @@ def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
if (not isinstance(region, (list, tuple)) and
not all(has_aligned_reads(x, region) for x in align_bams)):

write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
return

with file_transaction(out_file) as tx_out_file:
Expand All @@ -130,7 +129,7 @@ def mutect_caller(align_bams, items, ref_file, assoc_files, region=None,
# will be ignored. All the other exceptions will be raised
# correctly.
if java_exception in _PASS_EXCEPTIONS:
write_empty_vcf(tx_out_file)
vcfutils.write_empty_vcf(tx_out_file)
return
else:
raise
Expand Down
5 changes: 2 additions & 3 deletions bcbio/variation/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
from bcbio.log import logger
from bcbio.pipeline import config_utils
from bcbio.pipeline.shared import subset_variant_regions
from bcbio.variation.genotype import write_empty_vcf
from bcbio.variation import bamprep, realign
from bcbio.variation import bamprep, realign, vcfutils

def shared_variantcall(call_fn, name, align_bams, ref_file, config,
assoc_files, region=None, out_file=None):
Expand All @@ -31,7 +30,7 @@ def shared_variantcall(call_fn, name, align_bams, ref_file, config,
if ((variant_regions is not None and isinstance(target_regions, basestring)
and not os.path.isfile(target_regions))
or not all(realign.has_aligned_reads(x, region) for x in align_bams)):
write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
else:
with file_transaction(out_file) as tx_out_file:
call_fn(align_bams, ref_file, config, target_regions,
Expand Down
5 changes: 2 additions & 3 deletions bcbio/variation/varscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

from bcbio.pipeline import config_utils
from bcbio.provenance import do, programs
from bcbio.variation import samtools
from bcbio.variation.genotype import write_empty_vcf
from bcbio.variation import samtools, vcfutils

import pysam

Expand Down Expand Up @@ -59,4 +58,4 @@ def _varscan_work(align_bams, ref_file, config, target_regions, out_file):
# VarScan can create completely empty files in regions without
# variants, so we create a correctly formatted empty file
if os.path.getsize(out_file) == 0:
write_empty_vcf(out_file)
vcfutils.write_empty_vcf(out_file)
51 changes: 50 additions & 1 deletion bcbio/variation/vcfutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
from bcbio.pipeline import shared
from bcbio.variation import bamprep

def write_empty_vcf(out_file):
with open(out_file, "w") as out_handle:
out_handle.write("##fileformat=VCFv4.1\n"
"## No variants; no reads aligned in region\n"
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")


def split_snps_indels(broad_runner, orig_file, ref_file):
"""Split a variant call file into SNPs and INDELs for processing.
"""
Expand All @@ -32,6 +39,38 @@ def split_snps_indels(broad_runner, orig_file, ref_file):
broad_runner.run_gatk(cur_params)
return snp_file, indel_file

def _get_exclude_samples(in_file, to_exclude):
"""Identify samples in the exclusion list which are actually in the VCF.
"""
out = []
with open(in_file) as in_handle:
for line in in_handle:
if line.startswith("#CHROM"):
parts = line.strip().split("\t")
for s in parts[9:]:
if s in to_exclude:
out.append(s)
return out

def exclude_samples(in_file, out_file, to_exclude, ref_file, config):
"""Exclude specific samples from an input VCF file using GATK SelectVariants.
"""
to_exclude = _get_exclude_samples(in_file, to_exclude)
# can use the input sample, all exclusions already gone
if len(to_exclude) == 0:
out_file = in_file
elif not utils.file_exists(out_file):
broad_runner = broad.runner_from_config(config)
with file_transaction(out_file) as tx_out_file:
params = ["-T", "SelectVariants",
"-R", ref_file,
"--variant", in_file,
"--out", tx_out_file]
for x in to_exclude:
params += ["-xl_sn", x]
broad_runner.run_gatk(params)
return out_file

def combine_variant_files(orig_files, out_file, ref_file, config,
quiet_out=True, region=None):
"""Combine multiple VCF files into a single output file.
Expand Down Expand Up @@ -94,6 +133,13 @@ def _sort_by_region(fnames, regions, ref_file, config):
sitems.sort()
return [x[1] for x in sitems]

def _has_variants(in_file):
with open(in_file) as in_handle:
for line in in_handle:
if not line.startswith("#"):
return True
return False

def concat_variant_files(orig_files, out_file, regions, ref_file, config):
"""Concatenate multiple variant files from regions into a single output file.
Expand All @@ -103,14 +149,17 @@ def concat_variant_files(orig_files, out_file, regions, ref_file, config):
if not utils.file_exists(out_file):
with file_transaction(out_file) as tx_out_file:
with open(tx_out_file, "w") as out_handle:
for i, orig_file in enumerate(_sort_by_region(orig_files, regions, ref_file, config)):
for i, orig_file in enumerate(f for f in _sort_by_region(orig_files, regions, ref_file, config)
if _has_variants(f)):
with open(orig_file) as in_handle:
for line in in_handle:
if line.startswith("#"):
if i == 0:
out_handle.write(line)
else:
out_handle.write(line)
if os.path.getsize(tx_out_file) == 0:
write_empty_vcf(tx_out_file)
return out_file

# ## Parallel VCF file combining
Expand Down
16 changes: 15 additions & 1 deletion tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def setUp(self):
self.data_dir = os.path.join(os.path.dirname(__file__), "data")

@attr(speed=1)
def test_parallel_vcf_combine(self):
def test_1_parallel_vcf_combine(self):
"""Parallel combination of VCF files, split by chromosome.
"""
var_dir = os.path.join(self.data_dir, "variants")
Expand All @@ -62,3 +62,17 @@ def test_parallel_vcf_combine(self):
if os.path.exists(out_file):
os.remove(out_file)
vcfutils.parallel_combine_variants(files, out_file, ref_file, config, run_parallel)

@attr(speed=1)
def test_2_vcf_exclusion(self):
"""Exclude samples from VCF files.
"""
fname = os.path.join(self.data_dir, "variants", "S1_S2-combined.vcf")
ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa")
config = load_config(os.path.join(self.data_dir, "automated",
"post_process-sample.yaml"))
out_file = "%s-exclude%s" % os.path.splitext(fname)
to_exclude = ["S1"]
if os.path.exists(out_file):
os.remove(out_file)
vcfutils.exclude_samples(fname, out_file, to_exclude, ref_file, config)

0 comments on commit 1f5a620

Please sign in to comment.