Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tag DP_ACGT to GVCF #118

Merged
merged 2 commits into from
Aug 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 57 additions & 2 deletions python/clockwork/gvcf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from collections import Counter
import datetime
import logging
import subprocess

from cluster_vcf_records import vcf_file_read, vcf_record
import pyfastaq
Expand All @@ -15,6 +18,7 @@ def _combine_minos_and_samtools_header(minos_header, samtools_header, ref_seqs):
new_header = [
'##FILTER=<ID=NO_DATA,Description="No information from minos or samtools">',
'##INFO=<ID=CALLER,Number=1,Description="Origin of call, one of minos, samtools, or none if there was no depth">',
'##FORMAT=<ID=DP_ACGT,Number=8,Type=Integer,Description="Number of A-forward, A-reverse, C-forward, C-reverse, G-forward, G-reverse, T-forward, T-reverse bases">',
]
new_header.extend(
[f"##contig=<ID={k},length={len(v)}>" for k, v in ref_seqs.items()]
Expand Down Expand Up @@ -126,16 +130,61 @@ def _finish_contig(ref_pos, ref_seq, minos_record, minos_iter, filehandle):
ref_pos += 1


def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, out_vcf):
def _samtools_acgt_depths(bam_file):
command = f"samtools mpileup --no-output-ins --no-output-ins --no-output-del --no-output-del --no-output-ends -aa {bam_file}"
logging.info(f"Gathering per-position A/C/G/T depths ({command})")
p = subprocess.Popen(
command, shell=True, stdout=subprocess.PIPE, universal_newlines=True
)
acgt_depths = {}
acgt = ("A", "a", "C", "c", "G", "g", "T", "t")

for line in p.stdout:
ref, pos, _, _, depths, _ = line.split("\t")
pos = int(pos) - 1
if ref not in acgt_depths:
acgt_depths[ref] = {}
assert pos not in acgt_depths[ref]
counts = Counter(depths)
acgt_depths[ref][pos] = ",".join(str(counts[x]) for x in acgt)

p.wait()
if p.returncode != 0:
raise Exception(f"Error running samtools mpileup: {command}")

logging.info(f"Finished per-position A/C/G/T depths ({command})")
return acgt_depths


def _add_acgt_depths_to_minos(minos_records, acgt_depths):
for ref in minos_records:
if ref not in acgt_depths:
continue
depths = acgt_depths[ref]

for record in minos_records[ref]:
if record.POS in depths:
record.set_format_key_value("DP_ACGT", depths[record.POS])


def gvcf_from_minos_vcf_and_samtools_gvcf(
ref_fasta, minos_vcf, samtools_vcf, out_vcf, bam_file=None
):
minos_header, minos_records = vcf_file_read.vcf_file_to_dict(minos_vcf)
samtools_header = vcf_file_read.get_header_lines_from_vcf_file(samtools_vcf)
if bam_file is not None:
acgt_depths = _samtools_acgt_depths(bam_file)
_add_acgt_depths_to_minos(minos_records, acgt_depths)
else:
acgt_depths = {}

ref_seqs = {}
pyfastaq.tasks.file_to_dict(ref_fasta, ref_seqs)
used_ref_seqs = set()
ref_seq = None
ref_pos = -1
minos_record = None
logging.info(f"Making GVCF file {out_vcf} from {minos_vcf} {samtools_vcf}")

with open(out_vcf, "w") as f_out, open(samtools_vcf) as f_samtools:
print(
Expand All @@ -161,6 +210,12 @@ def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, ou
continue

samtools_record = vcf_record.VcfRecord(line)
try:
samtools_record.set_format_key_value(
"DP_ACGT", acgt_depths[samtools_record.CHROM][samtools_record.POS]
)
except KeyError:
pass

# If we've found a new CHROM in the samtools VCF file
if ref_seq is None or ref_seq.id != samtools_record.CHROM:
Expand Down Expand Up @@ -203,7 +258,7 @@ def gvcf_from_minos_vcf_and_samtools_gvcf(ref_fasta, minos_vcf, samtools_vcf, ou
print(samtools_record, file=f_out)
ref_pos = samtools_record.POS + 1

if ref_seq is not None: # happens if the samtools VCF was empty
if ref_seq is not None: # happens if the samtools VCF was empty
_finish_contig(ref_pos, ref_seq, minos_record, minos_iter, f_out)
_print_non_samtools_seqs(ref_seqs, used_ref_seqs, minos_records, f_out)

Expand Down
6 changes: 5 additions & 1 deletion python/clockwork/tasks/gvcf_from_minos_and_samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,9 @@

def run(options):
gvcf.gvcf_from_minos_vcf_and_samtools_gvcf(
options.ref_fasta, options.minos_vcf, options.samtools_vcf, options.outfile,
options.ref_fasta,
options.minos_vcf,
options.samtools_vcf,
options.outfile,
bam_file=options.bam_file,
)
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="total read depth from gramtools">
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases">
##FORMAT=<ID=DPF,Number=1,Type=Float,Description="Depth Fraction, defined as DP divided by mean depth">
##FORMAT=<ID=DP_ACGT,Number=8,Type=Integer,Description="Number of A-forward, A-reverse, C-forward, C-reverse, G-forward, G-reverse, T-forward, T-reverse bases">
##FORMAT=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same">
##FORMAT=<ID=FRS,Number=1,Type=Float,Description="Fraction of reads that support the genotype call">
##FORMAT=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies">
Expand Down
50 changes: 37 additions & 13 deletions python/clockwork/var_call_one_sample_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,13 @@ def run(
"Must give same number of forward and reverse reads files. Got:\nForward:{reads1_list}\nReverse:{reads2_list}"
)
if not check_reads_files(reads1_list + reads2_list):
raise Exception("Reads file(s) not found or are empty. See previous error messages. Cannot continue")
raise Exception(
"Reads file(s) not found or are empty. See previous error messages. Cannot continue"
)
if not os.path.exists(ref_dir):
raise FileNotFoundError(f"Reference directory not found: {ref_dir}. Cannot continue")
raise FileNotFoundError(
f"Reference directory not found: {ref_dir}. Cannot continue"
)

os.mkdir(outdir)

Expand All @@ -51,10 +55,7 @@ def run(
trimmed_reads_1.append(os.path.join(outdir, f"trimmed_reads.{i}.1.fq.gz"))
trimmed_reads_2.append(os.path.join(outdir, f"trimmed_reads.{i}.2.fq.gz"))
read_trim.run_trimmomatic(
reads1_list[i],
reads2_list[i],
trimmed_reads_1[-1],
trimmed_reads_2[-1],
reads1_list[i], reads2_list[i], trimmed_reads_1[-1], trimmed_reads_2[-1]
)

refdir = reference_dir.ReferenceDir(directory=ref_dir)
Expand All @@ -77,7 +78,9 @@ def run(
utils.syscall(cmd)
samtools_vcf_has_vars = utils.vcf_has_records(samtools_vcf)
if not samtools_vcf_has_vars:
logging.warn("SAMTOOLS VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA")
logging.warn(
"SAMTOOLS VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA"
)

samtools_gvcf = os.path.join(outdir, "samtools.gvcf")
cmd = f"bcftools mpileup -I --output-type u -f {refdir.ref_fasta} {rmdup_bam} | bcftools call -c -O v -o {samtools_gvcf}"
Expand All @@ -94,7 +97,9 @@ def run(
try:
ctx.run(run_mccortex_view_kmers=False)
except:
logging.error("ERROR RUNNING CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK CORTEX LOGS")
logging.error(
"ERROR RUNNING CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK CORTEX LOGS"
)

ctx_vcf_dir = os.path.join(cortex_dir, "cortex.out", "vcfs")
try:
Expand All @@ -108,24 +113,43 @@ def run(

cortex_vcf_has_vars = False
if len(cortex_vcfs) != 1:
logging.error("NO VCF FILE MADE BY CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK THE QUALITY OF INPUT DATA, AND CORTEX LOGS")
logging.error(
"NO VCF FILE MADE BY CORTEX. WILL TRY TO CONTINUE USING SAMTOOLS VCF ONLY (IF SAMTOOLS FOUND VARIANTS), BUT PLEASE CHECK THE QUALITY OF INPUT DATA, AND CORTEX LOGS"
)
cortex_vcf = ""
else:
cortex_vcf = os.path.join(outdir, "cortex.vcf")
os.rename(cortex_vcfs[0], cortex_vcf)
cortex_vcf_has_vars = utils.vcf_has_records(cortex_vcf)
if not cortex_vcf_has_vars:
logging.warn("CORTEX VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA")
logging.warn(
"CORTEX VCF FILE HAS ZERO VARIANTS. PLEASE CHECK THE QUALITY OF INPUT DATA"
)
if not debug:
utils.syscall(f"rm -rf {cortex_dir}")

final_vcf = os.path.join(outdir, "final.vcf")

if not (samtools_vcf_has_vars and cortex_vcf_has_vars):
logging.error("NO VARIANTS FOUND BY CORTEX OR SAMTOOLS. WRITING HEADER-ONLY FINAL VCF FILE INSTEAD OF RUNNING MINOS")
logging.error(
"NO VARIANTS FOUND BY CORTEX OR SAMTOOLS. WRITING HEADER-ONLY FINAL VCF FILE INSTEAD OF RUNNING MINOS"
)
with open(final_vcf, "w") as f:
print("##fileformat=VCFv4.2", file=f)
print("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample", sep="\t", file=f)
print(
"#CHROM",
"POS",
"ID",
"REF",
"ALT",
"QUAL",
"FILTER",
"INFO",
"FORMAT",
"sample",
sep="\t",
file=f,
)
minos_vcf_has_vars = False
else:
minos_dir = os.path.join(outdir, "minos")
Expand All @@ -137,7 +161,7 @@ def run(

final_gvcf = os.path.join(outdir, "final.gvcf")
gvcf.gvcf_from_minos_vcf_and_samtools_gvcf(
refdir.ref_fasta, final_vcf, samtools_gvcf, final_gvcf,
refdir.ref_fasta, final_vcf, samtools_gvcf, final_gvcf, bam_file=rmdup_bam
)
if not debug:
os.unlink(samtools_gvcf)
Expand Down
40 changes: 12 additions & 28 deletions python/scripts/clockwork
Original file line number Diff line number Diff line change
Expand Up @@ -477,20 +477,17 @@ subparser_gvcf_from_minos_and_samtools = subparsers.add_parser(
)

subparser_gvcf_from_minos_and_samtools.add_argument(
"ref_fasta",
help="Reference genome FASTA file",
"ref_fasta", help="Reference genome FASTA file"
)
subparser_gvcf_from_minos_and_samtools.add_argument(
"minos_vcf",
help="VCF file made by minos",
"minos_vcf", help="VCF file made by minos"
)
subparser_gvcf_from_minos_and_samtools.add_argument(
"samtools_vcf",
help="VCF file made by samtools",
"samtools_vcf", help="VCF file made by samtools"
)
subparser_gvcf_from_minos_and_samtools.add_argument("outfile", help="Output gVCF file")
subparser_gvcf_from_minos_and_samtools.add_argument(
"outfile",
help="Output gVCF file",
"--bam_file", help="BAM file. If used, will add DP_ACGT to VCF"
)
subparser_gvcf_from_minos_and_samtools.set_defaults(
func=clockwork.tasks.gvcf_from_minos_and_samtools.run
Expand All @@ -506,13 +503,9 @@ subparser_gvcf_to_fasta = subparsers.add_parser(
)

subparser_gvcf_to_fasta.add_argument(
"gvcf",
help="Input gVCF file made by gvcf_from_minos_and_samtools",
)
subparser_gvcf_to_fasta.add_argument(
"outfile",
help="Output FASTA file",
"gvcf", help="Input gVCF file made by gvcf_from_minos_and_samtools"
)
subparser_gvcf_to_fasta.add_argument("outfile", help="Output FASTA file")
subparser_gvcf_to_fasta.add_argument(
"--ignore_minos_pass",
action="store_true",
Expand Down Expand Up @@ -840,9 +833,7 @@ subparser_samtools_qc = subparsers.add_parser(
description="Runs BWA MEM, picard MarkDuplicates, then samtools stats and plots",
)

subparser_samtools_qc.add_argument(
"ref_fasta", help="Name of reference fasta file"
)
subparser_samtools_qc.add_argument("ref_fasta", help="Name of reference fasta file")
subparser_samtools_qc.add_argument("reads1", help="Name of reads file 1")
subparser_samtools_qc.add_argument("reads2", help="Name of reads file 2")
subparser_samtools_qc.add_argument(
Expand Down Expand Up @@ -967,19 +958,13 @@ subparser_variant_call_one_sample.add_argument(
metavar="INT",
)
subparser_variant_call_one_sample.add_argument(
"--force",
action="store_true",
help="Overwrite outdir if it already exists",
"--force", action="store_true", help="Overwrite outdir if it already exists"
)
subparser_variant_call_one_sample.add_argument(
"--keep_bam",
action="store_true",
help="Keep BAM file of rmdup reads",
"--keep_bam", action="store_true", help="Keep BAM file of rmdup reads"
)
subparser_variant_call_one_sample.add_argument(
"--debug",
action="store_true",
help="Debug mode: do not clean up any files",
"--debug", action="store_true", help="Debug mode: do not clean up any files"
)
subparser_variant_call_one_sample.add_argument(
"--no_trim",
Expand All @@ -990,8 +975,7 @@ subparser_variant_call_one_sample.add_argument(
"ref_dir", help="Directory of reference files, made by clockwork reference_prepare"
)
subparser_variant_call_one_sample.add_argument(
"outdir",
help="Output directory (must not exist, will be created)",
"outdir", help="Output directory (must not exist, will be created)"
)
subparser_variant_call_one_sample.add_argument(
"reads_files",
Expand Down
Loading