forked from hall-lab/bamkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bamtofastq.py
executable file
·187 lines (160 loc) · 6.7 KB
/
bamtofastq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python
# for tgi cluster:
#/gapp/x64linux/opt/pythonbrew/venvs/Python-2.7.6/gemini/bin/python
# for uva cluster:
import pysam
import sys
import argparse
from argparse import RawTextHelpFormatter
import string
from string import *
__author__ = "Ira Hall (ihall@genome.wustl.edu) and Colby Chiang (cc2qe@virginia.edu)"
__version__ = "$Revision: 0.0.1 $"
__date__ = "$Date: 2014-09-04 14:31 $"
def bamtofastq(bamlist, is_sam, readgroup, rename, header):
for bamfile in bamlist:
# get file and header
if bamfile == None:
if is_sam:
bam = pysam.Samfile("-", "r", check_sq=False)
else:
bam = pysam.Samfile('-', 'rb', check_sq=False)
else:
if is_sam:
bam = pysam.Samfile(bamfile, 'r', check_sq=False)
else:
bam = pysam.Samfile(bamfile, "rb", check_sq=False)
# parse readgroup string
try:
rg_list = readgroup.split(',')
except AttributeError:
rg_list = None
d = {}
counter = 0
header_written = False
for al in bam:
# print the header to file if requested
if (header is not None
and not header_written):
header.write(bam.text)
header.close()
header_written = True
# must be primary read alignment, not secondary or supplementary
if (al.is_secondary or al.flag & 2048 == 2048):
continue
# skip unpaired reads
if (not al.is_paired):
continue
# must be in a user specified readgroup
if rg_list and al.opt('RG') not in rg_list:
continue
# ensures the read is not hard-clipped. important
# when the BAM doesn't have shorter hits flagged as
# secondary
if al.cigar is not None and 5 in [x[0] for x in al.cigar]:
continue
# add read name to dictionary if not already there
key = al.qname
if key not in d:
d.setdefault(key,al)
# print matched read pairs
else:
# RG:Z:ID
try:
RG1 = d[key].opt('RG')
except KeyError:
RG1 = ""
try:
RG2 = al.opt('RG')
except KeyError:
RG2 = ""
counter += 1
if rename:
al.qname = RG2 + '.' + str(counter)
d[key].qname = RG1 + '.' + str(counter)
if al.is_read1:
printfastq_rg(al,1,RG2)
printfastq_rg(d[key],2,RG1)
else:
printfastq_rg(d[key],1,RG1)
printfastq_rg(al,2,RG2)
del d[key]
if len(d) != 0:
sys.stderr.write('Warning: %s unmatched name groups\n' % len(d))
#===================================================================================================================================================
# functions
#===================================================================================================================================================
def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
bamtofastq.py\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: Convert a coordinate sorted BAM file to FASTQ\n\
(ignores unpaired reads)")
# parser.add_argument('-i', '--input', metavar='BAM', required=False, help='Input BAM file')
parser.add_argument('-r', '--readgroup', metavar='STR', default=None, required=False, help='Read group(s) to extract (comma separated)')
parser.add_argument('-n', '--rename', required=False, action='store_true', help='Rename reads')
parser.add_argument('-S', '--is_sam', required=False, action='store_true', help='Input is SAM format')
parser.add_argument('-H', '--header', metavar='FILE', type=argparse.FileType('w'), default=None, required=False,
help='Write BAM header to file')
parser.add_argument('input', metavar='BAM', nargs='*', type=str,
help='Input BAM file(s). If absent then defaults to stdin.')
# parse the arguments
args = parser.parse_args()
# bail if no BAM file
if len(args.input) == 0:
if sys.stdin.isatty():
parser.print_help()
exit(1)
else:
args.input = [None]
# send back the user input
return args
def printfastq(al,read):
if(al.is_reverse):
print "@" + str(al.qname) + "/" + str(read) + "\n" + str(revcomp(al.seq)) + "\n" + "+" + "\n" + str(al.qual[::-1])
else:
print "@" + str(al.qname) + "/" + str(read) + "\n" + str(al.seq) + "\n" + "+" + "\n" + str(al.qual)
def printfastq_rg(al,read, rg):
if(al.is_reverse):
print "@" + str(al.qname) + "/" + str(read) + " " + "RG:Z:" + str(rg) + "\n" + str(revcomp(al.seq)) + "\n" + "+" + "\n" + str(al.qual[::-1])
else:
print "@" + str(al.qname) + "/" + str(read) + " " + "RG:Z:" + str(rg) + "\n" + str(al.seq) + "\n" + "+" + "\n" + str(al.qual)
def revcomp(seq):
seq1 = seq.translate(maketrans("AGCTagct", "TCGAtcga"))
seq2 = seq1[::-1]
return seq2
#===================================================================================================================================================
# driver
#===================================================================================================================================================
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def main():
args = get_args()
bamtofastq(args.input,
args.is_sam,
args.readgroup,
args.rename,
args.header)
if __name__ == "__main__":
try:
sys.exit(main())
except IOError, e:
if e.errno != 32: # ignore SIGPIPE
raise
# new features needed
# don't forget to select for primary alignments before using on bwa-mem files - DONE
# query memory usage to print out
# write output files based on library name
# include sample name as part of output file
# include sequencing center as part of output file
# stop opening and closing file the whole time - DONE
# close files at end
# add strict error checking ; need to catch all mistakes!!!!
# option for interleaved versus two fastq formats
# option for compressed versus uncompressed
# write out sequencing center - DONE
# need to make read-group based
# need to actually align output:
# need to have output directory prefix - default blank