-
Notifications
You must be signed in to change notification settings - Fork 6
/
bamgroupreads.py
executable file
·175 lines (150 loc) · 6.14 KB
/
bamgroupreads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python
# for tgi cluster:
#/gapp/x64linux/opt/pythonbrew/venvs/Python-2.7.6/gemini/bin/python
# for uva cluster:
import pysam
import sys
import argparse
from argparse import RawTextHelpFormatter
import string
from string import *
__author__ = "Colby Chiang (cc2qe@virginia.edu)"
__version__ = "$Revision: 0.0.1 $"
__date__ = "$Date: 2014-12-15 11:43 $"
def bamgroupreads(bamfile, readgroup, reset_dups, fix_flags, is_sam, bam_out, uncompressed_out):
# set input file
if bamfile == None:
if is_sam:
in_bam = pysam.Samfile("-", "r")
else:
in_bam = pysam.Samfile('-', 'rb')
else:
if is_sam:
in_bam = pysam.Samfile(bamfile, 'r')
else:
in_bam = pysam.Samfile(bamfile, "rb")
# set output file
if uncompressed_out:
out_bam = pysam.Samfile('-', 'wbu', template=in_bam)
elif bam_out:
out_bam = pysam.Samfile('-', 'wb', template=in_bam)
else:
out_bam = pysam.Samfile('-', 'wh', template=in_bam)
# parse readgroup string
try:
rg_list = readgroup.split(',')
except AttributeError:
rg_list = None
d = {}
for al in in_bam:
# must be in a user specified readgroup
if rg_list and al.opt('RG') not in rg_list:
continue
# add read name to dictionary if not already there
key = al.qname
if key not in d:
d.setdefault(key,Namegroup(al))
# print matched read pairs
else:
d[key].add_alignment(al)
if d[key].is_complete():
for al in d[key].alignments:
if reset_dups:
# unset the duplicate flag
al.is_duplicate = 0
if fix_flags:
# fix the secondary mate flag
proper_pair = False
duplicate = False
read1_unmapped = False
read2_unmapped = False
# gather info on the read cluster flags
for flagcheck in d[key].alignments:
if flagcheck.is_proper_pair:
proper_pair = True
if flagcheck.is_duplicate:
duplicate = True
if (legacy and flagcheck.is_secondary
or not legacy and flagcheck.flag & 2048 == 2048):
continue
if flagcheck.is_read1:
read1_unmapped = flagcheck.is_unmapped
elif flagcheck.is_read2:
read2_unmapped = flagcheck.is_unmapped
# set new info on the read cluster
if al.is_read1:
al.mate_is_unmapped = read2_unmapped
elif al.is_read2:
al.mate_is_unmapped = read1_unmapped
al.is_proper_pair = proper_pair
al.is_duplicate = duplicate
try:
out_bam.write(al)
except ValueError: # happens when pipe is closed
sys.exit(0)
del d[key]
if len(d) != 0:
sys.stderr.write('Warning: %s unmatched name groups\n' % len(d))
# ============================================
# functions
# ============================================
# class that holds reads from a sequence fragment
class Namegroup():
def __init__(self, al):
self.alignments = list()
self.name = al.qname
self.sa = 0
self.num_prim = 0
self.add_alignment(al)
def add_alignment(self, al):
self.alignments.append(al)
if not (legacy and al.is_secondary
or not legacy and al.flag & 2048 == 2048):
self.num_prim += 1
try:
self.sa += len(al.opt('SA').rstrip(';').split(';'))
# print self.sa
except KeyError:
pass
def is_complete(self):
return self.num_prim == 2 and len(self.alignments) == self.sa + 2
def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
bamgroupreads.py\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: Group BAM file by read IDs without sorting")
parser.add_argument('-i', '--input', metavar='BAM', required=False, help='Input BAM file')
parser.add_argument('-r', '--readgroup', metavar='STR', default=None, required=False, help='Read group(s) to extract (comma separated)')
parser.add_argument('-d', '--reset_dups', required=False, action='store_true', help='Reset duplicate flags')
parser.add_argument('-f', '--fix_flags', required=False, action='store_true', help='Fix mate flags for secondary reads')
parser.add_argument('-S', required=False, action='store_true', help='Input is SAM format')
parser.add_argument('-b', required=False, action='store_true', help='Output BAM format')
parser.add_argument('-u', required=False, action='store_true', help='Output uncompressed BAM format (implies -b)')
parser.add_argument('-M', action='store_true', dest='legacy', required=False, help='split reads are flagged as secondary, not supplementary. For compatibility with legacy BWA-MEM "-M" flag')
# parse the arguments
args = parser.parse_args()
global legacy
legacy = args.legacy
# bail if no BAM file
if args.input is None:
if sys.stdin.isatty():
parser.print_help()
exit(1)
# send back the user input
return args
# ============================================
# driver
# ============================================
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
def main():
args = get_args()
bamgroupreads(args.input, args.readgroup, args.reset_dups, args.fix_flags, args.S, args.b, args.u)
if __name__ == "__main__":
try:
sys.exit(main())
except IOError, e:
if e.errno != 32: # ignore SIGPIPE
raise