-
Notifications
You must be signed in to change notification settings - Fork 16
/
gtf.h
320 lines (269 loc) · 9.07 KB
/
gtf.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#ifndef __APIGENOME_GTF_H
#define __APIGENOME_GTF_H
/////////////////////
// GTF format
// 1. seqname
// 2. source
// 3. feature - gene, transcript, exon, cds
// 4. start
// 5. end
// 6. score
// 7. strand
// 8. frame
// 9. attribute
////////////////////
/////////////////////////////////////////////////////////////////////////
/// GTF class should be able to (minimally)
/// 1. Read a GTF file and organize a structure
/// 2. Efficiently find genes and regions overlapping with certain position
///////////////////////////////////////////////////////////////////////////
#include <map>
#include <set>
#include <vector>
#include <cstring>
#include <climits>
#include "Error.h"
#include "gtf_interval_tree.h"
// A single genomic int32_t interval
class posLocus {
public:
int32_t beg1; // includes 1-based, excludes 0-based
int32_t end0; // excludes 1-based, includes 0-based
// constructor
// b : beg1 value
// e : end0 value
posLocus(int32_t b, int32_t e) : beg1(b), end0(e) {}
// compare between genomeLocus
// l : rhs argument of the same type
// returns true iff this < l
inline bool operator< (const posLocus& l) const {
if ( beg1 == l.beg1 ) { return end0 < l.end0; }
else return beg1 < l.beg1;
}
// compare between genomeLocus
// l : rhs argument of the same type
// returns true iff this == l
inline bool operator== (const posLocus& l) const {
return ( ( beg1 == l.beg1 ) && ( end0 == l.end0 ) );
}
// returns length of the interval
int32_t length() const { return end0-beg1+1; }
// returns the total overlapping intervals
int32_t overlapBases(int32_t _beg1, int32_t _end0) const {
if ( ( beg1 <= end0 ) && ( _beg1 <= end0 ) ) {
return ( _beg1 < end0 ? _beg1 : end0 ) - ( beg1 < end0 ? end0 : beg1 ) + 1;
}
else return 0;
}
int32_t overlapBases(const posLocus& l) const {
return overlapBases(l.beg1, l.end0);
}
bool overlaps(int32_t _beg1, int32_t _end0) const {
if ( ( beg1 <= _end0 ) && ( _beg1 <= end0 ) )
return true;
else
return false;
}
// check overlap with other locus
bool overlaps (const posLocus& l) const {
return overlaps(l.beg1, l.end0);
}
// merge two locus if possible
bool merge (const posLocus& l) {
if ( ( beg1-1 <= l.end0 ) && ( l.beg1-1 <= end0 ) ) {
if ( l.beg1 < beg1 ) beg1 = l.beg1;
if ( l.end0 > end0 ) end0 = l.end0;
return true;
}
else {
return false;
}
}
// check whether the interval contains a particular position in 1-based coordinate
bool contains1(int32_t pos1 = INT_MAX) const {
return ( ( pos1 >= beg1 ) && ( pos1 <= end0 ) );
}
// check whether the interval contains a particular position in 0-based coordinate
bool contains0(int32_t pos0) const { return contains1(pos0+1); }
// parse a string in [chr]:[beg1]-[end0] format
static bool parseRegion(const char* region, std::string& chrom, int32_t& beg1, int32_t& end0) {
char buf[255];
strcpy(buf,region);
const char* pcolon = strchr(region,':');
const char* pminus = strchr(pcolon+1,'-');
if ( pcolon == NULL )
error("Cannot parse %s in genomeLocus::genomeLocus()");
chrom.assign(region,0,pcolon-region);
beg1 = atoi(pcolon+1);
if ( pminus == NULL ) end0 = INT_MAX;
else {
end0 = atoi(pminus+1);
if ( end0 == 0 ) end0 = INT_MAX;
}
return true;
}
// parse a string in [chr]:[beg1]-[end0] format
static bool parseBegLenStrand(const char* region, std::string& chrom, int32_t& beg1, int32_t& end0, bool& fwdStrand) {
char buf[255];
strcpy(buf,region);
const char* pcolon1 = strchr(region,':');
const char* pcolon2 = strchr(pcolon1+1,':');
const char* pcolon3 = strchr(pcolon2+1,':');
if ( pcolon1 == NULL )
error("Cannot parse %s in genomeLocus::genomeLocus()");
chrom.assign(region,0,pcolon1-region);
beg1 = atoi(pcolon1+1);
end0 = beg1 + atoi(pcolon2+1) - 1;
fwdStrand = ((pcolon3 != NULL) && (pcolon3[1] == '-')) ? false : true;
return true;
}
};
class gtfGene;
class gtfTranscript;
class gtfCDS;
class gtfElement;
class gtfExonElement;
class gtfElement {
public:
gtfElement* parent;
std::string type;
posLocus locus;
gtfElement(int32_t _start, int32_t _end, const char* _type, gtfElement* _parent);
//virtual void printElement();
};
class gtfEntryParser {
public:
std::map<std::string,std::string> dict;
std::string empty;
std::string get(const char* key) {
std::map<std::string,std::string>::iterator it = dict.find(key);
if ( it != dict.end() ) { return it->second; }
else { return empty; }
}
gtfEntryParser(const char* s) {
const char* p = s; // cursor
const char* c = p; // beginning of item
char delim = ';';
char space = ' ';
std::string key, val;
while( *p != '\0' ) {
if ( *p == delim ) { // delimiter found
// c to p-1 is a valid string
int32_t n = p - c;
if ( ( c[0] == '"' ) && ( c[n-1] == '"' ) ) {
val.assign(c+1,n-2); // take n-2 characters
}
else { // if no quotes
val.assign(c,n);
}
dict[key] = val; // fill in the dict
c = p+1; // set the start of next item
while ( ( *c != '\0' ) && ( *c == space ) ) ++c; // consume whitespaces
p = c;
}
else if ( *p == space ) {
// c to p-1 is a valid string
int32_t n = p - c;
key.assign(c, n);
c = p+1; // set the start of next item
while ( ( *c != '\0' ) && ( *c == space ) ) ++c; // consume whitespaces
p = c; // update p
}
else {
++p; // advance
}
}
}
};
struct gtfComp {
template<typename T>
bool operator()(const T* lhs, const T* rhs) const {
if ( lhs->locus == rhs->locus ) {
return ((int64_t)rhs - (int64_t)lhs > 0);
}
else return ( lhs->locus < rhs->locus );
}
};
class gtfCDS : public gtfElement {
public:
uint8_t frame;
gtfCDS(int32_t _start, int32_t _end, const char* sframe, gtfElement* _parent);
//virtual void printElement();
};
class gtfGene : public gtfElement {
public:
std::string geneId;
std::string geneName;
std::string geneType;
std::set<gtfTranscript*,gtfComp> transcripts;
std::string seqname;
bool fwdStrand;
gtfGene(const char* _seqname, int32_t _start, int32_t _end, bool _fwdStrand, std::string& _gid, std::string& _gname, std::string& _gtype);
//virtual void printElement();
};
class gtfTranscript : public gtfElement {
public:
std::string transcriptId;
std::string transcriptType;
std::set<gtfElement*,gtfComp> exons;
std::set<gtfCDS*, gtfComp> CDSs;
std::set<gtfElement*,gtfComp> UTRs;
std::set<gtfElement*,gtfComp> start_codons;
std::set<gtfElement*,gtfComp> stop_codons;
gtfTranscript(int32_t _start, int32_t _end, std::string& _tid, std::string& _ttype, gtfElement* _parent);
//virtual void printElement();
};
// An object that represent a GTF file
class gtf {
// transcript : gene
public:
gtf(const char* gtfFile, std::vector<std::string>* pGenetypes = NULL, bool addChrPrefix = false, bool removeChrPrefix = false, bool createGeneTranscript = false);
~gtf();
int32_t maxGeneLength;
int32_t maxTranscriptLength;
int32_t maxExonLength;
int32_t maxCDSLength;
int32_t maxUTRLength;
int32_t maxStartCodonLength;
int32_t maxStopCodonLength;
typedef std::multimap<posLocus, gtfElement*> gtf_chr_t;
typedef std::map<std::string, gtf_chr_t >::iterator gtf_chr_it_t;
std::map<std::string, gtf_chr_t > mmap;
typedef gtfIntervalTree<int32_t,gtfElement*> gtf_ivt_t;
//typedef gtfInterval<int32_t,gtfElement*> gtf_iv_t;
std::map<std::string, gtf_ivt_t> chr2ivt;
std::map<std::string, gtfGene*> gid2Gene;
std::map<std::string, gtfTranscript*> tid2Transcript;
std::map<std::string, gtfGene*> tid2Gene;
typedef std::multimap<posLocus, gtfElement*>::iterator gtf_elem_it_t;
gtf_chr_it_t curChrIt;
gtf_elem_it_t curElemIt;
void rewind();
bool next();
bool isend() const;
bool addGene(const char* seqname, int32_t start, int32_t end,
const char* strand,
std::string& gid, std::string& gname, std::string& gtype);
bool addTranscript(const char* seqname, int32_t start, int32_t end,
const char* strand,
std::string& gid, std::string& tid, std::string& ttype);
bool addExon(const char* seqname, int32_t start, int32_t end,
const char* strand,
std::string& tid);
bool addUTR(const char* seqname, int32_t start, int32_t end,
const char* strand,
std::string& tid);
bool addCDS(const char* seqname, int32_t start, int32_t end,
const char* strand, const char* frame,
std::string& tid);
bool addStartCodon(const char* seqname, int32_t start, int32_t end,
const char* strand,
std::string& tid);
bool addStopCodon(const char* seqname, int32_t start, int32_t end,
const char* strand,
std::string& tid);
bool checkTranscriptSanity(std::string& tid, const char* seqname, const char* strand);
int32_t findOverlappingElements(const char* seqname, int32_t start, int32_t end, std::set<gtfElement*>& results);
int32_t findOverlappingElements(const char* seqname, int32_t start, int32_t end, bool fwdStrand, std::set<gtfElement*>& results);
};
#endif