forked from deweylab/RSEM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RefSeq.h
140 lines (111 loc) · 3.34 KB
/
RefSeq.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#ifndef REFSEQ
#define REFSEQ
#include<cassert>
#include<fstream>
#include<string>
#include<vector>
#include "utils.h"
//Each Object can only be used once
class RefSeq {
public:
RefSeq() {
fullLen = totLen = 0;
name = ""; seq = "";
fmasks.clear();
}
//Constructor , seq : the forward strand of the reference
//tag does not contain ">"
//polyALen : length of polyA tail we add
RefSeq(const std::string& name, const std::string& seq, int polyALen) {
fullLen = seq.length();
totLen = fullLen + polyALen;
this->name = name;
this->seq = seq;
this->seq.append(polyALen, 'A');
assert(fullLen > 0 && totLen >= fullLen);
int len = (fullLen - 1) / NBITS + 1;
fmasks.assign(len, 0);
// set mask if poly(A) tail is added
if (polyALen > 0) {
for (int i = std::max(fullLen - OLEN + 1, 0); i < fullLen; i++) setMask(i);
}
}
RefSeq(const RefSeq& o) {
fullLen = o.fullLen;
totLen = o.totLen;
name = o.name;
seq = o.seq;
fmasks = o.fmasks;
}
RefSeq& operator= (const RefSeq &rhs) {
if (this != &rhs) {
fullLen = rhs.fullLen;
totLen = rhs.totLen;
name = rhs.name;
seq = rhs.seq;
fmasks = rhs.fmasks;
}
return *this;
}
~RefSeq() {}
bool read(std::ifstream&, int = 0);
void write(std::ofstream&);
int getFullLen() const { return fullLen; }
int getTotLen() const { return totLen; }
const std::string& getName() const { return name; }
std::string getSeq() const { return seq; }
std::string getRSeq() const {
std::string rseq = "";
for (int i = totLen - 1; i >= 0; i--) rseq.push_back(getCharacter(get_rbase_id(seq[i])));
return rseq;
}
//get the sequence dir 0 : + 1 : -
std::string getSeq(int dir) const {
return (dir == 0 ? getSeq() : getRSeq());
}
int get_id(int pos, int dir) const {
assert(pos >= 0 && pos < totLen);
return (dir == 0 ? get_base_id(seq[pos]) : get_rbase_id(seq[totLen - pos - 1]));
}
bool getMask(int seedPos) const {
assert(seedPos >= 0 && seedPos < totLen);
return fmasks[seedPos / NBITS] & mask_codes[seedPos % NBITS];
}
void setMask(int seedPos) {
assert(seedPos >= 0 && seedPos < totLen);
fmasks[seedPos / NBITS] |= mask_codes[seedPos % NBITS];
}
private:
int fullLen; // fullLen : the original length of an isoform
int totLen; // totLen : the total length, included polyA tails, if any
std::string name; // the tag
std::string seq; // the raw sequence, in forward strand
std::vector<unsigned int> fmasks; // record masks for forward strand, each position occupies 1 bit
};
//internal read; option 0 : read all 1 : do not read seqences
bool RefSeq::read(std::ifstream& fin, int option) {
std::string line;
if (!(fin>>fullLen>>totLen)) return false;
assert(fullLen > 0 && totLen >= fullLen);
getline(fin, line);
if (!getline(fin, name)) return false;
if (!getline(fin, seq)) return false;
int len = (fullLen - 1) / NBITS + 1; // assume each cell contains NBITS bits
fmasks.assign(len, 0);
for (int i = 0; i < len; i++)
if (!(fin>>fmasks[i])) return false;
getline(fin, line);
assert(option == 0 || option == 1);
if (option == 1) { seq = ""; }
return true;
}
//write to file in "internal" format
void RefSeq::write(std::ofstream& fout) {
fout<<fullLen<<" "<<totLen<<std::endl;
fout<<name<<std::endl;
fout<<seq<<std::endl;
int len = fmasks.size();
for (int i = 0; i < len - 1; i++) fout<<fmasks[i]<<" ";
fout<<fmasks[len - 1]<<std::endl;
}
#endif