Skip to content

Commit

Permalink
Merge pull request #110 from BUStools/devel
Browse files Browse the repository at this point in the history
version 0.44.1
  • Loading branch information
Yenaled authored Oct 21, 2024
2 parents 1dd7335 + ead6061 commit 1b3948c
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 48 deletions.
6 changes: 5 additions & 1 deletion src/Common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include "roaring.hh"
#include "hash.hpp"

#define BUSTOOLS_VERSION "0.44.0"
#define BUSTOOLS_VERSION "0.44.1"

#define u_map_ std::unordered_map
enum CAPTURE_TYPE : char
Expand Down Expand Up @@ -113,6 +113,10 @@ struct Bustools_opt
bool text_dumppad = false;
bool text_showall = false;

/* extract */
bool extract_exclude = false;
bool extract_include = true;

/* linker */
int start, end;

Expand Down
4 changes: 2 additions & 2 deletions src/bustools_count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ void bustools_count(Bustools_opt &opt) {
// v[i..j-1] share the same UMI
ecs.resize(0);
for (size_t k = i; k < j; k++) {
ecs.push_back(v[k].ec);
if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec);
}

if (opt.umi_gene_collapse) {
Expand Down Expand Up @@ -295,7 +295,7 @@ void bustools_count(Bustools_opt &opt) {
ecs.resize(0);
uint32_t counts = 0;
for (size_t k = i; k < j; k++) {
ecs.push_back(v[k].ec);
if (k == i || v[k].ec != v[k-1].ec) ecs.push_back(v[k].ec);
counts += v[k].count;
}

Expand Down
162 changes: 117 additions & 45 deletions src/bustools_extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,15 @@ inline bool open_fastqs(

for (int i = 0; i < opt.nFastqs; ++i) {
gzclose(outFastq[i]);
outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w");
outFastq[i] = gzopen(std::string(opt.output + "/" + std::to_string(iFastq + 1) + ".fastq.gz").c_str(), "w1");
gzclose(inFastq[i]);
inFastq[i] = gzopen(opt.fastq[iFastq].c_str(), "r");

if (seq[i]) {
kseq_destroy(seq[i]);
}
seq[i] = kseq_init(inFastq[i]);
if (kseq_read(seq[i]) < 0) {
return false;
}


++iFastq;
}
return true;
Expand Down Expand Up @@ -59,46 +56,11 @@ void bustools_extract(const Bustools_opt &opt) {
std::vector<kseq_t *> seq(opt.nFastqs, nullptr);
uint32_t iRead = 0;
size_t iFastq = 0;
if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl;
goto end_extract;
}
uint32_t lastFlag = 0;

while (true) {
in.read((char *) p, N * sizeof(BUSData));
size_t rc = in.gcount() / sizeof(BUSData);
if (rc == 0) {
break;
}
nr += rc;
for (size_t i = 0; i < rc; ++i) {
while (iRead < p[i].flags) {
for (const auto &s : seq) {
int err_kseq_read = kseq_read(s);
if (err_kseq_read == -1) { // Reached EOF
if (iFastq == opt.fastq.size()) { // Done with all files
std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl;
goto end_extract;
} else {
if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl;
goto end_extract;
}
}
} else if (err_kseq_read == -2) {
std::cerr << "Error: truncated FASTQ" << std::endl;
goto end_extract;
}
}
++iRead;
}

if (iRead > p[i].flags) {
std::cerr << "BUS file not sorted by flag" << std::endl;
goto end_extract;
}

for (int i = 0; i < opt.nFastqs; ++i) {
auto write_seq_to_file = [&opt, &buf, &outFastq] (std::vector<kseq_t *> &seq) {
for (int i = 0; i < opt.nFastqs; ++i) {
int bufLen = 1; // Already have @ character in buffer

memcpy(buf + bufLen, seq[i]->name.s, seq[i]->name.l);
Expand Down Expand Up @@ -127,14 +89,124 @@ void bustools_extract(const Bustools_opt &opt) {

buf[bufLen++] = '\n';

if (gzwrite(outFastq[i], buf, bufLen) != bufLen) {
std::cerr << "Error writing to FASTQ" << std::endl;
if (gzwrite(outFastq[i], buf, bufLen) != bufLen) {
return false;
}
}
return true;
};

bool tail = false;
bool finished = false;
size_t iFlag = 0;
size_t rc = 0;

if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
std::cerr << "Error reading FASTQ " << opt.fastq[iFastq] << std::endl;
goto end_extract;
}

// fill in the first N BUS records
in.read((char *) p, N * sizeof(BUSData));
rc = in.gcount() / sizeof(BUSData);
nr += rc;
tail = rc==0;

while (true) {
// fill the next read

for (int si = 0; si < seq.size(); ++si) {
const auto &s = seq[si];
int err_kseq_read = kseq_read(s);
if (err_kseq_read == -1) { // Reached EOF
if (si != 0) {
std::cerr << "Error: truncated FASTQ" << std::endl;
goto end_extract;
} else {
// let's make sure that all the files are also EOF
for (int sii = 1; sii < seq.size(); ++sii) {
int err_kseq_read2 = kseq_read(seq[sii]);
if (err_kseq_read2 != -1) {
std::cerr << "Error: truncated FASTQ" << std::endl;
goto end_extract;
}
}
}
// check if we are done with all files
if (iFastq == opt.fastq.size()) { // Done with all files
finished = true;
break;
} else {
if (!open_fastqs(outFastq, inFastq, seq, opt, iFastq)) {
std::cerr << "Error: cannot read FASTQ " << opt.fastq[iFastq] << std::endl;
goto end_extract;
}

// read the first read
err_kseq_read = kseq_read(seq[si]);
if (err_kseq_read == -1) {
finished = true;
break;
}
}
}

if (err_kseq_read == -2) {
std::cerr << "Error: truncated FASTQ" << std::endl;
goto end_extract;
}
}

if (finished) {
break;
}

// inclusion, check if the current read matches the next unproccessed flag
if (opt.extract_include && iRead == p[iFlag].flags) {
if (!write_seq_to_file(seq)) {
std::cerr << "Error writing to FASTQ" << std::endl;
goto end_extract;
}
}

// exclusion, make sure the current read does not match the next unproccessed flag or that we are in tail mode
if (opt.extract_exclude && (iRead < p[iFlag].flags || tail)) {
if (!write_seq_to_file(seq)) {
std::cerr << "Error writing to FASTQ" << std::endl;
goto end_extract;
}
}

// if we have not exhausted the
if (!tail && iRead == p[iFlag].flags) {
// read the next flag from the next bus record
iFlag++;

if (iFlag == rc) {
// read the next batch of bus
in.read((char *) p, N * sizeof(BUSData));
rc = in.gcount() / sizeof(BUSData);
nr += rc;
iFlag = 0;
tail = rc==0;
}
}

++iRead;

if (finished) {
if (iFlag < rc) {
std::cerr << "Warning: number of reads in FASTQs was less than number of reads in BUS file" << std::endl;
goto end_extract;
}
break;
}


}



std::cerr << "Read in " << nr << " BUS records" << std::endl;

end_extract:
Expand Down
17 changes: 17 additions & 0 deletions src/bustools_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,8 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt)
{"fastq", required_argument, 0, 'f'},
{"nFastqs", required_argument, 0, 'N'},
{"pipe", no_argument, 0, 'p'},
{"exclude", no_argument, 0, 'x'},
{"include", no_argument, 0, 'i'},
{0, 0, 0, 0}};

int option_index = 0, c;
Expand All @@ -1012,6 +1014,13 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt)
case '?':
opt.parse_error = true;
break;
case 'x':
opt.extract_exclude = true;
opt.extract_include = false;
break;
case 'i':
opt.extract_include = true;
break;
default:
break;
}
Expand Down Expand Up @@ -2551,6 +2560,12 @@ bool check_ProgramOptions_extract(Bustools_opt &opt)
ret = false;
}
}

if (opt.extract_exclude && opt.extract_include)
{
std::cerr << "Error: cannot specify both --exclude and --include" << std::endl;
ret = false;
}

return ret;
}
Expand Down Expand Up @@ -2904,6 +2919,8 @@ void Bustools_extract_Usage()
<< "-o, --output Output directory for FASTQ files" << std::endl
<< "-f, --fastq FASTQ file(s) from which to extract reads (comma-separated list)" << std::endl
<< "-N, --nFastqs Number of FASTQ file(s) per run" << std::endl
<< "-x, --exclude Exclude reads in the BUS file from the specified FASTQ file(s)" << std::endl
<< "-i, --include Include reads in the BUS file from the specified FASTQ file(s)" << std::endl
<< std::endl;
}

Expand Down

0 comments on commit 1b3948c

Please sign in to comment.