Skip to content

Commit

Permalink
Merge pull request #151 from smithlabcode/selectsites-zipped-output-a…
Browse files Browse the repository at this point in the history
…nd-summary

selectsites: zipped output and summary
  • Loading branch information
andrewdavidsmith authored Sep 29, 2023
2 parents f4a445c + 04261d0 commit f6fe880
Showing 1 changed file with 160 additions and 34 deletions.
194 changes: 160 additions & 34 deletions src/utils/selectsites.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,97 @@ using std::ios_base;
using std::runtime_error;
using std::ifstream;
using std::unordered_map;
using std::tuple;
using std::tie;

using bamxx::bgzf_file;

namespace fs = std::filesystem;

struct quick_buf : public std::ostringstream,
public std::basic_stringbuf<char> {
// ADS: By user ecatmur on SO; very fast. Seems to work...
quick_buf() {
// ...but this seems to depend on data layout
static_cast<std::basic_ios<char>&>(*this).rdbuf(this);
}
void clear() {
// reset buffer pointers (member functions)
setp(pbase(), pbase());
}
char const* c_str() {
/* between c_str and insertion make sure to clear() */
*pptr() = '\0';
return pbase();
}
};

struct selectsites_summary {

// command_line is the command used to produce this summary file and
// the corresponding results
string command_line{};

// n_target_regions is the number of target regions provided as
// input to the command
uint64_t n_target_regions{};

// target_region_size is the sum of the sizes of each target region
uint64_t target_region_size{};

// n_target_regions_collapsed is the number of target regions after
// having collapsed the input target regions merging those that
// overlap
uint64_t n_target_regions_collapsed{};

// target_region_collapsed_size is the sum of the sizes of target
// regions after collapsing
uint64_t target_region_collapsed_size{};

// n_sites_total is the total number of sites available in the input
// counts file. This value is displayed as zero if the command
// specified to process the sites on disk.
uint64_t n_sites_total{};

// n_sites_selected is the total number of sites in the output file
uint64_t n_sites_selected{};

template<typename T> static auto measure_target_regions(const T &t)
-> tuple<uint64_t, uint64_t> {
return {
std::size(t),
accumulate(cbegin(t), cend(t), 0ul,
[](const uint64_t a, const typename T::value_type &v) {
return a + v.get_width();
}),
};
}

auto tostring() const -> string {
std::ostringstream oss;
oss << "command_line: " << command_line << '\n'
<< "n_target_regions: " << n_target_regions << '\n'
<< "target_region_size: " << target_region_size << '\n'
<< "n_target_regions_collapsed: " << n_target_regions_collapsed << '\n'
<< "target_region_collapsed_size: " << target_region_collapsed_size
<< '\n'
<< "n_sites_total: " << n_sites_total << '\n'
<< "n_sites_selected: " << n_sites_selected;
return oss.str();
}
};

static auto
write_stats_output(const selectsites_summary &summary,
const string &summary_file) -> void {
if (!summary_file.empty()) {
std::ofstream out_summary(summary_file);
if (!out_summary) throw runtime_error("bad summary output file");
out_summary << summary.tostring() << endl;
}
}


static void
collapsebed(vector<GenomicRegion> &regions) {
size_t j = 0;
Expand All @@ -73,17 +159,21 @@ contains(const GenomicRegion &r, const MSite &s) {
(r.get_start() <= s.pos && s.pos < r.get_end()));
}

template<class T> static void
static auto
process_all_sites(const bool VERBOSE, const string &sites_file,
const unordered_map<string, vector<GenomicRegion>> &regions,
T &out) {
bgzf_file &out) -> tuple<uint64_t, uint64_t> {
bgzf_file in(sites_file, "r");
if (!in) throw runtime_error("cannot open file: " + sites_file);

uint64_t n_sites_total = 0u;
uint64_t n_sites_selected = 0u;

MSite the_site, prev_site;
vector<GenomicRegion>::const_iterator i, i_lim;
bool chrom_is_relevant = false;
while (read_site(in, the_site)) {
++n_sites_total;
if (the_site.chrom != prev_site.chrom) {
if (VERBOSE) cerr << "processing " << the_site.chrom << endl;
const auto r = regions.find(the_site.chrom);
Expand All @@ -96,41 +186,58 @@ process_all_sites(const bool VERBOSE, const string &sites_file,
if (chrom_is_relevant) {
while (i != i_lim && precedes(*i, the_site))
++i;
if (i != i_lim && contains(*i, the_site))
if (i != i_lim && contains(*i, the_site)) {
++n_sites_selected;
write_site(out, the_site);
}
}
std::swap(prev_site, the_site);
}
return {n_sites_selected, n_sites_total};
}

static void
static auto
get_sites_in_region(ifstream &site_in, const GenomicRegion &region,
std::ostream &out) {
bgzf_file &out) -> uint64_t {
quick_buf buf;

const string chrom{region.get_chrom()};
const size_t start_pos = region.get_start();
const size_t end_pos = region.get_end();
find_offset_for_msite(chrom, start_pos, site_in);

MSite the_site;
uint64_t n_sites_selected = 0u;
// ADS: should only happen once that "the_site.chrom < chrom" and
// this is only needed because of end state of binary search on disk
while (site_in >> the_site &&
(the_site.chrom < chrom ||
(the_site.chrom == chrom && the_site.pos < end_pos)))
if (start_pos <= the_site.pos) out << the_site << endl;
if (start_pos <= the_site.pos) {
// ADS: don't like this -- always needing to "clear()" this
// struct is bad...
buf.clear();
buf << the_site << '\n';
if (!out.write(buf.c_str(), buf.tellp()))
throw runtime_error("error writing output");
++n_sites_selected;
}
return n_sites_selected;
}

static void

static auto
process_with_sites_on_disk(const string &sites_file,
vector<GenomicRegion> &regions,
std::ostream &out) {

bgzf_file &out) -> uint64_t {
ifstream in(sites_file);
if (!in)
throw runtime_error("cannot open file: " + sites_file);

for (size_t i = 0; i < regions.size() && in; ++i)
get_sites_in_region(in, regions[i], out);
uint64_t n_sites_selected = 0ul;
for (auto i = 0u; i < size(regions) && in; ++i)
n_sites_selected += get_sites_in_region(in, regions[i], out);
return n_sites_selected;
}


Expand Down Expand Up @@ -164,15 +271,28 @@ is_compressed_file(const string &filename) {
}


static auto
get_command_line(const int argc, const char **const argv) -> string {
if (argc == 0) return string();
std::ostringstream cmd;
cmd << '"';
copy(argv, argv + (argc - 1), std::ostream_iterator<const char *>(cmd, " "));
cmd << argv[argc-1] << '"';
return cmd.str();
}


int
main_selectsites(int argc, const char **argv) {

try {

bool VERBOSE = false;
bool load_entire_file = false;
bool keep_file_on_disk = false;
bool compress_output = false;

string outfile;
string outfile("-");
string summary_file;

const string description =
"Select sites inside a set of genomic intervals. "
Expand All @@ -184,9 +304,11 @@ main_selectsites(int argc, const char **argv) {
"<intervals-bed> <sites-file>", 2);
opt_parse.add_opt("output", 'o', "output file (default: stdout)",
false, outfile);
opt_parse.add_opt("preload", 'p',
"preload sites (use for large target intervals)",
false, load_entire_file);
opt_parse.add_opt("disk", 'd', "process sites on disk "
"(fast if target intervals are few)",
false, keep_file_on_disk);
opt_parse.add_opt("summary", 'S', "write summary to this file", false, summary_file);
opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output);
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
opt_parse.set_show_defaults();
vector<string> leftover_args;
Expand All @@ -212,11 +334,14 @@ main_selectsites(int argc, const char **argv) {
const string sites_file = leftover_args.back();
/****************** END COMMAND LINE OPTIONS *****************/

selectsites_summary summary;
summary.command_line = get_command_line(argc, argv);

if (!fs::is_regular_file(sites_file))
throw runtime_error("bad input sites file: " + sites_file);

if (is_compressed_file(sites_file)) {
load_entire_file = true;
keep_file_on_disk = false;
if (VERBOSE)
cerr << "input file is so must be loaded" << endl;
}
Expand All @@ -225,34 +350,35 @@ main_selectsites(int argc, const char **argv) {
ReadBEDFile(regions_file, regions);
if (!check_sorted(regions))
throw runtime_error("regions not sorted in file: " + regions_file);
std::tie(summary.n_target_regions, summary.target_region_size) =
selectsites_summary::measure_target_regions(regions);

const size_t n_orig_regions = regions.size();
collapsebed(regions);
if (VERBOSE && n_orig_regions != regions.size())
cerr << "[number of regions merged due to overlap: "
<< n_orig_regions - regions.size() << "]" << endl;

std::tie(summary.n_target_regions_collapsed,
summary.target_region_collapsed_size) =
selectsites_summary::measure_target_regions(regions);

unordered_map<string, vector<GenomicRegion>> regions_lookup;
if ((outfile.empty() || !has_gz_ext(outfile)) && load_entire_file)
regions_by_chrom(regions, regions_lookup);
if (!keep_file_on_disk) regions_by_chrom(regions, regions_lookup);

if (outfile.empty() || !has_gz_ext(outfile)) {
std::ofstream of;
if (!outfile.empty()) of.open(outfile);
std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf());
if (!outfile.empty() && !out)
throw runtime_error("failed to open output file: " + outfile);
// open the output file
const string output_mode = compress_output ? "w" : "wu";
bamxx::bgzf_file out(outfile, output_mode);
if (!out) throw runtime_error("error opening output file: " + outfile);

if (load_entire_file)
process_all_sites(VERBOSE, sites_file, regions_lookup, out);
else
if (keep_file_on_disk)
summary.n_sites_selected =
process_with_sites_on_disk(sites_file, regions, out);
}
else {
// not supporting search on disk for gz file
bgzf_file out(outfile, "w");
process_all_sites(VERBOSE, sites_file, regions_lookup, out);
}
else
std::tie(summary.n_sites_selected, summary.n_sites_total) =
process_all_sites(VERBOSE, sites_file, regions_lookup, out);

write_stats_output(summary, summary_file);
}
catch (const runtime_error &e) {
cerr << e.what() << endl;
Expand Down

0 comments on commit f6fe880

Please sign in to comment.