Skip to content

Commit

Permalink
Merge pull request #234 from smithlabcode/xcounts-utils
Browse files Browse the repository at this point in the history
Adding xcounts_utils.cpp source file
  • Loading branch information
andrewdavidsmith committed Jun 23, 2024
2 parents 4c39184 + 3f3e770 commit 0bc95db
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 65 deletions.
2 changes: 2 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ libdnmtools_a_SOURCES = \
src/common/dnmtools_gaussinv.cpp \
src/common/bam_record_utils.cpp \
src/common/counts_header.cpp \
src/common/xcounts_utils.cpp \
src/common/BetaBin.cpp \
src/common/EmissionDistribution.cpp \
src/common/Epiread.cpp \
Expand All @@ -172,6 +173,7 @@ libdnmtools_a_SOURCES += \
src/common/dnmtools_gaussinv.hpp \
src/common/bam_record_utils.hpp \
src/common/counts_header.hpp \
src/common/xcounts_utils.hpp \
src/common/BetaBin.hpp \
src/common/EmissionDistribution.hpp \
src/common/Epiread.hpp \
Expand Down
77 changes: 12 additions & 65 deletions src/analysis/cpgbins.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include "MSite.hpp"
#include "OptionParser.hpp"
#include "bsutils.hpp"
#include "counts_header.hpp"
#include "xcounts_utils.hpp"
#include "smithlab_utils.hpp"

#include <bamxx.hpp>
Expand Down Expand Up @@ -121,69 +121,16 @@ get_chrom_names(const string &chrom_sizes_file) {
return chrom_names;
}

struct xsym_entry {
uint64_t pos{};
uint32_t n_meth{};
uint32_t n_unmeth{};
};

static unordered_map<string, vector<xsym_entry>>
read_xsym_by_chrom(const uint32_t n_threads, const string &xsym_file) {
bamxx::bam_tpool tp(n_threads);
bamxx::bgzf_file in(xsym_file, "r");
if (!in) throw runtime_error("failed to open input file");

// set the threads for the input file decompression
if (n_threads > 1 && in.is_bgzf()) tp.set_io(in);

kstring_t line{0, 0, nullptr};
string chrom_name;
uint64_t pos = 0;

unordered_map<string, vector<xsym_entry>> sites_by_chrom;

vector<xsym_entry> curr_chrom;

while (bamxx::getline(in, line)) {
if (is_counts_header_line(line.s)) continue; // ADS: early loop exit

if (!std::isdigit(line.s[0])) { // check if we have a chrom line
if (!chrom_name.empty()) {
sites_by_chrom.insert({chrom_name, curr_chrom});
curr_chrom.clear();
}
chrom_name = string{line.s};
pos = 0;
continue;
}

uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0;
const auto end_line = line.s + line.l;
auto res = std::from_chars(line.s, end_line, pos_step);
res = std::from_chars(res.ptr + 1, end_line, n_meth);
res = std::from_chars(res.ptr + 1, end_line, n_unmeth);

const auto curr_pos = pos + pos_step;

curr_chrom.push_back({curr_pos, n_meth, n_unmeth});

pos = curr_pos;
}

if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom});

return sites_by_chrom;
}

void
update(LevelsCounter &lc, const xsym_entry &xse) {
const uint64_t n_reads = xse.n_meth + xse.n_unmeth;
static void
update(LevelsCounter &lc, const xcounts_entry &xce) {
const uint64_t n_reads = xce.n_meth + xce.n_unmeth;
if (n_reads > 0) {
++lc.sites_covered;
lc.max_depth = std::max(lc.max_depth, n_reads);
lc.total_c += xse.n_meth;
lc.total_t += xse.n_unmeth;
const auto meth = static_cast<double>(xse.n_unmeth) / n_reads;
lc.total_c += xce.n_meth;
lc.total_t += xce.n_unmeth;
const auto meth = static_cast<double>(xce.n_unmeth) / n_reads;
lc.total_meth += meth;
double lower = 0.0, upper = 0.0;
wilson_ci_for_binomial(lc.alpha, n_reads, meth, lower, upper);
Expand All @@ -196,7 +143,7 @@ update(LevelsCounter &lc, const xsym_entry &xse) {
static void
process_chrom(const bool report_more_info, const char level_code,
const string &chrom_name, const uint64_t chrom_size,
const uint64_t bin_size, const vector<xsym_entry> &sites,
const uint64_t bin_size, const vector<xcounts_entry> &sites,
ostream &out) {
GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+');

Expand Down Expand Up @@ -302,17 +249,17 @@ Columns (beyond the first 6) in the BED format output:
return EXIT_FAILURE;
}
const string chrom_sizes_file = leftover_args.front();
const string xsym_file = leftover_args.back();
const string xcounts_file = leftover_args.back();
/****************** END COMMAND LINE OPTIONS *****************/

if (!fs::is_regular_file(chrom_sizes_file))
throw runtime_error("chromosome sizes file not a regular file: " +
chrom_sizes_file);

if (!fs::is_regular_file(xsym_file))
throw runtime_error("xsym file not a regular file: " + xsym_file);
if (!fs::is_regular_file(xcounts_file))
throw runtime_error("xsym file not a regular file: " + xcounts_file);

const auto sites_by_chrom = read_xsym_by_chrom(n_threads, xsym_file);
const auto sites_by_chrom = read_xcounts_by_chrom(n_threads, xcounts_file);
const auto chrom_names = get_chrom_names(chrom_sizes_file);
const auto chrom_sizes = get_chrom_sizes(chrom_sizes_file);

Expand Down
130 changes: 130 additions & 0 deletions src/common/xcounts_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/* xcounts_utils: code for doing things with xcounts format and some
* for counts format that is common to several tools.
*
* Copyright (C) 2023-2024 Andrew D. Smith
*
* Authors: Andrew D. Smith
*
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/

#include "xcounts_utils.hpp"

#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cassert>
#include <cstdint>
#include <sstream>
#include <charconv>
#include <unordered_map>

#include "counts_header.hpp"
#include "bam_record_utils.hpp"

#include "bamxx.hpp"
#include "dnmt_error.hpp"

using std::vector;
using std::string;
using std::to_string;
using std::unordered_map;
using std::runtime_error;

using bamxx::bgzf_file;


// careful: this could get big
unordered_map<string, vector<xcounts_entry>>
read_xcounts_by_chrom(const uint32_t n_threads, const string &xcounts_file) {
bamxx::bam_tpool tp(n_threads);
bamxx::bgzf_file in(xcounts_file, "r");
if (!in) throw runtime_error("failed to open input file");

// set the threads for the input file decompression
if (n_threads > 1 && in.is_bgzf()) tp.set_io(in);

kstring_t line{0, 0, nullptr};
string chrom_name;
uint64_t pos = 0;

unordered_map<string, vector<xcounts_entry>> sites_by_chrom;

vector<xcounts_entry> curr_chrom;

while (bamxx::getline(in, line)) {
if (is_counts_header_line(line.s)) continue; // ADS: early loop exit

if (!std::isdigit(line.s[0])) { // check if we have a chrom line
if (!chrom_name.empty()) {
sites_by_chrom.insert({chrom_name, curr_chrom});
curr_chrom.clear();
}
chrom_name = string{line.s};
pos = 0;
continue;
}

uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0;
const auto end_line = line.s + line.l;
auto res = std::from_chars(line.s, end_line, pos_step);
res = std::from_chars(res.ptr + 1, end_line, n_meth);
res = std::from_chars(res.ptr + 1, end_line, n_unmeth);

const auto curr_pos = pos + pos_step;

curr_chrom.push_back({curr_pos, n_meth, n_unmeth});

pos = curr_pos;
}

if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom});

return sites_by_chrom;
}


bool
get_is_xcounts_file(const std::string &filename) {
static constexpr auto max_lines_to_check = 1000ul;
bamxx::bgzf_file in(filename, "r");
if (!in) throw dnmt_error{"failed to open input file: " + filename};

kstring_t line{0, 0, nullptr};

bool found_header = false;
bool found_chrom = false;

auto line_count = 0ul;
while (line_count++ < max_lines_to_check && bamxx::getline(in, line)) {
if (is_counts_header_line(line.s)) {
found_header = true;
}
else if (!std::isdigit(line.s[0])) { // check if we have a chrom line
if (!found_header)
return false;
found_chrom = true;
}
else {
if (!found_chrom) return false;
int64_t pos_step = 0, n_meth = 0, n_unmeth = 0;
const auto end_line = line.s + line.l;
auto res = std::from_chars(line.s, end_line, pos_step);
if (res.ec != std::errc()) return false;
res = std::from_chars(res.ptr + 1, end_line, n_meth);
if (res.ec != std::errc()) return false;
res = std::from_chars(res.ptr + 1, end_line, n_unmeth);
if (res.ec != std::errc()) return false;
}
}
return true;
}
39 changes: 39 additions & 0 deletions src/common/xcounts_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/* xcounts_utils: code for doing things with xcounts format and some
* for counts format that is common to several tools.
*
* Copyright (C) 2023-2024 Andrew D. Smith
*
* Authors: Andrew D. Smith
*
* This program is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/

#ifndef XCOUNTS_UTILS_HPP
#define XCOUNTS_UTILS_HPP

#include <string>
#include <vector>
#include <cstdint>
#include <unordered_map>

struct xcounts_entry {
uint64_t pos{}; // absolute position
uint32_t n_meth{};
uint32_t n_unmeth{};
};

std::unordered_map<std::string, std::vector<xcounts_entry>>
read_xcounts_by_chrom(const uint32_t n_threads, const std::string &xcounts_file);

bool
get_is_xcounts_file(const std::string &filename);

#endif

0 comments on commit 0bc95db

Please sign in to comment.