Skip to content

Commit

Permalink
Merge pull request #220 from smithlabcode/levels-ignore-header
Browse files Browse the repository at this point in the history
Skip counts file header in levels command
  • Loading branch information
andrewdavidsmith committed Jun 3, 2024
2 parents 8cc8a65 + 04dc969 commit 43da2b1
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 40 deletions.
2 changes: 1 addition & 1 deletion src/analysis/hmr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ load_cpgs(const string &cpgs_file, vector<MSite> &cpgs,
bgzf_file in(cpgs_file, "r");
if (!in) throw runtime_error("failed opening file: " + cpgs_file);

if (has_counts_header(cpgs_file))
if (get_has_counts_header(cpgs_file))
skip_counts_header(in);

MSite prev_site, the_site;
Expand Down
73 changes: 42 additions & 31 deletions src/analysis/levels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@
* General Public License for more details.
*/

#include "LevelsCounter.hpp"
#include "MSite.hpp"
#include "OptionParser.hpp"
#include "bsutils.hpp"
#include "counts_header.hpp"
#include "smithlab_os.hpp"
#include "smithlab_utils.hpp"

#include <bamxx.hpp>

#include <cmath>
#include <exception>
#include <fstream>
Expand All @@ -36,20 +46,11 @@
#include <string>
#include <vector>

#include <bamxx.hpp>
#include "LevelsCounter.hpp"
#include "MSite.hpp"
#include "OptionParser.hpp"
#include "bsutils.hpp"
#include "smithlab_os.hpp"
#include "smithlab_utils.hpp"

using std::cerr;
using std::cout;
using std::endl;
using std::ios_base;
using std::runtime_error;
using std::string;
using std::to_string;
using std::vector;

using bamxx::bgzf_file;
Expand All @@ -58,14 +59,17 @@ enum class counts_file_format { ordinary, asym_cpg, sym_cpg };

static counts_file_format
guess_counts_file_format(const string &filename) {
static const size_t n_lines_to_check = 10000;
static const uint64_t n_lines_to_check = 10000;

const bool has_counts_header = get_has_counts_header(filename);
bgzf_file in(filename, "r");
if (!in) throw std::runtime_error("bad input file: " + filename);
if (!in) throw ios_base::failure{"bad input file: " + filename};
if (has_counts_header)
skip_counts_header(in);

bool found_non_cpg = false, found_asym_cpg = false;
MSite curr_site, prev_site;
size_t line_count = 0;
uint64_t line_count = 0;

while (read_site(in, curr_site) && (!found_non_cpg || !found_asym_cpg) &&
line_count++ < n_lines_to_check) {
Expand All @@ -81,7 +85,7 @@ guess_counts_file_format(const string &filename) {
int
main_levels(int argc, const char **argv) {
try {
bool VERBOSE = false;
bool verbose = false;
bool relaxed_mode = false;
string outfile;

Expand All @@ -97,7 +101,7 @@ main_levels(int argc, const char **argv) {
opt_parse.add_opt("relaxed", '\0',
"run on data that appears to have sites filtered", false,
relaxed_mode);
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose);
vector<string> leftover_args;
opt_parse.parse(argc, argv, leftover_args);
if (opt_parse.help_requested()) {
Expand All @@ -121,22 +125,25 @@ main_levels(int argc, const char **argv) {
/****************** END COMMAND LINE OPTIONS *****************/

if (!is_msite_file(meth_file))
throw runtime_error("malformed counts file: " + meth_file);
throw runtime_error{"malformed counts file: " + meth_file};

const counts_file_format guessed_format =
guess_counts_file_format(meth_file);
if (guessed_format != counts_file_format::ordinary) {
if (VERBOSE)
if (verbose)
cerr << "input might be only CpG sites ("
<< (guessed_format == counts_file_format::asym_cpg ? "not " : "")
<< "symmetric)" << endl;
if (!relaxed_mode)
throw runtime_error(
"unexpected input format (consider using \"-relaxed\")");
throw runtime_error{
"unexpected input format (consider using -relaxed)"};
}

const bool has_counts_header = get_has_counts_header(meth_file);
bgzf_file in(meth_file, "r");
if (!in) throw std::runtime_error("bad input file: " + meth_file);
if (has_counts_header)
skip_counts_header(in);

LevelsCounter cpg("cpg");
LevelsCounter cpg_symmetric("cpg_symmetric");
Expand All @@ -149,7 +156,7 @@ main_levels(int argc, const char **argv) {

while (read_site(in, site)) {
if (site.chrom != prev_site.chrom)
if (VERBOSE) cerr << "processing " << site.chrom << endl;
if (verbose) cerr << "processing " << site.chrom << endl;

cytosines.update(site);

Expand All @@ -160,27 +167,31 @@ main_levels(int argc, const char **argv) {
cpg_symmetric.update(site);
}
}
else if (site.is_chh()) chh.update(site);
else if (site.is_ccg()) ccg.update(site);
else if (site.is_cxg()) cxg.update(site);
else throw runtime_error("bad site context: " + site.context);
else if (site.is_chh())
chh.update(site);
else if (site.is_ccg())
ccg.update(site);
else if (site.is_cxg())
cxg.update(site);
else
throw runtime_error{"bad site context: " + site.context};

prev_site = site;
}

std::ofstream of;
if (!outfile.empty()) {
of.open(outfile);
if (!of) throw runtime_error("bad output file: " + outfile);
if (!of) throw ios_base::failure{"bad output file: " + outfile};
}
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());

out << cytosines << endl
<< cpg << endl
<< cpg_symmetric << endl
<< chh << endl
<< ccg << endl
<< cxg << endl;
out << cytosines << '\n'
<< cpg << '\n'
<< cpg_symmetric << '\n'
<< chh << '\n'
<< ccg << '\n'
<< cxg << '\n';
}
catch (const std::exception &e) {
cerr << e.what() << endl;
Expand Down
12 changes: 6 additions & 6 deletions src/analysis/pmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ get_optimized_boundary_likelihoods(const vector<string> &cpgs_file,
vector<bgzf_file*> in(cpgs_file.size());
for (size_t i = 0; i < cpgs_file.size(); ++i) {
in[i] = new bgzf_file(cpgs_file[i], "r");
if (has_counts_header(cpgs_file[i]))
if (get_has_counts_header(cpgs_file[i]))
skip_counts_header(*in[i]);
}

Expand Down Expand Up @@ -350,7 +350,7 @@ find_exact_boundaries(const vector<string> &cpgs_file,
vector<bgzf_file*> in(cpgs_file.size());
for (size_t i = 0; i < cpgs_file.size(); ++i) {
in[i] = new bgzf_file(cpgs_file[i], "r");
if (has_counts_header(cpgs_file[i]))
if (get_has_counts_header(cpgs_file[i]))
skip_counts_header(*in[i]);
}

Expand Down Expand Up @@ -768,7 +768,7 @@ check_if_array_data(const string &infile) {
bgzf_file in(infile, "r");
if (!in) throw std::runtime_error("bad file: " + infile);

if (has_counts_header(infile))
if (get_has_counts_header(infile))
skip_counts_header(in);

std::string line;
Expand All @@ -792,7 +792,7 @@ load_array_data(const size_t bin_size,
bgzf_file in(cpgs_file, "r");
if (!in) throw std::runtime_error("bad sites file: " + cpgs_file);

if (has_counts_header(cpgs_file))
if (get_has_counts_header(cpgs_file))
skip_counts_header(in);

string curr_chrom;
Expand Down Expand Up @@ -888,7 +888,7 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file,
bgzf_file in(cpgs_file, "r");
if (!in) throw runtime_error("bad sites file: " + cpgs_file);

if (has_counts_header(cpgs_file))
if (get_has_counts_header(cpgs_file))
skip_counts_header(in);

// keep track of the chroms we've seen
Expand Down Expand Up @@ -966,7 +966,7 @@ load_read_counts(const string &cpgs_file, const size_t bin_size,
bgzf_file in(cpgs_file, "r");
if (!in) throw runtime_error("bad methcounts file: " + cpgs_file);

if (has_counts_header(cpgs_file))
if (get_has_counts_header(cpgs_file))
skip_counts_header(in);

// keep track of where we are and what we've seen
Expand Down
2 changes: 1 addition & 1 deletion src/common/counts_header.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ write_counts_header_line(string line, bgzf_file &out) {
}

bool
has_counts_header(const string &filename) {
get_has_counts_header(const string &filename) {
bgzf_file in(filename, "r");
if (!in) return false;
string line;
Expand Down
2 changes: 1 addition & 1 deletion src/common/counts_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ bamxx::bgzf_file &
skip_counts_header(bamxx::bgzf_file &in);

bool
has_counts_header(const std::string &filename);
get_has_counts_header(const std::string &filename);

inline bool
is_counts_header_version_line(const std::string &line) {
Expand Down

0 comments on commit 43da2b1

Please sign in to comment.