From cfded6ebd0bf6425859fa9c2f1c41679378d1581 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 17 Jun 2024 15:25:38 -0700 Subject: [PATCH 1/4] src/analysis/cpgbins.cpp: adding a source file for the cpgbins command, which summarizes methylation at CpG sites in genomic bins tililng the genome --- src/analysis/cpgbins.cpp | 370 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 370 insertions(+) create mode 100644 src/analysis/cpgbins.cpp diff --git a/src/analysis/cpgbins.cpp b/src/analysis/cpgbins.cpp new file mode 100644 index 0000000..4d9ed55 --- /dev/null +++ b/src/analysis/cpgbins.cpp @@ -0,0 +1,370 @@ +/* cpgbins: average CpG methylation in genomic bins + * + * Copyright (C) 2024 Andrew D. Smith + * + * Authors: Andrew D. Smith and Masaru Nakajima + * + * This program is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include "GenomicRegion.hpp" +#include "LevelsCounter.hpp" +#include "MSite.hpp" +#include "OptionParser.hpp" +#include "bsutils.hpp" +#include "counts_header.hpp" +#include "smithlab_utils.hpp" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using std::cerr; +using std::cout; +using std::distance; +using std::endl; +using std::ifstream; +using std::map; +using std::ostream; +using std::pair; +using std::runtime_error; +using std::size; +using std::string; +using std::to_string; +using std::unordered_map; +using std::vector; + +using bamxx::bgzf_file; + +namespace fs = std::filesystem; + +static string +format_levels_counter(const LevelsCounter &lc) { + // ... + // (7) weighted mean methylation + // (8) unweighted mean methylation + // (9) fractional methylation + // (10) number of sites in the region + // (11) number of sites covered at least once + // (12) number of observations in reads indicating methylation + // (13) total number of observations from reads in the region + std::ostringstream oss; + // clang-format off + oss << lc.mean_meth_weighted() << '\t' + << lc.mean_meth() << '\t' + << lc.fractional_meth() << '\t' + << lc.total_sites << '\t' + << lc.sites_covered << '\t' + << lc.total_c << '\t' + << (lc.total_c + lc.total_t); + // clang-format on + return oss.str(); +} + +inline auto +getline(bgzf_file &file, kstring_t &line) -> bgzf_file & { + if (file.f == nullptr) return file; + const int x = bgzf_getline(file.f, '\n', &line); + if (x == -1) { + file.destroy(); + free(line.s); + line = {0, 0, nullptr}; + } + if (x < -1) { + // ADS: this is an error condition and should be handled + // differently from the EOF above. + file.destroy(); + free(line.s); + line = {0, 0, nullptr}; + throw runtime_error{"failed reading bgzf file"}; + } + return file; +} + +static unordered_map +get_chrom_sizes(const string &chrom_sizes_file) { + unordered_map chrom_sizes; + + ifstream in(chrom_sizes_file); + if (!in) throw runtime_error("failed to open file: " + chrom_sizes_file); + + string line; + while (getline(in, line)) { + std::istringstream iss(line); + string chrom_name{}; + uint64_t chrom_size{}; + if (!(iss >> chrom_name >> chrom_size)) + throw runtime_error("bad line in " + chrom_sizes_file + ":\n" + line); + string dummy; + if (iss >> dummy) throw runtime_error("too many columns: " + line); + + if (chrom_sizes.find(chrom_name) != cend(chrom_sizes)) + throw runtime_error("repeated entry " + chrom_name + " in " + + chrom_sizes_file); + + chrom_sizes[chrom_name] = chrom_size; + } + return chrom_sizes; +} + +static vector +get_chrom_names(const string &chrom_sizes_file) { + ifstream in(chrom_sizes_file); + if (!in) throw runtime_error("failed to open file: " + chrom_sizes_file); + + vector chrom_names; + + string line; + while (getline(in, line)) { + std::istringstream iss(line); + string chrom_name{}; + if (!(iss >> chrom_name)) + throw runtime_error("bad line in " + chrom_sizes_file + ":\n" + line); + + chrom_names.push_back(chrom_name); + } + return chrom_names; +} + +struct xsym_entry { + uint64_t pos{}; + uint32_t n_meth{}; + uint32_t n_unmeth{}; +}; + +static unordered_map> +read_xsym_by_chrom(const size_t n_threads, const string &xsym_file) { + bamxx::bam_tpool tp(n_threads); + bamxx::bgzf_file in(xsym_file, "r"); + if (!in) throw runtime_error("failed to open input file"); + + // set the threads for the input file decompression + if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); + + kstring_t line{0, 0, nullptr}; + string chrom_name; + uint64_t pos = 0; + + unordered_map> sites_by_chrom; + + vector curr_chrom; + + while (getline(in, line)) { + if (is_counts_header_line(line.s)) continue; // ADS: early loop exit + + if (!std::isdigit(line.s[0])) { // check if we have a chrom line + if (!chrom_name.empty()) { + sites_by_chrom.insert({chrom_name, curr_chrom}); + curr_chrom.clear(); + } + chrom_name = string{line.s}; + pos = 0; + continue; + } + + uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0; + const auto end_line = line.s + line.l; + auto res = std::from_chars(line.s, end_line, pos_step); + res = std::from_chars(res.ptr + 1, end_line, n_meth); + res = std::from_chars(res.ptr + 1, end_line, n_unmeth); + + const auto curr_pos = pos + pos_step; + + curr_chrom.push_back({curr_pos, n_meth, n_unmeth}); + + pos = curr_pos; + } + + if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom}); + + return sites_by_chrom; +} + +void +update(LevelsCounter &lc, const xsym_entry &xse) { + const uint64_t n_reads = xse.n_meth + xse.n_unmeth; + if (n_reads > 0) { + ++lc.sites_covered; + lc.max_depth = std::max(lc.max_depth, n_reads); + lc.total_c += xse.n_meth; + lc.total_t += xse.n_unmeth; + const auto meth = static_cast(xse.n_unmeth) / n_reads; + lc.total_meth += meth; + double lower = 0.0, upper = 0.0; + wilson_ci_for_binomial(lc.alpha, n_reads, meth, lower, upper); + lc.called_meth += (lower > 0.5); + lc.called_unmeth += (upper < 0.5); + } + ++lc.total_sites; +} + +static void +process_chrom(const bool report_more_info, const char level_code, + const string &chrom_name, const uint64_t chrom_size, + const uint64_t bin_size, const vector &sites, + ostream &out) { + GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+'); + + size_t j = 0; + for (auto i = 0ul; i < chrom_size; i += bin_size) { + while (j < size(sites) && sites[j].pos < i) ++j; + + LevelsCounter lc; + while (j < size(sites) && sites[j].pos < i + bin_size) + update(lc, sites[j++]); + + const double score = + level_code == 'w' + ? lc.mean_meth_weighted() + : (level_code == 'u' ? lc.mean_meth() : lc.fractional_meth()); + r.set_start(i); + r.set_end(std::min(i + bin_size, chrom_size)); + r.set_score(score); + out << r; + if (report_more_info) out << '\t' << format_levels_counter(lc); + out << '\n'; + } +} + +static void +process_chrom(const bool report_more_info, const string &chrom_name, + const uint64_t chrom_size, const uint64_t bin_size, + ostream &out) { + GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+'); + LevelsCounter lc; + for (auto i = 0ul; i < chrom_size; i += bin_size) { + r.set_start(i); + r.set_end(std::min(i + bin_size, chrom_size)); + out << r; + if (report_more_info) out << '\t' << format_levels_counter(lc); + out << '\n'; + } +} + +int +main_cpgbins(int argc, const char **argv) { + try { + static const string description = R"""( +Compute average site methylation levels in each non-overlapping +genomic bin of the specified size. The 5th column (the "score" column +in BED format) is determined by the '-l' or '-level' argument. + +Columns (beyond the first 6) in the BED format output: +(7) weighted mean methylation +(8) unweighted mean methylation +(9) fractional methylation +(10) number of sites in the region +(11) number of sites covered at least once +(12) number of observations in reads indicating methylation +(13) total number of observations from reads in the region +)"""; + + static const string default_name_prefix = "X"; + + bool verbose = false; + bool print_numeric_only = false; + bool report_more_info = false; + size_t n_threads = 1; + + uint64_t bin_size = 1000; + + string level_code = "w"; + + string outfile; + + /****************** COMMAND LINE OPTIONS ********************/ + OptionParser opt_parse(strip_path(argv[0]), description, + " "); + opt_parse.set_show_defaults(); + opt_parse.add_opt("output", 'o', "name of output file (default: stdout)", + false, outfile); + opt_parse.add_opt("bin", 'b', "bin size in base pairs", false, bin_size); + opt_parse.add_opt("numeric", 'N', "print numeric values only (not NAs)", + false, print_numeric_only); + opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); + opt_parse.add_opt("level", 'l', + "the level to report as score column " + "in bed format output (w, u or f)", + false, level_code); + opt_parse.add_opt("more-levels", 'M', "report more methylation information", + false, report_more_info); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); + vector leftover_args; + opt_parse.parse(argc, argv, leftover_args); + if (argc == 1 || opt_parse.help_requested() || + opt_parse.about_requested()) { + cerr << opt_parse.help_message() << opt_parse.about_message_raw() << endl; + return EXIT_SUCCESS; + } + if (opt_parse.option_missing()) { + cerr << opt_parse.option_missing_message() << endl; + return EXIT_FAILURE; + } + if (leftover_args.size() != 2) { + cerr << opt_parse.help_message() << endl; + return EXIT_FAILURE; + } + if (level_code != "w" && level_code != "u" && level_code != "f") { + cerr << "selected level must be in {w, u, f}" << endl; + return EXIT_FAILURE; + } + const string chrom_sizes_file = leftover_args.front(); + const string xsym_file = leftover_args.back(); + /****************** END COMMAND LINE OPTIONS *****************/ + + if (!fs::is_regular_file(chrom_sizes_file)) + throw runtime_error("chromosome sizes file not a regular file: " + + chrom_sizes_file); + + if (!fs::is_regular_file(xsym_file)) + throw runtime_error("xsym file not a regular file: " + xsym_file); + + const auto sites_by_chrom = read_xsym_by_chrom(n_threads, xsym_file); + const auto chrom_names = get_chrom_names(chrom_sizes_file); + const auto chrom_sizes = get_chrom_sizes(chrom_sizes_file); + + std::ofstream of; + if (!outfile.empty()) { + of.open(outfile); + if (!of) throw runtime_error("failed to open outfile: " + outfile); + } + std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); + + for (const auto &chrom_name : chrom_names) { + const auto sites = sites_by_chrom.find(chrom_name); + const auto chrom_size = chrom_sizes.find(chrom_name); + if (chrom_size == cend(chrom_sizes)) + throw runtime_error("failed chrom size lookup"); + if (sites != cend(sites_by_chrom)) + process_chrom(report_more_info, level_code[0], chrom_name, + chrom_size->second, bin_size, sites->second, out); + else + process_chrom(report_more_info, chrom_name, chrom_size->second, + bin_size, out); + } + } + catch (const std::exception &e) { + cerr << e.what() << endl; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} From fe1ca6885d691c5ec62416d09c04956d62648659 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 17 Jun 2024 15:25:47 -0700 Subject: [PATCH 2/4] Makefile.am: adding source file for cpgbins command to summarize methylation at CpG sites in genomic bins tiling the genome --- Makefile.am | 1 + src/dnmtools.cpp | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 5538c92..fe5e1e1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -205,6 +205,7 @@ dnmtools_SOURCES += src/analysis/bsrate.cpp dnmtools_SOURCES += src/analysis/methentropy.cpp dnmtools_SOURCES += src/analysis/methcounts.cpp dnmtools_SOURCES += src/analysis/roimethstat.cpp +dnmtools_SOURCES += src/analysis/cpgbins.cpp dnmtools_SOURCES += src/analysis/multimethstat.cpp dnmtools_SOURCES += src/analysis/hmr.cpp dnmtools_SOURCES += src/analysis/hmr-rep.cpp diff --git a/src/dnmtools.cpp b/src/dnmtools.cpp index 02cb9ca..49b8df2 100644 --- a/src/dnmtools.cpp +++ b/src/dnmtools.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2022-2023 Andrew D. Smith and Guilherme Sena +/* Copyright (C) 2022-2024 Andrew D. Smith and Guilherme Sena * * Authors: Andrew D. Smith and Guilherme Sena * @@ -90,6 +90,8 @@ main_pmd(int argc, const char **argv); int main_roimethstat(int argc, const char **argv); int +main_cpgbins(int argc, const char **argv); +int main_mlml(int argc, const char **argv); int main_dmr(int argc, const char **argv); @@ -175,6 +177,7 @@ main(int argc, const char **argv) { {"entropy", "compute methylation entropy in sliding window", main_methentropy}, {"pmd", "identify partially methylated domains", main_pmd}, {"roi", "get average CpG methylation in each of a set of genomic interval", main_roimethstat}, + {"cpgbins", "get average CpG methylation in genomic bins", main_cpgbins}, {"multistat", "same as roi except for multiple samples/input files", main_multimethstat}, {"mlml", "program to estimate hydroxymethylation levels", main_mlml}}}}, From 09fc4e987a33ab5e682841b44538d0c90541167f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 17 Jun 2024 15:28:39 -0700 Subject: [PATCH 3/4] src/analysis/cpgbins.cpp: cleaning up unused headers and usings --- src/analysis/cpgbins.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/analysis/cpgbins.cpp b/src/analysis/cpgbins.cpp index 4d9ed55..51e8762 100644 --- a/src/analysis/cpgbins.cpp +++ b/src/analysis/cpgbins.cpp @@ -31,11 +31,9 @@ #include #include #include -#include #include #include #include -#include #include using std::cerr; @@ -43,13 +41,10 @@ using std::cout; using std::distance; using std::endl; using std::ifstream; -using std::map; using std::ostream; -using std::pair; using std::runtime_error; using std::size; using std::string; -using std::to_string; using std::unordered_map; using std::vector; From 473f37a244ca9c931e2deeda77e37eba6622b9a0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 17 Jun 2024 15:47:58 -0700 Subject: [PATCH 4/4] src/analysis/cpgbins.cpp: adding functionality to report denominator of score column in the name column of the bed format output --- src/analysis/cpgbins.cpp | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/analysis/cpgbins.cpp b/src/analysis/cpgbins.cpp index 51e8762..97be547 100644 --- a/src/analysis/cpgbins.cpp +++ b/src/analysis/cpgbins.cpp @@ -147,7 +147,7 @@ struct xsym_entry { }; static unordered_map> -read_xsym_by_chrom(const size_t n_threads, const string &xsym_file) { +read_xsym_by_chrom(const uint32_t n_threads, const string &xsym_file) { bamxx::bam_tpool tp(n_threads); bamxx::bgzf_file in(xsym_file, "r"); if (!in) throw runtime_error("failed to open input file"); @@ -219,7 +219,7 @@ process_chrom(const bool report_more_info, const char level_code, ostream &out) { GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+'); - size_t j = 0; + uint64_t j = 0; for (auto i = 0ul; i < chrom_size; i += bin_size) { while (j < size(sites) && sites[j].pos < i) ++j; @@ -227,13 +227,16 @@ process_chrom(const bool report_more_info, const char level_code, while (j < size(sites) && sites[j].pos < i + bin_size) update(lc, sites[j++]); - const double score = - level_code == 'w' - ? lc.mean_meth_weighted() - : (level_code == 'u' ? lc.mean_meth() : lc.fractional_meth()); r.set_start(i); r.set_end(std::min(i + bin_size, chrom_size)); - r.set_score(score); + r.set_score(level_code == 'w' ? lc.mean_meth_weighted() + : (level_code == 'u' ? lc.mean_meth() + : lc.fractional_meth())); + r.set_name("CpG_" + + std::to_string((level_code == 'w' + ? lc.coverage() + : (level_code == 'u' ? lc.sites_covered + : lc.total_called())))); out << r; if (report_more_info) out << '\t' << format_levels_counter(lc); out << '\n'; @@ -244,13 +247,14 @@ static void process_chrom(const bool report_more_info, const string &chrom_name, const uint64_t chrom_size, const uint64_t bin_size, ostream &out) { - GenomicRegion r(chrom_name, 0, 0, "CpG", 0.0, '+'); + GenomicRegion r(chrom_name, 0, 0, "CpG_0", 0.0, '+'); LevelsCounter lc; + const string lc_formatted = format_levels_counter(lc); for (auto i = 0ul; i < chrom_size; i += bin_size) { r.set_start(i); r.set_end(std::min(i + bin_size, chrom_size)); out << r; - if (report_more_info) out << '\t' << format_levels_counter(lc); + if (report_more_info) out << '\t' << lc_formatted; out << '\n'; } } @@ -276,14 +280,10 @@ Columns (beyond the first 6) in the BED format output: static const string default_name_prefix = "X"; bool verbose = false; - bool print_numeric_only = false; bool report_more_info = false; - size_t n_threads = 1; - + uint32_t n_threads = 1; uint64_t bin_size = 1000; - string level_code = "w"; - string outfile; /****************** COMMAND LINE OPTIONS ********************/ @@ -293,8 +293,6 @@ Columns (beyond the first 6) in the BED format output: opt_parse.add_opt("output", 'o', "name of output file (default: stdout)", false, outfile); opt_parse.add_opt("bin", 'b', "bin size in base pairs", false, bin_size); - opt_parse.add_opt("numeric", 'N', "print numeric values only (not NAs)", - false, print_numeric_only); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("level", 'l', "the level to report as score column "