diff --git a/src/FalcoConfig.cpp b/src/FalcoConfig.cpp index b83c306..9959d3b 100644 --- a/src/FalcoConfig.cpp +++ b/src/FalcoConfig.cpp @@ -826,7 +826,7 @@ const string FalcoConfig::html_template = " {{adaptercontentce}}" "" " {{kmercontentcs}}" -"
  • {{kmercontentname}}
  • " +"
  • {{kmercontentname}}
  • " " {{kmercontentce}}" "" "" @@ -932,10 +932,10 @@ const string FalcoConfig::html_template = "" "{{kmercontentcs}}" "
    " -"

    " +"

    " " {{kmercontentname}} : {{passkmercontent}}" "

    " -" {{kmercontentdata}}" +"
    " "
    " "{{kmercontentce}}" "" @@ -1052,5 +1052,16 @@ const string FalcoConfig::html_template = " yaxis : {title : '% sequences with adapter before position'}" " } );" "}" +"if (document.getElementById('kmerlineplot') !== null) {" +" Plotly.newPlot('kmerlineplot', [" +" {{kmercontentdata}}" +" ], {" +" margin: { t: 0 }, " +" showlegend: true," +" xaxis : {title : 'Base position'}," +" yaxis : {title : 'log2(obs/ exp max)'}" +" } );" +"}" + "" ""; diff --git a/src/FalcoConfig.hpp b/src/FalcoConfig.hpp index 46fbca9..87b10e1 100644 --- a/src/FalcoConfig.hpp +++ b/src/FalcoConfig.hpp @@ -166,10 +166,10 @@ namespace Constants { const size_t bit_shift_adapter = log2exact(max_adapters); // we shift 14 bits when reading a kmer, two bits per base - const size_t bit_shift_kmer = 2 * Constants::kmer_size; + const size_t bit_shift_kmer = bit_shift_base*kmer_size; // mask to get only the first 2*k bits of the sliding window - const size_t kmer_mask = (1ll << (2*Constants::kmer_size)) - 1; + const size_t kmer_mask = (1ull << (bit_shift_kmer)) - 1; }; #endif diff --git a/src/FastqStats.cpp b/src/FastqStats.cpp index 94212e2..0fad09c 100644 --- a/src/FastqStats.cpp +++ b/src/FastqStats.cpp @@ -69,7 +69,7 @@ FastqStats::FastqStats() { position_quality_count.fill(0); pos_kmer_count.fill(0); pos_adapter_count.fill(0); - kmer_count = vector(kNumBases*(kmer_mask + 1), 0); + kmer_count = vector(kNumBases*(Constants::kmer_mask + 1), 0); } // Initialize as many gc models as fast bases diff --git a/src/FastqStats.hpp b/src/FastqStats.hpp index 209e23d..a2a100f 100644 --- a/src/FastqStats.hpp +++ b/src/FastqStats.hpp @@ -145,13 +145,6 @@ struct FastqStats { static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides); static const size_t kBitShiftQuality = log2exact(kNumQualityValues); - /************ KMER CONSTANTS **********/ - - // we shift 14 bits when reading a kmer, two bits per base - static const size_t kBitShiftKmer = 2 * Constants::kmer_size; - - // mask to get only the first 2*k bits of the sliding window - static const size_t kmer_mask = (1ll << (2*Constants::kmer_size)) - 1; /************ ADAPTER CONSTANTS **********/ // bit shift for adapters, log(100) = 7 diff --git a/src/Module.cpp b/src/Module.cpp index 0f5b2fc..957de63 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -1985,7 +1985,7 @@ ModuleKmerContent::summarize_module(FastqStats &stats) { for (size_t kmer = 0; kmer < num_kmers; ++kmer) { for (size_t i = kmer_size - 1; i < num_kmer_bases; ++i) { observed_count = stats.kmer_count[ - (i << FastqStats::kBitShiftKmer) | kmer + (i << Constants::bit_shift_kmer) | kmer ]; total_kmer_counts[kmer] += observed_count; } @@ -1995,40 +1995,51 @@ ModuleKmerContent::summarize_module(FastqStats &stats) { double dividend = static_cast(num_seen_kmers); for (size_t kmer = 0; kmer < num_kmers; ++kmer) { for (size_t i = kmer_size - 1; i < num_kmer_bases; ++i) { - observed_count = stats.kmer_count[ - (i << FastqStats::kBitShiftKmer) | kmer - ]; + observed_count = + stats.kmer_count[(i << Constants::bit_shift_kmer) | kmer]; + expected_count = pos_kmer_count[i] / dividend; obs_exp_ratio = observed_count / expected_count; if (i == 0 || obs_exp_ratio > obs_exp_max[kmer]) { obs_exp_max[kmer] = obs_exp_ratio; - where_obs_exp_is_max[kmer] = i; + where_obs_exp_is_max[kmer] = i + 1 - kmer_size; } } - if (obs_exp_max[kmer] > 5) { + if (obs_exp_max[kmer] > MIN_OBS_EXP_TO_REPORT) { kmers_to_report.push_back(make_pair(kmer, obs_exp_max[kmer])); } } sort (begin(kmers_to_report), end(kmers_to_report), - [](pair &a, pair &b) { + [](const pair &a, const pair &b) { return a.second > b.second; }); - } void ModuleKmerContent::make_grade() { - grade = "fail"; + const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_REPORT); + grade = "pass"; + + // the worst kmer is at the top + if (lim > 0) { + const size_t kmer = kmers_to_report[0].first; + const double obs_exp = obs_exp_max[kmer]; + if (obs_exp >= grade_error) + grade = "fail"; + else if (obs_exp >= grade_warn) + grade = "warn"; + } } void ModuleKmerContent::write_module(ostream &os) { os << "#Sequence\tCount\tPValue\tObs/Exp Max\tMax Obs/Exp Position\n"; - for (size_t i = 0; i < 20 && i < kmers_to_report.size(); ++i) { - size_t kmer = kmers_to_report[i].first; + const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_REPORT); + for (size_t i = 0; i < lim; ++i) { + const size_t kmer = kmers_to_report[i].first; os << size_t_to_seq(kmer, kmer_size) << "\t" << total_kmer_counts[kmer] << "\t" << "0.0" << "\t" @@ -2039,5 +2050,44 @@ ModuleKmerContent::write_module(ostream &os) { string ModuleKmerContent::make_html_data() { - return "K-mer content module currently not implemented!"; + bool seen_first = false; + ostringstream data; + const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_PLOT); + + // get xlim to plot: whatever the largest position with some + // reported k-mer is + size_t xlim = 0; + for (size_t i = 0; i < lim; ++i) + xlim = max(xlim, where_obs_exp_is_max[kmers_to_report[i].first]); + + for (size_t i = 0; i < lim; ++i) { + const size_t kmer = kmers_to_report[i].first; + const double log_obs_exp = log(kmers_to_report[i].second)/log(2); + if (!seen_first) + seen_first = true; + else + data << ","; + data << "{"; + + // X values : read position + data << "x : ["; + for (size_t j = 0; j < xlim; ++j) { + data << j+1; + if (j < xlim - 1) data << ","; + } + data << "]"; + + // Y values : A peak wherever the k-mer is seen the most + data << ", y : ["; + for (size_t j = 0; j < xlim; ++j) { + data << ((j == (where_obs_exp_is_max[kmer])) ? (log_obs_exp) : 0); + if (i < xlim - 1) + data << ","; + } + + data << "]"; + data << ", type : 'line', "; + data << "name : \"" << size_t_to_seq(kmer, Constants::kmer_size) << "\"}"; + } + return data.str(); } diff --git a/src/Module.hpp b/src/Module.hpp index 9442199..ce3cf1c 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -377,6 +377,9 @@ class ModuleKmerContent : public Module { std::vector> kmers_to_report; public: static const std::string module_name; + static const size_t MIN_OBS_EXP_TO_REPORT = 5; + static const size_t MAX_KMERS_TO_REPORT = 20; + static const size_t MAX_KMERS_TO_PLOT = 10; ModuleKmerContent(const FalcoConfig &config); ~ModuleKmerContent(){} void summarize_module(FastqStats &stats); diff --git a/src/StreamReader.cpp b/src/StreamReader.cpp index 094af24..9eddca2 100644 --- a/src/StreamReader.cpp +++ b/src/StreamReader.cpp @@ -256,10 +256,10 @@ StreamReader::process_sequence_base_from_buffer(FastqStats &stats) { cur_kmer = ((cur_kmer << Constants::bit_shift_base) | base_ind); // registers k-mer if seen at least k nucleotides since the last n - if (do_kmer && (num_bases_after_n == Constants::kmer_size)) { + if (do_kmer && (num_bases_after_n >= Constants::kmer_size)) { stats.kmer_count[(read_pos << Constants::bit_shift_kmer) - | (cur_kmer & Constants::bit_shift_kmer)]++; + | (cur_kmer & Constants::kmer_mask)]++; stats.pos_kmer_count[read_pos]++; }