Skip to content

Commit

Permalink
implemented k-mer content, no p-values yet
Browse files Browse the repository at this point in the history
  • Loading branch information
guilhermesena1 committed Sep 9, 2021
1 parent 40e437e commit e115b55
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 27 deletions.
17 changes: 14 additions & 3 deletions src/FalcoConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,7 @@ const string FalcoConfig::html_template =
" {{adaptercontentce}}"
""
" {{kmercontentcs}}"
" <li><a class=\"{{passkmercontent}}\" href=\"#adaptercontent\">{{kmercontentname}}</a></li>"
" <li><a class=\"{{passkmercontent}}\" href=\"#kmercontent\">{{kmercontentname}}</a></li>"
" {{kmercontentce}}"
""
""
Expand Down Expand Up @@ -932,10 +932,10 @@ const string FalcoConfig::html_template =
""
"{{kmercontentcs}}"
"<div class=\"module\">"
" <h2 class=\"{{passkmercontent}}\" id=\"adaptercontent\">"
" <h2 class=\"{{passkmercontent}}\" id=\"kmercontent\">"
" {{kmercontentname}} : {{passkmercontent}}"
" </h2>"
" {{kmercontentdata}}"
" <div id=\"kmerlineplot\"></div>"
"</div>"
"{{kmercontentce}}"
""
Expand Down Expand Up @@ -1052,5 +1052,16 @@ const string FalcoConfig::html_template =
" yaxis : {title : '% sequences with adapter before position'}"
" } );"
"}"
"if (document.getElementById('kmerlineplot') !== null) {"
" Plotly.newPlot('kmerlineplot', ["
" {{kmercontentdata}}"
" ], {"
" margin: { t: 0 }, "
" showlegend: true,"
" xaxis : {title : 'Base position'},"
" yaxis : {title : 'log2(obs/ exp max)'}"
" } );"
"}"

"</script>"
"</html>";
4 changes: 2 additions & 2 deletions src/FalcoConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,10 @@ namespace Constants {
const size_t bit_shift_adapter = log2exact(max_adapters);

// we shift 14 bits when reading a kmer, two bits per base
const size_t bit_shift_kmer = 2 * Constants::kmer_size;
const size_t bit_shift_kmer = bit_shift_base*kmer_size;

// mask to get only the first 2*k bits of the sliding window
const size_t kmer_mask = (1ll << (2*Constants::kmer_size)) - 1;
const size_t kmer_mask = (1ull << (bit_shift_kmer)) - 1;
};

#endif
2 changes: 1 addition & 1 deletion src/FastqStats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ FastqStats::FastqStats() {
position_quality_count.fill(0);
pos_kmer_count.fill(0);
pos_adapter_count.fill(0);
kmer_count = vector<size_t>(kNumBases*(kmer_mask + 1), 0);
kmer_count = vector<size_t>(kNumBases*(Constants::kmer_mask + 1), 0);
}

// Initialize as many gc models as fast bases
Expand Down
7 changes: 0 additions & 7 deletions src/FastqStats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,6 @@ struct FastqStats {
static const size_t kBitShiftNucleotide = log2exact(kNumNucleotides);
static const size_t kBitShiftQuality = log2exact(kNumQualityValues);

/************ KMER CONSTANTS **********/

// we shift 14 bits when reading a kmer, two bits per base
static const size_t kBitShiftKmer = 2 * Constants::kmer_size;

// mask to get only the first 2*k bits of the sliding window
static const size_t kmer_mask = (1ll << (2*Constants::kmer_size)) - 1;

/************ ADAPTER CONSTANTS **********/
// bit shift for adapters, log(100) = 7
Expand Down
74 changes: 62 additions & 12 deletions src/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1985,7 +1985,7 @@ ModuleKmerContent::summarize_module(FastqStats &stats) {
for (size_t kmer = 0; kmer < num_kmers; ++kmer) {
for (size_t i = kmer_size - 1; i < num_kmer_bases; ++i) {
observed_count = stats.kmer_count[
(i << FastqStats::kBitShiftKmer) | kmer
(i << Constants::bit_shift_kmer) | kmer
];
total_kmer_counts[kmer] += observed_count;
}
Expand All @@ -1995,40 +1995,51 @@ ModuleKmerContent::summarize_module(FastqStats &stats) {
double dividend = static_cast<double>(num_seen_kmers);
for (size_t kmer = 0; kmer < num_kmers; ++kmer) {
for (size_t i = kmer_size - 1; i < num_kmer_bases; ++i) {
observed_count = stats.kmer_count[
(i << FastqStats::kBitShiftKmer) | kmer
];
observed_count =
stats.kmer_count[(i << Constants::bit_shift_kmer) | kmer];

expected_count = pos_kmer_count[i] / dividend;
obs_exp_ratio = observed_count / expected_count;

if (i == 0 || obs_exp_ratio > obs_exp_max[kmer]) {
obs_exp_max[kmer] = obs_exp_ratio;
where_obs_exp_is_max[kmer] = i;
where_obs_exp_is_max[kmer] = i + 1 - kmer_size;
}
}

if (obs_exp_max[kmer] > 5) {
if (obs_exp_max[kmer] > MIN_OBS_EXP_TO_REPORT) {
kmers_to_report.push_back(make_pair(kmer, obs_exp_max[kmer]));
}
}

sort (begin(kmers_to_report), end(kmers_to_report),
[](pair<size_t, double> &a, pair<size_t, double> &b) {
[](const pair<size_t, double> &a, const pair<size_t, double> &b) {
return a.second > b.second;
});

}

void
ModuleKmerContent::make_grade() {
grade = "fail";
const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_REPORT);
grade = "pass";

// the worst kmer is at the top
if (lim > 0) {
const size_t kmer = kmers_to_report[0].first;
const double obs_exp = obs_exp_max[kmer];
if (obs_exp >= grade_error)
grade = "fail";
else if (obs_exp >= grade_warn)
grade = "warn";
}
}

void
ModuleKmerContent::write_module(ostream &os) {
os << "#Sequence\tCount\tPValue\tObs/Exp Max\tMax Obs/Exp Position\n";
for (size_t i = 0; i < 20 && i < kmers_to_report.size(); ++i) {
size_t kmer = kmers_to_report[i].first;
const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_REPORT);
for (size_t i = 0; i < lim; ++i) {
const size_t kmer = kmers_to_report[i].first;
os << size_t_to_seq(kmer, kmer_size) << "\t"
<< total_kmer_counts[kmer] << "\t"
<< "0.0" << "\t"
Expand All @@ -2039,5 +2050,44 @@ ModuleKmerContent::write_module(ostream &os) {

string
ModuleKmerContent::make_html_data() {
return "<b>K-mer content module currently not implemented!";
bool seen_first = false;
ostringstream data;
const size_t lim = min(kmers_to_report.size(), MAX_KMERS_TO_PLOT);

// get xlim to plot: whatever the largest position with some
// reported k-mer is
size_t xlim = 0;
for (size_t i = 0; i < lim; ++i)
xlim = max(xlim, where_obs_exp_is_max[kmers_to_report[i].first]);

for (size_t i = 0; i < lim; ++i) {
const size_t kmer = kmers_to_report[i].first;
const double log_obs_exp = log(kmers_to_report[i].second)/log(2);
if (!seen_first)
seen_first = true;
else
data << ",";
data << "{";

// X values : read position
data << "x : [";
for (size_t j = 0; j < xlim; ++j) {
data << j+1;
if (j < xlim - 1) data << ",";
}
data << "]";

// Y values : A peak wherever the k-mer is seen the most
data << ", y : [";
for (size_t j = 0; j < xlim; ++j) {
data << ((j == (where_obs_exp_is_max[kmer])) ? (log_obs_exp) : 0);
if (i < xlim - 1)
data << ",";
}

data << "]";
data << ", type : 'line', ";
data << "name : \"" << size_t_to_seq(kmer, Constants::kmer_size) << "\"}";
}
return data.str();
}
3 changes: 3 additions & 0 deletions src/Module.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,9 @@ class ModuleKmerContent : public Module {
std::vector<std::pair<size_t, double>> kmers_to_report;
public:
static const std::string module_name;
static const size_t MIN_OBS_EXP_TO_REPORT = 5;
static const size_t MAX_KMERS_TO_REPORT = 20;
static const size_t MAX_KMERS_TO_PLOT = 10;
ModuleKmerContent(const FalcoConfig &config);
~ModuleKmerContent(){}
void summarize_module(FastqStats &stats);
Expand Down
4 changes: 2 additions & 2 deletions src/StreamReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,10 @@ StreamReader::process_sequence_base_from_buffer(FastqStats &stats) {
cur_kmer = ((cur_kmer << Constants::bit_shift_base) | base_ind);

// registers k-mer if seen at least k nucleotides since the last n
if (do_kmer && (num_bases_after_n == Constants::kmer_size)) {
if (do_kmer && (num_bases_after_n >= Constants::kmer_size)) {

stats.kmer_count[(read_pos << Constants::bit_shift_kmer)
| (cur_kmer & Constants::bit_shift_kmer)]++;
| (cur_kmer & Constants::kmer_mask)]++;
stats.pos_kmer_count[read_pos]++;
}

Expand Down

0 comments on commit e115b55

Please sign in to comment.