Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit ff3e4f83f56c4eee8e3889374cc502185c961afa
Merge: ff9d5683 aed19c2
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Feb 8 20:13:20 2023 +0100

    Merge branch 'master' into dev

commit ff9d5683b47f1fa4c5391e36735e15b4e82465bc
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Feb 8 20:11:07 2023 +0100

    Updated version.

commit 4143bb6095fb316ea0188b9c985b722d4fa02b15
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Feb 8 18:35:23 2023 +0100

    Tuned score only culling.

commit d1cd241fcb0ba622c1f5b1b1ddb3e0028760d77e
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Feb 8 11:50:41 2023 +0100

    Removed kmer filter.

commit d63cc38216fba58687593f72830f7545b2ee6e21
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Feb 8 11:30:27 2023 +0100

    Fixed clang errors.

commit abcec00aefce0fc42cd6b1cf9f29b504a0a3b824
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Feb 7 20:50:55 2023 +0100

    Fixed DAA errors.

commit b76ab78f94f7fcb55bb9a43c42aaf2909800304b
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Feb 7 17:32:06 2023 +0100

    Fixed recluster issues.

commit 5b6896da00435b46dfa349cac7309d2b2d55f702
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Feb 7 17:15:08 2023 +0100

    Fixed warnings.

commit b45676d30d7ec00f193bcde7348cfa91240c2685
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Feb 7 12:48:53 2023 +0100

    Fixed warnings.

commit 53d8b4ac96c671afc22730904023b34651983d7a
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Feb 7 12:25:52 2023 +0100

    Fixed include.

commit 27bc731ac68e2af950b4b479d4fb0e462a8d4a7a
Author: Benjamin Buchfink <[email protected]>
Date:   Sun Feb 5 12:23:32 2023 +0100

    Fixed compilation errors.

commit a89ba6dac5383db6b0b0eb677d1e869e7383bc3a
Merge: ac6df028 8f119f1
Author: Benjamin Buchfink <[email protected]>
Date:   Fri Feb 3 18:34:31 2023 +0100

    Conflicts
  • Loading branch information
bbuchfink committed Feb 8, 2023
1 parent aed19c2 commit 9438db3
Show file tree
Hide file tree
Showing 34 changed files with 82 additions and 223 deletions.
4 changes: 0 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -322,17 +322,13 @@ set(OBJECTS
src/masking/motifs.cpp
src/align/alt_hsp.cpp
src/data/fasta/fasta_file.cpp
# src/cluster/incremental/run.cpp
src/cluster/output.cpp
# src/cluster/incremental/config.cpp
src/cluster/realign.cpp
src/cluster/reassign.cpp
src/util/tsv/read_tsv.cpp
src/tools/greedy_vertex_cover.cpp
src/cluster/cascaded/recluster.cpp
src/cluster/helpers.cpp
src/util/kmer/filter.cpp
src/align/kmer_filter.cpp
src/search/kmer_ranking.cpp
src/chaining/hamming_ext.cpp
src/dna/smith_watermann.cpp
Expand Down
9 changes: 8 additions & 1 deletion src/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
[2.1.1]
- Fixed compilation errors on non-x86 systems.
- Fixed compilation errors on non-x86 systems and for the clang compiler.
- Fixed an error message when running the `recluster` workflow.
- Fixed a bug that could cause an `invalid varint encoding` error when using the
DAA format.
- Fixed a bug that could cause corrupted DAA output.
- Fixed a bug that caused an error in the `view` workflow.
- Adjusted the hit culling heuristic of the frameshift alignment mode to be less
aggressive.

[2.1.0]
- Added the `cluster` workflow to cluster protein sequences.
Expand Down
13 changes: 1 addition & 12 deletions src/align/extend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,18 +259,11 @@ pair<vector<Match>, Stats> extend(BlockId query_id, Search::Hit* begin, Search::
stat.inc(Statistics::TIME_LOAD_HIT_TARGETS, timer.microseconds());
timer.finish();

vector<Match> trivial_matches;
if (config.filter_kmer_len) {
tie(l, trivial_matches) = kmer_filter(cfg.query->seqs()[query_id], Bias_correction(cfg.query->seqs()[query_id]).int8.data(), *cfg.target, l);
stat.inc(Statistics::TRIVIAL_ALN, trivial_matches.size());
}

const int64_t target_count = (int64_t)l.target_block_ids.size();
if (target_count == 0 && !config.swipe_all) {
if (add_self_aln(cfg))
return { {Match::self_match(query_id, cfg.query->seqs()[query_id])}, Stats() };
culling(trivial_matches, cfg);
return { trivial_matches, Stats() };
return { {}, Stats() };
}
const int64_t chunk_size = ranking_chunk_size(target_count, cfg.target->seqs().letters(), cfg.max_target_seqs);

Expand All @@ -284,10 +277,6 @@ pair<vector<Match>, Stats> extend(BlockId query_id, Search::Hit* begin, Search::
}

pair<vector<Match>, Stats> r = extend(query_id, cfg, stat, flags, l);
if (!trivial_matches.empty()) {
r.first.insert(r.first.end(), make_move_iterator(trivial_matches.begin()), make_move_iterator(trivial_matches.end()));
culling(r.first, cfg);
}
return r;
}

Expand Down
2 changes: 1 addition & 1 deletion src/align/global_ranking/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ void update_table(Search::Config& cfg) {
}
};
vector<thread> threads;
for (size_t i = 0; i < config.threads_; ++i)
for (int i = 0; i < config.threads_; ++i)
threads.emplace_back(worker);
for (thread& t : threads)
t.join();
Expand Down
42 changes: 0 additions & 42 deletions src/align/kmer_filter.cpp

This file was deleted.

25 changes: 18 additions & 7 deletions src/align/legacy/query_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../../output/target_culling.h"
#include "../../util/util.h"

using namespace std;
using std::tie;
using std::list;
using std::vector;
using std::unique_ptr;
using std::min;
using std::max;
using std::set;
using std::string;

bool Target::envelopes(const ApproxHsp &t, double p) const
{
Expand Down Expand Up @@ -92,7 +99,7 @@ QueryMapper::QueryMapper(size_t query_id, Search::Hit* begin, Search::Hit* end,
void QueryMapper::init()
{
if(config.log_query)
cout << "Query = " << metadata.query->ids()[query_id] << '\t' << query_id << endl;
std::cout << "Query = " << metadata.query->ids()[query_id] << '\t' << query_id << std::endl;
if (Stats::CBS::hauser(config.comp_based_stats))
for (int i = 0; i < align_mode.query_contexts; ++i)
query_cb.emplace_back(query_seq(i));
Expand Down Expand Up @@ -181,21 +188,25 @@ void QueryMapper::rank_targets(double ratio, double factor, const int64_t max_ta

void QueryMapper::score_only_culling(const int64_t max_target_seqs)
{
static const double COV_INCLUDE_CUTOFF = 0.1;
std::stable_sort(targets.begin(), targets.end(), config.toppercent == 100.0 ? Target::compare_evalue : Target::compare_score);
unique_ptr<TargetCulling> target_culling(TargetCulling::get(max_target_seqs));
const unsigned query_len = (unsigned)query_seq(0).length();
PtrVector<Target>::iterator i;
for (i = targets.begin(); i<targets.end();) {
if (!score_matrix.report_cutoff((*i)->filter_score, (*i)->filter_evalue))
break;
const int c = target_culling->cull(**i);
if (c == TargetCulling::FINISHED)
int code;
double cov;
tie(code, cov) = target_culling->cull(**i);
if (code == TargetCulling::FINISHED)
break;
else if (c == TargetCulling::NEXT) {
else if (code == TargetCulling::NEXT) {
i = targets.erase(i, i + 1);
}
else {
target_culling->add(**i);
if (cov < COV_INCLUDE_CUTOFF)
target_culling->add(**i);
++i;
}
}
Expand Down Expand Up @@ -229,7 +240,7 @@ bool QueryMapper::generate_output(TextBuffer &buffer, Statistics &stat, const Se
if (targets[i].hsps.size() == 0)
continue;

const int c = target_culling->cull(targets[i]);
const int c = target_culling->cull(targets[i]).first;
if (c == TargetCulling::NEXT)
continue;
else if (c == TargetCulling::FINISHED)
Expand Down
4 changes: 2 additions & 2 deletions src/align/load_hits.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ static SeedHitList load_hits(It begin, It end, const SequenceSet& ref_seqs) {
target_len = (unsigned)ref_seqs[target].length();
list.target_block_ids.push_back(target);
}
list.seed_hits.push_back({ (int)i->seed_offset_, (int)l.second, i->score_, i->query_ % align_mode.query_contexts });
list.seed_hits.push_back({ (int)i->seed_offset_, (int)l.second, i->score_, (unsigned)i->query_ % align_mode.query_contexts });
score = std::max(score, i->score_);
}
}
Expand All @@ -105,7 +105,7 @@ static SeedHitList load_hits(It begin, It end, const SequenceSet& ref_seqs) {
target = t;
target_len = (unsigned)ref_seqs[target].length();
}
list.seed_hits.push_back({ (int)i->seed_offset_, (int)(subject_offset - *(it - 1)), i->score_, i->query_ % align_mode.query_contexts });
list.seed_hits.push_back({ (int)i->seed_offset_, (int)(subject_offset - *(it - 1)), i->score_, (unsigned)i->query_ % align_mode.query_contexts });
score = std::max(score, i->score_);
}
}
Expand Down
1 change: 0 additions & 1 deletion src/align/target.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ std::vector<Match> align(std::vector<Target> &targets, const int64_t previous_ma
std::vector<Target> full_db_align(const Sequence *query_seq, const Bias_correction *query_cb, DP::Flags flags, const HspValues hsp_values, Statistics &stat, const Block& target_block);
void recompute_alt_hsps(std::vector<Match>::iterator begin, std::vector<Match>::iterator end, const Sequence* query, const int query_source_len, const Bias_correction* query_cb, const HspValues v, Statistics& stats);
void apply_filters(std::vector<Match>::iterator begin, std::vector<Match>::iterator end, int source_query_len, const char* query_title, const double query_self_aln_score, const Sequence& query_seq, const Search::Config& cfg);
std::pair<SeedHitList, std::vector<Match>> kmer_filter(Sequence query, const int8_t* query_cbs, const Block& targets, const SeedHitList& l);

std::pair<std::vector<Match>, Stats> extend(
BlockId query_id,
Expand Down
2 changes: 1 addition & 1 deletion src/basic/basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../util/util.h"
#include "../stats/standard_matrix.h"

const char* Const::version_string = "2.1.0";
const char* Const::version_string = "2.1.1";
using std::string;
using std::vector;
using std::count;
Expand Down
6 changes: 2 additions & 4 deletions src/basic/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
#endif
;

auto& general = parser.add_group("General options", { makedb, blastp, blastx, cluster, view, prep_db, getseq, dbinfo, makeidx, CLUSTER_REALIGN, GREEDY_VERTEX_COVER, DEEPCLUST });
auto& general = parser.add_group("General options", { makedb, blastp, blastx, cluster, view, prep_db, getseq, dbinfo, makeidx, CLUSTER_REALIGN, GREEDY_VERTEX_COVER, DEEPCLUST, RECLUSTER });
general.add()
("threads", 'p', "number of CPU threads", threads_)
("db", 'd', "database file", database)
Expand Down Expand Up @@ -613,8 +613,6 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
("reassign-max", 0, "", reassign_max)
("add-self-aln", 0, "", add_self_aln)
("weighted-gvc", 0, "", weighted_gvc)
("filter-kmer-len", 0, "", filter_kmer_len)
("filter-kmer-cutoff", 0, "", filter_kmer_cutoff)
("hamming-ext", 0, "", hamming_ext)
("diag-filter-id", 0, "", diag_filter_id)
("diag-filter-cov", 0, "", diag_filter_cov)
Expand Down Expand Up @@ -644,7 +642,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
("no_8bit_extension", 0, "", no_8bit_extension)
("anchored-swipe", 0, "", anchored_swipe)
("no_chaining_merge_hsps", 0, "", no_chaining_merge_hsps)
("no_recluster_bd", 0, "", no_recluster_bd)
("recluster_bd", 0, "", recluster_bd)
("pipeline-short", 0, "", pipeline_short)
("graph-algo", 0, "", graph_algo, string("gvc"))
#ifndef KEEP_TARGET_ID
Expand Down
4 changes: 1 addition & 3 deletions src/basic/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,6 @@ struct Config
bool mode_faster;
double member_cover;
bool weighted_gvc;
int filter_kmer_len;
double filter_kmer_cutoff;
bool kmer_ranking;
bool hamming_ext;
double diag_filter_id;
Expand All @@ -334,7 +332,7 @@ struct Config
bool no_8bit_extension;
bool anchored_swipe;
bool no_chaining_merge_hsps;
bool no_recluster_bd;
bool recluster_bd;
bool pipeline_short;
string graph_algo;

Expand Down
2 changes: 1 addition & 1 deletion src/basic/const.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Const
{

enum {
build_version = 154,
build_version = 155,
#ifdef SINGLE_THREADED
seedp_bits = 0,
#else
Expand Down
2 changes: 1 addition & 1 deletion src/basic/value.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ struct ValueTraits
};

#define AMINO_ACID_ALPHABET "ARNDCQEGHILKMFPSTWYVBJZX*_"
#define AMINO_ACID_COUNT (sizeof(AMINO_ACID_ALPHABET) - 1)
#define AMINO_ACID_COUNT (int(sizeof(AMINO_ACID_ALPHABET) - 1))

constexpr Letter MASK_LETTER = 23;
constexpr Letter STOP_LETTER = 24;
Expand Down
8 changes: 4 additions & 4 deletions src/cluster/cascaded/recluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
config.iterate = vector<string>();
config.output_format = { "edge" };
config.self = false;
config.query_cover = config.no_recluster_bd ? config.member_cover : 0;
config.query_cover = config.recluster_bd ? 0 : config.member_cover;
config.subject_cover = 0;
config.query_or_target_cover = config.no_recluster_bd ? 0 : config.member_cover;
config.query_or_target_cover = config.recluster_bd ? config.member_cover : 0;
config.sensitivity = from_string<Sensitivity>(cluster_steps(config.approx_min_id).back());
//tie(config.chunk_size, config.lowmem_) = block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), Search::iterated_sens.at(config.sensitivity).front(), false);
config.lowmem_ = 1;
Expand All @@ -104,7 +104,7 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
return out;

shared_ptr<SequenceFile> unmapped;
if (config.no_recluster_bd) {
if (!config.recluster_bd) {
timer.go("Creating database of unmapped sequences");
unmapped.reset(unaligned->sub_db(unmapped_members.cbegin(), unmapped_members.cend()));
}
Expand Down Expand Up @@ -134,7 +134,7 @@ static vector<OId> recluster(shared_ptr<SequenceFile>& db, const vector<OId>& cl
timer.finish();
message_stream << "#Centroid sequences covered: " << n << endl;
timer.go("Making sequence list for reclustering");
for (int64_t i = 0; i < unmapped_members.size(); ++i)
for (int64_t i = 0; i < (int64_t)unmapped_members.size(); ++i)
unmapped_members[i] = unal_members[unmapped_members[i]];
const vector<OId> members = cluster_members(centroid_list.begin(), end, clusters);
unmapped_members.reserve(unmapped_members.size() + members.size());
Expand Down
4 changes: 2 additions & 2 deletions src/cluster/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,10 @@ vector<BlockId> len_sorted_clust(const FlatArray<Util::Algo::Edge<SuperBlockId>>
for (int64_t i = 0; i < edges.size(); ++i) {
if (v[i] != -1)
continue;
v[i] = i;
v[i] = (BlockId)i;
for (auto it = edges.cbegin(i); it != edges.cend(i); ++it)
if (v[it->node2] == -1)
v[it->node2] = i;
v[it->node2] = (BlockId)i;
}
return v;
}
Expand Down
2 changes: 1 addition & 1 deletion src/cluster/incremental/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ Config::Config() :
problem_size_self(0)
{
sens.push_back(config.sensitivity);
for (int i = 0; i < sens.size() - 1; ++i)
for (int i = 0; i < (int)sens.size() - 1; ++i)
cache.emplace_back(new Block);
}

Expand Down
10 changes: 5 additions & 5 deletions src/cluster/incremental/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ static void search_vs_centroids(Block& block, const int round, Config& cfg) {
if (cfg.verbosity >= 2)
cfg.message_stream << "CLUSTER " << clustered << " assigned to clusters, " << unaligned.seqs().size() << " unaligned." << endl;

if (round + 1 < cfg.sens.size())
if (round + 1 < (int)cfg.sens.size())
cfg.cache[round]->append(unaligned);
else
self_align(unaligned, cfg);
Expand Down Expand Up @@ -172,7 +172,7 @@ void Algo::run() {

search_vs_centroids(*block, 0, cfg);

for (int i = 0; i < cfg.cache.size(); ++i)
for (int i = 0; i < (int)cfg.cache.size(); ++i)
if (cfg.cache[i]->seqs().letters() >= std::min(cache_limit, (int64_t)cfg.centroids->letters())) {
cfg.cache[i]->seqs().finish_reserve();
search_vs_centroids(*cfg.cache[i], i + 1, cfg);
Expand All @@ -190,7 +190,7 @@ void Algo::run() {
}
}

for (int i = 0; i < cfg.cache.size(); ++i)
for (int i = 0; i < (int)cfg.cache.size(); ++i)
if (cfg.cache[i]->seqs().letters() > 0) {
cfg.cache[i]->seqs().finish_reserve();
search_vs_centroids(*cfg.cache[i], i + 1, cfg);
Expand All @@ -204,7 +204,7 @@ void Algo::run() {
//cfg.output_file->write(buf.data(), buf.size());
//buf.clear();
//}
for (int64_t i = 0; i < cfg.oid2centroid.size(); ++i)
for (int64_t i = 0; i < (int64_t)cfg.oid2centroid.size(); ++i)
cfg.oid2centroid[i] = cfg.centroid2oid[cfg.oid2centroid[i]];
output_mem<CentroidId>(*cfg.output_file, *cfg.db, cfg.oid2centroid);

Expand All @@ -220,7 +220,7 @@ void Algo::run() {
table("Input sequences", cfg.db->sequence_count());
table("Number of clusters", cfg.centroids->sequence_count());

for (int i = 0; i < cfg.sens.size(); ++i) {
for (int i = 0; i < (int)cfg.sens.size(); ++i) {
table("Time (" + to_string(cfg.sens[i]) + ")", cfg.time_search[i], "s");
table("Problem size (" + to_string(cfg.sens[i]) + ")", cfg.problem_size[i]);
}
Expand Down
2 changes: 2 additions & 0 deletions src/data/dmnd/dmnd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ Serializer& operator<<(Serializer &s, const ReferenceHeader2 &h)

Deserializer& operator>>(Deserializer &d, ReferenceHeader2 &h)
{
#ifdef EXTRA
int32_t db_type;
#endif
d.read_record().read(h.hash, sizeof(h.hash))
>> h.taxon_array_offset
>> h.taxon_array_size
Expand Down
3 changes: 2 additions & 1 deletion src/data/sequence_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -829,12 +829,13 @@ void db_info() {

void prep_db() {
config.database.require();
if (is_blast_db(config.database))
if (is_blast_db(config.database)) {
#ifdef WITH_BLASTDB
BlastDB::prep_blast_db(config.database);
#else
;
#endif
}
#ifdef EXTRA
else if (DatabaseFile::is_diamond_db(auto_append_extension_if_exists(config.database, DatabaseFile::FILE_EXTENSION)))
DatabaseFile::prep_db();
Expand Down
Loading

0 comments on commit 9438db3

Please sign in to comment.