Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 6b1658e2764e4c1e31afe9d35a7a805fb6596a6b
Merge: 2f1bafe4 40e440f
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Mar 9 20:08:58 2023 +0100

    Merge branch 'master' into dev

commit 2f1bafe4b5cda4195fe344c3ef5d37ce825d6509
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Mar 9 20:05:33 2023 +0100

    Updated version.

commit f6cdba737cd926945ed0623cb855fca599aa01f1
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Mar 9 17:17:50 2023 +0100

    Mem use for extend stage.

commit b0d8841d46dcca30fd712c2ab27b7af67decfad9
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Mar 9 15:16:06 2023 +0100

    Changed estimate for minimizers.

commit 1ee97297be8a63fd9e51ae08e071ffe00e15117e
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Mar 8 18:41:41 2023 +0100

    Added memory for extend stage.

commit 33de8f8c61c058c92ead3c2de5b8646c6accd0da
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Mar 8 17:06:01 2023 +0100

    Remove capping of chunk size.

commit 0fa174446118484f7f49c3e3fc25782449f3221d
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Mar 7 17:06:08 2023 +0100

    Fixed translation bug.

commit 76fa63ee6dc542c7a0fdbaadc21b67e7e59e551a
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Mar 7 15:34:12 2023 +0100

    Added heartbeat option.

commit e556f044cb82e44bb23b621169926b2163e35077
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Mar 7 13:27:34 2023 +0100

    Disabled rounding.

commit e1993c244388b964f17bf07a387bde8bd15f11b9
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Mar 7 11:48:27 2023 +0100

    Use variable super block sizes.

commit 724f0489077165feb11bd32c6248e9c3b4cb01af
Author: Benjamin Buchfink <[email protected]>
Date:   Mon Mar 6 14:01:12 2023 +0100

    Fixed issues in gvc.

commit e397ebd7fbf98e575c001467519999ebe0a0e6cf
Author: Benjamin Buchfink <[email protected]>
Date:   Mon Mar 6 12:24:53 2023 +0100

    Removed register declarations.
  • Loading branch information
bbuchfink committed Mar 9, 2023
1 parent 40e440f commit 1247253
Show file tree
Hide file tree
Showing 18 changed files with 73 additions and 30 deletions.
7 changes: 7 additions & 0 deletions src/ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
for the `prepdb` workflow.
- Fixed a bug that caused a segmentation fault when using BLAST databases.
- Added line numbers for error messages when reading taxonomy mapping files.
- Fixed a bug that could cause a crash when using the `greedy-vertex-cover`
workflow without the `--out` and `--centroid-out` options.
- Fixed a bug that caused the `greedy-vertex-cover` workflow to only produce a
trivial clustering.
- Fixed a bug that caused the last codon of the -2 reading frame to be translated
incorrectly.
- Reduced the memory use of the clustering workflow.

[2.1.4]
- Leading spaces are now trimmed and tabulator characters escaped as `\t`
Expand Down
4 changes: 2 additions & 2 deletions src/align/align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,8 @@ void align_queries(Consumer* output_file, Search::Config& cfg)
OutputWriter writer{ output_file };
output_sink.reset(new ReorderQueue<TextBuffer*, OutputWriter>(query_range.first, writer));
unique_ptr<thread> heartbeat;
//if (config.verbosity >= 3 && config.load_balancing == Config::query_parallel && !config.swipe_all)
//heartbeat.reset(new thread(heartbeat_worker, query_range.second, &cfg));
if (config.verbosity >= 3 && config.load_balancing == Config::query_parallel && !config.swipe_all && config.heartbeat)
heartbeat.reset(new thread(heartbeat_worker, query_range.second, &cfg));
size_t n_threads = config.threads_align == 0 ? config.threads_ : config.threads_align;
if (config.load_balancing == Config::target_parallel || (config.swipe_all && (cfg.target->seqs().size() >= cfg.query->seqs().size())))
n_threads = 1;
Expand Down
2 changes: 1 addition & 1 deletion src/align/extend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ pair<vector<Match>, Stats> extend(
aligned_targets = move(v.first);

i0 = i1;
i1 += std::min((ptrdiff_t)std::min(chunk_size, MAX_CHUNK_SIZE), l.target_scores.cend() - i1);
i1 += std::min((ptrdiff_t)chunk_size, l.target_scores.cend() - i1);
previous_tail_score = tail_score;
if (new_hits)
tail_score = (i1 - 1)->score;
Expand Down
2 changes: 1 addition & 1 deletion src/basic/basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../util/util.h"
#include "../stats/standard_matrix.h"

const char* Const::version_string = "2.1.4";
const char* Const::version_string = "2.1.5";
using std::string;
using std::vector;
using std::count;
Expand Down
8 changes: 5 additions & 3 deletions src/basic/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,16 @@ pair<double, int> block_size(int64_t memory_limit, Sensitivity s, bool lin) {
const double m = (double)memory_limit / 1e9;
const int min = std::max(Search::sensitivity_traits[(int)align_mode.sequence_type].at(s).minimizer_window, 1),
c = m < 40.0 && s <= Sensitivity::MORE_SENSITIVE && min == 1 ? 4 : 1;
const double min_factor = std::min(1 / (double)min * 2, 1.0);
const double max = s <= Sensitivity::DEFAULT ? 12.0 :
(s <= Sensitivity::MORE_SENSITIVE ? 4.0 : 0.4);
double b = m / (18.0 / c / min + 2.0);
if (b > 4)
double b = m / (18.0 * min_factor / c + 2.0);
/*if (b > 4)
b = floor(b);
else if (b > 0.4)
b = floor(b * 10) / 10;
else
b = floor(b * 1000) / 1000;
b = floor(b * 1000) / 1000;*/
if (!config.no_block_size_limit && !lin)
b = std::min(b, max);
if (s >= Sensitivity::VERY_SENSITIVE)
Expand Down Expand Up @@ -448,6 +449,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
("query-parallel-limit", 0, "", query_parallel_limit, 3000000u)
("log-evalue-scale", 0, "", log_evalue_scale, 1.0 / std::log(2.0))
("bootstrap", 0, "", bootstrap)
("heartbeat", 0, "", heartbeat)
("mp-self", 0, "", mp_self)
#ifdef EXTRA
("zdrop", 'z', "zdrop for gapped dna alignment", zdrop, 40)
Expand Down
1 change: 1 addition & 0 deletions src/basic/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,7 @@ struct Config
bool linsearch;
int64_t tsv_read_size;
int zdrop;
bool heartbeat;

SequenceType dbtype;

Expand Down
2 changes: 1 addition & 1 deletion src/basic/const.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Const
{

enum {
build_version = 158,
build_version = 159,
#ifdef SINGLE_THREADED
seedp_bits = 0,
#else
Expand Down
29 changes: 26 additions & 3 deletions src/cluster/cascaded/wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../data/fasta/fasta_file.h"
#include "../run/workflow.h"
#include "../util/system/system.h"
#include "../../search/search.h"

using std::shared_ptr;
using std::endl;
Expand All @@ -40,6 +41,8 @@ using std::get;
using std::ignore;
using std::back_inserter;
using std::unique_ptr;
using std::bind;
using std::function;
using namespace Util::Tsv;

namespace Cluster {
Expand Down Expand Up @@ -67,11 +70,28 @@ struct Config {
task_timer total_time;
int64_t seqs_processed;
int64_t letters_processed;
std::vector<OId> oid2centroid;
std::vector<OId> centroid2oid;
std::unique_ptr<File> oid_to_centroid_oid;
};

int64_t seq_mem_use(Loc len, Loc id_len, int c, int min) {
assert(min > 1);
int extend_stage = (len / (min / 2)) * (15 // trace point buffer
+ 16) // SeedHitList
+ 12 // SeedHitList
+ 2 * len; // SequenceSet
extend_stage /= 2;
assert(min > 1);
return std::max(
len + 8 // SequenceSet
+ id_len + 8 // Seqtitle
+ len * 9 / c / (min / 2) // SeedArray
+ 8 // super_block_id_to_oid
+ 8 // BestCentroid / clustering
+ 4 // unaligned
, extend_stage);
}

struct BestCentroid : public Consumer, public vector<OId> {
BestCentroid(OId block_size) :
vector<OId>(block_size, -1)
Expand Down Expand Up @@ -145,7 +165,8 @@ void Cascaded::run() {
throw std::runtime_error("Clustering is not supported for BLAST databases.");
timer.finish();
message_stream << "Input database: " << db->file_name() << " (" << db->sequence_count() << " sequences, " << db->letters() << " letters)" << endl;
const int64_t block_size = (int64_t)(::block_size(Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT)), Sensitivity::FASTER, true).first * 1e9);
const int64_t mem_limit = Util::String::interpret_number(config.memory_limit.get(DEFAULT_MEMORY_LIMIT));
const int64_t block_size = (int64_t)(::block_size(mem_limit, Sensitivity::FASTER, true).first * 1e9);
unique_ptr<Util::Tsv::File> out(open_out_tsv());

if (block_size >= (double)db->letters() && db->sequence_count() < numeric_limits<SuperBlockId>::max()) {
Expand All @@ -157,7 +178,9 @@ void Cascaded::run() {
timer.go("Length sorting the input file");
Config cfg(db);
config.db_size = db->letters();
vector<tuple<FastaFile*, vector<OId>, Util::Tsv::File*>> super_blocks = db->length_sort(block_size);
const int minimizer_window = std::max(Search::sensitivity_traits[(int)align_mode.sequence_type].at(Sensitivity::FASTER).minimizer_window, 1);
auto seq_size = function<int64_t(Loc)>(bind(seq_mem_use, std::placeholders::_1, 0, 1, minimizer_window));
vector<tuple<FastaFile*, vector<OId>, Util::Tsv::File*>> super_blocks = db->length_sort(mem_limit / 2, seq_size);
timer.finish();
config.freq_masking = true;
int i = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/cluster/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ void init_thresholds() {
}
else {
config.diag_filter_id = 85.0;
config.diag_filter_cov = config.member_cover > 50.0 ? config.member_cover - 5.0 : 0.0;
config.diag_filter_cov = config.member_cover > 50.0 ? config.member_cover - 10.0 : 0.0;
}
}

Expand Down
18 changes: 13 additions & 5 deletions src/data/sequence_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ using std::numeric_limits;
using std::tuple;
using std::get;
using std::unique_ptr;
using std::function;
using namespace Util::Tsv;

static constexpr int64_t CHECK_FOR_DNA_COUNT = 10;
Expand Down Expand Up @@ -861,7 +862,8 @@ void SequenceFile::add_seqid_mapping(const std::string& id, OId oid) {
}
}

vector<tuple<FastaFile*, vector<OId>, File*>> SequenceFile::length_sort(int64_t block_size) {
vector<tuple<FastaFile*, vector<OId>, File*>> SequenceFile::length_sort(int64_t block_size, function<int64_t(Loc)>& seq_size) {
static const int64_t MIN_BLOCK_SIZE = 1;
vector<tuple<FastaFile*, vector<OId>, File*>> files;
init_seq_access();
vector<Letter> seq;
Expand All @@ -874,19 +876,25 @@ vector<tuple<FastaFile*, vector<OId>, File*>> SequenceFile::length_sort(int64_t
}
ips4o::parallel::sort(lengths.begin(), lengths.end(), greater<pair<Loc, OId>>(), config.threads_);

int64_t letters = 0, seqs = 0;
int64_t size = 0, seqs = 0, letters = 0;
int block = 0;
for (auto i = lengths.begin(); i != lengths.end(); ++i) {
size += seq_size(i->first);
letters += i->first;
++seqs;
i->first = block;
if (letters >= block_size || seqs >= numeric_limits<SuperBlockId>::max()) {
if ((size >= block_size && letters >= MIN_BLOCK_SIZE) || seqs >= numeric_limits<SuperBlockId>::max()) {
log_stream << "Super block " << block << " seqs=" << seqs << " letters=" << letters << endl;
++block;
letters = 0;
size = 0;
seqs = 0;
letters = 0;
}
}
if (letters > 0) ++block;
if (size > 0) {
log_stream << "Super block " << block << " seqs=" << seqs << " letters=" << letters << endl;
++block;
}
ips4o::parallel::sort(lengths.begin(), lengths.end(), [](const pair<Loc, OId>& p1, const pair<Loc, OId>& p2) { return p1.second < p2.second; }, config.threads_);

files.reserve(block);
Expand Down
2 changes: 1 addition & 1 deletion src/data/sequence_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ struct SequenceFile {
FastaFile* sub_db(It begin, It end, const std::string& file_name = std::string());
template<typename It>
void sub_db(It begin, It end, FastaFile* out);
std::vector<std::tuple<FastaFile*, std::vector<OId>, Util::Tsv::File*>> length_sort(int64_t block_size);
std::vector<std::tuple<FastaFile*, std::vector<OId>, Util::Tsv::File*>> length_sort(int64_t block_size, std::function<int64_t(Loc)>& seq_size);
Util::Tsv::File& seqid_file();

void init_dict(const size_t query_block, const size_t target_block);
Expand Down
6 changes: 3 additions & 3 deletions src/lib/blast/blastn_score.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ BlastScoreBlkProteinMatrixRead(BlastScoreBlk* sbp, FILE *fp)
char a1chars[BLASTAA_SIZE], a2chars[BLASTAA_SIZE];
long lineno = 0;
double xscore;
register int index1, index2;
int index1, index2;
int x_index, u_index, o_index, c_index;
const char kCommentChar = '#';
const char* kTokenStr = " \t\n\r";
Expand Down Expand Up @@ -500,7 +500,7 @@ BlastScoreBlkNucleotideMatrixRead(BlastScoreBlk* sbp, FILE *fp)
Int4 numFreqs = 0;
Int4 alphaSize = 0;
double fval;
register int index1, index2;
int index1, index2;
char fbuf[512+3];
char alphabet[24];
char *cp,*ncp,*lp;
Expand Down Expand Up @@ -1374,7 +1374,7 @@ BlastKarlinLHtoK(Blast_ScoreFreq* sfp, double lambda, double H)
/*highest and lowest possible alignment scores for current length*/
Int4 lowAlignmentScore, highAlignmentScore;
Int4 first, last; /*loop indices for dynamic program*/
register double innerSum;
double innerSum;
double oldsum, oldsum2; /* values of innerSum on previous
iterations*/
double outerSum; /* holds sum over j of (innerSum
Expand Down
2 changes: 1 addition & 1 deletion src/test/test_cases.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ const vector<uint64_t> ref_hashes = {
0xe787dcb23cc5b120,
0x5aa4baf48a888be9,
0x21f14583e88a13ac,
0xe5eb09929f0cc182,
0xea4b4492a522c624,
0x713deb9a5ae4b9e,
};

Expand Down
10 changes: 6 additions & 4 deletions src/tools/greedy_vertex_cover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ void greedy_vertex_cover() {
}
lines += n;
});
Util::Tsv::File(Util::Tsv::Schema(), config.edges).read(config.threads_, fn);
Util::Tsv::File(Util::Tsv::Schema(), config.edges).read(INT64_MAX, config.threads_, fn);
timer.finish();
message_stream << "#Lines: " << lines << endl;

Expand Down Expand Up @@ -115,7 +115,7 @@ void greedy_vertex_cover() {
edges.insert(edges.end(), e.cbegin(), e.cend());
}
});
Util::Tsv::File(Util::Tsv::Schema(), config.edges).read(config.threads_, fn2);
Util::Tsv::File(Util::Tsv::Schema(), config.edges).read(INT64_MAX, config.threads_, fn2);
timer.finish();
log_rss();

Expand Down Expand Up @@ -149,8 +149,10 @@ void greedy_vertex_cover() {
if (!config.output_file.empty())
*out << acc[r[i]] << '\t' << acc[i] << endl;
}
centroid_out->close();
out->close();
if(centroid_out)
centroid_out->close();
if (out)
out->close();
timer.finish();
message_stream << "#Centroids: " << c << endl;
}
2 changes: 1 addition & 1 deletion src/util/sequence/translate.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ struct Translator
}
if(r) {
proteins[1][i] = getAminoAcid(dnaSequence, pos);
proteins[4][i] = getAminoAcidReverse(dnaSequence, r);
proteins[4][i] = getAminoAcidReverse(dnaSequence, r - 1);
}
return n;
}
Expand Down
2 changes: 1 addition & 1 deletion src/util/string/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ int64_t interpret_number(const std::string& s) {
throw std::runtime_error(string("Invalid size specifier (") + c + ") in number: " + s + ". Permitted values: 'G'");
ss >> c;
if (!ss.eof())
throw std::runtime_error("Invalid number format: " + s);
throw std::runtime_error("Invalid number format: " + s);
return int64_t(n * 1e9);
}

Expand Down
2 changes: 1 addition & 1 deletion src/util/tsv/file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void File::read(int threads, std::function<void(int64_t chunk, const Table&)>& c
table.append(begin, end);
callback(chunk, table);
});
read(threads, f);
read(INT64_MAX, threads, f);
}

Table File::read(int64_t max_size, int threads) {
Expand Down
2 changes: 1 addition & 1 deletion src/util/tsv/file.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ struct File {
}

template<typename F, typename Out>
void read(F& f, Out& out) {
void read(int threads, F& f, Out& out) {
/*using T = typename std::result_of<F(const Record&)>::type;
auto f2 = [](std::vector<T>* v) {
for (std::vector<T>::const_iterator i = v->cbegin(); i != v->cend(); ++i)
Expand Down

0 comments on commit 1247253

Please sign in to comment.