Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 70409de0e2af0a16a7136bab31086ef7eab04d9b
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Jun 20 09:25:18 2023 +0200

    Updated version.

commit 4099a6d4e2d61fa0ca32e1ea2935bf7fda871601
Author: Benjamin Buchfink <[email protected]>
Date:   Mon Jun 19 17:03:26 2023 +0200

    Added alias taxid.

commit 38b77f968b2013944480e4c855be7ee8f6fe6c57
Author: Benjamin Buchfink <[email protected]>
Date:   Mon Jun 19 16:10:41 2023 +0200

    Added qlen for sam format.

commit 9db6be8ccd7293d501f4640cf317c72c0532c926
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Jun 13 16:57:07 2023 +0200

    Delete iostream

commit f91883fac79b43c29989b3091dec6d981457164b
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Jun 13 12:53:17 2023 +0200

    Use array for root.

commit 2d9667af766c8379b1f734fd6263ab8ece2efd2a
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Jun 13 12:53:10 2023 +0200

    Use array as root.

commit 95023132611bf4dfa2667ea172a2aa5542a7cabe
Author: emile151 <[email protected]>
Date:   Tue Jun 13 12:25:34 2023 +0200

    Dev (#15)

    * json format reworked in blast_tab

    * json format reworked in blast_tab

    * json format reworked in blast_tab

    * json format reworked in blast_tab

    * json format reworked in blast_tab

    * json format reworked in blast_tab

    * json format reworked in blast_tab (comments fixed)

    * json format reworked in blast_tab (comments fixed)

    * json format reworked in blast_tab (comments fixed)

    * json format reworked in blast_tab (comments fixed)

    * json format reworked in blast_tab (comments fixed)

    * json format reworked in blast_tab (comments fixed)

    * json format reworked in blast_tab (comments fixed)

    * json format reworked in blast_tab (comments fixed)

    * Merge conflcit resolved

    * json format reworked in blast_tab (comments fixed)

commit 608bc06a082c528eecdb2642205208906f1a756d
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Jun 13 11:23:31 2023 +0200

    Added seed masking for --algo 1.

commit aac15b8ba2489c76c8810714ae572f3e3278a1d8
Author: Benjamin Buchfink <[email protected]>
Date:   Mon Jun 12 17:23:31 2023 +0200

    Filter low complex seeds in enum_seeds.

commit ddca6fa9cce1eb65d17893930d969b8bd6228e1c
Merge: 5c10a7dd 14f3550
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Jun 7 09:33:45 2023 +0200

    Merge branch 'master' into dev
  • Loading branch information
bbuchfink committed Jun 20, 2023
1 parent 14f3550 commit 03e7f4a
Show file tree
Hide file tree
Showing 30 changed files with 266 additions and 151 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@ Makefile
diamond
src/extra/
.vs/
CMakeSettings.json
.unison
/CMakeSettings.json
6 changes: 6 additions & 0 deletions src/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
[2.1.8]
- Fixed an issue that could cause reduced performance when running in
query-indexed mode.
- Added support for the JSON output format (option `-f json-flat`).
- Added the option `--sam-query-len` to output query length in SAM format.

[2.1.7]
- Fixed a bug that caused taxonomy names not to be loaded correctly for the
`makedb` workflow.
Expand Down
2 changes: 1 addition & 1 deletion src/align/align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ void align_queries(Consumer* output_file, Search::Config& cfg)

timer.go("Computing alignments");
HitIterator hit_it(query_range.first, query_range.second, hit_buf->data(), hit_buf->data() + hit_buf->size());
OutputWriter writer{ output_file };
OutputWriter writer{output_file, (*cfg.output_format == OutputFormat::json) ? ',' : char(0)};
output_sink.reset(new ReorderQueue<TextBuffer*, OutputWriter>(query_range.first, writer));
unique_ptr<thread> heartbeat;
if (config.verbosity >= 3 && config.load_balancing == Config::query_parallel && !config.swipe_all && config.heartbeat)
Expand Down
3 changes: 2 additions & 1 deletion src/align/output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../output/daa/daa_write.h"
#include "../util/sequence/sequence.h"


using std::vector;

namespace Extension {
Expand All @@ -51,7 +52,7 @@ TextBuffer* generate_output(vector<Match> &targets, const Extension::Stats& stat
}
else if (aligned || config.report_unaligned)
f->print_query_intro(info);

for (int i = 0; i < (int)targets.size(); ++i) {

if (targets[i].hsp.empty())
Expand Down
2 changes: 1 addition & 1 deletion src/basic/basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../util/util.h"
#include "../stats/standard_matrix.h"

const char* Const::version_string = "2.1.7";
const char* Const::version_string = "2.1.8";
using std::string;
using std::vector;
using std::count;
Expand Down
12 changes: 7 additions & 5 deletions src/basic/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,8 +315,9 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
\t100 = DIAMOND alignment archive (DAA)\n\
\t101 = SAM\n\
\t102 = Taxonomic classification\n\
\t103 = PAF\n\n\
\tValue 6 may be followed by a space-separated list of these keywords:\n\n\
\t103 = PAF\n\
\t104 = JSON (flat)\n\n\
\tValues 6 and 104 may be followed by a space-separated list of these keywords:\n\n\
\tqseqid means Query Seq - id\n\
\tqlen means Query sequence length\n\
\tsseqid means Subject Seq - id\n\
Expand Down Expand Up @@ -422,11 +423,12 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
("multiprocessing", 0, "enable distributed-memory parallel processing", multiprocessing)
("mp-init", 0, "initialize multiprocessing run", mp_init)
("mp-recover", 0, "enable continuation of interrupted multiprocessing run", mp_recover)
("mp-query-chunk", 0, "process only a single query chunk as specified", mp_query_chunk, -1)
("mp-query-chunk", 0, "process only a single query chunk as specified", mp_query_chunk, -1)
("culling-overlap", 0, "minimum range overlap with higher scoring hit to delete a hit (default=50%)", inner_culling_overlap, 50.0)
("taxon-k", 0, "maximum number of targets to report per species", taxon_k, (uint64_t)0)
("range-cover", 0, "percentage of query range to be covered for range culling (default=50%)", query_range_cover, 50.0)
("range-cover", 0, "percentage of query range to be covered for range culling (default=50%)", query_range_cover, 50.0)
("xml-blord-format", 0, "Use gnl|BL_ORD_ID| style format in XML output", xml_blord_format)
("sam-query-len", 0, "add the query length to the SAM format (tag ZQ)", sam_qlen_field)
("stop-match-score", 0, "Set the match score of stop codons against each other.", stop_match_score, 1)
("target-indexed", 0, "Enable target-indexed mode", target_indexed)
("unaligned-targets", 0, "", unaligned_targets)
Expand Down Expand Up @@ -669,7 +671,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa

if (verbosity >= 1 || command == regression_test) {
ostream& header_out = command == Config::help ? cout : cerr;
header_out << Const::program_name << " v" << Const::version_string << "." << (unsigned)Const::build_version << " (C) Max Planck Society for the Advancement of Science" << endl;
header_out << Const::program_name << " v" << Const::version_string << "." << (unsigned)Const::build_version << " (C) Max Planck Society for the Advancement of Science, Benjamin Buchfink, University of Tuebingen" << endl;
header_out << "Documentation, support and updates available at http://www.diamondsearch.org" << endl;
header_out << "Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)" << endl << endl;
}
Expand Down
1 change: 1 addition & 0 deletions src/basic/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ struct Config
int zdrop;
bool heartbeat;
bool no_parse_seqids;
bool sam_qlen_field;

SequenceType dbtype;

Expand Down
2 changes: 1 addition & 1 deletion src/basic/const.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Const
{

enum {
build_version = 161,
build_version = 162,
#ifdef SINGLE_THREADED
seedp_bits = 0,
#else
Expand Down
40 changes: 24 additions & 16 deletions src/basic/seed_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,31 +159,39 @@ struct SketchIterator
template<uint64_t B>
struct HashedSeedIterator
{
HashedSeedIterator(const Sequence &seq, const Shape &sh):
ptr_(seq.data()),
end_(ptr_ + seq.length()),
HashedSeedIterator(Letter* seq, Loc len, const Shape &sh):
long_mask(sh.long_mask()),
ptr_(seq),
end_(ptr_ + len),
last_(0)
{
for (int i = 0; (i < sh.length_ - 1) && ptr_ < end_; ++i) {
for (int i = 0; i < sh.length_ && ptr_ < end_; ++i)
last_ = (last_ << B) | Reduction::reduction(letter_mask(*(ptr_++)));
}
}
bool good() const
{
return ptr_ < end_;
return ptr_ <= end_;
}
bool get(uint64_t &seed, uint64_t mask)
{
last_ <<= B;
const Letter l = letter_mask(*(ptr_++));
if (!is_amino_acid(l))
return false;
last_ |= Reduction::reduction(l);
seed = MurmurHash()(last_ & mask);
return true;
uint64_t operator*() const {
return MurmurHash()(last_ & long_mask);
}
HashedSeedIterator& operator++() {
while (ptr_ < end_) {
last_ <<= B;
const Letter l = letter_mask(*(ptr_++));
if (!is_amino_acid(l))
continue;
last_ |= Reduction::reduction(l);
return *this;
}
++ptr_;
}
Letter* seq_ptr(const Shape& sh) const {
return ptr_ - sh.length_;
}
private:
const Letter *ptr_, *end_;
const uint64_t long_mask;
Letter *ptr_, *end_;
uint64_t last_;
};

Expand Down
17 changes: 8 additions & 9 deletions src/data/enum_seeds.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ Search::SeedStats enum_seeds_minimizer(SequenceSet* seqs, F* f, unsigned begin,
template<typename F, uint64_t BITS, typename Filter>
void enum_seeds_hashed(SequenceSet* seqs, F* f, unsigned begin, unsigned end, const Filter* filter, const EnumCfg& cfg)
{
uint64_t key;
for (unsigned i = begin; i < end; ++i) {
if (cfg.skip && (*cfg.skip)[i / align_mode.query_contexts])
continue;
Expand All @@ -80,16 +79,16 @@ void enum_seeds_hashed(SequenceSet* seqs, F* f, unsigned begin, unsigned end, co
for (size_t shape_id = cfg.shape_begin; shape_id < cfg.shape_end; ++shape_id) {
const Shape& sh = shapes[shape_id];
if (seq.length() < sh.length_) continue;
const uint64_t shape_mask = sh.long_mask();
//const __m128i shape_mask = sh.long_mask_sse_;
HashedSeedIterator<BITS> it(seq, sh);
Loc j = 0;
HashedSeedIterator<BITS> it(seqs->ptr(i), seqs->length(i), sh);
while (it.good()) {
if (it.get(key, shape_mask)) {
if (filter->contains(key, shape_id))
(*f)(key, seqs->position(i, j), i, shape_id);
}
++j;
const uint64_t key = *it;
if (filter->contains(key, shape_id))
if (!cfg.filter_low_complexity_seeds || Search::seed_is_complex(it.seq_ptr(sh), sh, cfg.seed_cut))
(*f)(key, seqs->position(i, it.seq_ptr(sh) - seq.data()), i, shape_id);
else if (cfg.mask_low_complexity_seeds)
*it.seq_ptr(sh) |= SEED_MASK;
++it;
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/data/flags.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ using SeedLoc = PackedLoc;

struct EnumCfg {
const std::vector<size_t>* partition;
const size_t shape_begin, shape_end;
size_t shape_begin, shape_end;
const SeedEncoding code;
const std::vector<bool>* const skip;
const bool filter_masked_seeds, mask_seeds;
const double seed_cut;
const MaskingAlgo soft_masking;
const Loc minimizer_window;
const bool filter_low_complexity_seeds, mask_low_complexity_seeds;
};
19 changes: 11 additions & 8 deletions src/data/seed_histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ size_t SeedHistogram::max_chunk_size(const int index_chunks) const
}

template<typename Filter>
SeedHistogram::SeedHistogram(Block& seqs, bool serial, const Filter* filter, SeedEncoding code, const std::vector<bool>* skip, const bool mask_seeds, const double seed_cut, const MaskingAlgo soft_masking, Loc minimizer_window) :
SeedHistogram::SeedHistogram(Block& seqs, bool serial, const Filter* filter, EnumCfg& enum_cfg) :
data_(shapes.count()),
p_(seqs.seqs().partition(config.threads_))
{
Expand Down Expand Up @@ -76,17 +76,20 @@ SeedHistogram::SeedHistogram(Block& seqs, bool serial, const Filter* filter, See
PtrVector<Callback> cb;
for (size_t i = 0; i < p_.size() - 1; ++i)
cb.push_back(new Callback(i, data_));
enum_cfg.partition = &p_;
if (serial)
for (unsigned s = 0; s < shapes.count(); ++s) {
const EnumCfg cfg{ &p_,s,s + 1, code, skip, false, mask_seeds, seed_cut, soft_masking, minimizer_window };
enum_seeds(seqs, cb, filter, cfg);
enum_cfg.shape_begin = s;
enum_cfg.shape_end = s + 1;
enum_seeds(seqs, cb, filter, enum_cfg);
}
else {
const EnumCfg cfg{ &p_, 0, shapes.count(), code, skip, false, mask_seeds, seed_cut, soft_masking, minimizer_window };
enum_seeds(seqs, cb, filter, cfg);
enum_cfg.shape_begin = 0;
enum_cfg.shape_end = shapes.count();
enum_seeds(seqs, cb, filter, enum_cfg);
}
}

template SeedHistogram::SeedHistogram(Block&, bool, const NoFilter*, SeedEncoding, const std::vector<bool>*, const bool, const double, const MaskingAlgo, Loc);
template SeedHistogram::SeedHistogram(Block&, bool, const SeedSet*, SeedEncoding, const std::vector<bool>*, const bool, const double, const MaskingAlgo, Loc);
template SeedHistogram::SeedHistogram(Block&, bool, const HashedSeedSet*, SeedEncoding, const std::vector<bool>*, const bool, const double, const MaskingAlgo, Loc);
template SeedHistogram::SeedHistogram(Block&, bool, const NoFilter*, EnumCfg&);
template SeedHistogram::SeedHistogram(Block&, bool, const SeedSet*, EnumCfg&);
template SeedHistogram::SeedHistogram(Block&, bool, const HashedSeedSet*, EnumCfg&);
2 changes: 1 addition & 1 deletion src/data/seed_histogram.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct SeedHistogram
SeedHistogram();

template<typename Filter>
SeedHistogram(Block& seqs, bool serial, const Filter* filter, SeedEncoding code, const std::vector<bool>* skip, const bool mask_seeds, const double seed_cut, const MaskingAlgo soft_masking, Loc minimizer_window);
SeedHistogram(Block& seqs, bool serial, const Filter* filter, EnumCfg& enum_cfg);

const ShapeHistogram& get(unsigned sid) const
{ return data_[sid]; }
Expand Down
4 changes: 2 additions & 2 deletions src/data/seed_set.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ SeedSet::SeedSet(Block &seqs, double max_coverage, const std::vector<bool>* skip
PtrVector<Seed_set_callback> v;
v.push_back(new Seed_set_callback(data_, size_t(max_coverage*pow(Reduction::reduction.size(), shapes[0].length_))));
const auto p = seqs.seqs().partition(1);
const EnumCfg cfg{ &p, 0, 1, SeedEncoding::CONTIGUOUS, skip, true, false, seed_cut, soft_masking, 0 };
const EnumCfg cfg{ &p, 0, 1, SeedEncoding::CONTIGUOUS, skip, true, false, seed_cut, soft_masking, 0, false, false };
enum_seeds(seqs, v, &no_filter, cfg);
coverage_ = (double)v.back().coverage / pow(Reduction::reduction.size(), shapes[0].length_);
}
Expand All @@ -99,7 +99,7 @@ HashedSeedSet::HashedSeedSet(Block &seqs, const std::vector<bool>* skip, const d
PtrVector<Hashed_seed_set_callback> v;
v.push_back(new Hashed_seed_set_callback(data_));
const auto p = seqs.seqs().partition(1);
const EnumCfg cfg{ &p, 0, shapes.count(), SeedEncoding::HASHED, skip, false, false, seed_cut, soft_masking, 0 };
const EnumCfg cfg{ &p, 0, shapes.count(), SeedEncoding::HASHED, skip, false, false, seed_cut, soft_masking, 0, false, false };
enum_seeds(seqs, v, &no_filter, cfg);

vector<size_t> sizes;
Expand Down
2 changes: 1 addition & 1 deletion src/dna/dna_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ ref_buffer_(ref_buffer)

const EnumCfg enum_ref{&ref_hst.partition(), 0, 1, cfg.seed_encoding, nullptr, false, false,
cfg.seed_complexity_cut,
MaskingAlgo::NONE, cfg.minimizer_window};
MaskingAlgo::NONE, cfg.minimizer_window, false, false };


seed_arr_.reset(new SeedArray(*cfg.target, ref_hst.get(0), range, ref_buffer, &no_filter, enum_ref));
Expand Down
Loading

0 comments on commit 03e7f4a

Please sign in to comment.