Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 5c10a7ddd2e443a85bfca2418c2f4d9da2a606e7
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 30 17:48:52 2023 +0200

    Updated changelog.

commit 253ddffe7fba443ddd0e7390bd49156c0e82169e
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 30 16:33:29 2023 +0200

    Updated version.

commit 67e9ea316f89df8605fe5c019d1e09be647e531e
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 30 16:16:57 2023 +0200

    Fixed CLI.

commit 5b94bd8edea81f20b3c963e99fe2be8ca66d7ae5
Author: Benjamin Buchfink <[email protected]>
Date:   Fri May 26 17:01:36 2023 +0200

    CLI.

commit 910d2971d9b7cf9bfc342047586c0ab260f2b326
Author: Benjamin Buchfink <[email protected]>
Date:   Wed May 24 12:05:37 2023 +0200

    Added tantan license.

commit 916c9a26c2df0ee7641f0e4e8a4668c981500f29
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 23 16:55:12 2023 +0200

    Fixed changelog.

commit a99803c54dda3af26280c52a15ffb41037d04899
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 23 15:28:43 2023 +0200

    Fixed message.

commit 0470b1655dbdc4c2820302ade7f84dd8115f9e46
Merge: dae33fc2 c7b19e9
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 23 15:28:31 2023 +0200

    Merged w/master.

commit dae33fc29827c2abe70b2dcfa096d29a1f45129a
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 23 15:05:21 2023 +0200

    Fixed message.

commit da332d2315409df89d9179ab5c97f9d065ec6ca6
Author: emile151 <[email protected]>
Date:   Wed May 10 10:27:32 2023 +0200

    Dev (#13)

    * Created Documentation function:
    Selects the corresponding possible options depending on the mode selected

    * Arranged print Doc

    * Changed the print_help function to print only possible modes

    * Changed the print_help function to print only possible modes

    * Fixed remarks, prints Documentation when two arguments

    * print_documentation adjusted

commit 504540eb3328423c77215728965b9ed19086fea1
Merge: ac48249f 83cc65a5
Author: Benjamin Buchfink <[email protected]>
Date:   Tue May 2 09:52:18 2023 +0200

    Merge branch 'dev' of https://github.com/bbuchfink/diamond_dev into dev

commit ac48249f477d3ac6a3d6cf3d58acdbbb292a920c
Author: Benjamin Buchfink <[email protected]>
Date:   Mon May 1 21:43:50 2023 +0200

    Added model-seqs command.

commit 83cc65a569b86afc2cc0cebf17f46d7258d2cab9
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Apr 27 17:12:03 2023 +0200

    Added stats for accession parsing.

commit 77c2ffc5cf4bfc207e6df68e4689bbdd5193038b
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Apr 27 16:18:50 2023 +0200

    Allow `--tmpdir` for other workflows.

commit 9db87473bf9dc4a07285b6459420dfbb7ceca983
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Apr 13 10:13:51 2023 +0200

    Fixed error.

commit 27bdbeddcbf323f6e591b8ef34dee4a7f641b857
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Apr 13 10:13:13 2023 +0200

    Fixed errors.

commit c1676e276f6a78345b607a61144e0552fb7ae1fe
Author: Benjamin Buchfink <[email protected]>
Date:   Thu Apr 13 09:14:24 2023 +0200

    Updated changelog.

commit 168a21e1120f263e35959c787f61ad09151667d9
Author: Benjamin Buchfink <[email protected]>
Date:   Wed Apr 12 11:11:48 2023 +0200

    Added tests.

commit 9ab6a5d09c0894c81ea2ab21f55183920a9cce26
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Apr 4 16:58:26 2023 +0200

    Updated test script.

commit 98ecdfbc3cc4215e667254cae772d5ae0daec5ed
Author: Benjamin Buchfink <[email protected]>
Date:   Tue Apr 4 09:30:33 2023 +0200

    Added test script.

commit c475d4b5ea487d089b502e6bbf91747f216b3e9b
Author: Benjamin Buchfink <[email protected]>
Date:   Fri Mar 31 17:15:24 2023 +0200

    Implemented tokenizers for fasta and fastq.

commit ba34eb0186a1fa366e863544a9f158c2d0411c13
Merge: 9542bcaa 82e9959
Author: Benjamin Buchfink <[email protected]>
Date:   Fri Mar 31 15:02:53 2023 +0200

    Merge branch 'master' into dev
  • Loading branch information
bbuchfink committed May 30, 2023
1 parent c7b19e9 commit c0a759d
Show file tree
Hide file tree
Showing 33 changed files with 1,133 additions and 134 deletions.
13 changes: 7 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,7 @@ if(WITH_DNA)
target_link_libraries(diamond wfa2)

target_compile_options(wfa2 PRIVATE -DCMAKE_BUILD_TYPE=Release -DEXTRA_FLAGS="-ftree-vectorize -msse2 -mfpmath=sse -ftree-vectorizer-verbose=5")

endif()
endif()

if(EIGEN_BLAS)
add_definitions(-DEIGEN_USE_BLAS -DEIGEN_USE_LAPACKE)
Expand Down Expand Up @@ -485,7 +484,9 @@ target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS diamond DESTINATION bin)

enable_testing()
add_test(
NAME diamond
COMMAND diamond test
)
SET(TD ${CMAKE_SOURCE_DIR}/src/test)
SET(SP -DTEST_DIR=${CMAKE_SOURCE_DIR}/src/test -P ${CMAKE_SOURCE_DIR}/src/test/test.cmake)
add_test(NAME blastp COMMAND ${CMAKE_COMMAND} -DNAME=blastp "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa" ${SP})
add_test(NAME blastp-mid-sens COMMAND ${CMAKE_COMMAND} -DNAME=blastp-mid-sens "-DARGS=blastp -q ${TD}/3.faa -d ${TD}/4.faa --mid-sensitive" ${SP})
add_test(NAME blastp-f0 COMMAND ${CMAKE_COMMAND} -DNAME=blastp-f0 "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa -f 0" ${SP})
add_test(NAME diamond COMMAND diamond test)
8 changes: 8 additions & 0 deletions src/ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
- Fixed a bug that caused taxonomy names not to be loaded correctly for the
`makedb` workflow.
- Fixed a bug that caused a crash when using the `--target-indexed` option.
- Fixed an error when using the `--tmpdir` option for the makedb workflow.
- Added a warning message when sequence accessions are shortened due to parsing
rules for the `makedb` workflow.
- Added the option `--no-parse-seqids` to disable parsing of sequence
accessions.
- Changed the command line help to print options separated by command.
- Fixed an issue that the `--ignore-warnings` option could not be used for
the `makedb` workflow.

[2.1.6]
- Fixed compatibility issues on older systems without support for AVX2.
Expand Down
2 changes: 1 addition & 1 deletion src/basic/basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "../util/util.h"
#include "../stats/standard_matrix.h"

const char* Const::version_string = "2.1.6";
const char* Const::version_string = "2.1.7";
using std::string;
using std::vector;
using std::count;
Expand Down
217 changes: 134 additions & 83 deletions src/basic/config.cpp

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion src/basic/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,10 @@ struct Config
match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta = 18, dbinfo = 19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23, info = 24, seed_stat = 25,
smith_waterman = 26, cluster = 27, translate = 28, filter_blasttab = 29, show_cbs = 30, simulate_seqs = 31, split = 32, upgma = 33, upgma_mc = 34, regression_test = 35,
reverse_seqs = 36, compute_medoids = 37, mutate = 38, rocid = 40, makeidx = 41, find_shapes, prep_db, composition, JOIN, HASH_SEQS, LIST_SEEDS, CLUSTER_REALIGN,
GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST, LINCLUST, WORD_COUNT, CUT
GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST, LINCLUST, WORD_COUNT, CUT, MODEL_SEQS
};


unsigned command;

enum class Algo { AUTO = -1, DOUBLE_INDEXED = 0, QUERY_INDEXED = 1, CTG_SEED };
Expand Down
2 changes: 1 addition & 1 deletion src/basic/const.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Const
{

enum {
build_version = 160,
build_version = 161,
#ifdef SINGLE_THREADED
seedp_bits = 0,
#else
Expand Down
2 changes: 2 additions & 0 deletions src/data/blastdb/blastdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,10 @@ BlastDB::BlastDB(const std::string& file_name, Metadata metadata, Flags flags, c
sequence_count_(db_->GetNumOIDs()),
sparse_sequence_count_(db_->GetNumSeqs())
{
#ifndef EXTRA
if (flag_any(metadata, Metadata::TAXON_NODES | Metadata::TAXON_MAPPING | Metadata::TAXON_SCIENTIFIC_NAMES | Metadata::TAXON_RANKS))
throw std::runtime_error("Taxonomy features are not supported for the BLAST database format.");
#endif
vector<string> paths;
CSeqDB::FindVolumePaths(file_name, CSeqDB::eProtein, paths);
for (const string& db : paths)
Expand Down
9 changes: 7 additions & 2 deletions src/data/dmnd/dmnd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ void DatabaseFile::make_db()
const FASTA_format format;
vector<SeqInfo> pos_array;
ExternalSorter<pair<string, OId>> accessions;
AccessionParsing acc_stats;
try {
while (true) {
timer.go("Loading sequences");
Expand All @@ -285,7 +286,7 @@ void DatabaseFile::make_db()
if (!config.prot_accession2taxid.empty()) {
timer.go("Writing accessions");
for (size_t i = 0; i < n; ++i) {
vector<string> acc = accession_from_title(block->ids()[i]);
vector<string> acc = accession_from_title(block->ids()[i], acc_stats);
for (const string& s : acc)
accessions.push(std::make_pair(s, total_seqs + i));
}
Expand Down Expand Up @@ -317,9 +318,13 @@ void DatabaseFile::make_db()
pos_array.shrink_to_fit();
timer.finish();

if (!config.prot_accession2taxid.empty() && !config.no_parse_seqids)
message_stream << endl << "Accession parsing rules triggered for database seqids (use --no-parse-seqids to disable):" << endl << acc_stats << endl;

Util::Table stats;
stats("Database sequences", n_seqs);
stats("Database letters", letters);

taxonomy.init();
if (!config.prot_accession2taxid.empty()) {
header2.taxon_array_offset = out->tell();
Expand Down Expand Up @@ -397,7 +402,7 @@ void DatabaseFile::skip_seq()
}

bool DatabaseFile::is_diamond_db(const string &file_name) {
if (file_name == "-")
if (file_name == "-" || file_name.empty())
return false;
InputFile db_file(file_name);
uint64_t magic_number = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/data/fasta/fasta_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ struct FastaFile : public SequenceFile
std::list<TextInputFile> file_;
std::list<TextInputFile>::iterator file_ptr_;
std::unique_ptr<OutputFile> out_file_;
std::unique_ptr<const Sequence_file_format> format_;
std::unique_ptr<const SequenceFileFormat> format_;
OId oid_;
int64_t seqs_, letters_;

Expand Down
17 changes: 13 additions & 4 deletions src/data/taxon_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,14 @@ static int mapping_file_format(const string& header) {
throw std::runtime_error("Accession mapping file header has to be in one of these formats:\naccession\taccession.version\ttaxid\tgi\naccession.version\ttaxid");
}

static void load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)
static AccessionParsing load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)
{
TaxId taxid;
TextInputFile f(config.prot_accession2taxid);
f.getline();
int format = mapping_file_format(f.line);
string accession, last;
AccessionParsing stats;

while (!f.eof() && (f.getline(), !f.line.empty())) {
try {
Expand All @@ -79,12 +80,16 @@ static void load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)

if (!config.no_parse_seqids) {
size_t i = accession.find(":PDB=");
if (i != string::npos)
if (i != string::npos) {
accession.erase(i);
++stats.pdb_suffix;
}

i = accession.find_last_of('.');
if (i != string::npos)
if (i != string::npos) {
accession.erase(i);
++stats.suffix_after_dot;
}
}

if (accession != last)
Expand All @@ -93,13 +98,14 @@ static void load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)
last = accession;
}
f.close();
return stats;
}

void TaxonList::build(OutputFile &db, ExternalSorter<pair<string, OId>>& acc2oid, OId seqs, Util::Table& stats)
{
TaskTimer timer("Loading taxonomy mapping file");
ExternalSorter<pair<string, TaxId>> acc2taxid;
load_mapping_file(acc2taxid);
const AccessionParsing acc_stats = load_mapping_file(acc2taxid);

timer.go("Joining accession mapping");
acc2taxid.init_read();
Expand Down Expand Up @@ -135,4 +141,7 @@ void TaxonList::build(OutputFile &db, ExternalSorter<pair<string, OId>>& acc2oid
stats("Entries in accession to taxid file", acc2taxid.count());
stats("Database accessions mapped to taxid" , acc_matched);
stats("Database sequences mapped to taxid", mapped_seqs);

if (!config.no_parse_seqids)
message_stream << endl << "Accession parsing rules triggered for mapping file seqids (use --no-parse-seqids to disable):" << endl << acc_stats << endl;
}
32 changes: 26 additions & 6 deletions src/data/taxonomy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,27 +61,35 @@ Rank::Rank(const char *s) {

Taxonomy taxonomy;

string get_accession(const string &title)
string get_accession(const string &title, AccessionParsing& stat)
{
if (config.no_parse_seqids)
return title;
size_t i;
string t(title);
if (t.compare(0, 6, "UniRef") == 0)
if (t.compare(0, 6, "UniRef") == 0) {
t.erase(0, t.find('_', 0) + 1);
++stat.uniref_prefix;
}
else if ((i = t.find_first_of('|', 0)) != string::npos) {
if (t.compare(0, 3, "gi|") == 0) {
t.erase(0, t.find_first_of('|', i + 1) + 1);
i = t.find_first_of('|', 0);
++stat.gi_prefix;
}
t.erase(0, i + 1);
++stat.prefix_before_pipe;
i = t.find_first_of('|', 0);
if (i != string::npos)
if (i != string::npos) {
t.erase(i);
++stat.suffix_after_pipe;
}
}
i = t.find_last_of('.');
if (i != string::npos)
if (i != string::npos) {
t.erase(i);
++stat.suffix_after_dot;
}
return t;
}

Expand Down Expand Up @@ -116,10 +124,22 @@ void Taxonomy::init()
}
}

vector<string> accession_from_title(const char *title)
vector<string> accession_from_title(const char *title, AccessionParsing& stat)
{
vector<string> t(seq_titles(title));
for (vector<string>::iterator i = t.begin(); i < t.end(); ++i)
*i = get_accession(Util::Seq::seqid(i->c_str(), false));
*i = get_accession(Util::Seq::seqid(i->c_str(), false), stat);
return t;
}

std::ostream& operator<<(std::ostream& s, const AccessionParsing& stat) {
Util::Table t;
t("UniRef prefix", stat.uniref_prefix);
t("gi|xxx| prefix", stat.gi_prefix);
t("xxx| prefix", stat.prefix_before_pipe);
t("|xxx suffix", stat.suffix_after_pipe);
t(".xxx suffix", stat.suffix_after_dot);
t(":PDB= suffix", stat.pdb_suffix);
s << t;
return s;
}
17 changes: 15 additions & 2 deletions src/data/taxonomy.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,21 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
#include "taxonomy_nodes.h"
#include "../util/data_structures/bit_vector.h"

std::string get_accession(const std::string &t);
std::vector<std::string> accession_from_title(const char *title);
struct AccessionParsing {
AccessionParsing():
uniref_prefix(0),
gi_prefix(0),
prefix_before_pipe(0),
suffix_after_pipe(0),
suffix_after_dot(0),
pdb_suffix(0)
{}
friend std::ostream& operator<<(std::ostream& s, const AccessionParsing& stat);
int64_t uniref_prefix, gi_prefix, prefix_before_pipe, suffix_after_pipe, suffix_after_dot, pdb_suffix;
};

std::string get_accession(const std::string &t, AccessionParsing& stat);
std::vector<std::string> accession_from_title(const char *title, AccessionParsing& stat);

struct Taxonomy
{
Expand Down
Loading

0 comments on commit c0a759d

Please sign in to comment.