Squashed commit of the following:

commit 5c10a7ddd2e443a85bfca2418c2f4d9da2a606e7 Author: Benjamin Buchfink <[email protected]> Date: Tue May 30 17:48:52 2023 +0200 Updated changelog. commit 253ddffe7fba443ddd0e7390bd49156c0e82169e Author: Benjamin Buchfink <[email protected]> Date: Tue May 30 16:33:29 2023 +0200 Updated version. commit 67e9ea316f89df8605fe5c019d1e09be647e531e Author: Benjamin Buchfink <[email protected]> Date: Tue May 30 16:16:57 2023 +0200 Fixed CLI. commit 5b94bd8edea81f20b3c963e99fe2be8ca66d7ae5 Author: Benjamin Buchfink <[email protected]> Date: Fri May 26 17:01:36 2023 +0200 CLI. commit 910d2971d9b7cf9bfc342047586c0ab260f2b326 Author: Benjamin Buchfink <[email protected]> Date: Wed May 24 12:05:37 2023 +0200 Added tantan license. commit 916c9a26c2df0ee7641f0e4e8a4668c981500f29 Author: Benjamin Buchfink <[email protected]> Date: Tue May 23 16:55:12 2023 +0200 Fixed changelog. commit a99803c54dda3af26280c52a15ffb41037d04899 Author: Benjamin Buchfink <[email protected]> Date: Tue May 23 15:28:43 2023 +0200 Fixed message. commit 0470b1655dbdc4c2820302ade7f84dd8115f9e46 Merge: dae33fc2 c7b19e9 Author: Benjamin Buchfink <[email protected]> Date: Tue May 23 15:28:31 2023 +0200 Merged w/master. commit dae33fc29827c2abe70b2dcfa096d29a1f45129a Author: Benjamin Buchfink <[email protected]> Date: Tue May 23 15:05:21 2023 +0200 Fixed message. commit da332d2315409df89d9179ab5c97f9d065ec6ca6 Author: emile151 <[email protected]> Date: Wed May 10 10:27:32 2023 +0200 Dev (#13) * Created Documentation function: Selects the corresponding possible options depending on the mode selected * Arranged print Doc * Changed the print_help function to print only possible modes * Changed the print_help function to print only possible modes * Fixed remarks, prints Documentation when two arguments * print_documentation adjusted commit 504540eb3328423c77215728965b9ed19086fea1 Merge: ac48249f 83cc65a5 Author: Benjamin Buchfink <[email protected]> Date: Tue May 2 09:52:18 2023 +0200 Merge branch 'dev' of https://github.com/bbuchfink/diamond_dev into dev commit ac48249f477d3ac6a3d6cf3d58acdbbb292a920c Author: Benjamin Buchfink <[email protected]> Date: Mon May 1 21:43:50 2023 +0200 Added model-seqs command. commit 83cc65a569b86afc2cc0cebf17f46d7258d2cab9 Author: Benjamin Buchfink <[email protected]> Date: Thu Apr 27 17:12:03 2023 +0200 Added stats for accession parsing. commit 77c2ffc5cf4bfc207e6df68e4689bbdd5193038b Author: Benjamin Buchfink <[email protected]> Date: Thu Apr 27 16:18:50 2023 +0200 Allow `--tmpdir` for other workflows. commit 9db87473bf9dc4a07285b6459420dfbb7ceca983 Author: Benjamin Buchfink <[email protected]> Date: Thu Apr 13 10:13:51 2023 +0200 Fixed error. commit 27bdbeddcbf323f6e591b8ef34dee4a7f641b857 Author: Benjamin Buchfink <[email protected]> Date: Thu Apr 13 10:13:13 2023 +0200 Fixed errors. commit c1676e276f6a78345b607a61144e0552fb7ae1fe Author: Benjamin Buchfink <[email protected]> Date: Thu Apr 13 09:14:24 2023 +0200 Updated changelog. commit 168a21e1120f263e35959c787f61ad09151667d9 Author: Benjamin Buchfink <[email protected]> Date: Wed Apr 12 11:11:48 2023 +0200 Added tests. commit 9ab6a5d09c0894c81ea2ab21f55183920a9cce26 Author: Benjamin Buchfink <[email protected]> Date: Tue Apr 4 16:58:26 2023 +0200 Updated test script. commit 98ecdfbc3cc4215e667254cae772d5ae0daec5ed Author: Benjamin Buchfink <[email protected]> Date: Tue Apr 4 09:30:33 2023 +0200 Added test script. commit c475d4b5ea487d089b502e6bbf91747f216b3e9b Author: Benjamin Buchfink <[email protected]> Date: Fri Mar 31 17:15:24 2023 +0200 Implemented tokenizers for fasta and fastq. commit ba34eb0186a1fa366e863544a9f158c2d0411c13 Merge: 9542bcaa 82e9959 Author: Benjamin Buchfink <[email protected]> Date: Fri Mar 31 15:02:53 2023 +0200 Merge branch 'master' into dev
bbuchfink · May 30, 2023 · c0a759d · c0a759d
1 parent c7b19e9
commit c0a759d
Show file tree

Hide file tree

Showing 33 changed files with 1,133 additions and 134 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -455,8 +455,7 @@ if(WITH_DNA)
   target_link_libraries(diamond  wfa2)
 
   target_compile_options(wfa2 PRIVATE -DCMAKE_BUILD_TYPE=Release -DEXTRA_FLAGS="-ftree-vectorize -msse2 -mfpmath=sse -ftree-vectorizer-verbose=5")
-
-  endif()
+endif()
 
 if(EIGEN_BLAS)
   add_definitions(-DEIGEN_USE_BLAS -DEIGEN_USE_LAPACKE)
@@ -485,7 +484,9 @@ target_link_libraries(diamond ${ZLIB_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
 install(TARGETS diamond DESTINATION bin)
 
 enable_testing()
-add_test(
-        NAME diamond
-        COMMAND diamond test
-)
+SET(TD ${CMAKE_SOURCE_DIR}/src/test)
+SET(SP -DTEST_DIR=${CMAKE_SOURCE_DIR}/src/test -P ${CMAKE_SOURCE_DIR}/src/test/test.cmake)
+add_test(NAME blastp COMMAND ${CMAKE_COMMAND} -DNAME=blastp "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa" ${SP})
+add_test(NAME blastp-mid-sens COMMAND ${CMAKE_COMMAND} -DNAME=blastp-mid-sens "-DARGS=blastp -q ${TD}/3.faa -d ${TD}/4.faa --mid-sensitive" ${SP})
+add_test(NAME blastp-f0 COMMAND ${CMAKE_COMMAND} -DNAME=blastp-f0 "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa -f 0" ${SP})
+add_test(NAME diamond COMMAND diamond test)
diff --git a/src/ChangeLog b/src/ChangeLog
@@ -2,6 +2,14 @@
 - Fixed a bug that caused taxonomy names not to be loaded correctly for the
   `makedb` workflow.
 - Fixed a bug that caused a crash when using the `--target-indexed` option.
+- Fixed an error when using the `--tmpdir` option for the makedb workflow.
+- Added a warning message when sequence accessions are shortened due to parsing
+  rules for the `makedb` workflow.
+- Added the option `--no-parse-seqids` to disable parsing of sequence
+  accessions.
+- Changed the command line help to print options separated by command.
+- Fixed an issue that the `--ignore-warnings` option could not be used for
+  the `makedb` workflow.
 
 [2.1.6]
 - Fixed compatibility issues on older systems without support for AVX2.

diff --git a/src/basic/basic.cpp b/src/basic/basic.cpp
@@ -29,7 +29,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "../util/util.h"
 #include "../stats/standard_matrix.h"
 
-const char* Const::version_string = "2.1.6";
+const char* Const::version_string = "2.1.7";
 using std::string;
 using std::vector;
 using std::count;

diff --git a/src/basic/config.cpp b/src/basic/config.cpp
diff --git a/src/basic/config.h b/src/basic/config.h
@@ -354,8 +354,10 @@ struct Config
 		match_file_stat = 14, model_seqs = 15, opt = 16, mask = 17, fastq2fasta = 18, dbinfo = 19, test_extra = 20, test_io = 21, db_annot_stats = 22, read_sim = 23, info = 24, seed_stat = 25,
 		smith_waterman = 26, cluster = 27, translate = 28, filter_blasttab = 29, show_cbs = 30, simulate_seqs = 31, split = 32, upgma = 33, upgma_mc = 34, regression_test = 35,
 		reverse_seqs = 36, compute_medoids = 37, mutate = 38, rocid = 40, makeidx = 41, find_shapes, prep_db, composition, JOIN, HASH_SEQS, LIST_SEEDS, CLUSTER_REALIGN,
-		GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST, LINCLUST, WORD_COUNT, CUT
+		GREEDY_VERTEX_COVER, INDEX_FASTA, FETCH_SEQ, CLUSTER_REASSIGN, blastn, RECLUSTER, LENGTH_SORT, MERGE_DAA, DEEPCLUST, LINCLUST, WORD_COUNT, CUT, MODEL_SEQS
 	};
+
+
 	unsigned	command;
 
 	enum class Algo { AUTO = -1, DOUBLE_INDEXED = 0, QUERY_INDEXED = 1, CTG_SEED };

diff --git a/src/basic/const.h b/src/basic/const.h
@@ -25,7 +25,7 @@ struct Const
 {
 
 	enum {
-		build_version = 160,
+		build_version = 161,
 #ifdef SINGLE_THREADED
 		seedp_bits = 0,
 #else

diff --git a/src/data/blastdb/blastdb.cpp b/src/data/blastdb/blastdb.cpp
@@ -96,8 +96,10 @@ BlastDB::BlastDB(const std::string& file_name, Metadata metadata, Flags flags, c
 	sequence_count_(db_->GetNumOIDs()),
 	sparse_sequence_count_(db_->GetNumSeqs())
 {
+#ifndef EXTRA
 	if (flag_any(metadata, Metadata::TAXON_NODES | Metadata::TAXON_MAPPING | Metadata::TAXON_SCIENTIFIC_NAMES | Metadata::TAXON_RANKS))
 		throw std::runtime_error("Taxonomy features are not supported for the BLAST database format.");
+#endif
 	vector<string> paths;
 	CSeqDB::FindVolumePaths(file_name, CSeqDB::eProtein, paths);
 	for (const string& db : paths)

diff --git a/src/data/dmnd/dmnd.cpp b/src/data/dmnd/dmnd.cpp
@@ -260,6 +260,7 @@ void DatabaseFile::make_db()
 	const FASTA_format format;
 	vector<SeqInfo> pos_array;
 	ExternalSorter<pair<string, OId>> accessions;
+	AccessionParsing acc_stats;
 	try {
 		while (true) {
 			timer.go("Loading sequences");
@@ -285,7 +286,7 @@ void DatabaseFile::make_db()
 			if (!config.prot_accession2taxid.empty()) {
 				timer.go("Writing accessions");
 				for (size_t i = 0; i < n; ++i) {
-					vector<string> acc = accession_from_title(block->ids()[i]);
+					vector<string> acc = accession_from_title(block->ids()[i], acc_stats);
 					for (const string& s : acc)
 						accessions.push(std::make_pair(s, total_seqs + i));
 				}
@@ -317,9 +318,13 @@ void DatabaseFile::make_db()
 	pos_array.shrink_to_fit();
 	timer.finish();
 
+	if (!config.prot_accession2taxid.empty() && !config.no_parse_seqids)
+		message_stream << endl << "Accession parsing rules triggered for database seqids (use --no-parse-seqids to disable):" << endl << acc_stats << endl;
+
 	Util::Table stats;
 	stats("Database sequences", n_seqs);
 	stats("Database letters", letters);
+
 	taxonomy.init();
 	if (!config.prot_accession2taxid.empty()) {
 		header2.taxon_array_offset = out->tell();
@@ -397,7 +402,7 @@ void DatabaseFile::skip_seq()
 }
 
 bool DatabaseFile::is_diamond_db(const string &file_name) {
-	if (file_name == "-")
+	if (file_name == "-" || file_name.empty())
 		return false;
 	InputFile db_file(file_name);
 	uint64_t magic_number = 0;

diff --git a/src/data/fasta/fasta_file.h b/src/data/fasta/fasta_file.h
@@ -62,7 +62,7 @@ struct FastaFile : public SequenceFile
 	std::list<TextInputFile> file_;
 	std::list<TextInputFile>::iterator file_ptr_;
 	std::unique_ptr<OutputFile> out_file_;
-	std::unique_ptr<const Sequence_file_format> format_;
+	std::unique_ptr<const SequenceFileFormat> format_;
 	OId oid_;
 	int64_t seqs_, letters_;
 

diff --git a/src/data/taxon_list.cpp b/src/data/taxon_list.cpp
@@ -55,13 +55,14 @@ static int mapping_file_format(const string& header) {
 	throw std::runtime_error("Accession mapping file header has to be in one of these formats:\naccession\taccession.version\ttaxid\tgi\naccession.version\ttaxid");
 }
 
-static void load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)
+static AccessionParsing load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)
 {
 	TaxId taxid;
 	TextInputFile f(config.prot_accession2taxid);
 	f.getline();
 	int format = mapping_file_format(f.line);
 	string accession, last;
+	AccessionParsing stats;
 
 	while (!f.eof() && (f.getline(), !f.line.empty())) {
 		try {
@@ -79,12 +80,16 @@ static void load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)
 
 		if (!config.no_parse_seqids) {
 			size_t i = accession.find(":PDB=");
-			if (i != string::npos)
+			if (i != string::npos) {
 				accession.erase(i);
+				++stats.pdb_suffix;
+			}
 
 			i = accession.find_last_of('.');
-			if (i != string::npos)
+			if (i != string::npos) {
 				accession.erase(i);
+				++stats.suffix_after_dot;
+			}
 		}
 
 		if (accession != last)
@@ -93,13 +98,14 @@ static void load_mapping_file(ExternalSorter<pair<string, TaxId>>& sorter)
 		last = accession;
 	}
 	f.close();
+	return stats;
 }
 
 void TaxonList::build(OutputFile &db, ExternalSorter<pair<string, OId>>& acc2oid, OId seqs, Util::Table& stats)
 {
 	TaskTimer timer("Loading taxonomy mapping file");
 	ExternalSorter<pair<string, TaxId>> acc2taxid;
-	load_mapping_file(acc2taxid);
+	const AccessionParsing acc_stats = load_mapping_file(acc2taxid);
 
 	timer.go("Joining accession mapping");
 	acc2taxid.init_read();
@@ -135,4 +141,7 @@ void TaxonList::build(OutputFile &db, ExternalSorter<pair<string, OId>>& acc2oid
 	stats("Entries in accession to taxid file", acc2taxid.count());
 	stats("Database accessions mapped to taxid" , acc_matched);
 	stats("Database sequences mapped to taxid", mapped_seqs);
+
+	if (!config.no_parse_seqids)
+		message_stream << endl << "Accession parsing rules triggered for mapping file seqids (use --no-parse-seqids to disable):" << endl << acc_stats << endl;
 }
diff --git a/src/data/taxonomy.cpp b/src/data/taxonomy.cpp
@@ -61,27 +61,35 @@ Rank::Rank(const char *s) {
 
 Taxonomy taxonomy;
 
-string get_accession(const string &title)
+string get_accession(const string &title, AccessionParsing& stat)
 {
 	if (config.no_parse_seqids)
 		return title;
 	size_t i;
 	string t(title);
-	if (t.compare(0, 6, "UniRef") == 0)
+	if (t.compare(0, 6, "UniRef") == 0) {
 		t.erase(0, t.find('_', 0) + 1);
+		++stat.uniref_prefix;
+	}
 	else if ((i = t.find_first_of('|', 0)) != string::npos) {
 		if (t.compare(0, 3, "gi|") == 0) {
 			t.erase(0, t.find_first_of('|', i + 1) + 1);
 			i = t.find_first_of('|', 0);
+			++stat.gi_prefix;
 		}
 		t.erase(0, i + 1);
+		++stat.prefix_before_pipe;
 		i = t.find_first_of('|', 0);
-		if (i != string::npos)
+		if (i != string::npos) {
 			t.erase(i);
+			++stat.suffix_after_pipe;
+		}
 	}
 	i = t.find_last_of('.');
-	if (i != string::npos)
+	if (i != string::npos) {
 		t.erase(i);
+		++stat.suffix_after_dot;
+	}
 	return t;
 }
 
@@ -116,10 +124,22 @@ void Taxonomy::init()
 	}
 }
 
-vector<string> accession_from_title(const char *title)
+vector<string> accession_from_title(const char *title, AccessionParsing& stat)
 {
 	vector<string> t(seq_titles(title));
 	for (vector<string>::iterator i = t.begin(); i < t.end(); ++i)
-		*i = get_accession(Util::Seq::seqid(i->c_str(), false));
+		*i = get_accession(Util::Seq::seqid(i->c_str(), false), stat);
 	return t;
 }
+
+std::ostream& operator<<(std::ostream& s, const AccessionParsing& stat) {
+	Util::Table t;
+	t("UniRef prefix", stat.uniref_prefix);
+	t("gi|xxx| prefix", stat.gi_prefix);
+	t("xxx| prefix", stat.prefix_before_pipe);
+	t("|xxx suffix", stat.suffix_after_pipe);
+	t(".xxx suffix", stat.suffix_after_dot);
+	t(":PDB= suffix", stat.pdb_suffix);
+	s << t;
+	return s;
+}
diff --git a/src/data/taxonomy.h b/src/data/taxonomy.h
@@ -31,8 +31,21 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #include "taxonomy_nodes.h"
 #include "../util/data_structures/bit_vector.h"
 
-std::string get_accession(const std::string &t);
-std::vector<std::string> accession_from_title(const char *title);
+struct AccessionParsing {
+	AccessionParsing():
+		uniref_prefix(0),
+		gi_prefix(0),
+		prefix_before_pipe(0),
+		suffix_after_pipe(0),
+		suffix_after_dot(0),
+		pdb_suffix(0)
+	{}
+	friend std::ostream& operator<<(std::ostream& s, const AccessionParsing& stat);
+	int64_t uniref_prefix, gi_prefix, prefix_before_pipe, suffix_after_pipe, suffix_after_dot, pdb_suffix;
+};
+
+std::string get_accession(const std::string &t, AccessionParsing& stat);
+std::vector<std::string> accession_from_title(const char *title, AccessionParsing& stat);
 
 struct Taxonomy
 {