From 7af564dd69f11244886809b3de7db28f9835b64a Mon Sep 17 00:00:00 2001 From: Kirill Bessonov Date: Mon, 3 Jun 2024 18:00:57 -0400 Subject: [PATCH] towards relase 3.1.9 and issue fixes #160 and #163 --- .github/workflows/github-actions.yaml | 41 ++++++++++++ README.md | 2 +- mob_suite/conda/meta.yaml | 2 +- mob_suite/mob_init.py | 2 + mob_suite/mob_recon.py | 66 ++++++++++--------- mob_suite/tests/test_mobrecon.py | 41 +++++++++++- .../test_mobtyper_vs_biomarker_report.py | 2 +- mob_suite/utils.py | 31 +++++---- setup.py | 4 +- 9 files changed, 141 insertions(+), 50 deletions(-) create mode 100644 .github/workflows/github-actions.yaml diff --git a/.github/workflows/github-actions.yaml b/.github/workflows/github-actions.yaml new file mode 100644 index 0000000..5bd4093 --- /dev/null +++ b/.github/workflows/github-actions.yaml @@ -0,0 +1,41 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python application + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install mash ncbi-blast+ libcurl4-openssl-dev libssl-dev ca-certificates -y + sudo apt-get install python3-pip python3-dev python3-numpy python3-pandas -y + python3 -m pip install --upgrade pip setuptools + pip3 install pytest Cython + if [ -f requirements.txt ]; then + pip3 install -r requirements.txt; + else + pip3 install -e . + fi + mob_init + - name: Test with pytest + run: | + pytest -o log_cli=true --basetemp=tmp-pytest diff --git a/README.md b/README.md index 83c8759..2682d2e 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Provides in silico predictions of the replicon family, relaxase type, mate-pair + Python >= 3.7 + ete3 >= 3.1.3 (due to updated taxonomy database init) + pandas >= 0.22.0,<=1.05 -+ biopython >= 1.80 ++ biopython >= 1.80,<2 + pytables >= 3.3 + pycurl >= 7.43 + numpy >= 1.11.1 diff --git a/mob_suite/conda/meta.yaml b/mob_suite/conda/meta.yaml index da5faf2..5698bfb 100644 --- a/mob_suite/conda/meta.yaml +++ b/mob_suite/conda/meta.yaml @@ -25,7 +25,7 @@ requirements: - numpy >=1.11.1,<1.23.5 - pytables >=3.3,<4 - pandas >=0.22.0,<=1.0.5 - - biopython >=1.70,<2 + - biopython >=1.80,<2 - pycurl >=7.43,<8 - scipy >=1.1,<2 - ete3 >=3.0,<4 diff --git a/mob_suite/mob_init.py b/mob_suite/mob_init.py index d6c08a7..aee6f7f 100644 --- a/mob_suite/mob_init.py +++ b/mob_suite/mob_init.py @@ -88,6 +88,8 @@ def download_to_file(url, file): c.setopt(c.FOLLOWLOCATION, True) c.setopt(c.WRITEDATA, f) c.setopt(c.NOPROGRESS, False) + c.setopt(pycurl.SSL_VERIFYPEER, 0) + c.setopt(pycurl.SSL_VERIFYHOST, 0) c.perform() c.close() diff --git a/mob_suite/mob_recon.py b/mob_suite/mob_recon.py index 975542a..f19de38 100644 --- a/mob_suite/mob_recon.py +++ b/mob_suite/mob_recon.py @@ -424,25 +424,23 @@ def calc_cluster_scores(reference_hit_coverage): return OrderedDict(sorted(iter(list(cluster_scores.items())), key=lambda x: x[1], reverse=True)) -def get_contig_link_counts(reference_hit_coverage, cluster_contig_links): +def get_contig_link_counts(reference_hit_coverage, cluster_contig_links, contig_info): cluster_scores = calc_cluster_scores(reference_hit_coverage) - contig_link_counts = {} + # for each contig how many clusters a given contig is associated with, + # their cluster ids and their scores plus the contig size and # of links to plasmid clusters contig_clust_assoc = {} for clust_id in cluster_contig_links: contigs = cluster_contig_links[clust_id] for contig_id in contigs: - if not contig_id in contig_link_counts: - contig_link_counts[contig_id] = 0 - contig_clust_assoc[contig_id] = {} - contig_link_counts[contig_id] += 1 - contig_clust_assoc[contig_id][clust_id] = cluster_scores[clust_id] - - for contig_id in contig_clust_assoc: - contig_clust_assoc[contig_id] = OrderedDict( - sorted(iter(contig_clust_assoc[contig_id].items()), key=lambda x: x[1], reverse=True)) - - return OrderedDict(sorted(iter(contig_link_counts.items()), key=lambda x: x[1], reverse=False)) + if not contig_id in contig_clust_assoc: + contig_clust_assoc[contig_id] = {'links2clusters':0, 'contig_size': 0, 'cluster_ids':{}} + contig_clust_assoc[contig_id]['links2clusters'] += 1 + contig_clust_assoc[contig_id]['contig_size'] = int(contig_info[contig_id]['size']) + contig_clust_assoc[contig_id]['cluster_ids'][clust_id] = cluster_scores[clust_id] + contig_link_counts=sorted(contig_clust_assoc.items(),key=lambda x: (x[1]['links2clusters'], -x[1]['contig_size']), reverse=False) + return OrderedDict([(i[0], i[1]['links2clusters']) for i in contig_link_counts]) + def get_contigs_by_key_value(contig_info, column_key, reasons): @@ -472,20 +470,19 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ replicon_contigs = get_contigs_with_value_set(contig_info, 'rep_type(s)') relaxase_contigs = get_contigs_with_value_set(contig_info, 'relaxase_type(s)') - contig_list = list(contig_reference_coverage.keys()) + #contig_list = list(contig_reference_coverage.keys()) for contig_id in filtered_contigs: if contig_id in contig_reference_coverage: del(contig_reference_coverage[contig_id]) if contig_id in contig_reference_coverage: del (contig_reference_coverage[contig_id]) - + cluster_contig_links = get_seq_links(contig_reference_coverage, reference_sequence_meta) - contig_link_counts = get_contig_link_counts(reference_hit_coverage, cluster_contig_links) - cluster_scores = calc_cluster_scores(reference_hit_coverage) + contig_link_counts = get_contig_link_counts(reference_hit_coverage, cluster_contig_links, contig_info) + cluster_scores = calc_cluster_scores(reference_hit_coverage) #sort cluster scores in descending order contig_cluster_scores = {} - for clust_id in cluster_contig_links: for contig_id in cluster_contig_links[clust_id]: if contig_id in filtered_contigs: @@ -494,11 +491,11 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ if not contig_id in contig_cluster_scores: contig_cluster_scores[contig_id] = {} contig_cluster_scores[contig_id][clust_id] = float(score) - + for contig_id in contig_cluster_scores: contig_cluster_scores[contig_id] = OrderedDict( sorted(iter(contig_cluster_scores[contig_id].items()), key=lambda x: x[1], reverse=True)) - + black_list_clusters = {} group_membership = {} # assign circular contigs with replicon or relaxase first @@ -521,8 +518,8 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ ref_hit_id] del (contig_reference_coverage[contig_id]) break - - cluster_scores = calc_cluster_scores(reference_hit_coverage) + + cluster_scores = calc_cluster_scores(reference_hit_coverage) #re-sort cluster scores in descending order # find plasmids well covered by contigs high_confidence_references = {} @@ -533,8 +530,8 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ if coverage > 0.8: high_confidence_references[ref_id] = score - - # Assign contigs according to highly coverged plasmids + + # Assign contigs according to highly coverged plasmid clusters high_confidence_references = OrderedDict( sorted(iter(high_confidence_references.items()), key=lambda x: x[1], reverse=True)) @@ -556,20 +553,21 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ reference_hit_coverage[ref_hit_id]['score'] -= contig_reference_coverage[contig_id][ ref_hit_id] del (contig_reference_coverage[contig_id]) - cluster_scores = calc_cluster_scores(reference_hit_coverage) - + cluster_scores = calc_cluster_scores(reference_hit_coverage) #re-sort cluster scores in descending order + # Assign low linkage contigs first + # Note a given contig can be mapped to MULTIPLE plasmid clusters, so order of contigs presented for mapping matters for c_id in contig_link_counts: count = contig_link_counts[c_id] if count > 5: break scores = contig_cluster_scores[c_id] - for clust_id in scores: + for clust_id in scores: score = scores[clust_id] if clust_id in black_list_clusters: continue - for contig_id in cluster_contig_links[clust_id]: + for idx, contig_id in enumerate(cluster_contig_links[clust_id]): #debug remove the enumerate if contig_id not in group_membership: group_membership[contig_id] = clust_id # update cluster scores to remove contigs already assigned @@ -579,8 +577,10 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ reference_hit_coverage[ref_hit_id]['score'] -= contig_reference_coverage[contig_id][ ref_hit_id] del (contig_reference_coverage[contig_id]) + else: #debug + print(f"Not {contig_id} to cluster {clust_id} assigned as already present in cluster {group_membership[contig_id]}") break - + clusters_with_biomarkers = {} for clust_id in cluster_contig_links: contigs = cluster_contig_links[clust_id] @@ -620,8 +620,10 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ if clust_id not in cluster_links: cluster_links[clust_id] = [] cluster_links[clust_id].append(contig_id) + recon_cluster_dists = get_reconstructed_cluster_dists(mash_db, 0.1, cluster_links, out_dir, contig_seqs, num_threads) + cluster_md5 = {} for clust_id in cluster_contig_links: contigs = cluster_contig_links[clust_id] @@ -712,6 +714,7 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_ recon_cluster_dists = get_reconstructed_cluster_dists(mash_db, 0.1, cluster_links, out_dir, contig_seqs, num_threads) + clusters_with_biomarkers = {} for clust_id in cluster_links: contigs = cluster_links[clust_id] @@ -1142,7 +1145,7 @@ def main(): if contig_info[id]['circularity_status'] == '': contig_info[id]['circularity_status'] = 'not tested' - + # Blast reference databases identify_biomarkers(contig_info, fixed_fasta, tmp_dir, min_length, logging, \ @@ -1279,6 +1282,7 @@ def main(): contig_blast_df = contig_blast_df[contig_blast_df.sseqid.isin(list(reference_sequence_meta.keys()))] contig_blast_df.reset_index(drop=True) logging.info("Assigning contigs to plasmid groups") + #contig_blast_df.to_csv("contig_blast_df_back.txt", index=False, sep="\t") #debug contig_info = assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_info, tmp_dir, contig_seqs, mash_db, primary_distance, secondary_distance, num_threads) @@ -1394,7 +1398,7 @@ def main(): if prefix is not None: mobtyper_report = os.path.join(out_dir, "{}.mobtyper_results.txt".format(prefix)) build_mobtyper_report(contig_memberships['plasmid'], out_dir, mobtyper_report,contig_seqs, ncbi, lit, ETE3DBTAXAFILE, database_dir) - + results.sort(key = lambda d: d['contig_id']) #sort by contig identifiers writeReport(results, MOB_RECON_INFO_HEADER, contig_report) logging.info("Writting chromosome sequences to {}".format(chromosome_file)) diff --git a/mob_suite/tests/test_mobrecon.py b/mob_suite/tests/test_mobrecon.py index 9d9d446..6319be3 100644 --- a/mob_suite/tests/test_mobrecon.py +++ b/mob_suite/tests/test_mobrecon.py @@ -1,5 +1,9 @@ -import sys, os, logging +import sys, os, logging, json, pandas, pytest, random, hashlib import mob_suite.mob_recon +import mob_suite.utils +from mob_suite.constants import MOB_CLUSTER_INFO_HEADER, default_database_dir +from Bio import SeqIO + #test all mob_recon functions including aggregation of results TEST_ROOT = os.path.dirname(__file__) ROOT_MODULE = os.path.dirname(mob_suite.__file__) @@ -57,4 +61,37 @@ def test_mob_recon_typical_run(): assert os.path.exists(os.path.join(TEST_ROOT,"run_test/mob_recon_SRR3703080/contig_report.txt")) == True, "File does not exits" assert os.path.exists(os.path.join(TEST_ROOT,"run_test/mob_recon_SRR3703080/plasmid_AA474.fasta")) == True, "File does not exists" - \ No newline at end of file + +def test_contig_order_on_final_results(tmpdir): + check_if_output_dir_exists_and_create() + in_fasta = ROOT_MODULE + "/example/SRR3703080_illumina_unicycler.fasta" + records=SeqIO.index(in_fasta, "fasta") + for iter in range(0,2): + contigs_list = [r for r in records.keys()] + random.shuffle(contigs_list) + with open(os.path.join(tmpdir, f"shuffled_{iter}.fasta"), "w") as output_handle: + SeqIO.write([records[k] for k in contigs_list], output_handle, "fasta") + args = [ + "--infile", os.path.join(tmpdir, f"shuffled_{iter}.fasta"), + "--outdir", TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_{iter}", + "--debug", + "--force" + ] + sys.argv[1:] = args + mob_suite.mob_recon.main() + + df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/contig_report.txt", sep="\t")[1:] #remove sample_id column + df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/contig_report.txt", sep="\t")[1:] #remove sample_id column + assert all(df1==df2), "Two contig_reports.txt are not equal" + + df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/mobtyper_results.txt", sep="\t")[1:] + df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/mobtyper_results.txt", sep="\t")[1:] + assert all(df1==df2), "Two mobtyper_results.txt are not equal" + + df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/biomarkers.blast.txt", sep="\t")[1:] + df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/biomarkers.blast.txt", sep="\t")[1:] + assert all(df1==df2), "Two biomarkers.blast.txt are not equal" + + hash_1 = hashlib.md5(open(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/plasmid_AA474.fasta",'rb').read()).hexdigest() + hash_2 = hashlib.md5(open(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/plasmid_AA474.fasta",'rb').read()).hexdigest() + assert hash_1==hash_2, "Two plasmid_AA474.fasta files are not equal" \ No newline at end of file diff --git a/mob_suite/tests/test_mobtyper_vs_biomarker_report.py b/mob_suite/tests/test_mobtyper_vs_biomarker_report.py index ab613b0..33ba955 100644 --- a/mob_suite/tests/test_mobtyper_vs_biomarker_report.py +++ b/mob_suite/tests/test_mobtyper_vs_biomarker_report.py @@ -12,7 +12,7 @@ TEST_ROOT = os.path.dirname(__file__) PACKAGE_ROOT = os.path.dirname(mob_suite.__file__) LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]' -NRANDOMSAMPLES2TEST = 300 +NRANDOMSAMPLES2TEST = 10 COMPLEX_CASES_LIST = ['CP011291', 'CP045773', 'KU932024', 'NC_002134', 'CP033122', 'CP031575', 'CP021102', 'CP033949', 'CP025336', 'CP025336', 'CP028546'] diff --git a/mob_suite/utils.py b/mob_suite/utils.py index c7a00cd..7032608 100644 --- a/mob_suite/utils.py +++ b/mob_suite/utils.py @@ -668,7 +668,7 @@ def verify_init(logger, database_dir): mob_init_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mob_init.py') status_file = os.path.join(database_dir, 'status.txt') if not os.path.isfile(status_file): - logger.info('MOB-databases need to be initialized, this will take some time') + logger.info(f'MOB-databases need to be initialized at {database_dir}, this will take some time ...') p = Popen([sys.executable, mob_init_path, '-d', database_dir], stdout=PIPE, stderr=PIPE, @@ -822,20 +822,26 @@ def recursive_filter_overlap_records(blast_df, overlap_threshold, contig_id_col, def fix_fasta_header(in_fasta, out_fasta): ids = {} fh = open(out_fasta, 'w') - incr=0 + md5_contigid_len = [] with open(in_fasta, "r") as handle: for record in SeqIO.parse(handle, "fasta"): - id = str(record.description) - status = 'false' - if 'circular=true' in id or '_circ' in id: - status = 'true' - seq = str(record.seq.upper()) + id = str(record.id) + seq = str(record.seq) md5 = calc_md5(seq) - new_id = "{}_{}_circular={}".format(incr,md5,status) - ids[new_id] = id - fh.write(">{}\n{}\n".format(new_id,seq)) - incr+=1 + md5_contigid_len.append((md5, id, len(seq))) handle.close() + #sort contigs by length and then by md5 checksum to guarantee unique ordering + records_index_dict = SeqIO.index(in_fasta, "fasta") + for incr, (md5, id, seq_len) in enumerate(sorted(md5_contigid_len, key=lambda x: (x[2], x[0]), reverse = True)): + record = records_index_dict[id] + id = str(record.description) + status = 'false' + if 'circular=true' in id or '_circ' in id: + status = 'true' + seq = str(record.seq.upper()) + new_id = f"{incr}_{md5}_circular={status}_{seq_len}"#.format(incr,md5,status) + ids[new_id] = id + fh.write(">{}\n{}\n".format(new_id,seq)) fh.close() return ids @@ -905,7 +911,8 @@ def init_console_logger(lvl=2): return logging.getLogger(__name__) -def writeReport(data_list, header, outfile): +def writeReport(data_list, header, outfile): + with open(outfile, 'w') as fh: fh.write("{}\n".format("\t".join(header))) for i in range(0, len(data_list)): diff --git a/setup.py b/setup.py index f4a1436..f6e027c 100644 --- a/setup.py +++ b/setup.py @@ -48,8 +48,8 @@ def read(fname): install_requires=[ 'numpy>=1.11.1,<1.23.5', 'tables>=3.3.0,<4', - 'pandas>=0.22.0,<=1.0.5', - 'biopython>=1.8,<2', + 'pandas>=0.22.0,<=1.5.3', + 'biopython>=1.8,<2', 'pycurl>=7.43.0,<8', 'scipy>=1.1.0,<2', 'ete3>=3.1.3,<4',