Merge pull request #166 from kbessonov1984/master

towards release 3.1.9 and issue fixes #160 and #163
phac-nml · Jun 3, 2024 · eb86064 · eb86064
2 parents 1400ab4 + 7af564d
commit eb86064
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 50 deletions.
diff --git a/.github/workflows/github-actions.yaml b/.github/workflows/github-actions.yaml
@@ -0,0 +1,41 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python application
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+
+    runs-on: ubuntu-22.04
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install mash ncbi-blast+ libcurl4-openssl-dev libssl-dev ca-certificates -y
+        sudo apt-get install python3-pip python3-dev python3-numpy python3-pandas -y
+        python3 -m pip install --upgrade pip setuptools
+        pip3 install pytest Cython
+        if [ -f requirements.txt ]; then 
+          pip3 install -r requirements.txt;
+        else
+          pip3 install -e .
+        fi
+        mob_init
+    - name: Test  with pytest
+      run: |
+        pytest -o log_cli=true --basetemp=tmp-pytest
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ Provides in silico predictions of the replicon family, relaxase type, mate-pair
 + Python >= 3.7
 + ete3 >= 3.1.3 (due to updated taxonomy database init)
 + pandas >= 0.22.0,<=1.05
-+ biopython >= 1.80
++ biopython >= 1.80,<2
 + pytables  >= 3.3
 + pycurl >= 7.43
 + numpy >= 1.11.1

diff --git a/mob_suite/conda/meta.yaml b/mob_suite/conda/meta.yaml
@@ -25,7 +25,7 @@ requirements:
     - numpy >=1.11.1,<1.23.5
     - pytables >=3.3,<4
     - pandas >=0.22.0,<=1.0.5
-    - biopython >=1.70,<2
+    - biopython >=1.80,<2
     - pycurl >=7.43,<8
     - scipy >=1.1,<2
     - ete3 >=3.0,<4

diff --git a/mob_suite/mob_init.py b/mob_suite/mob_init.py
@@ -88,6 +88,8 @@ def download_to_file(url, file):
         c.setopt(c.FOLLOWLOCATION, True)
         c.setopt(c.WRITEDATA, f)
         c.setopt(c.NOPROGRESS, False)
+        c.setopt(pycurl.SSL_VERIFYPEER, 0)
+        c.setopt(pycurl.SSL_VERIFYHOST, 0)
         c.perform()
         c.close()
 

diff --git a/mob_suite/mob_recon.py b/mob_suite/mob_recon.py
@@ -424,25 +424,23 @@ def calc_cluster_scores(reference_hit_coverage):
     return OrderedDict(sorted(iter(list(cluster_scores.items())), key=lambda x: x[1], reverse=True))
 
 
-def get_contig_link_counts(reference_hit_coverage, cluster_contig_links):
+def get_contig_link_counts(reference_hit_coverage, cluster_contig_links, contig_info):
     cluster_scores = calc_cluster_scores(reference_hit_coverage)
 
-    contig_link_counts = {}
+    # for each contig how many clusters a given contig is associated with, 
+    # their cluster ids and their scores plus the contig size and # of links to plasmid clusters
     contig_clust_assoc = {}
     for clust_id in cluster_contig_links:
         contigs = cluster_contig_links[clust_id]
         for contig_id in contigs:
-            if not contig_id in contig_link_counts:
-                contig_link_counts[contig_id] = 0
-                contig_clust_assoc[contig_id] = {}
-            contig_link_counts[contig_id] += 1
-            contig_clust_assoc[contig_id][clust_id] = cluster_scores[clust_id]
-
-    for contig_id in contig_clust_assoc:
-        contig_clust_assoc[contig_id] = OrderedDict(
-            sorted(iter(contig_clust_assoc[contig_id].items()), key=lambda x: x[1], reverse=True))
-
-    return OrderedDict(sorted(iter(contig_link_counts.items()), key=lambda x: x[1], reverse=False))
+            if not contig_id in contig_clust_assoc:
+                contig_clust_assoc[contig_id] = {'links2clusters':0, 'contig_size': 0, 'cluster_ids':{}}
+            contig_clust_assoc[contig_id]['links2clusters'] += 1
+            contig_clust_assoc[contig_id]['contig_size'] = int(contig_info[contig_id]['size'])
+            contig_clust_assoc[contig_id]['cluster_ids'][clust_id] = cluster_scores[clust_id]
+    contig_link_counts=sorted(contig_clust_assoc.items(),key=lambda x: (x[1]['links2clusters'], -x[1]['contig_size']), reverse=False)
+    return OrderedDict([(i[0], i[1]['links2clusters']) for i in contig_link_counts])
+
 
 
 def get_contigs_by_key_value(contig_info, column_key, reasons):
@@ -472,20 +470,19 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
     replicon_contigs = get_contigs_with_value_set(contig_info, 'rep_type(s)')
     relaxase_contigs = get_contigs_with_value_set(contig_info, 'relaxase_type(s)')
 
-    contig_list = list(contig_reference_coverage.keys())
+    #contig_list = list(contig_reference_coverage.keys())
     for contig_id in filtered_contigs:
         if contig_id in contig_reference_coverage:
             del(contig_reference_coverage[contig_id])
         if contig_id in contig_reference_coverage:
             del (contig_reference_coverage[contig_id])
 
-
+    
     cluster_contig_links = get_seq_links(contig_reference_coverage, reference_sequence_meta)
-    contig_link_counts = get_contig_link_counts(reference_hit_coverage, cluster_contig_links)
-    cluster_scores = calc_cluster_scores(reference_hit_coverage)
+    contig_link_counts = get_contig_link_counts(reference_hit_coverage, cluster_contig_links, contig_info)
+    cluster_scores = calc_cluster_scores(reference_hit_coverage) #sort cluster scores in descending order
 
     contig_cluster_scores = {}
-
     for clust_id in cluster_contig_links:
         for contig_id in cluster_contig_links[clust_id]:
             if contig_id in filtered_contigs:
@@ -494,11 +491,11 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
             if not contig_id in contig_cluster_scores:
                 contig_cluster_scores[contig_id] = {}
             contig_cluster_scores[contig_id][clust_id] = float(score)
-
+    
     for contig_id in contig_cluster_scores:
         contig_cluster_scores[contig_id] = OrderedDict(
             sorted(iter(contig_cluster_scores[contig_id].items()), key=lambda x: x[1], reverse=True))
-
+    
     black_list_clusters = {}
     group_membership = {}
     # assign circular contigs with replicon or relaxase first
@@ -521,8 +518,8 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
                                     ref_hit_id]
                         del (contig_reference_coverage[contig_id])
                         break
-
-    cluster_scores = calc_cluster_scores(reference_hit_coverage)
+     
+    cluster_scores = calc_cluster_scores(reference_hit_coverage) #re-sort cluster scores in descending order
 
     # find plasmids well covered by contigs
     high_confidence_references = {}
@@ -533,8 +530,8 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
 
         if coverage > 0.8:
             high_confidence_references[ref_id] = score
-
-    # Assign contigs according to highly coverged plasmids
+    
+    # Assign contigs according to highly coverged plasmid clusters
     high_confidence_references = OrderedDict(
         sorted(iter(high_confidence_references.items()), key=lambda x: x[1], reverse=True))
 
@@ -556,20 +553,21 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
                             reference_hit_coverage[ref_hit_id]['score'] -= contig_reference_coverage[contig_id][
                                 ref_hit_id]
                     del (contig_reference_coverage[contig_id])
-    cluster_scores = calc_cluster_scores(reference_hit_coverage)
-
+    cluster_scores = calc_cluster_scores(reference_hit_coverage) #re-sort cluster scores in descending order
+    
     # Assign low linkage contigs first
+    # Note a given contig can be mapped to MULTIPLE plasmid clusters, so order of contigs presented for mapping matters
     for c_id in contig_link_counts:
         count = contig_link_counts[c_id]
         if count > 5:
             break
         scores = contig_cluster_scores[c_id]
-        for clust_id in scores:
+        for clust_id in scores: 
             score = scores[clust_id]
             if clust_id in black_list_clusters:
                 continue
 
-            for contig_id in cluster_contig_links[clust_id]:
+            for idx, contig_id in enumerate(cluster_contig_links[clust_id]): #debug remove the enumerate
                 if contig_id not in group_membership:
                     group_membership[contig_id] = clust_id
                     # update cluster scores to remove contigs already assigned
@@ -579,8 +577,10 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
                                 reference_hit_coverage[ref_hit_id]['score'] -= contig_reference_coverage[contig_id][
                                     ref_hit_id]
                         del (contig_reference_coverage[contig_id])
+                else: #debug
+                    print(f"Not {contig_id} to cluster {clust_id} assigned as already present in cluster {group_membership[contig_id]}")        
             break
-
+    
     clusters_with_biomarkers = {}
     for clust_id in cluster_contig_links:
         contigs = cluster_contig_links[clust_id]
@@ -620,8 +620,10 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
         if clust_id not in cluster_links:
             cluster_links[clust_id] = []
         cluster_links[clust_id].append(contig_id)
+
     recon_cluster_dists = get_reconstructed_cluster_dists(mash_db, 0.1, cluster_links, out_dir, contig_seqs,
                                                           num_threads)
+
     cluster_md5 = {}
     for clust_id in cluster_contig_links:
         contigs = cluster_contig_links[clust_id]
@@ -712,6 +714,7 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
 
     recon_cluster_dists = get_reconstructed_cluster_dists(mash_db, 0.1, cluster_links, out_dir, contig_seqs,
                                                           num_threads)
+
     clusters_with_biomarkers = {}
     for clust_id in cluster_links:
         contigs = cluster_links[clust_id]
@@ -1142,7 +1145,7 @@ def main():
 
         if contig_info[id]['circularity_status'] == '':
             contig_info[id]['circularity_status'] = 'not tested'
-
+    
     # Blast reference databases
 
     identify_biomarkers(contig_info, fixed_fasta, tmp_dir, min_length, logging, \
@@ -1279,6 +1282,7 @@ def main():
         contig_blast_df = contig_blast_df[contig_blast_df.sseqid.isin(list(reference_sequence_meta.keys()))]
         contig_blast_df.reset_index(drop=True)
         logging.info("Assigning contigs to plasmid groups")
+        #contig_blast_df.to_csv("contig_blast_df_back.txt", index=False, sep="\t")   #debug   
         contig_info = assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_info, tmp_dir,
                                                  contig_seqs, mash_db, primary_distance, secondary_distance,
                                                  num_threads)
@@ -1394,7 +1398,7 @@ def main():
             if prefix is not None:
                 mobtyper_report = os.path.join(out_dir, "{}.mobtyper_results.txt".format(prefix))
             build_mobtyper_report(contig_memberships['plasmid'], out_dir, mobtyper_report,contig_seqs, ncbi, lit, ETE3DBTAXAFILE, database_dir)
-
+        results.sort(key = lambda d: d['contig_id']) #sort by contig identifiers
         writeReport(results, MOB_RECON_INFO_HEADER, contig_report)
 
         logging.info("Writting chromosome sequences to {}".format(chromosome_file))

diff --git a/mob_suite/tests/test_mobrecon.py b/mob_suite/tests/test_mobrecon.py
@@ -1,5 +1,9 @@
-import sys, os, logging
+import sys, os, logging, json, pandas, pytest, random, hashlib
 import mob_suite.mob_recon
+import mob_suite.utils
+from mob_suite.constants import MOB_CLUSTER_INFO_HEADER, default_database_dir
+from Bio import SeqIO
+
 #test all mob_recon functions including aggregation of results
 TEST_ROOT = os.path.dirname(__file__)
 ROOT_MODULE = os.path.dirname(mob_suite.__file__)
@@ -57,4 +61,37 @@ def test_mob_recon_typical_run():
     assert os.path.exists(os.path.join(TEST_ROOT,"run_test/mob_recon_SRR3703080/contig_report.txt")) == True, "File does not exits"
     assert os.path.exists(os.path.join(TEST_ROOT,"run_test/mob_recon_SRR3703080/plasmid_AA474.fasta")) == True, "File does not exists"
 
-
+
+def test_contig_order_on_final_results(tmpdir):
+    check_if_output_dir_exists_and_create()
+    in_fasta = ROOT_MODULE + "/example/SRR3703080_illumina_unicycler.fasta"
+    records=SeqIO.index(in_fasta, "fasta")
+    for iter in range(0,2):
+        contigs_list = [r for r in records.keys()]
+        random.shuffle(contigs_list)
+        with open(os.path.join(tmpdir, f"shuffled_{iter}.fasta"), "w") as output_handle:
+            SeqIO.write([records[k] for k in contigs_list], output_handle, "fasta")
+        args = [
+            "--infile", os.path.join(tmpdir, f"shuffled_{iter}.fasta"),
+            "--outdir", TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_{iter}",
+            "--debug",
+            "--force"
+        ]
+        sys.argv[1:] = args
+        mob_suite.mob_recon.main()
+
+    df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/contig_report.txt", sep="\t")[1:] #remove sample_id column
+    df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/contig_report.txt", sep="\t")[1:] #remove sample_id column
+    assert all(df1==df2), "Two contig_reports.txt are not equal"
+
+    df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/mobtyper_results.txt", sep="\t")[1:]
+    df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/mobtyper_results.txt", sep="\t")[1:]
+    assert all(df1==df2), "Two mobtyper_results.txt are not equal"
+
+    df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/biomarkers.blast.txt", sep="\t")[1:]
+    df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/biomarkers.blast.txt", sep="\t")[1:]
+    assert all(df1==df2), "Two biomarkers.blast.txt are not equal"
+
+    hash_1 = hashlib.md5(open(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/plasmid_AA474.fasta",'rb').read()).hexdigest()
+    hash_2 = hashlib.md5(open(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/plasmid_AA474.fasta",'rb').read()).hexdigest()
+    assert hash_1==hash_2, "Two plasmid_AA474.fasta files are not equal"
diff --git a/mob_suite/tests/test_mobtyper_vs_biomarker_report.py b/mob_suite/tests/test_mobtyper_vs_biomarker_report.py
@@ -12,7 +12,7 @@
 TEST_ROOT = os.path.dirname(__file__)
 PACKAGE_ROOT = os.path.dirname(mob_suite.__file__)
 LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
-NRANDOMSAMPLES2TEST = 300
+NRANDOMSAMPLES2TEST = 10
 COMPLEX_CASES_LIST = ['CP011291', 'CP045773', 'KU932024', 'NC_002134', 'CP033122', 'CP031575', 'CP021102',
                       'CP033949', 'CP025336', 'CP025336', 'CP028546']
 

diff --git a/mob_suite/utils.py b/mob_suite/utils.py
@@ -668,7 +668,7 @@ def verify_init(logger, database_dir):
     mob_init_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mob_init.py')
     status_file = os.path.join(database_dir, 'status.txt')
     if not os.path.isfile(status_file):
-        logger.info('MOB-databases need to be initialized, this will take some time')
+        logger.info(f'MOB-databases need to be initialized at {database_dir}, this will take some time ...')
         p = Popen([sys.executable, mob_init_path, '-d', database_dir],
                   stdout=PIPE,
                   stderr=PIPE,
@@ -822,20 +822,26 @@ def recursive_filter_overlap_records(blast_df, overlap_threshold, contig_id_col,
 def fix_fasta_header(in_fasta, out_fasta):
     ids = {}
     fh = open(out_fasta, 'w')
-    incr=0
+    md5_contigid_len = []
     with open(in_fasta, "r") as handle:
         for record in SeqIO.parse(handle, "fasta"):
-            id = str(record.description)
-            status = 'false'
-            if 'circular=true' in id or '_circ' in id:
-                status = 'true'
-            seq = str(record.seq.upper())
+            id = str(record.id)
+            seq = str(record.seq)
             md5 = calc_md5(seq)
-            new_id = "{}_{}_circular={}".format(incr,md5,status)
-            ids[new_id] = id
-            fh.write(">{}\n{}\n".format(new_id,seq))
-            incr+=1
+            md5_contigid_len.append((md5, id, len(seq)))
     handle.close()
+    #sort contigs by length and then by md5 checksum to guarantee unique ordering
+    records_index_dict = SeqIO.index(in_fasta, "fasta")
+    for incr, (md5, id, seq_len) in enumerate(sorted(md5_contigid_len, key=lambda x: (x[2], x[0]), reverse = True)):
+        record = records_index_dict[id]
+        id = str(record.description)
+        status = 'false'
+        if 'circular=true' in id or '_circ' in id:
+            status = 'true'
+        seq = str(record.seq.upper())
+        new_id = f"{incr}_{md5}_circular={status}_{seq_len}"#.format(incr,md5,status)
+        ids[new_id] = id
+        fh.write(">{}\n{}\n".format(new_id,seq))
     fh.close()
     return ids
 
@@ -905,7 +911,8 @@ def init_console_logger(lvl=2):
     return logging.getLogger(__name__)
 
 
-def writeReport(data_list, header, outfile):
+def writeReport(data_list, header, outfile):  
+
     with open(outfile, 'w') as fh:
         fh.write("{}\n".format("\t".join(header)))
         for i in range(0, len(data_list)):

diff --git a/setup.py b/setup.py
@@ -48,8 +48,8 @@ def read(fname):
     install_requires=[
         'numpy>=1.11.1,<1.23.5',
         'tables>=3.3.0,<4',
-        'pandas>=0.22.0,<=1.0.5',
-        'biopython>=1.8,<2',
+        'pandas>=0.22.0,<=1.5.3',
+        'biopython>=1.8,<2', 
         'pycurl>=7.43.0,<8',
         'scipy>=1.1.0,<2',
         'ete3>=3.1.3,<4',