Skip to content

Commit

Permalink
Merge pull request #166 from kbessonov1984/master
Browse files Browse the repository at this point in the history
towards release 3.1.9 and issue fixes #160 and #163
  • Loading branch information
kbessonov1984 committed Jun 3, 2024
2 parents 1400ab4 + 7af564d commit eb86064
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 50 deletions.
41 changes: 41 additions & 0 deletions .github/workflows/github-actions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python application

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install mash ncbi-blast+ libcurl4-openssl-dev libssl-dev ca-certificates -y
sudo apt-get install python3-pip python3-dev python3-numpy python3-pandas -y
python3 -m pip install --upgrade pip setuptools
pip3 install pytest Cython
if [ -f requirements.txt ]; then
pip3 install -r requirements.txt;
else
pip3 install -e .
fi
mob_init
- name: Test with pytest
run: |
pytest -o log_cli=true --basetemp=tmp-pytest
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Provides in silico predictions of the replicon family, relaxase type, mate-pair
+ Python >= 3.7
+ ete3 >= 3.1.3 (due to updated taxonomy database init)
+ pandas >= 0.22.0,<=1.05
+ biopython >= 1.80
+ biopython >= 1.80,<2
+ pytables >= 3.3
+ pycurl >= 7.43
+ numpy >= 1.11.1
Expand Down
2 changes: 1 addition & 1 deletion mob_suite/conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ requirements:
- numpy >=1.11.1,<1.23.5
- pytables >=3.3,<4
- pandas >=0.22.0,<=1.0.5
- biopython >=1.70,<2
- biopython >=1.80,<2
- pycurl >=7.43,<8
- scipy >=1.1,<2
- ete3 >=3.0,<4
Expand Down
2 changes: 2 additions & 0 deletions mob_suite/mob_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ def download_to_file(url, file):
c.setopt(c.FOLLOWLOCATION, True)
c.setopt(c.WRITEDATA, f)
c.setopt(c.NOPROGRESS, False)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
c.close()

Expand Down
66 changes: 35 additions & 31 deletions mob_suite/mob_recon.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,25 +424,23 @@ def calc_cluster_scores(reference_hit_coverage):
return OrderedDict(sorted(iter(list(cluster_scores.items())), key=lambda x: x[1], reverse=True))


def get_contig_link_counts(reference_hit_coverage, cluster_contig_links):
def get_contig_link_counts(reference_hit_coverage, cluster_contig_links, contig_info):
cluster_scores = calc_cluster_scores(reference_hit_coverage)

contig_link_counts = {}
# for each contig how many clusters a given contig is associated with,
# their cluster ids and their scores plus the contig size and # of links to plasmid clusters
contig_clust_assoc = {}
for clust_id in cluster_contig_links:
contigs = cluster_contig_links[clust_id]
for contig_id in contigs:
if not contig_id in contig_link_counts:
contig_link_counts[contig_id] = 0
contig_clust_assoc[contig_id] = {}
contig_link_counts[contig_id] += 1
contig_clust_assoc[contig_id][clust_id] = cluster_scores[clust_id]

for contig_id in contig_clust_assoc:
contig_clust_assoc[contig_id] = OrderedDict(
sorted(iter(contig_clust_assoc[contig_id].items()), key=lambda x: x[1], reverse=True))

return OrderedDict(sorted(iter(contig_link_counts.items()), key=lambda x: x[1], reverse=False))
if not contig_id in contig_clust_assoc:
contig_clust_assoc[contig_id] = {'links2clusters':0, 'contig_size': 0, 'cluster_ids':{}}
contig_clust_assoc[contig_id]['links2clusters'] += 1
contig_clust_assoc[contig_id]['contig_size'] = int(contig_info[contig_id]['size'])
contig_clust_assoc[contig_id]['cluster_ids'][clust_id] = cluster_scores[clust_id]
contig_link_counts=sorted(contig_clust_assoc.items(),key=lambda x: (x[1]['links2clusters'], -x[1]['contig_size']), reverse=False)
return OrderedDict([(i[0], i[1]['links2clusters']) for i in contig_link_counts])



def get_contigs_by_key_value(contig_info, column_key, reasons):
Expand Down Expand Up @@ -472,20 +470,19 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
replicon_contigs = get_contigs_with_value_set(contig_info, 'rep_type(s)')
relaxase_contigs = get_contigs_with_value_set(contig_info, 'relaxase_type(s)')

contig_list = list(contig_reference_coverage.keys())
#contig_list = list(contig_reference_coverage.keys())
for contig_id in filtered_contigs:
if contig_id in contig_reference_coverage:
del(contig_reference_coverage[contig_id])
if contig_id in contig_reference_coverage:
del (contig_reference_coverage[contig_id])


cluster_contig_links = get_seq_links(contig_reference_coverage, reference_sequence_meta)
contig_link_counts = get_contig_link_counts(reference_hit_coverage, cluster_contig_links)
cluster_scores = calc_cluster_scores(reference_hit_coverage)
contig_link_counts = get_contig_link_counts(reference_hit_coverage, cluster_contig_links, contig_info)
cluster_scores = calc_cluster_scores(reference_hit_coverage) #sort cluster scores in descending order

contig_cluster_scores = {}

for clust_id in cluster_contig_links:
for contig_id in cluster_contig_links[clust_id]:
if contig_id in filtered_contigs:
Expand All @@ -494,11 +491,11 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
if not contig_id in contig_cluster_scores:
contig_cluster_scores[contig_id] = {}
contig_cluster_scores[contig_id][clust_id] = float(score)

for contig_id in contig_cluster_scores:
contig_cluster_scores[contig_id] = OrderedDict(
sorted(iter(contig_cluster_scores[contig_id].items()), key=lambda x: x[1], reverse=True))

black_list_clusters = {}
group_membership = {}
# assign circular contigs with replicon or relaxase first
Expand All @@ -521,8 +518,8 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
ref_hit_id]
del (contig_reference_coverage[contig_id])
break

cluster_scores = calc_cluster_scores(reference_hit_coverage)
cluster_scores = calc_cluster_scores(reference_hit_coverage) #re-sort cluster scores in descending order

# find plasmids well covered by contigs
high_confidence_references = {}
Expand All @@ -533,8 +530,8 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_

if coverage > 0.8:
high_confidence_references[ref_id] = score

# Assign contigs according to highly coverged plasmids
# Assign contigs according to highly coverged plasmid clusters
high_confidence_references = OrderedDict(
sorted(iter(high_confidence_references.items()), key=lambda x: x[1], reverse=True))

Expand All @@ -556,20 +553,21 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
reference_hit_coverage[ref_hit_id]['score'] -= contig_reference_coverage[contig_id][
ref_hit_id]
del (contig_reference_coverage[contig_id])
cluster_scores = calc_cluster_scores(reference_hit_coverage)

cluster_scores = calc_cluster_scores(reference_hit_coverage) #re-sort cluster scores in descending order
# Assign low linkage contigs first
# Note a given contig can be mapped to MULTIPLE plasmid clusters, so order of contigs presented for mapping matters
for c_id in contig_link_counts:
count = contig_link_counts[c_id]
if count > 5:
break
scores = contig_cluster_scores[c_id]
for clust_id in scores:
for clust_id in scores:
score = scores[clust_id]
if clust_id in black_list_clusters:
continue

for contig_id in cluster_contig_links[clust_id]:
for idx, contig_id in enumerate(cluster_contig_links[clust_id]): #debug remove the enumerate
if contig_id not in group_membership:
group_membership[contig_id] = clust_id
# update cluster scores to remove contigs already assigned
Expand All @@ -579,8 +577,10 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
reference_hit_coverage[ref_hit_id]['score'] -= contig_reference_coverage[contig_id][
ref_hit_id]
del (contig_reference_coverage[contig_id])
else: #debug
print(f"Not {contig_id} to cluster {clust_id} assigned as already present in cluster {group_membership[contig_id]}")
break

clusters_with_biomarkers = {}
for clust_id in cluster_contig_links:
contigs = cluster_contig_links[clust_id]
Expand Down Expand Up @@ -620,8 +620,10 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_
if clust_id not in cluster_links:
cluster_links[clust_id] = []
cluster_links[clust_id].append(contig_id)

recon_cluster_dists = get_reconstructed_cluster_dists(mash_db, 0.1, cluster_links, out_dir, contig_seqs,
num_threads)

cluster_md5 = {}
for clust_id in cluster_contig_links:
contigs = cluster_contig_links[clust_id]
Expand Down Expand Up @@ -712,6 +714,7 @@ def assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_

recon_cluster_dists = get_reconstructed_cluster_dists(mash_db, 0.1, cluster_links, out_dir, contig_seqs,
num_threads)

clusters_with_biomarkers = {}
for clust_id in cluster_links:
contigs = cluster_links[clust_id]
Expand Down Expand Up @@ -1142,7 +1145,7 @@ def main():

if contig_info[id]['circularity_status'] == '':
contig_info[id]['circularity_status'] = 'not tested'

# Blast reference databases

identify_biomarkers(contig_info, fixed_fasta, tmp_dir, min_length, logging, \
Expand Down Expand Up @@ -1279,6 +1282,7 @@ def main():
contig_blast_df = contig_blast_df[contig_blast_df.sseqid.isin(list(reference_sequence_meta.keys()))]
contig_blast_df.reset_index(drop=True)
logging.info("Assigning contigs to plasmid groups")
#contig_blast_df.to_csv("contig_blast_df_back.txt", index=False, sep="\t") #debug
contig_info = assign_contigs_to_clusters(contig_blast_df, reference_sequence_meta, contig_info, tmp_dir,
contig_seqs, mash_db, primary_distance, secondary_distance,
num_threads)
Expand Down Expand Up @@ -1394,7 +1398,7 @@ def main():
if prefix is not None:
mobtyper_report = os.path.join(out_dir, "{}.mobtyper_results.txt".format(prefix))
build_mobtyper_report(contig_memberships['plasmid'], out_dir, mobtyper_report,contig_seqs, ncbi, lit, ETE3DBTAXAFILE, database_dir)

results.sort(key = lambda d: d['contig_id']) #sort by contig identifiers
writeReport(results, MOB_RECON_INFO_HEADER, contig_report)

logging.info("Writting chromosome sequences to {}".format(chromosome_file))
Expand Down
41 changes: 39 additions & 2 deletions mob_suite/tests/test_mobrecon.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import sys, os, logging
import sys, os, logging, json, pandas, pytest, random, hashlib
import mob_suite.mob_recon
import mob_suite.utils
from mob_suite.constants import MOB_CLUSTER_INFO_HEADER, default_database_dir
from Bio import SeqIO

#test all mob_recon functions including aggregation of results
TEST_ROOT = os.path.dirname(__file__)
ROOT_MODULE = os.path.dirname(mob_suite.__file__)
Expand Down Expand Up @@ -57,4 +61,37 @@ def test_mob_recon_typical_run():
assert os.path.exists(os.path.join(TEST_ROOT,"run_test/mob_recon_SRR3703080/contig_report.txt")) == True, "File does not exits"
assert os.path.exists(os.path.join(TEST_ROOT,"run_test/mob_recon_SRR3703080/plasmid_AA474.fasta")) == True, "File does not exists"



def test_contig_order_on_final_results(tmpdir):
check_if_output_dir_exists_and_create()
in_fasta = ROOT_MODULE + "/example/SRR3703080_illumina_unicycler.fasta"
records=SeqIO.index(in_fasta, "fasta")
for iter in range(0,2):
contigs_list = [r for r in records.keys()]
random.shuffle(contigs_list)
with open(os.path.join(tmpdir, f"shuffled_{iter}.fasta"), "w") as output_handle:
SeqIO.write([records[k] for k in contigs_list], output_handle, "fasta")
args = [
"--infile", os.path.join(tmpdir, f"shuffled_{iter}.fasta"),
"--outdir", TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_{iter}",
"--debug",
"--force"
]
sys.argv[1:] = args
mob_suite.mob_recon.main()

df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/contig_report.txt", sep="\t")[1:] #remove sample_id column
df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/contig_report.txt", sep="\t")[1:] #remove sample_id column
assert all(df1==df2), "Two contig_reports.txt are not equal"

df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/mobtyper_results.txt", sep="\t")[1:]
df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/mobtyper_results.txt", sep="\t")[1:]
assert all(df1==df2), "Two mobtyper_results.txt are not equal"

df1 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/biomarkers.blast.txt", sep="\t")[1:]
df2 = pandas.read_csv(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/biomarkers.blast.txt", sep="\t")[1:]
assert all(df1==df2), "Two biomarkers.blast.txt are not equal"

hash_1 = hashlib.md5(open(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_0/plasmid_AA474.fasta",'rb').read()).hexdigest()
hash_2 = hashlib.md5(open(TEST_ROOT+f"/run_test/mob_recon_SRR3703080_shuffled_1/plasmid_AA474.fasta",'rb').read()).hexdigest()
assert hash_1==hash_2, "Two plasmid_AA474.fasta files are not equal"
2 changes: 1 addition & 1 deletion mob_suite/tests/test_mobtyper_vs_biomarker_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
TEST_ROOT = os.path.dirname(__file__)
PACKAGE_ROOT = os.path.dirname(mob_suite.__file__)
LOG_FORMAT = '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
NRANDOMSAMPLES2TEST = 300
NRANDOMSAMPLES2TEST = 10
COMPLEX_CASES_LIST = ['CP011291', 'CP045773', 'KU932024', 'NC_002134', 'CP033122', 'CP031575', 'CP021102',
'CP033949', 'CP025336', 'CP025336', 'CP028546']

Expand Down
31 changes: 19 additions & 12 deletions mob_suite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ def verify_init(logger, database_dir):
mob_init_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mob_init.py')
status_file = os.path.join(database_dir, 'status.txt')
if not os.path.isfile(status_file):
logger.info('MOB-databases need to be initialized, this will take some time')
logger.info(f'MOB-databases need to be initialized at {database_dir}, this will take some time ...')
p = Popen([sys.executable, mob_init_path, '-d', database_dir],
stdout=PIPE,
stderr=PIPE,
Expand Down Expand Up @@ -822,20 +822,26 @@ def recursive_filter_overlap_records(blast_df, overlap_threshold, contig_id_col,
def fix_fasta_header(in_fasta, out_fasta):
ids = {}
fh = open(out_fasta, 'w')
incr=0
md5_contigid_len = []
with open(in_fasta, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
id = str(record.description)
status = 'false'
if 'circular=true' in id or '_circ' in id:
status = 'true'
seq = str(record.seq.upper())
id = str(record.id)
seq = str(record.seq)
md5 = calc_md5(seq)
new_id = "{}_{}_circular={}".format(incr,md5,status)
ids[new_id] = id
fh.write(">{}\n{}\n".format(new_id,seq))
incr+=1
md5_contigid_len.append((md5, id, len(seq)))
handle.close()
#sort contigs by length and then by md5 checksum to guarantee unique ordering
records_index_dict = SeqIO.index(in_fasta, "fasta")
for incr, (md5, id, seq_len) in enumerate(sorted(md5_contigid_len, key=lambda x: (x[2], x[0]), reverse = True)):
record = records_index_dict[id]
id = str(record.description)
status = 'false'
if 'circular=true' in id or '_circ' in id:
status = 'true'
seq = str(record.seq.upper())
new_id = f"{incr}_{md5}_circular={status}_{seq_len}"#.format(incr,md5,status)
ids[new_id] = id
fh.write(">{}\n{}\n".format(new_id,seq))
fh.close()
return ids

Expand Down Expand Up @@ -905,7 +911,8 @@ def init_console_logger(lvl=2):
return logging.getLogger(__name__)


def writeReport(data_list, header, outfile):
def writeReport(data_list, header, outfile):

with open(outfile, 'w') as fh:
fh.write("{}\n".format("\t".join(header)))
for i in range(0, len(data_list)):
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def read(fname):
install_requires=[
'numpy>=1.11.1,<1.23.5',
'tables>=3.3.0,<4',
'pandas>=0.22.0,<=1.0.5',
'biopython>=1.8,<2',
'pandas>=0.22.0,<=1.5.3',
'biopython>=1.8,<2',
'pycurl>=7.43.0,<8',
'scipy>=1.1.0,<2',
'ete3>=3.1.3,<4',
Expand Down

0 comments on commit eb86064

Please sign in to comment.