Changes for pub (#106)

* fix: fasta write compression * fix: option for compressed sg_object to gbk output * fix: file compression suffix * fix: account for input max proteins being greater than actual proteins * fix: npatlas import * fix: lint * fix: pytest * chore: remove temporary function * style: flake8 * feat: add args for blast options * feat: add genomes to the neo4j database * fix: antismash as nodes * fix: jaccard score * fix: bgc comparison scoring * fix: blast arg in bgc search * fix: neo4j escaping I had no idea Neo4j removed '\'. Which means SMILES (and other) strings have to be stored as escaped '\\' https://neo4j.com/docs/cypher-manual/current/queries/expressions/#string * feat: BGC search, adaptive width * fix: genbank output * fix: to conform to antismash and gbk output * fix: ability to modify clustermap assembly name without modifying uid * fix: bgc search scoring * fix: clustermap.js links color based on on 0-1 scale * fix: tests for release * fix: metavar deprecation * style: black, isort, flake8 * fix: make pytest * style: clean and recreate package and class diagrams * chore: bump dependencies * style: black
socialgene · Jul 23, 2024 · bbab371 · bbab371
1 parent 02f8342
commit bbab371
Show file tree

Hide file tree

Showing 50 changed files with 1,828 additions and 989 deletions.
diff --git a/Makefile b/Makefile
@@ -56,7 +56,7 @@ install:
 
 ## pytest	:	Run Python pacakge unit tests
 pytest: clean install
-	pytest tests -v --ignore=socialgene/entrypoints/export_protein_loci_assembly_tables.py 	 --cov=./socialgene --cov-report=xml:./coverage.xml --cov-report html
+	pytest tests/python -v --ignore=socialgene/entrypoints/export_protein_loci_assembly_tables.py 	 --cov=./socialgene --cov-report=xml:./coverage.xml --cov-report html tests/python
 	xdg-open htmlcov/index.html
 
 ## pytestnf :	Run Nextflow pytest tests (first runs clean, install python and  nextflow test run)

diff --git a/Take an input BGC.md b/Take an input BGC.md
diff --git a/Untitled-2.cql b/Untitled-2.cql
diff --git a/classes_sgpy.png b/classes_sgpy.png
diff --git a/packages_sgpy.png b/packages_sgpy.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,14 +26,14 @@ classifiers = [  #! Update me
   "Programming Language :: Python :: 3",
 ]
 dependencies = [
-        "rdkit==2023.9.4",
+        "rdkit==2024.3.3",
         "pandas>=2.2",
-        "numpy>=1.26",
-        "neo4j>=5.17.0",
-        "biopython>=1.83",
-        "textdistance>=4.6.1",
-        "rich",
-        "requests"
+        "numpy>=1.26", # rdkit requires <2.0
+        "neo4j>=5.22.0",
+        "biopython>=1.84",
+        "textdistance>=4.6.3",
+        "rich>=13.7.1",
+        "requests>=2.32.3"
 ]
 
 [project.license]

diff --git a/socialgene/addons/antismash/__init__.py b/socialgene/addons/antismash/__init__.py
diff --git a/socialgene/addons/antismash/nr.py b/socialgene/addons/antismash/nr.py
@@ -0,0 +1,40 @@
+from socialgene.neo4j.neo4j_element import Node, Relationship
+
+
+class Product(Node):
+    neo4j_label = ["product"]
+    description = "Represents the product of an antiSMASH BGC"
+    property_specification = {
+        "uid": int,
+    }
+    constraints_unique = ["uid"]
+
+
+class Category(Node):
+    neo4j_label = ["category"]
+    description = "Represents the category of an antiSMASH BGC"
+    property_specification = {
+        "uid": int,
+    }
+    constraints_unique = ["uid"]
+
+
+class ProductToCategory(Relationship):
+    neo4j_label = "IS_A"
+    description = "Connects an antiSMASH product to category "
+    start_class = Product
+    end_class = Category
+
+
+class GeneClusterToProduct(Relationship):
+    neo4j_label = "IS_A"
+    description = "Connects an antiSMASH BGC to product "
+    start_class = Product
+    end_class = Category
+    property_specification = {
+        "tool": str,
+        "start": int,
+        "end": int,
+        "core_start": int,
+        "core_end": int,
+    }
diff --git a/socialgene/addons/chebi/__init__.py b/socialgene/addons/chebi/__init__.py
@@ -1,3 +0,0 @@
-# from socialgene.neo4j.neo4j_element import Node, Relationship
-# try to use classes already created that inherit from Nodes() or Relationship()
-# to do that create a mixin class in a separate file and then import that

diff --git a/socialgene/addons/chemistry/cli_similarity.py b/socialgene/addons/chemistry/cli_similarity.py
@@ -39,16 +39,16 @@ def get_db_inchis():
         res = db.run(
             f"""
             MATCH (c1:{cmpd_label})
-            RETURN c1.inchi as chem
+            RETURN c1.CanonicalSmiles as chem
             """,
         ).value()
-    return res
+    return [i.replace("\\\\", "\\") for i in res if i is not None]
 
 
 def inchi_list_to_compound_dict(x):
     res = {}
     for i in x:
-        temp = ChemicalCompound(i)
+        temp = ChemicalCompound(i, sanitize=True)
         res[i] = (temp, temp.node)
     return res
 

diff --git a/socialgene/addons/chemistry/nr.py b/socialgene/addons/chemistry/nr.py
@@ -31,7 +31,6 @@ class ChemicalCompoundNode(Node):
         "MolMR": float,
         "AnonymousGraph": str,
         "ElementGraph": str,
-        "CanonicalSmiles": str,
         "MurckoScaffold": str,
         "ExtendedMurcko": str,
         "MolFormula": str,
@@ -50,7 +49,7 @@ class ChemicalCompoundNode(Node):
         "inchi": str,
         "CanonicalSmiles": str,
     }
-    constraints_unique = ["inchi", "CanonicalSmiles"]
+    constraints_unique = ["inchi"]
 
 
 class TanimotoSimilarity(Relationship):
@@ -74,29 +73,19 @@ class McsSimilarity(Relationship):
 
 
 class ChemicalSubstructure(ChemicalCompoundNode):
-    neo4j_label = ChemicalCompoundNode.neo4j_label + ["substructure"]
+    neo4j_label = ["substructure"]
     description = "Represents a chemical substructure"
-
-
-class ChemicalFragment(Node):
-    neo4j_label = ["chemical_fragment"]
-    description = "Represents a chemical fragment as defined by rdkit.Chem.Descriptors"
-    required_properties = ["uid"]
-    properties = {
+    required_properties = ["inchi", "CanonicalSmiles"]
+    property_specification = {
         "uid": str,
+        "inchi": str,
+        "CanonicalSmiles": str,
     }
-    constraints_unique = ["uid"]
-
-
-class ContainsFragment(Relationship):
-    neo4j_label = "CONTAINS"
-    description = "Connects a chemical compound to a chemical fragment"
-    start_class = ChemicalCompoundNode
-    end_class = ChemicalFragment
+    constraints_unique = ["inchi", "CanonicalSmiles"]
 
 
 class ContainsSubstructure(Relationship):
-    neo4j_label = "CONTAINS"
+    neo4j_label = "SUBSTRUCTURE"
     description = "Connects a chemical compound to a chemical substructure"
     start_class = ChemicalCompoundNode
     end_class = ChemicalSubstructure
diff --git a/socialgene/addons/chemistry/substruct.py b/socialgene/addons/chemistry/substruct.py
@@ -0,0 +1,64 @@
+# from socialgene.neo4j.neo4j import GraphDriver
+# import logging
+# import os
+# from socialgene.base.chem import ChemicalCompound
+# from rich.progress import Progress
+# from socialgene.addons.chemistry.nr import ChemicalSubstructure, ContainsSubstructure, ChemicalCompoundNode
+# from rdkit import Chem
+# from multiprocessing import Pool
+# from rdkit.Chem.MolStandardize import rdMolStandardize
+# from itertools import batched
+# logging.getLogger("neo4j").setLevel(logging.WARNING)
+# logging.getLogger().setLevel(logging.INFO)
+
+# if __name__ == "__main__":
+
+#     #inspect(a)
+#     with GraphDriver() as db:
+#         results = db.run(
+#             """
+#             MATCH (n:chemical_compound) RETURN n.inchi as inchi
+#             """
+#         ).value()
+
+
+#     nodes=set()
+#     rels=set()
+
+#     def process_subgraph(batched_results):
+#         nodes=set()
+#         rels=set()
+#         for i in batched_results:
+#             sgmol = ChemicalCompound(i)
+#             subgraphs = Chem.FindAllSubgraphsOfLengthN(sgmol.mol, 5)
+#             for subgraph in subgraphs:
+#                 sub_mol = Chem.PathToSubmol(sgmol.mol, subgraph, useQuery=True)
+#                 sub_mol = rdMolStandardize.Cleanup(sub_mol)
+#                 sub_mol = rdMolStandardize.Normalize(sub_mol)
+#                 r = rdMolStandardize.Reionizer()
+#                 sub_mol = r.reionize(sub_mol)
+#                 Chem.RemoveStereochemistry( sub_mol )
+#                 #sub_mol=Chem.MolToSmiles(sub_mol, canonical=True)
+#                 node = ChemicalSubstructure()
+#                 temp_compound = ChemicalCompound(sub_mol)
+#                 node.fill_from_dict(temp_compound.base_properties | temp_compound.hash_dict)
+#                 del temp_compound
+#                 nodes.add(node)
+#                 cn = ChemicalCompoundNode()
+#                 cn.fill_from_dict(sgmol.base_properties | sgmol.hash_dict)
+#                 rel = ContainsSubstructure(cn, node)
+#                 rels.add(rel)
+#         nodes[0].add_multiple_to_neo4j(nodes)
+#         rels[0].add_multiple_to_neo4j(rels)
+
+#     def update_progress(*args):
+#         pg.update(task, advance=1)
+
+#     with Progress(transient=True) as pg:
+#         task = pg.add_task("Progress...", total=len(results))
+#         batched_results = batched(results, 100)
+#         with Pool() as p:
+#             for i in p.imap_unordered(process_subgraph, batched_results):
+#                 nodes.update(i[0])
+#                 rels.update(i[1])
+#                 update_progress()
diff --git a/socialgene/addons/culture_collections/__init__.py b/socialgene/addons/culture_collections/__init__.py
diff --git a/socialgene/addons/culture_collections/nr.py b/socialgene/addons/culture_collections/nr.py