ababaian · LicoriceLin · Oct 16, 2023 · Oct 16, 2023
diff --git a/palmfold.py b/palmfold.py
@@ -3,260 +3,45 @@
 #
 # RNA virus RdRp structural classifier.
 #
-from os import path, mkdir, listdir, rename, remove
-from sys import stderr
-
 import argparse
-import subprocess
 import logging as log
 import gzip
 import shutil
-
-class PalmStructs:
-
-    # Initialization Routine ------------------------------
-    # Ensures ./pol dir-tree is populated
-    # with the correct files (palmprint pdb, pdb, fasta)
-    def __init__(self, polpath):
-        # Check if the .pol directory exists
-        if not path.exists(polpath):
-            log.error("The palmprint directory %s does not exist", polpath)
-            exit(1)
-        self.polpath = polpath
-
-        # Verify the reference RdRp models and fasta files exist
-        self.rdrps = []
-        rdrp_filelist = path.join(polpath, "rdrp.model.list")
-
-        if not path.exists(rdrp_filelist):
-            log.error('The "rdrp.model.list" file not found in %s,', polpath)
-            exit(1)
-
-        # Check each input RdRp model fasta, full_pdb, and palmprint_pdb
-        with open(rdrp_filelist) as rdrp_fl:
-            log.info("  Verifying RdRp Model List")
-            # Verify each rdrp file in directory tree
-            for line in rdrp_fl:
-                line = line.strip()
-                if len(line) > 0:
-                    name = line
-                    log.info("    %s", name)
-                    # verify fasta
-                    if not path.exists(path.join(polpath, "fa", f"{name}.fa")):
-                        log.warning("Fasta file missing for: %s", name)
-                        continue
-                    # verify full sequence
-                    gz = path.join(polpath, "full_length", f"{name}.pdb.gz")
-                    if not path.exists(gz):
-                        log.warning("Full length structure missing for: %s", name)
-                        continue
-                    # verify palmprint pdb structucture (required)
-                    if not path.exists(path.join(polpath, "palmprint", f"{name}.pdb")):
-                        log.error("<palmprint>.pdb missing for: %s", name)
-                        exit(1)
-                    self.rdrps.append(name)
-
-
-        # Verify the reference XdXp models and fasta files exist
-        self.xdxps = []
-        xdxp_filelist = path.join(polpath, "xdxp.model.list")
-
-        if not path.exists(xdxp_filelist):
-            log.error('The "xdxp.model.list" file not found in %s,', polpath)
-            exit(1)
-
-        # Check each input XdXp model fasta, full_pdb, and palmprint_pdb
-        with open(xdxp_filelist) as xdxp_fl:
-            log.info("  Verifying XdXp Model List")
-            # Verify each XdXp file in directory tree
-            for line in xdxp_fl:
-                line = line.strip()
-                if len(line) > 0:
-                    name = line
-                    log.info("    %s", name)
-                    # verify fasta
-                    if not path.exists(path.join(polpath, "fa", f"{name}.fa")):
-                        log.warning("Fasta file missing for: %s", name)
-                        continue
-                    # verify full sequence
-                    gz = path.join(polpath, "full_length", f"{name}.pdb.gz")
-                    if not path.exists(gz):
-                        log.warning("Full length structure missing for: %s", name)
-                        continue
-                    # verify palmprint pdb structucture (required)
-                    if not path.exists(path.join(polpath, "palmprint", f"{name}.pdb")):
-                        log.error("<palmprint>.pdb missing for: %s", name)
-                        exit(1)
-                    self.xdxps.append(name)
-
-        self.all_domains = self.rdrps + self.xdxps
-
-    # Classification Routine ------------------------------
-    # Runs TMalign of input against all palmprint pdb
-    # Extracts scores and re-aligns if a RdRp palmprint detected
-    def align(self, pdbpath, outpath, name, tm_threshold):
-        # TODO: this looks like an inefficient way to find files in path
-        # it introduces a bug where two files "ABC.pdb" and "ABCD.pdb" will
-        # have the same start and endings, and cause multiple returns and skip
-        # of ABC.pdb
-        pdb_file = [file for file in listdir(pdbpath) if file.startswith(f"{name}") and file.endswith(".pdb")]
-        if len(pdb_file) == 0:
-            log.warning("No <input>.pdb file found  %s", name)
-            return
-        elif len(pdb_file) > 1:
-            log.warning("Multiple <input>.pdb files found for %s", name)
-            log.warning("Abiguous situation, skip molecule")
-            return
-
-        # Define input/output files
-        out_name = path.join(outpath, path.splitext(pdb_file[0])[0])
-        out_tsv = path.join(out_name + ".tm")
-        out_ppfa = path.join(out_name + ".pp.fa")
-        out_rcfa = path.join(out_name + ".rc.fa")
-        out_rpdb = path.join(out_name + "_realigned.pdb")
-        pdb_file = path.join(pdbpath, pdb_file[0])
-
-        # Reset maximum scores observed
-        max_rdrp_score = max_xdxp_score = 0
-        max_rdrp = max_xdxp = None
-
-        # Initialize output TSV file
-        log.info("  Writing tsv output to %s", out_tsv)
-        with open(out_tsv, "w") as out:
-            print(
-                "PDBchain1\tPDBchain2\tTM1\tTM2\t" +
-                "RMSD\tID1\tID2\tIDali\tL1\tL2\tLali",
-                file=out
-                )
-
-            # TODO: these should be wrapped in functions to not repeat code
-            # Run RdRp-palmprint TMalign and identify max_RdRp_Score
-            log.info("  Run TMalign against RdRp palmprints")
-            for domain in self.rdrps:
-                # TMalign command
-                tmalign_cmd = f"TMalign -outfmt 2 {pdb_file} {path.join(self.polpath, 'palmprint', f'{domain}.pdb')}"
-                log.info("    %s", tmalign_cmd)
-
-                # Runs TMalign and captures output
-                ret_val = subprocess.run(
-                    tmalign_cmd.split(" "),
-                    capture_output=True,
-                    text=True,
-                    close_fds=False)
-                # Parse the captured output for scores
-                values = ret_val.stdout.split("\n")[1]
-                values = [name, domain] + [float(x) for x in values.split("\t")[2:]]
-                # Append TMalign output to TSV output
-                print("\t".join([str(v) for v in values]), file=out)
-                # Save best scores
-                score = values[3]
-                if score >= max_rdrp_score:
-                    max_rdrp_score = score
-                    max_rdrp = domain
-
-            # Run XdXp-palmprint TMalign and identify max_XdXp_Score
-            log.info("  Run TMalign against XdXp palmprints")
-            for domain in self.xdxps:
-                # Run TM-Align
-                tmalign_cmd = f"TMalign -outfmt 2 {pdb_file} {path.join(self.polpath, 'palmprint', f'{domain}.pdb')}"
-                log.info("    %s", tmalign_cmd)
-
-                # Runs TMalign and captures output
-                ret_val = subprocess.run(
-                    tmalign_cmd.split(" "),
-                    capture_output=True,
-                    text=True,
-                    close_fds=False)
-                # Parse the captured output for scores
-                values = ret_val.stdout.split("\n")[1]
-                values = [name, domain] + [float(x) for x in values.split("\t")[2:]]
-                # Append TMalign output to TSV output
-                print("\t".join([str(v) for v in values]), file=out)
-                # Save best scores
-                score = values[3]
-                if score >= max_xdxp_score:
-                    max_xdxp_score = score
-                    max_xdxp = domain
-
-        # Does maxRdRP_score pass CUTOFF value
-        log.info("")
-        log.info("  Max RdRp palmprint: %s - %s", max_rdrp, max_rdrp_score)
-        log.info("  Max XdXp palmprint: %s - %s", max_xdxp, max_xdxp_score)
-        log.info("")
-
-        # RdRp classification
-        # RdRp TMscore above re-alignment threshold
-        if max(max_rdrp_score, max_xdxp_score) >= tm_threshold:
-            log.info("  + Polymerase re-alignment threshold score reached: %s", tm_threshold)
-            if max_rdrp_score > max_xdxp_score:
-                # RdRp+ classification
-                log.info("  ++ RdRp-classification for %s", name)
-            else:
-                log.info("  %s classified as non-RdRp polymerase", name)
-
-            log.info("")
-            log.info("  Re-aligning:")
-
-            # Run TMalign to re-align input structure to top-RdRp palmprint
-            realign_cmd = f"TMalign -outfmt 1 {pdb_file} {path.join(self.polpath, 'palmprint', f'{domain}.pdb')} -o {pdb_file}_tmpalign"
-            log.info("  %s", realign_cmd)
-            with open(f"{pdb_file}.tmp", "w") as output:
-                subprocess.run(realign_cmd.split(" "), stdout=output)
-
-            # Move the realign
-            rename(f"{pdb_file}_tmpalign.pdb", f"{out_rpdb}")
-
-            # Run palmgrab.py to sub-select palmprint/core fasta seq
-            log.info("")
-            log.info("  Run palmgrab.py to extract sub-sequence fasta")
-            log.info("  cmd:")
-            scriptdir = path.dirname(path.realpath(__file__))
-            log.info('  python3 ' + path.join(scriptdir, "palmgrab.py ") + f"{pdb_file}.tmp " + f"{out_ppfa} " + f"{out_rcfa}")
-
-            subprocess.run(['python3', path.join(scriptdir, "palmgrab.py"), f"{pdb_file}.tmp", f"{out_ppfa}", f"{out_rcfa}"])
-            log.info("  done")
-            log.info("")
-
-            for f in listdir(pdbpath):
-                if "_tmpalign" in f:
-                    remove(path.join(pdbpath, f))
-            remove(f"{pdb_file}.tmp")
-
-
+from pathlib import Path
+from palmfoldutils import PalmStructs
 # Initialization Routine ==================================
-def main(inputpath, outpath, palmprints, threshold):
+def main(inputpath:Path, outpath:Path, palmprints:Path, threshold:float):
     # Verify Input Path exists
-    if not path.exists(inputpath):
+    if not inputpath.is_dir():
         log.error("The input directory %s does not exist", inputpath)
         exit(1)
 
     # Verify Output Path exists
-    if not path.exists(outpath):
+    if not outpath.is_dir():
         log.info("Output directory %s does not exist. Creating", outpath)
-        mkdir(outpath)
+        outpath.mkdir()
     else:
         log.info("Output directory %s exists. Overwriting", outpath)
 
     # Extract protein filenames
     names = [
-        filename[:-4]
-        for filename in listdir(inputpath)
-        if filename.endswith(".pdb")
+        filename.stem
+        for filename in inputpath.iterdir()
+        if filename.suffix==".pdb"
     ]
 
     namesgz = [
-        filename[:-7]
-        for filename in listdir(inputpath)
-        if filename.endswith(".pdb.gz")
+        filename.name[:-7]
+        for filename in inputpath.iterdir()
+        if filename.name.endswith(".pdb.gz")
     ]
 
     log.info(" %s %s", names, namesgz)
-
+    
     # Verify Input Path contains PDB files
-    if not names:
-        if not namesgz:
-            log.error("Input directory %s doesn't contain any .pdb(.gz) files.", inputpath)
-            exit(1)
+    if not names and not namesgz:
+        log.error("Input directory %s doesn't contain any .pdb(.gz) files.", inputpath)
+        exit(1)
 
     # Create the Palmprint datastructure
     log.info("Initializing palmfold...")
@@ -267,11 +52,12 @@ def main(inputpath, outpath, palmprints, threshold):
         log.info("")
         log.info("  Analyzing %s...", protgz)
         # Decompress gz
-        with gzip.open(path.join(inputpath, protgz + ".pdb.gz"), 'rb') as f_in:
-            with open(path.join(inputpath, protgz + ".pdb"), 'wb') as f_out:
+        # TODO should also use `TemporaryDirectory`
+        with gzip.open(inputpath/f"{protgz}.pdb.gz", 'rb') as f_in:
+            with open(inputpath/f"{protgz}.pdb", 'wb') as f_out:
                 shutil.copyfileobj(f_in, f_out)
         ps.align(inputpath, outpath, protgz, threshold)
-        remove(path.join(inputpath, protgz + ".pdb"))
+        (inputpath/f"{protgz}.pdb").unlink()
 
     for prot in names:
         log.info("")
@@ -284,16 +70,16 @@ def main(inputpath, outpath, palmprints, threshold):
         description='palmfold: RNA virus RdRp structural classifier.'
         )
     parser.add_argument(
-        '--inputpath', '-i', type=str, default='./pdb',
+        '--inputpath', '-i', type=Path, default='./pdb',
         help='Directory containing "*.pdb(.gz)"" files to be classified. [./pdb]'
         )
     parser.add_argument(
-        '--palmprints', '-p', type=str, default='./pol',
+        '--palmprints', '-p', type=Path, default=Path(__file__).parent/'pol',
         help='path to "~/palmfold/pol/" containing palmprint pdb models. [./pol]'
         )
     ## TODO: implement output directory
     parser.add_argument(
-        '--outpath', '-o', type=str, default='./out',
+        '--outpath', '-o', type=Path, default='./out',
         help='Directory for writing output files. [./out]'
         )
     parser.add_argument(

diff --git a/palmfoldutils/__init__.py b/palmfoldutils/__init__.py
@@ -0,0 +1 @@
+from .palmstructs import PalmStructs
diff --git a/palmgrab.py → palmfoldutils/palmgrab.py b/palmgrab.py → palmfoldutils/palmgrab.py
@@ -1,21 +1,21 @@
-import os
 import sys
-import glob
 from Bio import SeqIO
 from Bio.Seq import Seq
-from Bio.SeqIO import FastaIO
 from Bio.SeqRecord import SeqRecord
-
-def palmgrab(inputfa, palmout, rdrpout) :
+from pathlib import Path
+def palmgrab(inputfa:Path, palmout:Path, rdrpout:Path) :
+    inputfa=Path(inputfa) #for compatibility
+    palmout=Path(palmout)
+    rdrpout=Path(rdrpout)
     PfinList = [] # palmprint fasta
     RfinList = [] # rdrpcore  fasta
     # Resulttrunc.append(SeqRecord(Seq(splitseq), id=name))
-    if os.path.exists(inputfa):
+    if inputfa.is_file():
         #for input in glob.glob(os.path.join(os.path.join(inputfa,'tmfa'),'*.fa')) :
         # Import 2 TMalign fasta output
         seqList = []
         fasta_sequences = SeqIO.parse(open(inputfa), 'fasta')
-        pdb_id = os.path.basename(inputfa.split('.fa')[0])
+        pdb_id = inputfa.stem #os.path.basename(inputfa.split('.fa')[0])
 
         for fasta in fasta_sequences:
             name, sequence = fasta.id, str(fasta.seq)