diff --git a/palmfold.py b/palmfold.py index 57a5df0..77ac33e 100755 --- a/palmfold.py +++ b/palmfold.py @@ -3,260 +3,45 @@ # # RNA virus RdRp structural classifier. # -from os import path, mkdir, listdir, rename, remove -from sys import stderr - import argparse -import subprocess import logging as log import gzip import shutil - -class PalmStructs: - - # Initialization Routine ------------------------------ - # Ensures ./pol dir-tree is populated - # with the correct files (palmprint pdb, pdb, fasta) - def __init__(self, polpath): - # Check if the .pol directory exists - if not path.exists(polpath): - log.error("The palmprint directory %s does not exist", polpath) - exit(1) - self.polpath = polpath - - # Verify the reference RdRp models and fasta files exist - self.rdrps = [] - rdrp_filelist = path.join(polpath, "rdrp.model.list") - - if not path.exists(rdrp_filelist): - log.error('The "rdrp.model.list" file not found in %s,', polpath) - exit(1) - - # Check each input RdRp model fasta, full_pdb, and palmprint_pdb - with open(rdrp_filelist) as rdrp_fl: - log.info(" Verifying RdRp Model List") - # Verify each rdrp file in directory tree - for line in rdrp_fl: - line = line.strip() - if len(line) > 0: - name = line - log.info(" %s", name) - # verify fasta - if not path.exists(path.join(polpath, "fa", f"{name}.fa")): - log.warning("Fasta file missing for: %s", name) - continue - # verify full sequence - gz = path.join(polpath, "full_length", f"{name}.pdb.gz") - if not path.exists(gz): - log.warning("Full length structure missing for: %s", name) - continue - # verify palmprint pdb structucture (required) - if not path.exists(path.join(polpath, "palmprint", f"{name}.pdb")): - log.error(".pdb missing for: %s", name) - exit(1) - self.rdrps.append(name) - - - # Verify the reference XdXp models and fasta files exist - self.xdxps = [] - xdxp_filelist = path.join(polpath, "xdxp.model.list") - - if not path.exists(xdxp_filelist): - log.error('The "xdxp.model.list" file not found in %s,', polpath) - exit(1) - - # Check each input XdXp model fasta, full_pdb, and palmprint_pdb - with open(xdxp_filelist) as xdxp_fl: - log.info(" Verifying XdXp Model List") - # Verify each XdXp file in directory tree - for line in xdxp_fl: - line = line.strip() - if len(line) > 0: - name = line - log.info(" %s", name) - # verify fasta - if not path.exists(path.join(polpath, "fa", f"{name}.fa")): - log.warning("Fasta file missing for: %s", name) - continue - # verify full sequence - gz = path.join(polpath, "full_length", f"{name}.pdb.gz") - if not path.exists(gz): - log.warning("Full length structure missing for: %s", name) - continue - # verify palmprint pdb structucture (required) - if not path.exists(path.join(polpath, "palmprint", f"{name}.pdb")): - log.error(".pdb missing for: %s", name) - exit(1) - self.xdxps.append(name) - - self.all_domains = self.rdrps + self.xdxps - - # Classification Routine ------------------------------ - # Runs TMalign of input against all palmprint pdb - # Extracts scores and re-aligns if a RdRp palmprint detected - def align(self, pdbpath, outpath, name, tm_threshold): - # TODO: this looks like an inefficient way to find files in path - # it introduces a bug where two files "ABC.pdb" and "ABCD.pdb" will - # have the same start and endings, and cause multiple returns and skip - # of ABC.pdb - pdb_file = [file for file in listdir(pdbpath) if file.startswith(f"{name}") and file.endswith(".pdb")] - if len(pdb_file) == 0: - log.warning("No .pdb file found %s", name) - return - elif len(pdb_file) > 1: - log.warning("Multiple .pdb files found for %s", name) - log.warning("Abiguous situation, skip molecule") - return - - # Define input/output files - out_name = path.join(outpath, path.splitext(pdb_file[0])[0]) - out_tsv = path.join(out_name + ".tm") - out_ppfa = path.join(out_name + ".pp.fa") - out_rcfa = path.join(out_name + ".rc.fa") - out_rpdb = path.join(out_name + "_realigned.pdb") - pdb_file = path.join(pdbpath, pdb_file[0]) - - # Reset maximum scores observed - max_rdrp_score = max_xdxp_score = 0 - max_rdrp = max_xdxp = None - - # Initialize output TSV file - log.info(" Writing tsv output to %s", out_tsv) - with open(out_tsv, "w") as out: - print( - "PDBchain1\tPDBchain2\tTM1\tTM2\t" + - "RMSD\tID1\tID2\tIDali\tL1\tL2\tLali", - file=out - ) - - # TODO: these should be wrapped in functions to not repeat code - # Run RdRp-palmprint TMalign and identify max_RdRp_Score - log.info(" Run TMalign against RdRp palmprints") - for domain in self.rdrps: - # TMalign command - tmalign_cmd = f"TMalign -outfmt 2 {pdb_file} {path.join(self.polpath, 'palmprint', f'{domain}.pdb')}" - log.info(" %s", tmalign_cmd) - - # Runs TMalign and captures output - ret_val = subprocess.run( - tmalign_cmd.split(" "), - capture_output=True, - text=True, - close_fds=False) - # Parse the captured output for scores - values = ret_val.stdout.split("\n")[1] - values = [name, domain] + [float(x) for x in values.split("\t")[2:]] - # Append TMalign output to TSV output - print("\t".join([str(v) for v in values]), file=out) - # Save best scores - score = values[3] - if score >= max_rdrp_score: - max_rdrp_score = score - max_rdrp = domain - - # Run XdXp-palmprint TMalign and identify max_XdXp_Score - log.info(" Run TMalign against XdXp palmprints") - for domain in self.xdxps: - # Run TM-Align - tmalign_cmd = f"TMalign -outfmt 2 {pdb_file} {path.join(self.polpath, 'palmprint', f'{domain}.pdb')}" - log.info(" %s", tmalign_cmd) - - # Runs TMalign and captures output - ret_val = subprocess.run( - tmalign_cmd.split(" "), - capture_output=True, - text=True, - close_fds=False) - # Parse the captured output for scores - values = ret_val.stdout.split("\n")[1] - values = [name, domain] + [float(x) for x in values.split("\t")[2:]] - # Append TMalign output to TSV output - print("\t".join([str(v) for v in values]), file=out) - # Save best scores - score = values[3] - if score >= max_xdxp_score: - max_xdxp_score = score - max_xdxp = domain - - # Does maxRdRP_score pass CUTOFF value - log.info("") - log.info(" Max RdRp palmprint: %s - %s", max_rdrp, max_rdrp_score) - log.info(" Max XdXp palmprint: %s - %s", max_xdxp, max_xdxp_score) - log.info("") - - # RdRp classification - # RdRp TMscore above re-alignment threshold - if max(max_rdrp_score, max_xdxp_score) >= tm_threshold: - log.info(" + Polymerase re-alignment threshold score reached: %s", tm_threshold) - if max_rdrp_score > max_xdxp_score: - # RdRp+ classification - log.info(" ++ RdRp-classification for %s", name) - else: - log.info(" %s classified as non-RdRp polymerase", name) - - log.info("") - log.info(" Re-aligning:") - - # Run TMalign to re-align input structure to top-RdRp palmprint - realign_cmd = f"TMalign -outfmt 1 {pdb_file} {path.join(self.polpath, 'palmprint', f'{domain}.pdb')} -o {pdb_file}_tmpalign" - log.info(" %s", realign_cmd) - with open(f"{pdb_file}.tmp", "w") as output: - subprocess.run(realign_cmd.split(" "), stdout=output) - - # Move the realign - rename(f"{pdb_file}_tmpalign.pdb", f"{out_rpdb}") - - # Run palmgrab.py to sub-select palmprint/core fasta seq - log.info("") - log.info(" Run palmgrab.py to extract sub-sequence fasta") - log.info(" cmd:") - scriptdir = path.dirname(path.realpath(__file__)) - log.info(' python3 ' + path.join(scriptdir, "palmgrab.py ") + f"{pdb_file}.tmp " + f"{out_ppfa} " + f"{out_rcfa}") - - subprocess.run(['python3', path.join(scriptdir, "palmgrab.py"), f"{pdb_file}.tmp", f"{out_ppfa}", f"{out_rcfa}"]) - log.info(" done") - log.info("") - - for f in listdir(pdbpath): - if "_tmpalign" in f: - remove(path.join(pdbpath, f)) - remove(f"{pdb_file}.tmp") - - +from pathlib import Path +from palmfoldutils import PalmStructs # Initialization Routine ================================== -def main(inputpath, outpath, palmprints, threshold): +def main(inputpath:Path, outpath:Path, palmprints:Path, threshold:float): # Verify Input Path exists - if not path.exists(inputpath): + if not inputpath.is_dir(): log.error("The input directory %s does not exist", inputpath) exit(1) # Verify Output Path exists - if not path.exists(outpath): + if not outpath.is_dir(): log.info("Output directory %s does not exist. Creating", outpath) - mkdir(outpath) + outpath.mkdir() else: log.info("Output directory %s exists. Overwriting", outpath) # Extract protein filenames names = [ - filename[:-4] - for filename in listdir(inputpath) - if filename.endswith(".pdb") + filename.stem + for filename in inputpath.iterdir() + if filename.suffix==".pdb" ] namesgz = [ - filename[:-7] - for filename in listdir(inputpath) - if filename.endswith(".pdb.gz") + filename.name[:-7] + for filename in inputpath.iterdir() + if filename.name.endswith(".pdb.gz") ] log.info(" %s %s", names, namesgz) - + # Verify Input Path contains PDB files - if not names: - if not namesgz: - log.error("Input directory %s doesn't contain any .pdb(.gz) files.", inputpath) - exit(1) + if not names and not namesgz: + log.error("Input directory %s doesn't contain any .pdb(.gz) files.", inputpath) + exit(1) # Create the Palmprint datastructure log.info("Initializing palmfold...") @@ -267,11 +52,12 @@ def main(inputpath, outpath, palmprints, threshold): log.info("") log.info(" Analyzing %s...", protgz) # Decompress gz - with gzip.open(path.join(inputpath, protgz + ".pdb.gz"), 'rb') as f_in: - with open(path.join(inputpath, protgz + ".pdb"), 'wb') as f_out: + # TODO should also use `TemporaryDirectory` + with gzip.open(inputpath/f"{protgz}.pdb.gz", 'rb') as f_in: + with open(inputpath/f"{protgz}.pdb", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) ps.align(inputpath, outpath, protgz, threshold) - remove(path.join(inputpath, protgz + ".pdb")) + (inputpath/f"{protgz}.pdb").unlink() for prot in names: log.info("") @@ -284,16 +70,16 @@ def main(inputpath, outpath, palmprints, threshold): description='palmfold: RNA virus RdRp structural classifier.' ) parser.add_argument( - '--inputpath', '-i', type=str, default='./pdb', + '--inputpath', '-i', type=Path, default='./pdb', help='Directory containing "*.pdb(.gz)"" files to be classified. [./pdb]' ) parser.add_argument( - '--palmprints', '-p', type=str, default='./pol', + '--palmprints', '-p', type=Path, default=Path(__file__).parent/'pol', help='path to "~/palmfold/pol/" containing palmprint pdb models. [./pol]' ) ## TODO: implement output directory parser.add_argument( - '--outpath', '-o', type=str, default='./out', + '--outpath', '-o', type=Path, default='./out', help='Directory for writing output files. [./out]' ) parser.add_argument( diff --git a/palmfoldutils/__init__.py b/palmfoldutils/__init__.py new file mode 100644 index 0000000..c723af2 --- /dev/null +++ b/palmfoldutils/__init__.py @@ -0,0 +1 @@ +from .palmstructs import PalmStructs \ No newline at end of file diff --git a/palmgrab.py b/palmfoldutils/palmgrab.py similarity index 91% rename from palmgrab.py rename to palmfoldutils/palmgrab.py index 90d1897..39d3b65 100644 --- a/palmgrab.py +++ b/palmfoldutils/palmgrab.py @@ -1,21 +1,21 @@ -import os import sys -import glob from Bio import SeqIO from Bio.Seq import Seq -from Bio.SeqIO import FastaIO from Bio.SeqRecord import SeqRecord - -def palmgrab(inputfa, palmout, rdrpout) : +from pathlib import Path +def palmgrab(inputfa:Path, palmout:Path, rdrpout:Path) : + inputfa=Path(inputfa) #for compatibility + palmout=Path(palmout) + rdrpout=Path(rdrpout) PfinList = [] # palmprint fasta RfinList = [] # rdrpcore fasta # Resulttrunc.append(SeqRecord(Seq(splitseq), id=name)) - if os.path.exists(inputfa): + if inputfa.is_file(): #for input in glob.glob(os.path.join(os.path.join(inputfa,'tmfa'),'*.fa')) : # Import 2 TMalign fasta output seqList = [] fasta_sequences = SeqIO.parse(open(inputfa), 'fasta') - pdb_id = os.path.basename(inputfa.split('.fa')[0]) + pdb_id = inputfa.stem #os.path.basename(inputfa.split('.fa')[0]) for fasta in fasta_sequences: name, sequence = fasta.id, str(fasta.seq) diff --git a/palmfoldutils/palmstructs.py b/palmfoldutils/palmstructs.py new file mode 100644 index 0000000..b9cf330 --- /dev/null +++ b/palmfoldutils/palmstructs.py @@ -0,0 +1,263 @@ +from .palmgrab import palmgrab +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Callable +import logging as log +import subprocess + +TMALIGN_=Path(__file__).parent/'TMalign' +TMALIGN=TMALIGN_ if TMALIGN_.is_file() else Path('TMalign') + +def check_tmalign_version(): + ''' + TODO + warning: the given link of TMalign binary + doesn't have -outfmt parameter, + + new-link below works well: + https://zhanggroup.org/TM-align/TMtools20190822.tar.gz + + ''' + return + + +def tm_align(pdb_file:Path,ref_file:Path): + ''' + run TMalign between `pdb_file` and `ref_file` + only output the splited score lines. + + TODO: if we only need the score, + consider this package rather than call a subprecess: + https://github.com/jvkersch/tmtools + ''' + tmalign_cmd = f"{TMALIGN} -outfmt 2 {pdb_file} {ref_file}" + log.info(" %s", tmalign_cmd) + + # Runs TMalign and captures output + ret_val = subprocess.run( + tmalign_cmd.split(" "), + capture_output=True, + text=True, + close_fds=False) + # Parse the captured output for scores + values = ret_val.stdout.split("\n")[1] + values = [pdb_file.stem, ref_file.stem] + [float(x) for x in values.split("\t")[2:]] + return values + + +class PalmStructs: + # Initialization Routine ------------------------------ + # Ensures ./pol dir-tree is populated + # with the correct files (palmprint pdb, pdb, fasta) + def __init__(self, polpath:Path): + ''' + parameters: + ----------- + polpath:Path(str-OK) + Ensures the library contains \ + the correct tress: + + `rdrp.model.list` + + `xdxp.model.list` + + `palmprint\` + + (optional)`fa\` & `full_length` + + properties: + ---------- + `polpath` + + `all_domains` + + `rdrps & xdxps` + ''' + # Check if the .pol directory exists + polpath=Path(polpath) + if not polpath.is_dir(): + log.error("The palmprint directory %s does not exist", polpath) + exit(1) + self.polpath = polpath + + # Verify the reference RdRp models and fasta files exist + self.rdrps = [] + rdrp_filelist = polpath / "rdrp.model.list" + + if not rdrp_filelist.is_file(): + log.error('The "rdrp.model.list" file not found in %s,', polpath) + exit(1) + + # Check each input RdRp model fasta, full_pdb, and palmprint_pdb + with open(rdrp_filelist) as rdrp_fl: + log.info(" Verifying RdRp Model List") + # Verify each rdrp file in directory tree + for line in rdrp_fl: + line = line.strip() + if len(line) > 0: + name = line + log.info(" %s", name) + # verify fasta + if not (polpath/"fa"/f"{name}.fa").is_file(): + log.warning("Fasta file missing for: %s", name) + continue + # verify full sequence + if not (polpath/"full_length"/f"{name}.pdb.gz").is_file(): + log.warning("Full length structure missing for: %s", name) + continue + # verify palmprint pdb structucture (required) + if not (polpath/"palmprint"/f"{name}.pdb").is_file(): + log.error(".pdb missing for: %s", name) + exit(1) + self.rdrps.append(name) + + + # Verify the reference XdXp models and fasta files exist + self.xdxps = [] + xdxp_filelist = polpath/"xdxp.model.list" + + if not xdxp_filelist.is_file(): + log.error('The "xdxp.model.list" file not found in %s,', polpath) + exit(1) + + # Check each input XdXp model fasta, full_pdb, and palmprint_pdb + with open(xdxp_filelist) as xdxp_fl: + log.info(" Verifying XdXp Model List") + # Verify each XdXp file in directory tree + for line in xdxp_fl: + line = line.strip() + if len(line) > 0: + name = line + log.info(" %s", name) + # verify fasta + if not (polpath/"fa"/f"{name}.fa").is_file(): + log.warning("Fasta file missing for: %s", name) + continue + # verify full sequence + if not (polpath/"full_length"/f"{name}.pdb.gz").is_file(): + log.warning("Full length structure missing for: %s", name) + continue + # verify palmprint pdb structucture (required) + if not (polpath/"palmprint"/f"{name}.pdb").is_file(): + log.error(".pdb missing for: %s", name) + exit(1) + self.xdxps.append(name) + + self.all_domains = self.rdrps + self.xdxps + + # Classification Routine ------------------------------ + # Runs TMalign of input against all palmprint pdb + # Extracts scores and re-aligns if a RdRp palmprint detected + def align(self, pdbpath:Path, outpath:Path, name:str, tm_threshold:float=0.4): + ''' + fetch `pdbpath`/`name`.pdb file as target. + align it to each palm in `self.polpath`/{rdrp,xdxp}.model.list. + record the align score in `outpath`/{name}.tm + + if the highest score surpass given `tm_threshold` + realign the target pdb to highest matching (`outpath`/{name}_realign.pdb) + write the seq of palmprint/core (`outpath`/{name}_{pp,rc}.fa) + + parameters: + ----------- + pdbpath,outpath: pathlib.Path (str compatible). + + name: str + should not contains suffix. + + tm_threshold: float=0.4 + only when highest score + will classification/realignment be executed. + ''' + pdbpath,outpath=Path(pdbpath),Path(outpath) # for compatibility + + pdb_file = pdbpath / f'{name}.pdb' + if not pdb_file.is_file(): + log.warning("No .pdb file found %s", pdb_file) + return + + # Define input/output files + outf:Callable[[str], Path] = lambda x: outpath / (pdb_file.stem+x) + + out_tsv,out_ppfa,out_rcfa,out_rpdb = ( + outf('.tm'),outf(".pp.fa"),outf(".rc.fa"),outf("_realigned.pdb")) + + # Reset maximum scores observed + max_rdrp_score = max_xdxp_score = 0 + max_rdrp = max_xdxp = None + + # Initialize output TSV file + log.info(" Writing tsv output to %s", out_tsv) + with open(out_tsv, "w") as out: + print( + "PDBchain1\tPDBchain2\tTM1\tTM2\t" + + "RMSD\tID1\tID2\tIDali\tL1\tL2\tLali", + file=out + ) + + log.info(" Run TMalign against RdRp palmprints") + for domain in self.rdrps: + values = tm_align(pdb_file,self.polpath/'palmprint'/f'{domain}.pdb') + print("\t".join([str(v) for v in values]), file=out) + score= values[3] + if score >= max_rdrp_score: + max_rdrp_score = score + max_rdrp = domain + + # Run XdXp-palmprint TMalign and identify max_XdXp_Score + log.info(" Run TMalign against XdXp palmprints") + for domain in self.xdxps: + # Run TM-Align + values = tm_align(pdb_file,self.polpath/'palmprint'/f'{domain}.pdb') + print("\t".join([str(v) for v in values]), file=out) + score= values[3] + if score >= max_xdxp_score: + max_xdxp_score = score + max_xdxp = domain + + # Does maxRdRP_score pass CUTOFF value + #TODO write top matches in a log file. (now only in stdout in verbose mode) + log.info("") + log.info(" Max RdRp palmprint: %s - %s", max_rdrp, max_rdrp_score) + log.info(" Max XdXp palmprint: %s - %s", max_xdxp, max_xdxp_score) + log.info("") + + # RdRp classification + # RdRp TMscore above re-alignment threshold + if max(max_rdrp_score, max_xdxp_score) >= tm_threshold: + # MAIN TODO output a pse file with: + # 1.original pdb & best match palm pdb, aligned. + # 2.highlighted by colors: motif 1-5 + # 3.highlighted by licorice: ion-clamps, aromatic/polar gates. + + log.info(" + Polymerase re-alignment threshold score reached: %s", tm_threshold) + if max_rdrp_score > max_xdxp_score: + # RdRp+ classification + log.info(" ++ RdRp-classification for %s", name) + ## fix a bug here + domain=max_rdrp + else: + log.info(" %s classified as non-RdRp polymerase", name) + domain=max_xdxp + log.info("") + log.info(" Re-aligning:") + + # Run TMalign to re-align input structure to top-RdRp palmprint + + realign_cmd = (f"{TMALIGN} -outfmt 1 {pdb_file} " + f"{self.polpath/'palmprint'/f'{domain}.pdb'} -o {out_rpdb.with_suffix('')}") + log.info(" %s", realign_cmd) + + with TemporaryDirectory() as tdir: + temp_file = Path(tdir)/f'{name}.pdb' + log.info('execute `palmgrab` to sub-select palmprint/core fasta seq') + with open(temp_file,'w+') as f: + subprocess.run(realign_cmd.split(" "), stdout=f) + palmgrab(temp_file,f"{out_ppfa}",f"{out_rcfa}") + + for item in outpath.iterdir(): + if '_realigned' in item.stem and item.suffix!='.pdb': + item.unlink() + log.info(" done") + log.info("") +