Skip to content

Commit

Permalink
Merge pull request #74 from moshi4/develop
Browse files Browse the repository at this point in the history
Bump to v1.5.0
  • Loading branch information
moshi4 authored Dec 21, 2024
2 parents 17e7c00 + 609a116 commit 569a22d
Show file tree
Hide file tree
Showing 9 changed files with 231 additions and 35 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
os: [ubuntu-24.04, macos-latest]
python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- name: Checkout
Expand All @@ -32,7 +32,7 @@ jobs:
- name: Install external tool dependencies
run: |
sudo apt update -y
sudo apt install -y ncbi-blast+ mmseqs2 mummer progressivemauve
sudo apt install -y ncbi-blast+ mmseqs2 mummer progressivemauve last-align
if: ${{ matrix.os=='ubuntu-latest' }}

- name: Run ruff lint check
Expand Down
2 changes: 1 addition & 1 deletion src/pygenomeviz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pygenomeviz.genomeviz import GenomeViz

__version__ = "1.4.2"
__version__ = "1.5.0"

__all__ = [
"GenomeViz",
Expand Down
2 changes: 2 additions & 0 deletions src/pygenomeviz/align/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pygenomeviz.align.tool import (
AlignToolBase,
Blast,
Last,
MMseqs,
MUMmer,
ProgressiveMauve,
Expand All @@ -11,6 +12,7 @@
"AlignCoord",
"AlignToolBase",
"Blast",
"Last",
"MMseqs",
"MUMmer",
"ProgressiveMauve",
Expand Down
71 changes: 42 additions & 29 deletions src/pygenomeviz/align/coord.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import csv
import io
from dataclasses import astuple, dataclass
from functools import cached_property
from pathlib import Path

from pygenomeviz.typing import SeqType


@dataclass
@dataclass(frozen=True)
class AlignCoord:
"""Alignment Coordinates DataClass (0-based coordinate)"""

Expand All @@ -23,58 +24,58 @@ class AlignCoord:
identity: float | None = None
evalue: float | None = None

@property
@cached_property
def query_length(self) -> int:
"""Query length"""
return abs(self.query_end - self.query_start)

@property
@cached_property
def query_strand(self) -> int:
"""Query strand"""
return 1 if self.query_end >= self.query_start else -1

@property
@cached_property
def query_link(self) -> tuple[str, str, int, int]:
"""Query (name, start, end) link"""
return (self.query_id, self.query_name, self.query_start, self.query_end)

@property
@cached_property
def query_block(self) -> tuple[int, int, int]:
"""Query (start, end, strand) block"""
if self.query_start < self.query_end:
return (self.query_start, self.query_end, self.query_strand)
else:
return (self.query_end, self.query_start, self.query_strand)

@property
@cached_property
def ref_length(self) -> int:
"""Reference length"""
return abs(self.query_end - self.query_start)

@property
@cached_property
def ref_strand(self) -> int:
"""Reference strand"""
return 1 if self.ref_end >= self.ref_start else -1

@property
@cached_property
def ref_link(self) -> tuple[str, str, int, int]:
"""Reference (name, start, end) link"""
return (self.ref_id, self.ref_name, self.ref_start, self.ref_end)

@property
@cached_property
def ref_block(self) -> tuple[int, int, int]:
"""Reference (start, end, strand) block"""
if self.ref_start < self.ref_end:
return (self.ref_start, self.ref_end, self.ref_strand)
else:
return (self.ref_end, self.ref_start, self.ref_strand)

@property
@cached_property
def is_inverted(self) -> bool:
"""Check inverted or not"""
return self.query_strand * self.ref_strand < 0

@property
@cached_property
def as_tsv_format(self) -> str:
"""TSV format text"""
return "\t".join(
Expand All @@ -100,12 +101,12 @@ def parse_blast_file(
query_id: str,
ref_id: str,
) -> list[AlignCoord]:
"""Parse blast result file (outfmt=6)
"""Parse blast format result file (outfmt=6)
Parameters
----------
blast_file : str | Path
Blast result file
Blast format result file
query_id : str
Query ID
ref_id : str
Expand All @@ -120,16 +121,20 @@ def parse_blast_file(
with open(blast_file) as f:
reader = csv.reader(f, delimiter="\t")
for row in reader:
if row[0].startswith("#"):
continue
qseqid, sseqid = row[0], row[1]
# Blast pident: 0 <= pident <= 100
# Last pident: 0 <= pident <= 100
# MMseqs pident: 0 <= pident <= 1.0
pident = float(row[2])
if 0 <= pident <= 1.0:
# Convert [0-1] to [0-100] pident (e.g. 0.95215 -> 95.21)
pident = int(float(pident) * 10000) / 100
qstart, qend, sstart, send = map(int, row[6:10])
qstart, sstart = qstart - 1, sstart - 1 # 1-based to 0-based coordinate
evalue = float(row[10])
# No evalue column exist in Last output
evalue = float(row[10]) if len(row) >= 11 else None

align_coords.append(
AlignCoord(
Expand Down Expand Up @@ -419,31 +424,39 @@ def filter_overlap(align_coords: list[AlignCoord]) -> list[AlignCoord]:
def __contains__(self, target_ac: AlignCoord) -> bool:
"""Check whether target is completely overlapping with self"""
# Check query-ref is same value or not
if not (
self.query_id == target_ac.query_id
and self.query_name == target_ac.query_name
and self.ref_id == target_ac.ref_id
and self.ref_name == target_ac.ref_name
if (
self.query_id != target_ac.query_id
or self.query_name != target_ac.query_name
or self.ref_id != target_ac.ref_id
or self.ref_name != target_ac.ref_name
):
return False

# Check same query-ref coord overlap
ac1, ac2 = target_ac, self
ac1_qmin = min(ac1.query_start, ac1.query_end)
ac1_qmax = max(ac1.query_start, ac1.query_end)
ac2_qmin = min(ac2.query_start, ac2.query_end)
ac2_qmax = max(ac2.query_start, ac2.query_end)
ac1_rmin = min(ac1.ref_start, ac1.ref_end)
ac1_rmax = max(ac1.ref_start, ac1.ref_end)
ac2_rmin = min(ac2.ref_start, ac2.ref_end)
ac2_rmax = max(ac2.ref_start, ac2.ref_end)
if (
ac2_qmin <= ac1_qmin <= ac1_qmax <= ac2_qmax
and ac2_rmin <= ac1_rmin <= ac1_rmax <= ac2_rmax
ac2._qmin <= ac1._qmin <= ac1._qmax <= ac2._qmax
and ac2._rmin <= ac1._rmin <= ac1._rmax <= ac2._rmax
):
return True
else:
return False

@cached_property
def _qmin(self) -> int:
return min(self.query_start, self.query_end)

@cached_property
def _qmax(self) -> int:
return max(self.query_start, self.query_end)

@cached_property
def _rmin(self) -> int:
return min(self.ref_start, self.ref_end)

@cached_property
def _rmax(self) -> int:
return max(self.ref_start, self.ref_end)

def __eq__(self, target_ac: AlignCoord) -> bool:
return self.as_tsv_format == target_ac.as_tsv_format
4 changes: 3 additions & 1 deletion src/pygenomeviz/align/tool/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from pygenomeviz.align.tool.base import AlignToolBase
from pygenomeviz.align.tool.blast import Blast
from pygenomeviz.align.tool.last import Last
from pygenomeviz.align.tool.mmseqs import MMseqs
from pygenomeviz.align.tool.mummer import MUMmer
from pygenomeviz.align.tool.pmauve import ProgressiveMauve

__all__ = [
"AlignToolBase",
"Blast",
"Last",
"MMseqs",
"MUMmer",
"ProgressiveMauve",
"AlignToolBase",
]
128 changes: 128 additions & 0 deletions src/pygenomeviz/align/tool/last.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
from __future__ import annotations

import logging
import os
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Sequence

from pygenomeviz.align import AlignCoord
from pygenomeviz.align.tool import AlignToolBase
from pygenomeviz.parser import Fasta, Genbank


class Last(AlignToolBase):
"""Last Alignment Class
This class is experimental. API may change in the future release.
"""

def __init__(
self,
seqs: Sequence[str | Path | Fasta | Genbank],
*,
outdir: str | Path | None = None,
threads: int | None = None,
cmd_opts: str | None = None,
logger: logging.Logger | None = None,
quiet: bool = True,
):
"""
Parameters
----------
seqs : Sequence[str | Path | Fasta | Genbank]
List of fasta or genbank
(file suffix must be `.fa`, `.fna`, `.fasta`, `.gb`, `.gbk`, `.gbff`)
outdir : str | Path | None, optional
Temporary result directory. If None, tmp directory is auto created.
threads : int | None, optional
Threads parameter for lastal run
cmd_opts : str | None, optional
`lastal` additional command options
logger : logging.Logger | None, optional
Logger object. If None, logger instance newly created.
quiet : bool, optional
If True, don't display log message
"""
super().__init__(logger, quiet)

self._seqs = self._parse_input_gbk_and_fasta_seqs(seqs)
self._outdir = None if outdir is None else Path(outdir)
self._threads = self.max_threads if threads is None else threads
self._cmd_opts = cmd_opts

@classmethod
def get_tool_name(cls) -> str:
"""Tool name"""
return "Last"

@classmethod
def get_binary_names(cls) -> list[str]:
"""Binary names"""
return ["lastdb", "lastal", "last-split", "maf-convert"]

def run(self) -> list[AlignCoord]:
"""Run genome alignment"""
# Last genome alignment parameters are selected based on this page
# https://gitlab.com/mcfrith/last/-/blob/main/doc/last-cookbook.rst
with TemporaryDirectory() as tmpdir:
outdir = self._outdir if self._outdir else tmpdir
outdir = Path(outdir)
os.makedirs(outdir, exist_ok=True)
genome_files: list[Path] = self._write_genome_files(outdir)

self._logger.info(f"{'='*10} Start Last Alignment {'='*10}")
align_coords = []
for idx in range(len(genome_files) - 1):
qfile, rfile = genome_files[idx], genome_files[idx + 1]
qname, rname = qfile.stem, rfile.stem
self._logger.info(f"{idx+1:02d}. Last Alignment '{qname}' vs '{rname}'")
# Make Last database
lastdb = outdir / f"{rname}_lastdb"
cmd = f"lastdb '{lastdb}' '{rfile}' -P {self._threads}"
self.run_cmd(cmd, self._logger)
# Run Last alignment
outfile_prefix = f"{idx+1:02d}_{qname}_vs_{rname}"
maf_outfile1 = outdir / f"{outfile_prefix}_many-to-one.maf"
cmd = f"lastal '{lastdb}' '{qfile}' -P {self._threads} -D 1e9 --split-f=MAF+" # noqa
if self._cmd_opts:
cmd = f"{cmd} {self._cmd_opts}"
self.run_cmd(cmd, self._logger, maf_outfile1)
# Convert many-to-one -> one-to-one
maf_outfile2 = outdir / f"{outfile_prefix}_one-to-one.maf"
cmd = f"last-split -r '{maf_outfile1}'"
self.run_cmd(cmd, self._logger, maf_outfile2)
# Convert MAF -> BlastTab format
blast_outfile = outdir / f"{outfile_prefix}.tsv"
cmd = f"maf-convert -n blasttab '{maf_outfile2}'"
self.run_cmd(cmd, self._logger, blast_outfile)

align_coords.extend(
AlignCoord.parse_blast_file(blast_outfile, qname, rname)
)
self._logger.info(f"{'='*10} Finish Last Alignment {'='*10}")

return align_coords

def _write_genome_files(self, outdir: str | Path) -> list[Path]:
"""Write genome fasta files to output directory
Parameters
----------
outdir : str | Path
Target output directory
Returns
-------
genome_files : list[Path]
Genome fasta files
"""
genome_files: list[Path] = []
for seq in self._seqs:
genome_file = Path(outdir) / f"{seq.name}.fna"
cls_name = seq.__class__.__name__
log_msg = f"Convert {cls_name} object to genome fasta file '{genome_file}'"
self._logger.info(log_msg)
seq.write_genome_fasta(genome_file)
genome_files.append(genome_file)
return genome_files
6 changes: 5 additions & 1 deletion src/pygenomeviz/scripts/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
import importlib.util
import os
import signal
import subprocess as sp
import sys
import textwrap
Expand Down Expand Up @@ -43,7 +44,10 @@ def main() -> None:
os.environ["STREAMLIT_BROWSER_GATHER_USAGE_STATS"] = "false"
os.environ["STREAMLIT_SERVER_MAX_UPLOAD_SIZE"] = "100"
os.environ["PGV_GUI_LOCAL"] = "true"
sp.run(f"streamlit run {app_path} --server.port {port}", shell=True)
try:
sp.run(f"streamlit run {app_path} --server.port {port}", shell=True)
except KeyboardInterrupt:
sys.exit(-signal.SIGINT)


def get_args(cli_args: list[str] | None = None) -> argparse.Namespace:
Expand Down
Loading

0 comments on commit 569a22d

Please sign in to comment.