HerwigLab · YalanBi · May 6, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 24, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,24 +1,41 @@
 # Change Log
 
 ## TODO: ideas, issues and planed extensions or changes that are not yet implemented
+
 * optimize add_qc_metrics for run after new samples have been added - should not recompute everything
-* planned new feature: during import of long reads, (optionally) correct for short exon alignment issues. 
+* planned new feature: during import of long reads, (optionally) correct for short exon alignment issues.
 * separate new read import and classification of isoforms.
 
+## [2.0.0]
+
+* supported the analysis of Oxford Nanopore data
+* new option TSS identification according to the reference annotation
+* new gene model characteristics, simplex coordinates and relative entropy
+* new visualisation using triangle plot
+* fixed bugs in external gtf import to reconstruct transcriptome
+* imporoved the coordination analysis of TSS and PAS
+* supported the import of SQANTI QC report and filtering based on it
+* supported the export of positive and negative TSS for ML
+* supported proteogenomic approaches at the interface of transcriptomics and proteomics
+* improved code readability and filter tag construction
+* ...
+
 ## [0.3.5]
+
 * fixed a bug in domain plots, which was introduced in 0.3.4
 * fixed a bug in iter_genes/iter_transcripts with region='chr', and no positions specified
 * new option in plot_domains to depict noncoding transcripts, controlled with the coding_only parameter
 
 ## [0.3.4]
+
 * fixing #8: AssertationError when unifying TSS/PAS between transcript
-* improved domain plots: ORF start and end do not appear like exon exon boundaries. 
-* API change: separated ORF prediction from QC metrics calculation. 
+* improved domain plots: ORF start and end do not appear like exon exon boundaries.
+* API change: separated ORF prediction from QC metrics calculation.
 * new feature: count number of upstream start codons in Gene.add_orfs() (called by default when adding QC metrics to transcriptome)
-* new feature: calculate Fickett testcode and hexamer score for longest ORFs, to separate coding and noncoding genes. 
-
+* new feature: calculate Fickett testcode and hexamer score for longest ORFs, to separate coding and noncoding genes.
 
 ## [0.3.3]
+
 * fixed bug in filter_ref_transcripts with no query
 * export gtf with long read transcripts as well uncovered as reference transcripts
 * fix warning in plot_diff_results
@@ -28,11 +45,13 @@
 * improved documentation: syntax highlighting, code style, additional explanations on filtering
 
 ## [0.3.2]
+
 * restructured tutorials
 * new feature: add domains to differential splicing result tables.
 * new feature: min_coverage and max_coverage for iter_genes function.
 
 ## [0.3.1]
+
 * new feature: add protein domains from 3 different sources and depict them with Gene.plot_domains()
 * new feature: restrict gene and transcript iterators on list of genes of interest
 * new feature: filter_transcripts function for genes
@@ -43,39 +62,45 @@
     * order of events is now according to gene strand: A upstream of B
 
 ## [0.3.0]
+
 * new feature: find longest ORF and infer NMD of lr transcripts (and annotation)
 * new feature: allow for several TSS/PAS per intron chain and unify them across intron chains
 * changed default parameter of filter_query in run_isotools script to "FSM or not (INTERNAL_PRIMING or RTTS)"
 
 ## [0.2.11.1]
-* bugfix: KeyError during transcriptome reconstruction in _add_chimeric. 
+
+* bugfix: KeyError during transcriptome reconstruction in _add_chimeric.
 * bugfix: default colors in plot_diff_results.
 
 ## [0.2.11]
+
 * added function to import samples from csv/gtf to import transcriptome reconstruction / quantification from other tools.
 * dropped requirement for gtf files to be tabix indexed.
 
 
 ## [0.2.10]
+
 * fixed get_overlap - important for correct assignment of mono exonic genes to reference
 * added parameter to control for minimal mapping quality in add_sample_from_bam. This allows for filtering out ambiguous reads, which have mapping quality of 0
 * fixed plot_diff_result (Key error due to incorrect parsing of group names)
-* New function estimate_tpm_threshold, to estimate the minimal abundance level of observable transcripts, given a sequencing depth. 
-* New function coordination_test, to test coordination of splicing events within a gene. 
+* New function estimate_tpm_threshold, to estimate the minimal abundance level of observable transcripts, given a sequencing depth.
+* New function coordination_test, to test coordination of splicing events within a gene.
 * Optional log or linear scale for the coverage axis in sashimi plots.
 
 ## [0.2.9]
+
 * added DIE test
 * adjusted classification of novel exonic TSS/PAS to ISM
-* improved assignment of reference genes in case of equal number of matching splice sites to several reference genes. 
+* improved assignment of reference genes in case of equal number of matching splice sites to several reference genes.
 * added parameter to control for minimal exonic overlap to reference genes in add_sample_from_bam.
 * changed computation of direct repeats. Added wobble and max_mm parameters.
-* exposed parameters to end user in the add_qc_metrics function. 
+* exposed parameters to end user in the add_qc_metrics function.
 * added options for additional fields in gtf output
 * improved options for graphical output with the command line script
 * fixed plot_bar default color scheme
 
 ## [0.2.8]
+
 * fix: version information lost when pickeling reference.
 * fix missing gene name
 * added pt_size parameter to plot_embedding and plot_diff_results function
@@ -84,11 +109,13 @@
 
 
 ## [0.2.7]
+
 * added command line script run_isotools.py
-* added test data for unit tests 
+* added test data for unit tests
 
 
 ## [0.2.6]
+
 * Added unit tests
 * Fixed bug in novel splicing subcategory assignment
 * new feature: rarefaction analysis
@@ -98,55 +125,63 @@
     * added optional progress bar to iter_genes/transcripts
 
 ## [0.2.5]
+
 * New feature: distinguish noncanonical and canonical novel splice sites for direct repeat hist
 * New feature: option to drop partially aligned reads with the min_align_fraction parameter in add_sample_from_bam
 
 ## [0.2.4]
+
 * New feature: added option to save read names during bam import
 * new feature: gzip compressed gtf output
 
 ## [0.2.3]
+
 * Changed assignment of transcripts to genes if no splice sites match.
 * Fix: more flexible import of reference files, gene name not required (but id is), introducing "infer_genes" from exon entries of gtf files.
 * New function: Transcriptome.remove_filter(filter=[tags])
 
 ## [0.2.2]
+
 * Fix: export to gtf with filter features
 
 ## [0.2.1]
+
 * Fix: import reference from gtf file
 * New feature: Import multiple samples from single bam tagged by barcode (e.g. from single cell data)
 * Fix: issue with zero base exons after shifting fuzzy junctions
 
-
 ## [0.2.0]
+
 * restructure to meet PyPI recommendations
 * New feature: isoseq.altsplice_test accepts more than 2 groups, and computes ML parameters for all groups
 
 ## [0.1.5]
+
 * New feature: restrict tests on provided splice_types
 * New feature: provide position to find given alternative splicing events
 
 ## [0.1.4]
+
 * Fix: Issue with noncanonical splicing detection introduced in 0.1.3
 * Fix: crash with secondary alignments in bam files during import.
 * New feature: Report and skip if alignment outside chromosome (uLTRA issue)
 * Fix: import of chimeric reads (secondary alignments have no SA tag)
-* Fix: Transcripts per sample in sample table: During import count only used transcripts, do not count chimeric transcripts twice. 
+* Fix: Transcripts per sample in sample table: During import count only used transcripts, do not count chimeric transcripts twice.
 * Change: sample_table reports chimeric_reads and nonchimeric_reads (instead of total_reads)
 * Change: import of long read bam is more verbose in info mode
 * Fix: Bug: import of chained chimeric alignments overwrites read coverage when merging to existing transcript
 * Fix: remove_samples actually removes the samples from the sample_table
 * Change: refactored add_biases to add_qc_metrics
 * fix: property of transcripts included {sample_name:0}
 * save the TSS and PAS positions
-* New: use_satag parameter for add_sample_from_bam 
+* New: use_satag parameter for add_sample_from_bam
 * Change: use median TSS/PAS (of all reads with same splice pattern) as transcript start/end (e.g. exons[0][0]/exons[-1][1])
 * Fix: Novel exon skipping annotation now finds all exonic regions that are skipped.
 * change: Default filter of FRAGMENTS now only tags reads that do not use a reference TSS or PAS
+
 ## [0.1.3]
-* Fix: improved performance of noncanonical splicing detection by avoiding redundant lookups. 
 
+* Fix: improved performance of noncanonical splicing detection by avoiding redundant lookups.
 
 ## [0.1.2] - 2020-05-03
 
@@ -157,7 +192,6 @@
 * New: Do not distinguish intronic/exonic novel splice sites. Report distance to shortest splice site of same type.
 * Fix: Sashimi plots ignored mono exons
 
-
 ## [0.1.1] - 2020-04-12
 
 * Fix: fixed bug in TSS/PAS events affecting start/end positions and known flag.
@@ -170,23 +204,23 @@
 * moved examples in documentation
 
 ## [0.0.2] - 2020-03-22
+
 * Change: refactored SpliceGraph to SegmentGraph to better comply with common terms in literature
-* New: added a basic implementation of an actual SpliceGraph (as commonly defined in literature) 
+* New: added a basic implementation of an actual SpliceGraph (as commonly defined in literature)
     * based on sorted dict
     * not used so far, but maybe useful in importing the long read bam files since it can be extended easily
-* New: added decorators "experimental" and "deprecated" to mark unsafe functions 
+* New: added decorators "experimental" and "deprecated" to mark unsafe functions
 * Change: in differential splicing changed the alternative fraction, to match the common PSI (% spliced in) definition
 * Change: narrowed definition of mutually exclusive exons: the alternatives now need to to feature exactly one ME exon and rejoin at node C
 * Change: for ME exons now the beginning of node C is returned as "end" of the splice bubble
-* New: differential splicing result contains "novel", indicating that the the alternative is in the annotation 
+* New: differential splicing result contains "novel", indicating that the the alternative is in the annotation
 * New: added alternative TSS/alternative PAS to the differential splicing test
 * Change: removed obsolete weights from splice graph and added strand
 * Change: unified parameters and column names of results of Transcriptome.find_splice_bubbles() and Transcriptome.altsplice_test()
-* Fix: add_short_read_coverage broken if short reads are already there. 
-
+* Fix: add_short_read_coverage broken if short reads are already there.
 
 ## [0.0.1] - 2020-02-25
+
 * first shared version
 * New: added option to export alternative splicing events for MISO and rMATS
 * New: added change log
-
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.3.5_rc11
+2.0.0
diff --git a/setup.cfg b/setup.cfg
@@ -1,15 +1,15 @@
 [metadata]
 name = isotools
 version = file: VERSION.txt
-author = Matthias Lienhard
-author_email = [email protected]
+author = Matthias Lienhard, Yalan Bi
+author_email = [email protected], [email protected]
 description = Framework for the analysis of long read transcriptome sequencing data
 long_description = file: README.md
 long_description_content_type = text/markdown
 license_files = LICENSE.txt
-url = https://github.com/MatthiasLienhard/isotools
+url = https://github.com/HerwigLab/IsoTools2
 project_urls =
-    Bug Tracker = https://github.com/MatthiasLienhard/isotools/issues
+    Bug Tracker = https://github.com/HerwigLab/IsoTools2/issues
 classifiers =
     Programming Language :: Python :: 3
     License :: OSI Approved :: MIT License

diff --git a/src/isotools/_gene_plots.py b/src/isotools/_gene_plots.py
@@ -222,7 +222,6 @@ def sashimi_plot(self, samples=None, title='Long read sashimi plot', ax=None, ju
 
     The Sashimi plot depicts the genomic long read sequencing coverage of one or more samples as blocks, and junction coverage as arcs.
 
-
     :param samples: Names of the samples to be depicted (as a list).
     :param title: Specify the title of the axis.
     :param ax: Specify the axis.
@@ -242,7 +241,7 @@ def sashimi_plot(self, samples=None, title='Long read sashimi plot', ax=None, ju
     :param min_cov_th: Coverage threshold for a junction to be considdered at all.
     :param text_width: Scaling factor for the horizontal space that gets reserved for labels on the arcs.
         This affects the height of the arcs.
-    :param arc_type: Label the junction arcs with  the "coverage" (e.g. number of supporting reads),
+    :param arc_type: Label the junction arcs with the "coverage" (e.g. number of supporting reads),
         "fraction" (e.g. fraction of supporting reads in %), or "both".
     :param text_height: Scaling factor for the vertical space that gets reserved for labels on the arcs.
         This affects the height of the arcs.'''

diff --git a/src/isotools/_transcriptome_io.py b/src/isotools/_transcriptome_io.py
@@ -1007,7 +1007,7 @@ def _read_gtf_file(file_name, chromosomes, infer_genes=False, progress_bar=True)
                 logger.warning('skipping line with unknown strand:\n%s', line)
                 # add this entry to skipped
                 # keys are feature types (ls[2], e.g. gene, transcript, exon) and values are sets of feature ids that are searched in ls[-1]
-                feature_id = [i.split(' ')[-1].strip('"') for i in ls[-1].split(sep=';') if f'{ls[2]}_id' or f'{ls[2]}_number' in i]
+                feature_id = [i.split(' ')[-1].strip('"') for i in ls[-1].split(sep=';') if f'{ls[2]}_id' in i or f'{ls[2]}_number' in i]
                 if len(feature_id) == 1:
                     skipped[ls[2]].add(feature_id[0])
                 else:
@@ -1149,7 +1149,7 @@ def import_ref_transcripts(fn, transcriptome: Transcriptome, file_format, chromo
 
     if skipped:
         logger.info('skipped the following categories: %s', skipped.keys())
-    
+
     logger.debug('construct interval trees for genes...')
     genes: dict[str, IntervalTree[Gene]] = {}
     for chrom in gene_infos:
@@ -1521,23 +1521,29 @@ def write_gtf(self: Transcriptome, fn, source='isotools', gzip=False, **filter_a
             f.write('\n'.join(('\t'.join(str(field) for field in line) for line in lines)) + '\n')
 
 
-def write_fasta(self: Transcriptome, genome_fn, fn, gzip=False, reference=False, protein=False, **filter_args):
+def write_fasta(self: Transcriptome, genome_fn, fn, gzip=False, reference=False, protein=False, coverage=None, **filter_args):
     '''
     Exports the transcript sequences in fasta format to a file.
 
     :param genome_fn: Path to the genome in fastA format.
     :param reference: Specify whether the sequence is fetched for reference transcripts (True), or long read transcripts (False, default).
     :param protein: Return protein sequences (ORF) instead of transcript sequences.
+    :param coverage: By default, the coverage is not added to the header of the fasta. If set, the allowed values are: 'all', or 'sample'.
+        'all' - total coverage for all samples; 'sample' - coverage by sample.
     :param fn: The filename to write the fasta.
     :param gzip: Compress the output as gzip.
     :param filter_args: Additional filter arguments (e.g. "region", "gois", "query") are passed to iter_transcripts.
     '''
 
+    if coverage:
+        assert coverage in ['all', 'sample'], 'if coverage is set, it must be "all", or "sample"'
+
     with openfile(fn, gzip) as f:
         logger.info('writing %sfasta file to %s', "gzip compressed " if gzip else "", fn)
         for gene, transcript_ids, _ in self.iter_transcripts(genewise=True, **filter_args):
             tr_seqs = gene.get_sequence(genome_fn, transcript_ids, reference=reference, protein=protein)
-            f.write('\n'.join(f'>{gene.id}_{k} gene={gene.name}\n{v}' for k,v in tr_seqs.items()) + '\n')
+            if len(tr_seqs) > 0:
+                f.write('\n'.join(f'>{gene.id}_{k} gene={gene.name}{(" coverage=" + (str(gene.coverage[:, k].sum()) if coverage == "all" else str(gene.coverage[:, k])) if coverage else "")}\n{v}' for k, v in tr_seqs.items()) + '\n')
 
 
 def export_alternative_splicing(self: Transcriptome, out_dir, out_format='mats', reference=False, min_total=100,

diff --git a/src/isotools/gene.py b/src/isotools/gene.py
@@ -334,7 +334,7 @@ def get_sequence(self, genome_fh, transcript_ids=None, reference=False, protein=
         :param transcript_ids: List of transcript ids for which the sequence are requested.
         :param reference: Specify whether the sequence is fetched for reference transcripts (True)
             or long read transcripts (False, default).
-        :param protein: Return protein sequences instead of transcript sequences.
+        :param protein: Return translated protein coding sequences instead of full transcript sequences.
         :returns: A dictionary of transcript ids and their sequences.
         '''
 
@@ -359,6 +359,7 @@ def get_sequence(self, genome_fh, transcript_ids=None, reference=False, protein=
             transcript_seqs = {i: reverse_complement(ts) for i, ts in transcript_seqs.items()}
         if not protein:
             return transcript_seqs
+
         prot_seqs = {}
         for i, transcript in trL:
             orf = transcript.get("CDS", transcript.get("ORF"))

diff --git a/src/isotools/plots.py b/src/isotools/plots.py
@@ -373,8 +373,12 @@ def plot_str_var_number(str_var_count, group_name:'str', n_multi=10, fig_size=(1
     fig, axs = plt.subplots(1, 3, figsize=fig_size)
 
     group_tab = str_var_count.loc[:, str_var_count.columns.str.startswith(group_name)]
+    feature_list = group_tab.columns.str.split('_').str[-1].unique().tolist()
 
-    for i, feature in enumerate(group_tab.columns.str.split('_').str[-1].unique()):
+    # update group_tab to avoid cases where group_name is a prefix of another group name
+    group_tab = group_tab.loc[:, [f'{group_name}_{f}' for f in feature_list]]
+
+    for i, feature in enumerate(feature_list):
         n_feature_tab = group_tab.filter(regex=feature).value_counts(dropna=True).to_frame().sort_index().reset_index()
         n_feature_tab.columns = ['n_feature', 'n_gene']
 
@@ -450,10 +454,6 @@ def triangle_plot(str_var_tab, ax=None, colors=None, tax_title=None):
     else:
         _, tax = ternary.figure(scale=scale)
 
-    for gn in groups:
-        vals = coords.loc[:, coords.columns.str.startswith(gn)]
-        tax.scatter(vals.to_numpy()[:, [2, 1, 0]], color=color_scheme[gn], alpha=.5, label=gn)
-
     tax.boundary(linewidth=1.5)
     tax.gridlines(multiple=0.25, linewidth=0.5)
 
@@ -476,6 +476,10 @@ def triangle_plot(str_var_tab, ax=None, colors=None, tax_title=None):
 
     tax.set_background_color(color="whitesmoke", alpha=0.7)
 
+    for gn in groups:
+        vals = coords.loc[:, coords.columns.str.startswith(gn)]
+        tax.scatter(vals.to_numpy()[:, [2, 1, 0]], color=color_scheme[gn], alpha=.7, label=gn)
+
     if isinstance(colors, dict):
         tax.legend(title=None, fontsize=10, facecolor='white', frameon=True)