Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion isovar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "1.5.0"
__version__ = "1.5.1"


from .allele_read import AlleleRead
Expand Down
12 changes: 9 additions & 3 deletions isovar/allele_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,20 @@ class AlleleRead(ValueObject):
Extremely simplified representation of a read at a locus: just the allele
at the locus and sequence before/after. We're ignoring the base qualities
and any additional information about splicing, clipping or alignment.

When overlapping mates from the same fragment are merged upstream,
`source_read_count` retains the number of raw reads represented by this
fragment-level allele observation.
"""
__slots__ = ["prefix", "allele", "suffix", "name", "sequence"]
__slots__ = ["prefix", "allele", "suffix", "name", "sequence", "source_read_count"]

def __init__(self, prefix, allele, suffix, name):
def __init__(self, prefix, allele, suffix, name, source_read_count=1):
self.prefix = prefix
self.allele = allele
self.suffix = suffix
self.name = name
self.sequence = prefix + allele + suffix
self.source_read_count = source_read_count

def __len__(self):
return len(self.sequence)
Expand Down Expand Up @@ -95,4 +100,5 @@ def from_locus_read(cls, locus_read):
prefix,
nucleotides_at_variant_locus,
suffix,
name=read_name)
name=read_name,
source_read_count=locus_read.source_read_count)
22 changes: 18 additions & 4 deletions isovar/dataframe_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,11 @@ def allele_counts_dataframe(read_evidence_generator):
return dataframe_from_generator(
element_class=ReadEvidence,
variant_and_elements_generator=read_evidence_generator,
# DataFrameBuilder will take the length of these fields' values
converters={
"ref_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads),
"alt_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads),
"other_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads),
},
rename_dict={
"ref_reads": "num_ref_reads",
"alt_reads": "num_alt_reads",
Expand All @@ -99,6 +103,7 @@ def allele_reads_to_dataframe(variants_and_allele_reads):
"""
df_builder = DataFrameBuilder(
AlleleRead,
exclude={"source_read_count"},
extra_column_fns={
"gene": lambda v, _: ";".join(v.gene_names),
})
Expand All @@ -116,6 +121,7 @@ def locus_reads_dataframe(alignments, chromosome, base0_start, base0_end, *args,
"""
df_builder = DataFrameBuilder(
LocusRead,
exclude={"source_read_count"},
variant_columns=False,
converters={
"reference_positions": list_to_string,
Expand Down Expand Up @@ -187,7 +193,10 @@ def translations_generator_to_dataframe(translations_generator):
},
extra_column_fns={
"untrimmed_variant_sequence_read_count": (
lambda _, t: len(t.untrimmed_variant_sequence.reads)),
lambda _, t: sum(
getattr(read, "source_read_count", 1)
for read in t.untrimmed_variant_sequence.reads
)),
})


Expand All @@ -197,7 +206,12 @@ def read_evidence_generator_to_dataframe(read_evidence_generator):
"""
return dataframe_from_generator(
element_class=ReadEvidence,
variant_and_elements_generator=read_evidence_generator)
variant_and_elements_generator=read_evidence_generator,
converters={
"ref_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads),
"alt_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads),
"other_reads": lambda reads: sum(getattr(read, "source_read_count", 1) for read in reads),
})


def isovar_results_to_dataframe(isovar_results):
Expand All @@ -213,4 +227,4 @@ def isovar_results_to_dataframe(isovar_results):
records = []
for isovar_result in isovar_results:
records.append(isovar_result.to_record())
return pd.DataFrame.from_records(records)
return pd.DataFrame.from_records(records)
13 changes: 9 additions & 4 deletions isovar/isovar_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
from .common import safediv
from .alignment_score import alignment_score


def _sum_source_read_count(reads):
return sum(getattr(read, "source_read_count", 1) for read in reads)


class IsovarResult(object):
"""
This object represents all information gathered about a variant,
Expand Down Expand Up @@ -983,7 +988,7 @@ def num_ref_reads(self):
"""
Number of reads which support the reference allele.
"""
return len(self.ref_reads)
return _sum_source_read_count(self.ref_reads)

@cached_property
def num_ref_fragments(self):
Expand All @@ -997,7 +1002,7 @@ def num_alt_reads(self):
"""
Number of reads which support the alt allele.
"""
return len(self.alt_reads)
return _sum_source_read_count(self.alt_reads)

@cached_property
def num_alt_fragments(self):
Expand All @@ -1011,7 +1016,7 @@ def num_other_reads(self):
"""
Number of reads which support neither the reference nor alt alleles.
"""
return len(self.other_reads)
return _sum_source_read_count(self.other_reads)

@cached_property
def num_other_fragments(self):
Expand Down Expand Up @@ -1147,4 +1152,4 @@ def num_phased_variants_in_protein_sequence(self):

Returns int
"""
return len(self.phased_variants_in_protein_sequence)
return len(self.phased_variants_in_protein_sequence)
16 changes: 13 additions & 3 deletions isovar/locus_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,17 @@ class LocusRead(ValueObject):
"""
Minimal set of information extracted from SAM/BAM alignment file at a particular
locus to later figure out the allele at this locus.

Overlapping paired-end reads from the same fragment may be merged into a
single LocusRead. In that case `source_read_count` records how many raw
alignments were collapsed into this fragment-level view.
"""
__slots__ = [
"name",
"sequence",
"reference_positions",
"quality_scores",
"source_read_count",
"reference_base0_start_inclusive",
"reference_base0_end_exclusive",
"read_base0_start_inclusive",
Expand All @@ -44,7 +49,8 @@ def __init__(
reference_base0_start_inclusive,
reference_base0_end_exclusive,
read_base0_start_inclusive,
read_base0_end_exclusive):
read_base0_end_exclusive,
source_read_count=1):
"""
Parameters
----------
Expand All @@ -61,6 +67,11 @@ def __init__(
quality_scores : array of int
Base qualities for every character in the sequence

source_read_count : int
Number of raw reads represented by this LocusRead. Usually 1, but
overlapping paired-end mates from the same fragment may be merged
into a single LocusRead while keeping this count.

reference_base0_start_inclusive : int
Start index of reference locus which is overlapped
by this read (base 0, inclusive)
Expand Down Expand Up @@ -134,9 +145,8 @@ def __init__(
self.sequence = sequence
self.reference_positions = reference_positions
self.quality_scores = quality_scores
self.source_read_count = source_read_count
self.reference_base0_start_inclusive = reference_base0_start_inclusive
self.reference_base0_end_exclusive = reference_base0_end_exclusive
self.read_base0_start_inclusive = read_base0_start_inclusive
self.read_base0_end_exclusive = read_base0_end_exclusive


2 changes: 1 addition & 1 deletion isovar/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def run_isovar(
threads=decompression_threads)

if read_collector is None:
read_collector = ReadCollector()
read_collector = ReadCollector(merge_overlapping_fragments=True)

if protein_sequence_creator is None:
protein_sequence_creator = ProteinSequenceCreator()
Expand Down
6 changes: 5 additions & 1 deletion isovar/protein_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
logger = get_logger(__name__)


def _sum_source_read_count(reads):
return sum(getattr(read, "source_read_count", 1) for read in reads)


class ProteinSequence(TranslationKey):
"""
Translated amino acid sequence aggregated across possibly multiple
Expand Down Expand Up @@ -181,7 +185,7 @@ def num_supporting_reads(self):

Returns int
"""
return len(self.supporting_reads)
return _sum_source_read_count(self.supporting_reads)

@property
def num_mismatches_before_variant(self):
Expand Down
Loading
Loading