Skip to content

Commit

Permalink
#7: Use pysam instead of PyFasta
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Oct 31, 2019
1 parent e316740 commit 6fc5024
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 18 deletions.
35 changes: 21 additions & 14 deletions pyreference/reference.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
from __future__ import print_function, absolute_import

import HTSeq
from functools import reduce
from deprecation import deprecated
from functools import reduce
import gzip
import json
from lazy import lazy
import operator
import os
from pyfasta.fasta import Fasta
from pyreference import settings
from pyreference.gene import Gene
from pyreference.mirna import MiRNA
from pyreference.pyreference_config import load_params_from_config
from pyreference.settings import BEST_REGION_TYPE_ORDER
from pyreference.settings import CHROM, START, END, STRAND
from pyreference.transcript import Transcript
from pyreference.utils.genomics_utils import HTSeqInterval_to_pyfasta_feature, \
get_unique_features_from_genomic_array_of_sets_iv, fasta_to_hash
from pyreference.utils.genomics_utils import get_unique_features_from_genomic_array_of_sets_iv, fasta_to_hash, \
HTSeqInterval_to_feature_dict, reverse_complement
from pysam import FastaFile # @UnresolvedImport
import six
import sys

Expand Down Expand Up @@ -237,30 +238,36 @@ def genome(self):
if not os.path.exists(self._genome_sequence_fasta):
raise IOError("Genome sequence file '%s' not found" % self.reference_fasta)

# @see: https://pypi.python.org/pypi/pyfasta
key_fn = lambda key : key.split()[0] # Use first value before whitespace as keys
return Fasta(self._genome_sequence_fasta, key_fn=key_fn)
return FastaFile(self._genome_sequence_fasta)

def get_sequence_from_iv(self, iv, upper_case=True):
pyfasta_feature = HTSeqInterval_to_pyfasta_feature(iv)
return self.get_sequence_from_pyfasta_feature(pyfasta_feature, upper_case=upper_case)
feature_dict = HTSeqInterval_to_feature_dict(iv)
return self.get_sequence_from_feature(feature_dict, upper_case=upper_case)


def get_sequence_from_pyfasta_feature(self, pyfasta_feature, upper_case=True):
def get_sequence_from_feature(self, feature_dict, upper_case=True):
'''Repetitive regions are sometimes represented as lower case.
If upper_case=True, return the sequence as upper case (Default).
If false, do not convert case, i.e retain lower case where it was present.'''

seq = self.genome.sequence(pyfasta_feature, one_based=False)
if upper_case:
chrom = str(feature_dict[CHROM])
start = feature_dict[START]
end = feature_dict[END]
strand = str(feature_dict[STRAND])
seq = self.genome.fetch(reference=chrom,
start=start,
end=end)
if strand == '-':
seq = reverse_complement(seq)

if upper_case:
seq = seq.upper()
return seq


def get_sequence_from_features(self, features):
sequences = []
for feature in features:
sequences.append(self.get_sequence_from_pyfasta_feature(feature))
sequences.append(self.get_sequence_from_feature(feature))

if sequences:
sequence = reduce(operator.add, sequences)
Expand Down
2 changes: 1 addition & 1 deletion pyreference/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

CODING_FEATURES = {"CDS", "start_codon", "stop_codon"}

# We want our features to match PyFasta so they can be used without conversion
# Keys used in dictionary (serialized to JSON)
CHROM = "chr"
START = "start"
END = "stop"
Expand Down
9 changes: 8 additions & 1 deletion pyreference/utils/genomics_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
@author: dlawrence
'''

from Bio.Alphabet import DNAAlphabet
from Bio.Seq import Seq
from Bio import SeqIO
import HTSeq

Expand All @@ -14,7 +17,7 @@
from pyreference.settings import CHROM, START, END, STRAND


def HTSeqInterval_to_pyfasta_feature(iv):
def HTSeqInterval_to_feature_dict(iv):
return {CHROM : iv.chrom, START : iv.start, END : iv.end, STRAND : iv.strand}

def dict_to_iv(data):
Expand Down Expand Up @@ -101,3 +104,7 @@ def fasta_to_hash(fasta):
indexed_fasta[record.id] = str(record.seq)
return indexed_fasta


def reverse_complement(dna_sequence):
seq = Seq(dna_sequence, DNAAlphabet())
return str(seq.reverse_complement())
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ configargparse
deprecation
HTSeq
lazy
pyfasta
pysam
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
'deprecation',
'HTSeq',
'lazy',
'pyfasta',
'pysam',
],
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
scripts=['bin/pyreference_gtf_to_json.py'],)
Expand Down

0 comments on commit 6fc5024

Please sign in to comment.