Skip to content

Commit ae9afba

Browse files
authored
Merge pull request #509 from padix-key/application
Amend: Increase performance of fetching via sra-tools (#504)
2 parents 76ff66e + 847502c commit ae9afba

File tree

2 files changed

+47
-15
lines changed

2 files changed

+47
-15
lines changed

doc/examples/scripts/sequence/quality_control.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,12 @@
3232
FIG_SIZE = (8.0, 6.0)
3333

3434

35+
app = sra.FastqDumpApp("ERR266411")
36+
app.start()
37+
app.join()
3538
# Each run can have multiple reads per spot
3639
# by selecting index 0 we take only the first read for every spot
37-
sequences_and_scores = sra.FastqDumpApp.fetch("ERR266411")[0]
40+
sequences_and_scores = app.get_sequences_and_scores()[0]
3841
sequence_codes = np.stack([
3942
sequence.code for sequence, _ in sequences_and_scores.values()
4043
])
@@ -102,7 +105,7 @@
102105

103106
########################################################################
104107
# This is a typical distribution.
105-
#
108+
#
106109
# Now we want to see the appearance of each base over the length of the
107110
# sequence reads.
108111
# In a random library one would expect, that :math:`p(A) \approx p(T)`
@@ -209,7 +212,7 @@
209212
ax.bar(
210213
np.arange(0, len(duplication_level_freq)),
211214
duplication_level_freq,
212-
width=0.6,
215+
width=0.6,
213216
color=biotite.colors["dimorange"]
214217
)
215218
ax.set_xlim(0.5, len(duplication_level_freq) + 0.5)
@@ -218,6 +221,8 @@
218221
ax.set_ylabel("Sequence percentage (%)")
219222
fig.tight_layout()
220223

224+
plt.show()
225+
221226
########################################################################
222227
# The dataset has quite an unusual repetition profile:
223228
# Usually one would expect, that most sequences occur only once and the

src/biotite/application/sra/app.py

+39-12
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
__all__ = ["FastaDumpApp", "FastqDumpApp"]
88

99
import abc
10+
from os.path import join
1011
from subprocess import Popen, SubprocessError, PIPE, TimeoutExpired
1112
import glob
12-
from tempfile import gettempprefix, NamedTemporaryFile
13-
from ..localapp import cleanup_tempfile
13+
from tempfile import TemporaryDirectory
1414
from ..application import Application, AppState, AppStateError, \
1515
requires_state
1616
from ...sequence.seqtypes import NucleotideSequence
@@ -54,13 +54,11 @@ def __init__(self, uid, output_path_prefix=None,
5454
self._prefetch_path = prefetch_path
5555
self._fasterq_dump_path = fasterq_dump_path
5656
self._uid = uid
57+
self._sra_dir = TemporaryDirectory(suffix="_sra")
5758
if output_path_prefix is None:
58-
self._prefix = gettempprefix()
59+
self._prefix = join(self._sra_dir.name, self._uid)
5960
else:
6061
self._prefix = output_path_prefix
61-
self._sra_file = NamedTemporaryFile(
62-
"w", suffix=".sra", delete=False
63-
)
6462
self._prefetch_process = None
6563
self._fasterq_dump_process = None
6664

@@ -94,10 +92,14 @@ def join(self, timeout=None):
9492

9593

9694
def run(self):
95+
# Prefetch into a temp directory with file name equaling UID
96+
# This ensures that the ID in the header is not the temp prefix
97+
sra_file_name = join(self._sra_dir.name, self._uid)
9798
command = (
98-
f"{self._prefetch_path} -q -o {self._sra_file.name} {self._uid}; "
99+
f"{self._prefetch_path} -q -O {self._sra_dir.name} "
100+
f"{self.get_prefetch_options()} {self._uid}; "
99101
f"{self._fasterq_dump_path} -q -o {self._prefix}.fastq "
100-
f"{self.get_fastq_dump_options()} {self._sra_file.name}"
102+
f"{self.get_fastq_dump_options()} {sra_file_name}"
101103
)
102104
self._process = Popen(
103105
command, stdout=PIPE, stderr=PIPE, shell=True, encoding="UTF-8"
@@ -120,8 +122,8 @@ def evaluate(self):
120122
if exit_code != 0:
121123
err_msg = self._stderr.replace("\n", " ")
122124
raise SubprocessError(
123-
f"'{self._bin_path}' returned with exit code {exit_code}: "
124-
f"{err_msg}"
125+
f"'prefetch' or 'fasterq-dump' returned with exit code "
126+
f"{exit_code}: {err_msg}"
125127
)
126128

127129
self._file_names = (
@@ -130,6 +132,8 @@ def evaluate(self):
130132
# For entries with multiple reads per spot
131133
glob.glob(self._prefix + "_*.fastq")
132134
)
135+
print(self._prefix)
136+
print(self._file_names)
133137
# Only load FASTQ files into memory when needed
134138
self._fastq_files = None
135139

@@ -142,9 +146,24 @@ def wait_interval(self):
142146
def clean_up(self):
143147
if self.get_app_state() == AppState.CANCELLED:
144148
self._process.kill()
145-
cleanup_tempfile(self._sra_file)
149+
# Directory with temp files does not need to be deleted,
150+
# as temp dir is automatically deleted upon object destruction
146151

147152

153+
@requires_state(AppState.CREATED)
154+
def get_prefetch_options(self):
155+
"""
156+
Get additional options for the `prefetch` call.
157+
158+
PROTECTED: Override when inheriting.
159+
160+
Returns
161+
-------
162+
options: str
163+
The additional options.
164+
"""
165+
return ""
166+
148167
@requires_state(AppState.CREATED)
149168
def get_fastq_dump_options(self):
150169
"""
@@ -155,7 +174,7 @@ def get_fastq_dump_options(self):
155174
Returns
156175
-------
157176
options: str
158-
The additional options
177+
The additional options.
159178
"""
160179
return ""
161180

@@ -359,6 +378,14 @@ def __init__(self, uid, output_path_prefix=None, prefetch_path="prefetch",
359378
self._fasta_files = None
360379

361380

381+
@requires_state(AppState.CREATED)
382+
def get_prefetch_options(self):
383+
return
384+
# TODO: Use '--eliminate-quals'
385+
# when https://github.com/ncbi/sra-tools/issues/883 is resolved
386+
# return "--eliminate-quals"
387+
388+
362389
@requires_state(AppState.CREATED)
363390
def get_fastq_dump_options(self):
364391
return "--fasta"

0 commit comments

Comments
 (0)