Skip to content

Commit 74b3230

Browse files
Corrects behavior with ANISOU lines in pdb_selaltloc (#131)
1 parent 10a7a37 commit 74b3230

File tree

5 files changed

+371
-56
lines changed

5 files changed

+371
-56
lines changed

pdbtools/pdb_selaltloc.py

+85-56
Original file line numberDiff line numberDiff line change
@@ -149,10 +149,15 @@ def select_altloc(fhandle, selloc=None, byocc=False):
149149
prev_resname = ''
150150
prev_resnum = ''
151151

152+
# uses the same function names in the loop below. However, depending
153+
# on the input options, the functions used are different. One is
154+
# specific for byocc=True, and other specific for occ char selection
152155
flush_func_multi_residues = flush_resloc_occ if byocc else flush_resloc
156+
153157
flush_func_single_residues = \
154158
flush_resloc_occ_same_residue if byocc else flush_resloc_id_same_residue
155159

160+
# defines records and terminators
156161
records = ('ATOM', 'HETATM', 'ANISOU')
157162
terminators = ('TER', 'END', 'CONECT', 'END', 'ENDMDL')
158163

@@ -170,16 +175,19 @@ def select_altloc(fhandle, selloc=None, byocc=False):
170175
# if we see the altloc group has changed, we should flush
171176
# the lines observed for the previous altloc group
172177

173-
# uses for loop instead of "yield from" to maintain compatibility
174-
# with older python version
178+
# uses "for loop" instead of "yield from" to maintain
179+
# compatibility with older python version
175180
if partial_altloc(altloc_lines):
176181
flush_func = flush_func_single_residues
177182
else:
178183
flush_func = flush_func_multi_residues
179184

180-
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines, res_per_loc=res_per_loc):
185+
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines):
181186
yield __line
182187

188+
altloc_lines = {}
189+
res_per_loc = {}
190+
183191
# saves the line per altloc identifier
184192
current_loc = altloc_lines.setdefault(altloc, [])
185193
current_loc.append(line)
@@ -200,9 +208,12 @@ def select_altloc(fhandle, selloc=None, byocc=False):
200208
flush_func = flush_func_single_residues
201209
else:
202210
flush_func = flush_func_multi_residues
203-
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines, res_per_loc=res_per_loc):
211+
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines):
204212
yield __line
205213

214+
altloc_lines = {}
215+
res_per_loc = {}
216+
206217
prev_altloc = ''
207218
prev_resname = ''
208219
prev_resnum = ''
@@ -224,9 +235,12 @@ def select_altloc(fhandle, selloc=None, byocc=False):
224235
else:
225236
flush_func = flush_func_multi_residues
226237

227-
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines, res_per_loc=res_per_loc):
238+
for __line in flush_func(selloc=selloc, altloc_lines=altloc_lines):
228239
yield __line
229240

241+
altloc_lines = []
242+
res_per_loc = {}
243+
230244

231245
def is_another_altloc_group(
232246
altloc,
@@ -238,7 +252,7 @@ def is_another_altloc_group(
238252
altloc_lines,
239253
rploc,
240254
):
241-
"""Detect if current line because to another altloc group."""
255+
"""Detect if current line belongs to a new altloc group."""
242256
a0 = prev_altloc
243257
a1 = altloc
244258
ra0 = prev_resname
@@ -266,7 +280,7 @@ def is_another_altloc_group(
266280
return is_another
267281

268282

269-
def flush_resloc(selloc, altloc_lines, res_per_loc):
283+
def flush_resloc(selloc, altloc_lines):
270284
"""Flush the captured altloc lines."""
271285
# only the selected altloc is yieled
272286
if selloc in altloc_lines:
@@ -280,12 +294,8 @@ def flush_resloc(selloc, altloc_lines, res_per_loc):
280294
for line2flush in lines2flush:
281295
yield line2flush
282296

283-
# clears the altloc group dictionary. Ready for the next one!
284-
altloc_lines.clear()
285-
res_per_loc.clear()
286297

287-
288-
def flush_resloc_occ(altloc_lines, res_per_loc, **kw):
298+
def flush_resloc_occ(altloc_lines, **kw):
289299
"""Flush the captured altloc lines by highest occupancy."""
290300
# only the selected altloc is yieled
291301
highest = 0.00
@@ -303,30 +313,21 @@ def flush_resloc_occ(altloc_lines, res_per_loc, **kw):
303313
for line2flush in altloc_lines[altloc]:
304314
yield line2flush[:16] + ' ' + line2flush[17:]
305315

306-
# clears the altloc group dictionary. Ready for the next one!
307-
altloc_lines.clear()
308-
res_per_loc.clear()
309-
310316

311-
def flush_resloc_id_same_residue(selloc, altloc_lines, res_per_loc):
317+
def flush_resloc_id_same_residue(selloc, altloc_lines):
312318
"""Flush altloc if altloc are atoms in the same residue - by ID."""
313319
# places all lines in a single list
314-
all_lines = []
315-
for altloc, lines in altloc_lines.items():
316-
all_lines.extend(lines)
317-
318-
# organize by atoms
319-
atoms = {}
320-
for line in all_lines:
321-
atom_number = int(line[6:11])
322-
atom = line[12:16]
323-
alist = atoms.setdefault((atom_number, atom), [])
324-
alist.append(line)
320+
sorted_atoms = _get_sort_atoms(altloc_lines)
325321

326-
sorted_atoms = sorted(list(atoms.items()), key=lambda x: x[0][0])
322+
for atom, linet in sorted_atoms:
323+
to_yield = []
324+
# remember linet is a tuple, where the first item is the atom number
325+
lines = linet[1]
327326

328-
to_yield = []
329-
for atom, lines in sorted_atoms:
327+
# here we don't need to care about anisou lines as in
328+
# `flush_resloc_occ_same_residue` because ATOM/HETATM and ANISOU
329+
# are already sorted by definition and lines are yieled from the
330+
# altloc record
330331
for line in lines:
331332
if line[16] == selloc:
332333
to_yield.append(line)
@@ -338,41 +339,69 @@ def flush_resloc_id_same_residue(selloc, altloc_lines, res_per_loc):
338339
for line in lines:
339340
yield line
340341

341-
altloc_lines.clear()
342-
res_per_loc.clear()
343342

344-
345-
def flush_resloc_occ_same_residue(altloc_lines, res_per_loc, **kw):
343+
def flush_resloc_occ_same_residue(altloc_lines, **kw):
346344
"""Flush altloc if altloc are atoms in the same residue - by occ."""
347-
# places all lines in a single list
345+
sorted_atoms = _get_sort_atoms(altloc_lines)
346+
347+
for atom, linest in sorted_atoms:
348+
lines = linest[1]
349+
350+
atom_lines = [l for l in lines if l.startswith(("ATOM", "HETATM"))]
351+
anisou_lines = [l for l in lines if l.startswith(("ANISOU"))]
352+
353+
if anisou_lines:
354+
new = []
355+
356+
if len(atom_lines) != len(anisou_lines):
357+
emsg = (
358+
"There is an error with this PDB. "
359+
"We expect one ANISOU line per ATOM/HETATM lines. "
360+
"But the number of ATOM/HETATM and ANISOU lines differ."
361+
)
362+
raise ValueError(emsg)
363+
364+
for _a, _b in zip(atom_lines, anisou_lines):
365+
new.append((_a, _b))
366+
367+
new.sort(key=lambda x: float(x[0][54:60]), reverse=True)
368+
369+
# ATOM/HETATM
370+
yield new[0][0][:16] + ' ' + new[0][0][17:]
371+
# ANISOU
372+
yield new[0][1][:16] + ' ' + new[0][1][17:]
373+
374+
else:
375+
atom_lines.sort(key=lambda x: float(x[54:60]), reverse=True)
376+
yield atom_lines[0][:16] + ' ' + atom_lines[0][17:]
377+
378+
379+
def _get_sort_atoms(altloc_lines):
380+
# this function is used by both:
381+
# flush_resloc_occ_same_residue
382+
# flush_resloc_id_same_residue
348383
all_lines = []
349384
for altloc, lines in altloc_lines.items():
350385
all_lines.extend(lines)
351386

352387
# organize by atoms
353388
atoms = {}
389+
# key in the dictionary are unique identifiers of the same residue
354390
for line in all_lines:
391+
res_number = int(line[22:26])
392+
res_name = line[17:20].strip()
393+
atom_name = line[12:16]
355394
atom_number = int(line[6:11])
356-
atom = line[12:16]
357-
alist = atoms.setdefault((atom_number, atom), [])
358-
alist.append(line)
359-
360-
sorted_atoms = sorted(list(atoms.items()), key=lambda x: x[0][0])
361-
362-
A = {
363-
'ATOM': 1,
364-
'HETA': 1,
365-
'ANIS': 0,
366-
}
367-
368-
for atom, lines in sorted_atoms:
369-
lines.sort(key=lambda x: (A[x[:4]], float(x[54:60])), reverse=True)
370-
yield lines[0][:16] + ' ' + lines[0][17:]
371-
if lines[1:] and lines[1].startswith('ANISOU'):
372-
yield lines[1][:16] + ' ' + lines[1][17:]
373-
374-
altloc_lines.clear()
375-
res_per_loc.clear()
395+
chain_id = line[21]
396+
key = (res_number, res_name, atom_name, chain_id)
397+
# the atom number is saved so that the original order can be kept
398+
alist = atoms.setdefault(key, (atom_number, []))
399+
alist[1].append(line)
400+
401+
# entries at this point are not sorted. Sorts entries by residue
402+
# number followed by atom number
403+
sorted_atoms = sorted(list(atoms.items()), key=lambda x: (x[0][0], x[1][0]))
404+
return sorted_atoms
376405

377406

378407
def all_same_residue(altloc_lines):

tests/data/anisou.pdb

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
ATOM 1 N ALA A 31 -12.806 6.423 23.735 1.00 70.81 N
2+
ANISOU 1 N ALA A 31 7836 7867 11203 2151 -675 -66 N
3+
ATOM 2 CA ALA A 31 -13.433 7.788 23.746 1.00 65.66 C
4+
ANISOU 2 CA ALA A 31 7296 7485 10167 1829 -642 -81 C
5+
ATOM 3 C ALA A 31 -12.448 8.891 24.124 1.00 63.44 C
6+
ANISOU 3 C ALA A 31 6818 7632 9654 1744 -656 20 C
7+
ATOM 4 O ALA A 31 -11.549 8.680 24.937 1.00 65.52 O
8+
ANISOU 4 O ALA A 31 6891 8010 9994 1853 -759 238 O
9+
ATOM 5 CB ALA A 31 -14.628 7.834 24.691 1.00 66.76 C
10+
ANISOU 5 CB ALA A 31 7636 7487 10242 1630 -748 143 C
11+
ATOM 6 N THR A 32 -12.659 10.075 23.550 1.00 58.59 N
12+
ANISOU 6 N THR A 32 6249 7244 8768 1532 -569 -122 N
13+
ATOM 7 CA THR A 32 -11.806 11.232 23.788 1.00 55.42 C
14+
ANISOU 7 CA THR A 32 5678 7217 8161 1398 -585 -53 C
15+
ATOM 8 C THR A 32 -12.119 11.857 25.148 1.00 52.59 C
16+
ANISOU 8 C THR A 32 5384 6947 7651 1206 -743 183 C
17+
ATOM 9 O THR A 32 -13.166 11.585 25.770 1.00 49.84 O
18+
ANISOU 9 O THR A 32 5232 6412 7293 1142 -803 278 O
19+
ATOM 10 CB THR A 32 -11.965 12.321 22.696 1.00 51.87 C
20+
ANISOU 10 CB THR A 32 5260 6948 7500 1220 -449 -254 C
21+
ATOM 11 OG1 THR A 32 -13.294 12.844 22.720 1.00 48.11 O
22+
ANISOU 11 OG1 THR A 32 5047 6344 6890 1015 -452 -282 O
23+
ATOM 12 CG2 THR A 32 -11.660 11.781 21.303 1.00 54.23 C
24+
ANISOU 12 CG2 THR A 32 5483 7248 7875 1398 -280 -508 C

tests/data/anisou_altloc.pdb

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
ATOM 1 N AALA A 31 -12.806 6.423 23.735 0.40 70.81 N
2+
ANISOU 1 N AALA A 31 7836 7867 11203 2151 -675 -66 N
3+
ATOM 1 N BALA A 31 -12.806 6.423 23.735 0.60 70.81 N
4+
ANISOU 1 N BALA A 31 7836 7867 11203 2151 -675 -66 N
5+
ATOM 2 CA AALA A 31 -13.433 7.788 23.746 0.60 65.66 C
6+
ANISOU 2 CA AALA A 31 7296 7485 10167 1829 -642 -81 C
7+
ATOM 2 CA BALA A 31 -13.433 7.788 23.746 0.40 65.66 C
8+
ANISOU 2 CA BALA A 31 7296 7485 10167 1829 -642 -81 C
9+
ATOM 3 C ALA A 31 -12.448 8.891 24.124 1.00 63.44 C
10+
ANISOU 3 C ALA A 31 6818 7632 9654 1744 -656 20 C
11+
ATOM 4 O ALA A 31 -11.549 8.680 24.937 1.00 65.52 O
12+
ANISOU 4 O ALA A 31 6891 8010 9994 1853 -759 238 O
13+
ATOM 5 CB ALA A 31 -14.628 7.834 24.691 1.00 66.76 C
14+
ANISOU 5 CB ALA A 31 7636 7487 10242 1630 -748 143 C
15+
ATOM 6 N THR A 32 -12.659 10.075 23.550 1.00 58.59 N
16+
ANISOU 6 N THR A 32 6249 7244 8768 1532 -569 -122 N
17+
ATOM 7 CA THR A 32 -11.806 11.232 23.788 1.00 55.42 C
18+
ANISOU 7 CA THR A 32 5678 7217 8161 1398 -585 -53 C
19+
ATOM 8 C THR A 32 -12.119 11.857 25.148 1.00 52.59 C
20+
ANISOU 8 C THR A 32 5384 6947 7651 1206 -743 183 C
21+
ATOM 9 O THR A 32 -13.166 11.585 25.770 1.00 49.84 O
22+
ANISOU 9 O THR A 32 5232 6412 7293 1142 -803 278 O
23+
ATOM 10 CB ATHR A 32 -11.965 12.321 22.696 0.45 51.87 C
24+
ANISOU 10 CB ATHR A 32 5260 6948 7500 1220 -449 -254 C
25+
ATOM 10 CB BTHR A 32 -11.965 12.321 22.696 0.55 51.87 C
26+
ANISOU 10 CB BTHR A 32 5260 6948 7500 1220 -449 -254 C
27+
ATOM 11 OG1 THR A 32 -13.294 12.844 22.720 1.00 48.11 O
28+
ANISOU 11 OG1 THR A 32 5047 6344 6890 1015 -452 -282 O
29+
ATOM 12 CG2 THR A 32 -11.660 11.781 21.303 1.00 54.23 C
30+
ANISOU 12 CG2 THR A 32 5483 7248 7875 1398 -280 -508 C

tests/data/anisou_missing.pdb

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
ATOM 1 N ALA A 31 -12.806 6.423 23.735 1.00 70.81 N
2+
ANISOU 1 N ALA A 31 7836 7867 11203 2151 -675 -66 N
3+
ATOM 2 CA ALA A 31 -13.433 7.788 23.746 1.00 65.66 C
4+
ANISOU 2 CA ALA A 31 7296 7485 10167 1829 -642 -81 C
5+
ATOM 3 C ALA A 31 -12.448 8.891 24.124 1.00 63.44 C
6+
ATOM 4 O ALA A 31 -11.549 8.680 24.937 1.00 65.52 O
7+
ANISOU 4 O ALA A 31 6891 8010 9994 1853 -759 238 O

0 commit comments

Comments
 (0)