Skip to content

Commit

Permalink
Markers: Filter HMM hits
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobnissen committed Aug 25, 2023
1 parent f071a5d commit 295873a
Showing 1 changed file with 9 additions and 11 deletions.
20 changes: 9 additions & 11 deletions vamb/parsemarkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ def process_chunk(
name_to_id: dict[MarkerName, MarkerID],
finder: pyrodigal.OrfFinder,
) -> list[tuple[ContigID, np.ndarray]]:
markers: defaultdict[ContigID, list[MarkerID]] = defaultdict(list)
alphabet = alphabet = pyhmmer.easel.Alphabet.amino()
markers: defaultdict[ContigID, set[MarkerID]] = defaultdict(set)
alphabet = pyhmmer.easel.Alphabet.amino()
digitized: list[pyhmmer.easel.DigitalSequence] = []
for record in chunk:
for gene in finder.find_genes(record.sequence):
Expand All @@ -198,18 +198,16 @@ def process_chunk(
).digitize(alphabet)
digitized.append(seq)

for top_hits in pyhmmer.hmmsearch(hmms, digitized):
marker_bytes = top_hits.query_name
# The query name should always be defined here. I don't know what could
# cause it to not be defined.
assert marker_bytes is not None
marker_name = MarkerName(marker_bytes.decode())
for (hmm, top_hits) in zip(hmms, pyhmmer.hmmsearch(hmms, digitized)):
marker_name = MarkerName(hmm.name.decode())
marker_id = name_to_id[marker_name]
score_cutoff = hmm.cutoffs.trusted1
assert score_cutoff is not None
for hit in top_hits:
# TODO: Criteria for removing spurious hits?
markers[ContigID(int(hit.name.decode()))].append(marker_id)
if hit.score >= score_cutoff:
markers[ContigID(int(hit.name.decode()))].add(marker_id)

return [(name, np.array(ids, dtype=np.uint32)) for (name, ids) in markers.items()]
return [(name, np.array(list(ids), dtype=np.uint32)) for (name, ids) in markers.items()]


# We avoid moving data between processes as much as possible, so each process
Expand Down

0 comments on commit 295873a

Please sign in to comment.