Skip to content

Commit

Permalink
Fix FASTA parsing with Windows newlines
Browse files Browse the repository at this point in the history
When iterating lines of a file, Python includes the trailing `\n`, such that
Windows lines ends with `\r\n`. Regex considers `$` to mean the end of the line,
not end of the string. Vamb's header regex check did not take the trailing
newline into account, but this happened to not matter since
* Only the identifier was used in Vamb's internals, so the newline in the
  description was not noticed, and
* It still passed the regex header check, as the regex matching stopped at the
  `\n` character.
However, `\r\n` threw a wrench in all that.
Fix it by simply removing the trailing `\r?\n` in the FASTA parser.
  • Loading branch information
jakobnissen committed Jul 24, 2023
1 parent 79919d8 commit 6d02489
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
23 changes: 23 additions & 0 deletions test/test_parsecontigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,26 @@ def test_save_load(self):
self.assertTrue(np.all(md1.lengths == md2.lengths))
self.assertTrue(np.all(md1.refhash == md2.refhash))
self.assertTrue(np.all(md1.minlength == md2.minlength))

def test_windows_newlines(self):
rng = random.Random()
buf1 = io.BytesIO()
buf2 = io.BytesIO()
for i in range(10):
record = testtools.make_randseq(rng, 10, 20)
buf1.write(b">" + record.header.encode())
buf2.write(b">" + record.header.encode())
buf1.write(b"\r\n")
buf2.write(b"\n")
buf1.write(record.sequence)
buf2.write(record.sequence)
buf1.write(b"\r\n")
buf2.write(b"\n")

buf1.seek(0)
buf2.seek(0)
comp1 = Composition.from_file(buf1)
comp2 = Composition.from_file(buf2)

self.assertEqual(comp1.metadata.refhash, comp2.metadata.refhash)
self.assertTrue(np.all(comp1.matrix == comp2.matrix))
5 changes: 3 additions & 2 deletions vamb/vambtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,8 @@ def byte_iterfasta(
)
raise TypeError(errormsg) from None

header = probeline[1:]
# 13 is the byte value of \r, meaning we remove either \r\n or \n
header = probeline[1 : -(1 + (probeline[-2] == 13))]
buffer: list[bytes] = list()

# Iterate over lines
Expand All @@ -320,7 +321,7 @@ def byte_iterfasta(
elif line.startswith(b">"):
yield FastaEntry(header, bytearray().join(buffer))
buffer.clear()
header = line[1:]
header = line[1 : -(1 + (line[-2] == 13))]

else:
buffer.append(line)
Expand Down

0 comments on commit 6d02489

Please sign in to comment.