diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index d517bf7b2f..93a8e1ab23 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -33,74 +33,6 @@ def load_sequences(): return sequences -class TimeMinHashSuite: - def setup(self): - self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) - self.protein_mh = MinHash( - MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=False - ) - self.sequences = load_sequences() - - self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) - for seq in self.sequences: - self.populated_mh.add_sequence(seq) - - def time_add_sequence(self): - mh = self.mh - sequences = self.sequences - for seq in sequences: - mh.add_sequence(seq) - - def time_add_protein(self): - mh = self.protein_mh - sequences = self.sequences - for seq in sequences: - mh.add_protein(seq) - - def time_get_mins(self): - mh = self.populated_mh - for i in range(GET_MINS_RANGE): - mh.get_mins() - - def time_add_hash(self): - mh = self.mh - for i in range(ADD_HASH_RANGE): - mh.add_hash(i) - - def time_add_many(self): - mh = self.mh - mh.add_many(list(range(ADD_MANY_RANGE))) - - def time_similarity(self): - mh = self.mh - other_mh = self.populated_mh - for i in range(SIMILARITY_TIMES): - mh.similarity(other_mh) - - def time_count_common(self): - mh = self.mh - other_mh = self.populated_mh - for i in range(COUNT_COMMON_TIMES): - mh.count_common(other_mh) - - def time_merge(self): - mh = self.mh - other_mh = self.populated_mh - for i in range(MERGE_TIMES): - mh.merge(other_mh) - - def time_copy(self): - mh = self.populated_mh - for i in range(COPY_TIMES): - mh.__copy__() - - def time_concat(self): - mh = self.mh - other_mh = self.populated_mh - for i in range(CONCAT_TIMES): - mh += other_mh - - class PeakmemMinHashSuite: def setup(self): self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) @@ -134,33 +66,6 @@ def peakmem_add_many(self): #################### -class TimeMinAbundanceSuite(TimeMinHashSuite): - def setup(self): - TimeMinHashSuite.setup(self) - self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) - - self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) - for seq in self.sequences: - self.populated_mh.add_sequence(seq) - - def time_get_mins_abundance(self): - mh = self.populated_mh - for i in range(GET_MINS_RANGE): - mh.get_mins(with_abundance=True) - - def time_set_abundances(self): - mh = self.mh - mins = self.populated_mh.get_mins(with_abundance=True) - for i in range(SET_ABUNDANCES_RANGE): - mh.set_abundances(mins) - - def time_set_abundances_noclear(self): - mh = self.mh - mins = self.populated_mh.get_mins(with_abundance=True) - for i in range(SET_ABUNDANCES_RANGE): - mh.set_abundances(mins, clear=False) - - class PeakmemMinAbundanceSuite(PeakmemMinHashSuite): def setup(self): PeakmemMinHashSuite.setup(self) @@ -170,35 +75,6 @@ def setup(self): #################### -class TimeZipStorageSuite: - def setup(self): - import zipfile - - self.zipfile = NamedTemporaryFile() - - with zipfile.ZipFile( - self.zipfile, mode="w", compression=zipfile.ZIP_STORED - ) as storage: - for i in range(ZIP_STORAGE_WRITE): - # just so we have lots of entries - storage.writestr(str(i), b"0") - # one big-ish entry - storage.writestr("sig1", b"9" * 1_000_000) - - def time_load_from_zipstorage(self): - with ZipStorage(self.zipfile.name) as storage: - for i in range(ZIP_STORAGE_LOAD): - storage.load("sig1") - - def time_load_small_from_zipstorage(self): - with ZipStorage(self.zipfile.name) as storage: - for i in range(ZIP_STORAGE_LOAD): - storage.load("99999") - - def teardown(self): - self.zipfile.close() - - class PeakmemZipStorageSuite: def setup(self): import zipfile diff --git a/pyproject.toml b/pyproject.toml index ccd826f794..b6a7a27eb3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -103,6 +103,7 @@ test = [ "pytest>=6.2.4,<8.3.0", "pytest-cov>=4,<6.0", "pytest-xdist>=3.1", + "pytest-benchmark>=4.0", "pyyaml>=6,<7", "diff-cover>=7.3", "covdefaults>=2.2.2", diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py new file mode 100644 index 0000000000..cc15efc5a9 --- /dev/null +++ b/tests/test_benchmarks.py @@ -0,0 +1,193 @@ +import random +from tempfile import NamedTemporaryFile + +import pytest + +from sourmash.sbt_storage import ZipStorage +from sourmash.minhash import MinHash + +RANDOM_SEQ_SIZE = 3000 +RANDOM_SEQ_NUMBER = 300 + +MINHASH_NUM = 500 +MINHASH_K = 21 + +GET_MINS_RANGE = 500 +ADD_HASH_RANGE = 10_000 +ADD_MANY_RANGE = 1000 +SIMILARITY_TIMES = 500 +COUNT_COMMON_TIMES = 500 +MERGE_TIMES = 500 +COPY_TIMES = 500 +CONCAT_TIMES = 500 +SET_ABUNDANCES_RANGE = 500 +ZIP_STORAGE_WRITE = 100_000 +ZIP_STORAGE_LOAD = 20 + + +def load_sequences(): + sequences = [] + for _ in range(10): + random_seq = random.sample( + "A,C,G,T".split(",") * RANDOM_SEQ_SIZE, RANDOM_SEQ_NUMBER + ) + sequences.append("".join(random_seq)) + return sequences + + +@pytest.fixture +def mh(): + return MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) + +@pytest.fixture +def mh_protein(): + return MinHash( + MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=False + ) + + +@pytest.fixture +def sequences(): + return load_sequences() + + +@pytest.fixture +def populated_mh(sequences): + populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) + for seq in sequences: + populated_mh.add_sequence(seq) + return populated_mh + + +def test_add_sequence(benchmark, mh, sequences): + @benchmark + def bench(): + for seq in sequences: + mh.add_sequence(seq) + + +def test_add_protein(benchmark, mh_protein, sequences): + @benchmark + def bench(): + for seq in sequences: + mh_protein.add_protein(seq) + + +def test_get_mins(benchmark, populated_mh): + @benchmark + def bench(): + for _ in range(GET_MINS_RANGE): + populated_mh.get_mins() + + +def test_add_hash(benchmark, mh): + @benchmark + def bench(): + for i in range(ADD_HASH_RANGE): + mh.add_hash(i) + + +def test_add_many(benchmark, mh): + benchmark(mh.add_many, list(range(ADD_MANY_RANGE))) + + +def test_similarity(benchmark, mh, populated_mh): + @benchmark + def bench(): + for _ in range(SIMILARITY_TIMES): + mh.similarity(populated_mh) + +def test_count_common(benchmark, mh, populated_mh): + @benchmark + def bench(): + for _ in range(COUNT_COMMON_TIMES): + mh.count_common(populated_mh) + + +def test_merge(benchmark, mh, populated_mh): + @benchmark + def bench(): + for i in range(MERGE_TIMES): + mh.merge(populated_mh) + + +def test_copy(benchmark, populated_mh): + @benchmark + def bench(): + for i in range(COPY_TIMES): + populated_mh.__copy__() + + +def test_concat(benchmark, mh, populated_mh): + @benchmark + def bench(): + nonlocal mh + for _ in range(CONCAT_TIMES): + mh += populated_mh + +#################### + + +def setup(self): + TimeMinHashSuite.setup(self) + self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) + + self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) + for seq in self.sequences: + self.populated_mh.add_sequence(seq) + +def time_get_mins_abundance(self): + mh = self.populated_mh + for i in range(GET_MINS_RANGE): + mh.get_mins(with_abundance=True) + +def time_set_abundances(self): + mh = self.mh + mins = self.populated_mh.get_mins(with_abundance=True) + for i in range(SET_ABUNDANCES_RANGE): + mh.set_abundances(mins) + +def time_set_abundances_noclear(self): + mh = self.mh + mins = self.populated_mh.get_mins(with_abundance=True) + for i in range(SET_ABUNDANCES_RANGE): + mh.set_abundances(mins, clear=False) + + +#################### + + +@pytest.fixture +def zipstore(): + import zipfile + + zf = NamedTemporaryFile() + + with zipfile.ZipFile( + zf, mode="w", compression=zipfile.ZIP_STORED + ) as storage: + for i in range(ZIP_STORAGE_WRITE): + # just so we have lots of entries + storage.writestr(str(i), b"0") + # one big-ish entry + storage.writestr("sig1", b"9" * 1_000_000) + + yield zf + + zf.close() + + +def test_load_from_zipstorage(benchmark, zipstore): + @benchmark + def bench(): + with ZipStorage(zipstore.name) as storage: + for _ in range(ZIP_STORAGE_LOAD): + storage.load("sig1") + + +def test_load_small_from_zipstorage(benchmark, zipstore): + @benchmark + def bench(): + with ZipStorage(zipstore.name) as storage: + for _ in range(ZIP_STORAGE_LOAD): + storage.load("99999") diff --git a/tox.ini b/tox.ini index ecf66a2bcd..c1237267da 100644 --- a/tox.ini +++ b/tox.ini @@ -111,6 +111,34 @@ commands = asv machine --yes asv continuous latest HEAD {posargs} +[testenv:benchmarks] +description = run pytest-benchmark for benchmarking +changedir = {toxinidir} +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.benchmarks.xml \ + --benchmark-only \ + -n 0 \ + {posargs:tests} + +[testenv:codspeed] +description = run codspeed for benchmarking +deps = + pytest-codspeed +changedir = {toxinidir} +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.codspeed.xml \ + --codspeed \ + -k benchmarks \ + {posargs:tests} + [testenv:docs] description = invoke sphinx-build to build the HTML docs basepython = python3.10 @@ -180,7 +208,7 @@ commands = coverage xml -i -o {toxworkdir}/coverage.xml coverage html -i -d {toxworkdir}/htmlcov diff-cover --compare-branch {env:DIFF_AGAINST:origin/latest} {toxworkdir}/coverage.xml -depends = py312, py311, py310, pypy3 +depends = py312, py311, py310, pypy3, codspeed pass_env = {[testenv]pass_env} DIFF_AGAINST set_env = COVERAGE_FILE={toxworkdir}/.coverage @@ -233,7 +261,7 @@ source = src/sourmash/ python = 3.10: py310, docs, package_description, coverage 3.11: py311, coverage - 3.12: py312, coverage + 3.12: py312, coverage, codspeed [flake8] max-complexity = 22