From 3222c42b2d3041bd4c0ed01bda66277b378c5656 Mon Sep 17 00:00:00 2001
From: Tomoya Iwata <iwata.tomoya@classmethod.jp>
Date: Mon, 30 Dec 2024 15:21:07 +0900
Subject: [PATCH 1/5] tests: parametrize bench mark tests

In the previous implementation, it was necessary to adjust the timeout
value every time a benchmark test added.
By parametrizing the benchmark tests, the time required for each test
becomes predictable, eliminating the need to adjust the timeout value

Signed-off-by: Tomoya Iwata <iwata.tomoya@classmethod.jp>
---
 .../performance/test_benchmarks.py            | 135 +++++++++++-------
 1 file changed, 86 insertions(+), 49 deletions(-)

diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py
index 6e6541a688d..4e3e5b7228a 100644
--- a/tests/integration_tests/performance/test_benchmarks.py
+++ b/tests/integration_tests/performance/test_benchmarks.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import platform
+import re
 import shutil
 from pathlib import Path
 
@@ -17,78 +18,114 @@
 LOGGER = logging.getLogger(__name__)
 
 
+def get_executables():
+    """
+    Get a list of binaries for benchmarking
+    """
+
+    # Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the
+    # usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we
+    # extract the path to the compiled benchmark binary.
+    _, stdout, _ = cargo(
+        "bench",
+        f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run",
+    )
+
+    executables = []
+    for line in stdout.split("\n"):
+        if line:
+            msg = json.loads(line)
+            executable = msg.get("executable")
+            if executable:
+                executables.append(executable)
+
+    return executables
+
+
 @pytest.mark.no_block_pr
-@pytest.mark.timeout(900)
-def test_no_regression_relative_to_target_branch():
+@pytest.mark.timeout(600)
+@pytest.mark.parametrize("executable", get_executables())
+def test_no_regression_relative_to_target_branch(executable):
     """
     Run the microbenchmarks in this repository, comparing results from pull
     request target branch against what's achieved on HEAD
     """
+    run_criterion = get_run_criterion(executable)
+    compare_results = get_compare_results(executable)
     git_ab_test(run_criterion, compare_results)
 
 
-def run_criterion(firecracker_checkout: Path, is_a: bool) -> Path:
+def get_run_criterion(executable):
     """
-    Executes all benchmarks by running "cargo bench --no-run", finding the executables, and running them pinned to some CPU
+    Get function that executes specified benchmarks, and running them pinned to some CPU
     """
-    baseline_name = "a_baseline" if is_a else "b_baseline"
-
-    with contextlib.chdir(firecracker_checkout):
-        # Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the
-        # usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we
-        # extract the path to the compiled benchmark binary.
-        _, stdout, _ = cargo(
-            "bench",
-            f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run",
-        )
 
-        executables = []
-        for line in stdout.split("\n"):
-            if line:
-                msg = json.loads(line)
-                executable = msg.get("executable")
-                if executable:
-                    executables.append(executable)
+    def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path:
+        baseline_name = "a_baseline" if is_a else "b_baseline"
 
-        for executable in executables:
+        with contextlib.chdir(firecracker_checkout):
             utils.check_output(
                 f"CARGO_TARGET_DIR=build/cargo_target taskset -c 1 {executable} --bench --save-baseline {baseline_name}"
             )
 
-    return firecracker_checkout / "build" / "cargo_target" / "criterion"
+        return firecracker_checkout / "build" / "cargo_target" / "criterion"
+
+    return _run_criterion
+
 
+def get_compare_results(executable):
+    """
+    Get function that compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main
+    """
 
-def compare_results(location_a_baselines: Path, location_b_baselines: Path):
-    """Compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main"""
-    for benchmark in location_b_baselines.glob("*"):
-        data = json.loads(
-            (benchmark / "b_baseline" / "estimates.json").read_text("utf-8")
+    def _compare_results(location_a_baselines: Path, location_b_baselines: Path):
+
+        list_result = utils.check_output(
+            f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --list"
         )
 
-        average_ns = data["mean"]["point_estimate"]
+        # Format a string like `page_fault #2: benchmark` to a string like `page_fault_2`.
+        # Because under `cargo_target/criterion/`, a directory like `page_fault_2` will create.
+        bench_marks = [
+            re.sub(r"\s#(?P<sub_id>[1-9]+)", r"_\g<sub_id>", i.split(":")[0])
+            for i in list_result.stdout.split("\n")
+            if i.endswith(": benchmark")
+        ]
+
+        for benchmark in bench_marks:
+            data = json.loads(
+                (
+                    location_b_baselines / benchmark / "b_baseline" / "estimates.json"
+                ).read_text("utf-8")
+            )
 
-        LOGGER.info("%s mean: %iµs", benchmark.name, average_ns / 1000)
+            average_ns = data["mean"]["point_estimate"]
 
-    # Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here
-    # to do the comparison
-    for benchmark in location_a_baselines.glob("*"):
-        shutil.copytree(
-            benchmark / "a_baseline",
-            location_b_baselines / benchmark.name / "a_baseline",
+            LOGGER.info("%s mean: %iµs", benchmark, average_ns / 1000)
+
+        # Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here
+        # to do the comparison
+
+        for benchmark in bench_marks:
+            shutil.copytree(
+                location_a_baselines / benchmark / "a_baseline",
+                location_b_baselines / benchmark / "a_baseline",
+            )
+
+        bench_result = utils.check_output(
+            f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --baseline a_baseline --load-baseline b_baseline",
+            True,
+            Path.cwd().parent,
         )
 
-    _, stdout, _ = cargo(
-        "bench",
-        f"--all --target {platform.machine()}-unknown-linux-musl",
-        "--baseline a_baseline --load-baseline b_baseline",
-    )
+        regressions_only = "\n\n".join(
+            result
+            for result in bench_result.stdout.split("\n\n")
+            if "Performance has regressed." in result
+        )
 
-    regressions_only = "\n\n".join(
-        result
-        for result in stdout.split("\n\n")
-        if "Performance has regressed." in result
-    )
+        # If this string is anywhere in stdout, then at least one of our benchmarks
+        # is now performing worse with the PR changes.
+        assert not regressions_only, "\n" + regressions_only
 
-    # If this string is anywhere in stdout, then at least one of our benchmarks
-    # is now performing worse with the PR changes.
-    assert not regressions_only, "\n" + regressions_only
+    return _compare_results

From b95900e0bca410722add733a8bb2370e3080ecbf Mon Sep 17 00:00:00 2001
From: Tomoya Iwata <iwata.tomoya@classmethod.jp>
Date: Sun, 12 Jan 2025 17:38:50 +0900
Subject: [PATCH 2/5] tests: use splitlines

use `splitlines()` instead of `split("\n")`.

Signed-off-by: Tomoya Iwata <iwata.tomoya@classmethod.jp>
---
 tests/integration_tests/performance/test_benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py
index 4e3e5b7228a..1f43cd2ab79 100644
--- a/tests/integration_tests/performance/test_benchmarks.py
+++ b/tests/integration_tests/performance/test_benchmarks.py
@@ -32,7 +32,7 @@ def get_executables():
     )
 
     executables = []
-    for line in stdout.split("\n"):
+    for line in stdout.splitlines():
         if line:
             msg = json.loads(line)
             executable = msg.get("executable")

From 215af23e54aa51859c6a0fcd07e0625615026f84 Mon Sep 17 00:00:00 2001
From: Tomoya Iwata <iwata.tomoya@classmethod.jp>
Date: Sun, 12 Jan 2025 17:43:01 +0900
Subject: [PATCH 3/5] tests: delete specific timeout parameter for performance
 test

No longer need to set individual timeout values,
Because parameterized performance tests.

Signed-off-by: Tomoya Iwata <iwata.tomoya@classmethod.jp>
---
 tests/integration_tests/performance/test_benchmarks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py
index 1f43cd2ab79..49e31df065b 100644
--- a/tests/integration_tests/performance/test_benchmarks.py
+++ b/tests/integration_tests/performance/test_benchmarks.py
@@ -43,7 +43,6 @@ def get_executables():
 
 
 @pytest.mark.no_block_pr
-@pytest.mark.timeout(600)
 @pytest.mark.parametrize("executable", get_executables())
 def test_no_regression_relative_to_target_branch(executable):
     """

From d42d39ff88c6fec1a42463633eb9afc5fe518043 Mon Sep 17 00:00:00 2001
From: Tomoya Iwata <iwata.tomoya@classmethod.jp>
Date: Sun, 19 Jan 2025 12:47:20 +0900
Subject: [PATCH 4/5] tests: build and run benchmark tests in each branch

In the previous implementation, same binary that built in the PR branch
execute twice,
which was not a correct A/B test. This has been fixed.

Signed-off-by: Tomoya Iwata <iwata.tomoya@classmethod.jp>
---
 .../performance/test_benchmarks.py            | 86 ++++++++-----------
 1 file changed, 37 insertions(+), 49 deletions(-)

diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py
index 49e31df065b..6d3988c23d9 100644
--- a/tests/integration_tests/performance/test_benchmarks.py
+++ b/tests/integration_tests/performance/test_benchmarks.py
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 """Optional benchmarks-do-not-regress test"""
 import contextlib
-import json
 import logging
 import platform
 import re
 import shutil
 from pathlib import Path
+from typing import Callable, List
 
 import pytest
 
@@ -18,43 +18,40 @@
 LOGGER = logging.getLogger(__name__)
 
 
-def get_executables():
+def get_benchmark_names() -> List[str]:
     """
-    Get a list of binaries for benchmarking
+    Get a list of benchmark test names
     """
 
-    # Passing --message-format json to cargo tells it to print its log in a json format. At the end, instead of the
-    # usual "placed executable <...> at <...>" we'll get a json object with an 'executable' key, from which we
-    # extract the path to the compiled benchmark binary.
     _, stdout, _ = cargo(
         "bench",
-        f"--all --quiet --target {platform.machine()}-unknown-linux-musl --message-format json --no-run",
+        f"--workspace --quiet --target {platform.machine()}-unknown-linux-musl",
+        "--list",
     )
 
-    executables = []
-    for line in stdout.splitlines():
-        if line:
-            msg = json.loads(line)
-            executable = msg.get("executable")
-            if executable:
-                executables.append(executable)
+    # Format a string like `page_fault #2: benchmark` to a string like `page_fault`.
+    benchmark_names = [
+        re.sub(r"\s#([0-9]*)", "", i.split(":")[0])
+        for i in stdout.split("\n")
+        if i.endswith(": benchmark")
+    ]
 
-    return executables
+    return list(set(benchmark_names))
 
 
 @pytest.mark.no_block_pr
-@pytest.mark.parametrize("executable", get_executables())
-def test_no_regression_relative_to_target_branch(executable):
+@pytest.mark.parametrize("benchname", get_benchmark_names())
+def test_no_regression_relative_to_target_branch(benchname):
     """
     Run the microbenchmarks in this repository, comparing results from pull
     request target branch against what's achieved on HEAD
     """
-    run_criterion = get_run_criterion(executable)
-    compare_results = get_compare_results(executable)
+    run_criterion = get_run_criterion(benchname)
+    compare_results = get_compare_results(benchname)
     git_ab_test(run_criterion, compare_results)
 
 
-def get_run_criterion(executable):
+def get_run_criterion(benchmark_name) -> Callable[[Path, bool], Path]:
     """
     Get function that executes specified benchmarks, and running them pinned to some CPU
     """
@@ -64,7 +61,7 @@ def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path:
 
         with contextlib.chdir(firecracker_checkout):
             utils.check_output(
-                f"CARGO_TARGET_DIR=build/cargo_target taskset -c 1 {executable} --bench --save-baseline {baseline_name}"
+                f"taskset -c 1 cargo bench --workspace --quiet -- {benchmark_name} --exact --save-baseline {baseline_name}"
             )
 
         return firecracker_checkout / "build" / "cargo_target" / "criterion"
@@ -72,54 +69,45 @@ def _run_criterion(firecracker_checkout: Path, is_a: bool) -> Path:
     return _run_criterion
 
 
-def get_compare_results(executable):
+def get_compare_results(benchmark_name) -> Callable[[Path, Path], None]:
     """
     Get function that compares the two recorded criterion baselines for regressions, assuming that "A" is the baseline from main
     """
 
     def _compare_results(location_a_baselines: Path, location_b_baselines: Path):
 
-        list_result = utils.check_output(
-            f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --list"
+        _, stdout, _ = cargo(
+            "bench",
+            f"--workspace --target {platform.machine()}-unknown-linux-musl --quiet",
+            f"--exact {benchmark_name} --list",
         )
 
         # Format a string like `page_fault #2: benchmark` to a string like `page_fault_2`.
         # Because under `cargo_target/criterion/`, a directory like `page_fault_2` will create.
-        bench_marks = [
-            re.sub(r"\s#(?P<sub_id>[1-9]+)", r"_\g<sub_id>", i.split(":")[0])
-            for i in list_result.stdout.split("\n")
+        bench_mark_targets = [
+            re.sub(r"\s#(?P<sub_id>[0-9]*)", r"_\g<sub_id>", i.split(":")[0])
+            for i in stdout.split("\n")
             if i.endswith(": benchmark")
         ]
 
-        for benchmark in bench_marks:
-            data = json.loads(
-                (
-                    location_b_baselines / benchmark / "b_baseline" / "estimates.json"
-                ).read_text("utf-8")
-            )
-
-            average_ns = data["mean"]["point_estimate"]
-
-            LOGGER.info("%s mean: %iµs", benchmark, average_ns / 1000)
-
-        # Assumption: location_b_baseline = cargo_target of current working directory. So just copy the a_baselines here
-        # to do the comparison
-
-        for benchmark in bench_marks:
+        # If benchmark test has multiple targets, the results of a single benchmark test will be output to multiple directories.
+        # For example, `page_fault` and `page_fault_2`.
+        # We need copy benchmark results each directories.
+        for bench_mark_target in bench_mark_targets:
             shutil.copytree(
-                location_a_baselines / benchmark / "a_baseline",
-                location_b_baselines / benchmark / "a_baseline",
+                location_a_baselines / bench_mark_target / "a_baseline",
+                location_b_baselines / bench_mark_target / "a_baseline",
             )
 
-        bench_result = utils.check_output(
-            f"CARGO_TARGET_DIR=build/cargo_target {executable} --bench --baseline a_baseline --load-baseline b_baseline",
-            True,
-            Path.cwd().parent,
+        _, stdout, _ = cargo(
+            "bench",
+            f"--workspace --target {platform.machine()}-unknown-linux-musl",
+            f"{benchmark_name} --baseline a_baseline --load-baseline b_baseline",
         )
 
         regressions_only = "\n\n".join(
             result
-            for result in bench_result.stdout.split("\n\n")
+            for result in stdout.split("\n\n")
             if "Performance has regressed." in result
         )
 

From 033ca8f4fcaea4e4e9f44e907542f6ed101e4b0a Mon Sep 17 00:00:00 2001
From: Tomoya Iwata <iwata.tomoya@classmethod.jp>
Date: Sun, 19 Jan 2025 22:19:38 +0900
Subject: [PATCH 5/5] tests: optimize a/b test fixture

In the previous implementation, git clone executed
for each parameter of the parametize test.
This has a large overhead, adjusted it so that
fixtures only called once per class.

Signed-off-by: Tomoya Iwata <iwata.tomoya@classmethod.jp>
---
 tests/framework/ab_test.py                    | 30 ++++++++++++++--
 .../performance/test_benchmarks.py            | 36 ++++++++++++++-----
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py
index 2ef3e2350a7..65773c6f693 100644
--- a/tests/framework/ab_test.py
+++ b/tests/framework/ab_test.py
@@ -22,9 +22,9 @@
 does not block PRs). If not, it fails, preventing PRs from introducing new vulnerable dependencies.
 """
 import statistics
-from pathlib import Path
+from pathlib import Path, PosixPath
 from tempfile import TemporaryDirectory
-from typing import Callable, List, Optional, TypeVar
+from typing import Callable, Generator, List, Optional, Tuple, TypeVar
 
 import scipy
 
@@ -98,6 +98,32 @@ def git_ab_test(
         return result_a, result_b, comparison
 
 
+def git_clone_ab_dirs(
+    a_revision: str = DEFAULT_A_REVISION,
+    b_revision: Optional[str] = None,
+) -> Generator[Tuple[PosixPath, PosixPath], None, None]:
+    """
+    Prepare cloned Git repository to run A/B tests.
+
+    :param a_revision: The revision to checkout for the "A" part of the test. Defaults to the pull request target branch
+                       if run in CI, and "main" otherwise.
+    :param b_revision: The git revision to check out for "B" part of the test. Defaults to whatever is currently checked
+                       out (in which case no temporary directory will be created).
+    """
+
+    with TemporaryDirectory() as tmp_dir:
+        dir_a = git_clone(Path(tmp_dir) / a_revision, a_revision)
+
+        if b_revision:
+            dir_b = git_clone(Path(tmp_dir) / b_revision, b_revision)
+        else:
+            # By default, pytest execution happens inside the `tests` subdirectory. Pass the repository root, as
+            # documented.
+            dir_b = Path.cwd().parent
+
+        yield (dir_a, dir_b)
+
+
 DEFAULT_A_DIRECTORY = FC_WORKSPACE_DIR / "build" / "main"
 DEFAULT_B_DIRECTORY = FC_WORKSPACE_DIR / "build" / "cargo_target" / DEFAULT_TARGET_DIR
 
diff --git a/tests/integration_tests/performance/test_benchmarks.py b/tests/integration_tests/performance/test_benchmarks.py
index 6d3988c23d9..81afd37a0fb 100644
--- a/tests/integration_tests/performance/test_benchmarks.py
+++ b/tests/integration_tests/performance/test_benchmarks.py
@@ -12,10 +12,11 @@
 import pytest
 
 from framework import utils
-from framework.ab_test import git_ab_test
+from framework.ab_test import binary_ab_test, git_clone_ab_dirs
 from host_tools.cargo_build import cargo
 
 LOGGER = logging.getLogger(__name__)
+git_clone_ab_dirs_one_time = pytest.fixture(git_clone_ab_dirs, scope="class")
 
 
 def get_benchmark_names() -> List[str]:
@@ -39,16 +40,33 @@ def get_benchmark_names() -> List[str]:
     return list(set(benchmark_names))
 
 
-@pytest.mark.no_block_pr
-@pytest.mark.parametrize("benchname", get_benchmark_names())
-def test_no_regression_relative_to_target_branch(benchname):
+class TestBenchMarks:
     """
-    Run the microbenchmarks in this repository, comparing results from pull
-    request target branch against what's achieved on HEAD
+    This class is used to prevent fixtures from being executed for each parameter in
+    a parametrize test.
     """
-    run_criterion = get_run_criterion(benchname)
-    compare_results = get_compare_results(benchname)
-    git_ab_test(run_criterion, compare_results)
+
+    @pytest.mark.no_block_pr
+    @pytest.mark.parametrize("benchname", get_benchmark_names())
+    def test_no_regression_relative_to_target_branch(
+        self, benchname, git_clone_ab_dirs_one_time
+    ):
+        """
+        Run the microbenchmarks in this repository, comparing results from pull
+        request target branch against what's achieved on HEAD
+        """
+
+        dir_a = git_clone_ab_dirs_one_time[0]
+        dir_b = git_clone_ab_dirs_one_time[1]
+        run_criterion = get_run_criterion(benchname)
+        compare_results = get_compare_results(benchname)
+
+        binary_ab_test(
+            test_runner=run_criterion,
+            comparator=compare_results,
+            a_directory=dir_a,
+            b_directory=dir_b,
+        )
 
 
 def get_run_criterion(benchmark_name) -> Callable[[Path, bool], Path]: