Add a new marker to check for memory leaks

Users have indicated that it will be very useful if the plugin exposes a way to detect memory leaks in tests. This is possible, but is a bit tricky as the interpreter can allocate memory for internal caches, as well as user functions. To make this more reliable, the new marker will take two parameters: * The limit of memory per location to consider an allocation. If the memory leaked by any allocation location in the test is higher than this value, the test will fail. * An optional callable function that can be used to filter out locations. This will allow users to remove false positives. Signed-off-by: Pablo Galindo <[email protected]>
bloomberg · Aug 16, 2023 · e261803 · e261803
1 parent b25d4b8
commit e261803
Show file tree

Hide file tree

Showing 4 changed files with 324 additions and 22 deletions.
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -40,6 +40,9 @@ validations on tests when this plugin is enabled.
 ``limit_memory``
 ----------------
 
+.. py:function:: limit_memory(memory_limit: str)
+    Fail the execution of the test if the test allocates more memory than allowed
+
 When this marker is applied to a test, it will cause the test to fail if the execution
 of the test allocates more memory than allowed. It takes a single argument with a
 string indicating the maximum memory that the test can allocate.
@@ -61,3 +64,63 @@ Example of usage:
     @pytest.mark.limit_memory("24 MB")
     def test_foobar():
         pass  # do some stuff that allocates memory
+
+
+``limit_leaks``
+ ---------------
+
+ .. py:function:: limit_leaks(location_limit: str, filtering_fn: Callable[Iterable['memray.PythonStackElement'], bool])
+       Fail the execution of the test if any location in the test leaks more memory than allowed.
+
+ .. important::
+       To detect leaks, Memray needs to intercept calls to the Python allocators and use native
+       traces. This is adds significant overhead, and will slow your test down.
+
+When this marker is applied to a test, it will cause the test to fail if any allocation location in
+the execution of the test leaks more memory than allowed. It takes a single positional argument with a
+string indicating the maximum memory **per allocation location** that the test is allowed to leak.
+
+Leaks are defined as memory that is allocated **in the marked test** that is not freed before leaving the test body.
+
+.. important::
+    It's recommended to run your API or code in a loop when utilizing this plugin. This practice helps in distinguishing
+    genuine leaks from the "noise" generated by internal caches and other incidental allocations.
+
+The format for the string is ``<NUMBER> ([KMGTP]B|B)``. The marker will raise
+``ValueError`` if the string format cannot be parsed correctly.
+
+The marker also takes an optional keyword-only argument ``filtering_fn``. This argument represents a filtering
+function that will be called with the traceback for every location that allocates memory that cumulatively is
+bigger than the provided limit. The function must return *True* if the allocation must be taken into account
+and *False* otherwise. This function can be used to discard some false positives detected by the marker.
+
+.. tip::
+
+   You can pass the ``--memray-bin-path`` argument to ``pytest`` to specify
+   a directory where Memray will store the binary files with the results. You
+   can then use the ``memray`` CLI to further investigate the allocations and the
+   leaks using any Memray reporters you'd like. Check `the memray docs
+   <https://bloomberg.github.io/memray/getting_started.html>`_ for more
+   information.
+
+Example of usage:
+
+.. code-block:: python
+
+    @pytest.mark.limit_leaks("1 MB")
+    def test_foobar():
+        # Run the function to test in a loop to ensure
+        # we can differentiate leaks from memory allocated
+        # in internal caches
+        for _ in range(100):
+            do_some_stuff()
+
+.. warning::
+   Is **very** challenging to write tests that do not "leak" memory in some way.
+   interpreter caches but there are some that cannot be correctly detected so
+   you may need to allow some small amount of leaked memory per location or use the
+   ``filtering_fn`` argument to filter out false positive leak reports caused by
+   objects that the interpreter plans to reuse later. These caches are
+   implementation details of the interpreter, so the amount of memory
+   allocated, the location of the allocation, and the allocator that was used
+   can all change from one Python version to another.
diff --git a/src/pytest_memray/marks.py b/src/pytest_memray/marks.py
@@ -1,10 +1,15 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Tuple
 from typing import cast
+from typing import Callable
+from typing import Iterable
+from typing import Optional
 
 from memray import AllocationRecord
+from memray import FileReader
 from pytest import Config
 
 from .utils import parse_memory_string
@@ -13,25 +18,23 @@
 
 PytestSection = Tuple[str, str]
 
+StackElement = Tuple[str, str, int]
+LeaksFilteringFunction = Callable[[Iterable[StackElement]], bool]
+
 
 @dataclass
 class _MemoryInfo:
-    """Type that holds all memray-related info for a failed test."""
+    """Type that holds memory-related info for a failed test."""
 
     max_memory: float
     total_allocated_memory: int
     allocations: list[AllocationRecord]
     num_stacks: int
     native_stacks: bool
 
-    @property
-    def section(self) -> PytestSection:
-        """Return a tuple in the format expected by section reporters."""
-        total_memory_str = sizeof_fmt(self.total_allocated_memory)
-        max_memory_str = sizeof_fmt(self.max_memory)
+    def _generate_section_text(self, limit_text: str, header_text: str) -> str:
         text_lines = [
-            f"Test is using {total_memory_str} out of limit of {max_memory_str}",
-            "List of allocations: ",
+            f"{header_text} {sizeof_fmt(self.total_allocated_memory)} out of limit of {sizeof_fmt(self.max_memory)}"
         ]
         for record in self.allocations:
             size = record.size
@@ -51,28 +54,93 @@ def section(self) -> PytestSection:
                 text_lines.append(f"{padding*2}{function}:{file}:{line}")
                 stacks_left -= 1
 
-        return "memray-max-memory", "\n".join(text_lines)
+        return "\n".join(text_lines)
+
+    @property
+    def section(self) -> PytestSection:
+        """Return a tuple in the format expected by section reporters."""
+        return (
+            "memray-max-memory",
+            self._generate_section_text("Test is using", "List of allocations:"),
+        )
 
     @property
     def long_repr(self) -> str:
         """Generate a longrepr user-facing error message."""
-        total_memory_str = sizeof_fmt(self.total_allocated_memory)
-        max_memory_str = sizeof_fmt(self.max_memory)
-        return f"Test was limited to {max_memory_str} but allocated {total_memory_str}"
+        return f"Test was limited to {sizeof_fmt(self.max_memory)} but allocated {sizeof_fmt(self.total_allocated_memory)}"
+
+
+@dataclass
+class _LeakedInfo(_MemoryInfo):
+    """Type that holds leaked memory-related info for a failed test."""
+
+    @property
+    def section(self) -> PytestSection:
+        """Return a tuple in the format expected by section reporters."""
+        return (
+            "memray-leaked-memory",
+            self._generate_section_text("Test leaked", "List of leaked allocations:"),
+        )
+
+    @property
+    def long_repr(self) -> str:
+        """Generate a longrepr user-facing error message."""
+        return (
+            f"Test was allowed to leak {sizeof_fmt(self.max_memory)} "
+            "per location but at least one location leaked more"
+        )
 
 
 def limit_memory(
-    limit: str, *, _allocations: list[AllocationRecord], _config: Config
+    limit: str, *, _result_file: Path, _config: Config
 ) -> _MemoryInfo | None:
     """Limit memory used by the test."""
+    reader = FileReader(_result_file)
+    func = reader.get_high_watermark_allocation_records
+    allocations: list[AllocationRecord] = list((func(merge_threads=True)))
     max_memory = parse_memory_string(limit)
-    total_allocated_memory = sum(record.size for record in _allocations)
+    total_allocated_memory = sum(record.size for record in allocations)
     if total_allocated_memory < max_memory:
         return None
     num_stacks: int = cast(int, value_or_ini(_config, "stacks"))
     native_stacks: bool = cast(bool, value_or_ini(_config, "native"))
     return _MemoryInfo(
-        max_memory, total_allocated_memory, _allocations, num_stacks, native_stacks
+        max_memory, total_allocated_memory, allocations, num_stacks, native_stacks
+    )
+
+
+def limit_leaks(
+    location_limit: str,
+    *,
+    filter_fn: Optional[LeaksFilteringFunction] = None,
+    _result_file: Path,
+    _config: Config,
+) -> _MemoryInfo | None:
+    reader = FileReader(_result_file)
+    func = reader.get_leaked_allocation_records
+    allocations: list[AllocationRecord] = list((func(merge_threads=True)))
+
+    memory_limit = parse_memory_string(location_limit)
+
+    leaked_allocations = list(
+        allocation
+        for allocation in allocations
+        if (
+            allocation.size >= memory_limit
+            and (filter_fn is None or filter_fn(allocation.hybrid_stack_trace()))
+        )
+    )
+    if not leaked_allocations:
+        return None
+    total_leaked_memory = sum(allocation.size for allocation in leaked_allocations)
+
+    num_stacks: int = cast(int, value_or_ini(_config, "stacks"))
+    return _LeakedInfo(
+        memory_limit,
+        total_leaked_memory,
+        leaked_allocations,
+        num_stacks,
+        native_stacks=True,
     )
 
 

diff --git a/src/pytest_memray/plugin.py b/src/pytest_memray/plugin.py
@@ -17,6 +17,7 @@
 from typing import List
 from typing import Tuple
 from typing import cast
+from typing import Protocol
 
 from _pytest.terminal import TerminalReporter
 from memray import AllocationRecord
@@ -34,12 +35,28 @@
 from pytest import hookimpl
 
 from .marks import limit_memory
+from .marks import _MemoryInfo
+from .marks import limit_leaks
 from .utils import WriteEnabledDirectoryAction
 from .utils import positive_int
 from .utils import sizeof_fmt
 from .utils import value_or_ini
 
-MARKERS = {"limit_memory": limit_memory}
+
+class PluginFn(Protocol):
+    def __call__(
+        *args: Any,
+        _result_file: Path,
+        _config: Config,
+        **kwargs: Any,
+    ) -> _MemoryInfo | None:
+        ...
+
+
+MARKERS = {
+    "limit_memory": limit_memory,
+    "limit_leaks": limit_leaks,
+}
 
 N_TOP_ALLOCS = 5
 N_HISTOGRAM_BINS = 5
@@ -134,6 +151,11 @@ def pytest_pyfunc_call(self, pyfuncitem: Function) -> object | None:
             yield
             return
 
+        if len(markers) > 1:
+            raise ValueError(
+                "Only one memray marker can be applied at the same time to the same test"
+            )
+
         def _build_bin_path() -> Path:
             if self._tmp_dir is None and not os.getenv("MEMRAY_RESULT_PATH"):
                 of_id = pyfuncitem.nodeid.replace("::", "-")
@@ -151,6 +173,9 @@ def _build_bin_path() -> Path:
             value_or_ini(self.config, "trace_python_allocators")
         )
 
+        if markers and "limit_leaks" in markers:
+            native = trace_python_allocators = True
+
         @functools.wraps(func)
         def wrapper(*args: Any, **kwargs: Any) -> object | None:
             test_result: object | Any = None
@@ -198,19 +223,17 @@ def pytest_runtest_makereport(
             return None
 
         for marker in item.iter_markers():
-            marker_fn = MARKERS.get(marker.name)
-            if not marker_fn:
+            maybe_marker_fn = MARKERS.get(marker.name)
+            if not maybe_marker_fn:
                 continue
+            marker_fn: PluginFn = cast(PluginFn, maybe_marker_fn)
             result = self.results.get(item.nodeid)
             if not result:
                 continue
-            reader = FileReader(result.result_file)
-            func = reader.get_high_watermark_allocation_records
-            allocations = list((func(merge_threads=True)))
             res = marker_fn(
                 *marker.args,
                 **marker.kwargs,
-                _allocations=allocations,
+                _result_file=result.result_file,
                 _config=self.config,
             )
             if res: