test/conftest.py

import random

import numpy as np
import pytest
import torch

from common_utils import (
    CUDA_NOT_AVAILABLE_MSG,
    IN_FBCODE,
    IN_OSS_CI,
    IN_RE_WORKER,
    MPS_NOT_AVAILABLE_MSG,
    OSS_CI_GPU_NO_CUDA_MSG,
)


def pytest_configure(config):
    # register an additional marker (see pytest_collection_modifyitems)
    config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device")
    config.addinivalue_line("markers", "needs_mps: mark for tests that rely on a MPS device")
    config.addinivalue_line("markers", "dont_collect: mark for tests that should not be collected")
    config.addinivalue_line("markers", "opcheck_only_one: only opcheck one parametrization")


def pytest_collection_modifyitems(items):
    # This hook is called by pytest after it has collected the tests (google its name to check out its doc!)
    # We can ignore some tests as we see fit here, or add marks, such as a skip mark.
    #
    # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
    # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already.
    # This is true for both OSS CI and the fbcode internal CI.
    # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on
    # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if
    # these tests never existed.

    out_items = []
    for item in items:
        # The needs_cuda mark will exist if the test was explicitly decorated with
        # the @needs_cuda decorator. It will also exist if it was parametrized with a
        # parameter that has the mark: for example if a test is parametrized with
        # @pytest.mark.parametrize('device', cpu_and_cuda())
        # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark,
        # and the ones with device == 'cpu' won't have the mark.
        needs_cuda = item.get_closest_marker("needs_cuda") is not None
        needs_mps = item.get_closest_marker("needs_mps") is not None

        if needs_cuda and not torch.cuda.is_available():
            # In general, we skip cuda tests on machines without a GPU
            # There are special cases though, see below
            item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG))

        if needs_mps and not torch.backends.mps.is_available():
            item.add_marker(pytest.mark.skip(reason=MPS_NOT_AVAILABLE_MSG))

        if IN_FBCODE:
            # fbcode doesn't like skipping tests, so instead we  just don't collect the test
            # so that they don't even "exist", hence the continue statements.
            if not needs_cuda and IN_RE_WORKER:
                # The RE workers are the machines with GPU, we don't want them to run CPU-only tests.
                continue
            if needs_cuda and not torch.cuda.is_available():
                # On the test machines without a GPU, we want to ignore the tests that need cuda.
                # TODO: something more robust would be to do that only in a sandcastle instance,
                # so that we can still see the test being skipped when testing locally from a devvm
                continue
            if needs_mps and not torch.backends.mps.is_available():
                # Same as above, but for MPS
                continue
        elif IN_OSS_CI:
            # Here we're not in fbcode, so we can safely collect and skip tests.
            if not needs_cuda and torch.cuda.is_available():
                # Similar to what happens in RE workers: we don't need the OSS CI GPU machines
                # to run the CPU-only tests.
                item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG))

        if item.get_closest_marker("dont_collect") is not None:
            # currently, this is only used for some tests we're sure we don't want to run on fbcode
            continue

        out_items.append(item)

    items[:] = out_items


def pytest_addoption(parser):
    parser.addoption("--debug-images", action="store_true", help="Enable debug mode for saving images.")


def pytest_sessionfinish(session, exitstatus):
    # This hook is called after all tests have run, and just before returning an exit status.
    # We here change exit code 5 into 0.
    #
    # 5 is issued when no tests were actually run, e.g. if you use `pytest -k some_regex_that_is_never_matched`.
    #
    # Having no test being run for a given test rule is a common scenario in fbcode, and typically happens on
    # the GPU test machines which don't run the CPU-only tests (see pytest_collection_modifyitems above). For
    # example `test_transforms.py` doesn't contain any CUDA test at the time of
    # writing, so on a GPU test machine, testpilot would invoke pytest on this file and no test would be run.
    # This would result in pytest returning 5, causing testpilot to raise an error.
    # To avoid this, we transform this 5 into a 0 to make testpilot happy.
    if exitstatus == 5:
        session.exitstatus = 0


@pytest.fixture(autouse=True)
def prevent_leaking_rng():
    # Prevent each test from leaking the rng to all other test when they call
    # torch.manual_seed() or random.seed() or np.random.seed().
    # Note: the numpy rngs should never leak anyway, as we never use
    # np.random.seed() and instead rely on np.random.RandomState instances (see
    # issue #4247). We still do it for extra precaution.

    torch_rng_state = torch.get_rng_state()
    builtin_rng_state = random.getstate()
    nunmpy_rng_state = np.random.get_state()
    if torch.cuda.is_available():
        cuda_rng_state = torch.cuda.get_rng_state()

    yield

    torch.set_rng_state(torch_rng_state)
    random.setstate(builtin_rng_state)
    np.random.set_state(nunmpy_rng_state)
    if torch.cuda.is_available():
        torch.cuda.set_rng_state(cuda_rng_state)