From 0f68bf7801845e71485bcfa240da7af005b071ca Mon Sep 17 00:00:00 2001
From: xmhubj <xmhu@baai.ac.cn>
Date: Wed, 4 Feb 2026 19:19:40 +0800
Subject: [PATCH 01/14] add labeler workflow

---
 .github/labeler.yml           | 35 +++++++++++++++++++++++++++++++++++
 .github/workflows/labeler.yml | 19 +++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 .github/labeler.yml
 create mode 100644 .github/workflows/labeler.yml

diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 0000000..4d58834
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,35 @@
+# PR Labeler 配置文件
+# 根据修改的文件路径自动添加标签
+
+docs:
+  - changed-files:
+      - any-glob-to-any-file: '**/*.md'
+
+ci:
+  - changed-files:
+      - any-glob-to-any-file:
+          - '.github/**/*'
+          - '.pre-commit-config.yaml'
+
+tests:
+  - changed-files:
+      - any-glob-to-any-file: 'tests/**/*'
+
+core:
+  - changed-files:
+      - any-glob-to-any-file: 'vllm_fl/**/*'
+
+examples:
+  - changed-files:
+      - any-glob-to-any-file: 'examples/**/*'
+
+benchmarks:
+  - changed-files:
+      - any-glob-to-any-file: 'benchmarks/**/*'
+
+build:
+  - changed-files:
+      - any-glob-to-any-file:
+          - 'setup.py'
+          - 'requirements*.txt'
+          - 'pyproject.toml'
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 0000000..b8220f4
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,19 @@
+name: "Pull Request Labeler"
+
+on:
+  pull_request:
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  label:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Apply labels
+        uses: actions/labeler@v5
+        with:
+          repo-token: "${{ secrets.GITHUB_TOKEN }}"
+          sync-labels: true
\ No newline at end of file

From 9f4b7d55f32805585a9236d1653957d6d62c06d0 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhu@baai.ac.cn>
Date: Wed, 4 Feb 2026 19:48:05 +0800
Subject: [PATCH 02/14] add unit tests

---
 pyproject.toml                                |  58 ++++
 tests/unit_tests/__init__.py                  |   2 +
 tests/unit_tests/compilation/__init__.py      |   1 +
 tests/unit_tests/compilation/test_graph.py    |  76 +++++
 tests/unit_tests/conftest.py                  |  22 ++
 tests/unit_tests/dispatch/__init__.py         |   1 +
 tests/unit_tests/dispatch/test_discovery.py   | 134 +++++++++
 tests/unit_tests/dispatch/test_policy.py      | 265 ++++++++++++++++++
 tests/unit_tests/dispatch/test_registry.py    | 136 +++++++++
 tests/unit_tests/dispatch/test_types.py       | 177 ++++++++++++
 tests/unit_tests/distributed/__init__.py      |   1 +
 .../distributed/test_communicator.py          |  72 +++++
 tests/unit_tests/distributed/test_flagcx.py   | 124 ++++++++
 tests/unit_tests/ops/__init__.py              |   1 +
 tests/unit_tests/ops/test_activation.py       |  36 +++
 tests/unit_tests/ops/test_layernorm.py        |  69 +++++
 tests/unit_tests/ops/test_rotary_embedding.py |  46 +++
 tests/unit_tests/worker/__init__.py           |   1 +
 tests/unit_tests/worker/test_model_runner.py  |  23 ++
 tests/unit_tests/worker/test_worker.py        | 102 +++++++
 20 files changed, 1347 insertions(+)
 create mode 100644 pyproject.toml
 create mode 100644 tests/unit_tests/__init__.py
 create mode 100644 tests/unit_tests/compilation/__init__.py
 create mode 100644 tests/unit_tests/compilation/test_graph.py
 create mode 100644 tests/unit_tests/conftest.py
 create mode 100644 tests/unit_tests/dispatch/__init__.py
 create mode 100644 tests/unit_tests/dispatch/test_discovery.py
 create mode 100644 tests/unit_tests/dispatch/test_policy.py
 create mode 100644 tests/unit_tests/dispatch/test_registry.py
 create mode 100644 tests/unit_tests/dispatch/test_types.py
 create mode 100644 tests/unit_tests/distributed/__init__.py
 create mode 100644 tests/unit_tests/distributed/test_communicator.py
 create mode 100644 tests/unit_tests/distributed/test_flagcx.py
 create mode 100644 tests/unit_tests/ops/__init__.py
 create mode 100644 tests/unit_tests/ops/test_activation.py
 create mode 100644 tests/unit_tests/ops/test_layernorm.py
 create mode 100644 tests/unit_tests/ops/test_rotary_embedding.py
 create mode 100644 tests/unit_tests/worker/__init__.py
 create mode 100644 tests/unit_tests/worker/test_model_runner.py
 create mode 100644 tests/unit_tests/worker/test_worker.py

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..332c7e0
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,58 @@
+[build-system]
+requires = ["setuptools>=45", "setuptools-scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+markers = [
+    "gpu: marks tests as requiring GPU (deselect with '-m \"not gpu\"')",
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "integration: marks tests as integration tests",
+    "e2e: marks tests as end-to-end tests",
+]
+addopts = "-v --tb=short"
+filterwarnings = [
+    "ignore::DeprecationWarning",
+    "ignore::UserWarning",
+]
+
+[tool.coverage.run]
+source = ["vllm_fl"]
+omit = [
+    "tests/*",
+    "examples/*",
+    "benchmarks/*",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "raise NotImplementedError",
+    "if TYPE_CHECKING:",
+    "if __name__ == .__main__.:",
+]
+
+[tool.ruff]
+line-length = 100
+target-version = "py39"
+
+[tool.ruff.lint]
+select = [
+    "E",   # pycodestyle errors
+    "W",   # pycodestyle warnings
+    "F",   # pyflakes
+    "I",   # isort
+    "B",   # flake8-bugbear
+    "C4",  # flake8-comprehensions
+]
+ignore = [
+    "E501",  # line too long (handled by formatter)
+    "B008",  # do not perform function calls in argument defaults
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["vllm_fl"]
diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py
new file mode 100644
index 0000000..8806137
--- /dev/null
+++ b/tests/unit_tests/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+"""Unit tests for vllm_fl."""
diff --git a/tests/unit_tests/compilation/__init__.py b/tests/unit_tests/compilation/__init__.py
new file mode 100644
index 0000000..8c90136
--- /dev/null
+++ b/tests/unit_tests/compilation/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
diff --git a/tests/unit_tests/compilation/test_graph.py b/tests/unit_tests/compilation/test_graph.py
new file mode 100644
index 0000000..38f1a54
--- /dev/null
+++ b/tests/unit_tests/compilation/test_graph.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for compilation graph module.
+"""
+
+import pytest
+from unittest.mock import patch, MagicMock
+from dataclasses import dataclass
+
+
+class TestGraphClasses:
+    """Test graph-related classes."""
+
+    def test_graph_entry_import(self):
+        from vllm_fl.compilation.graph import GraphEntry
+        assert GraphEntry is not None
+
+    def test_graph_options_import(self):
+        from vllm_fl.compilation.graph import GraphOptions
+        assert GraphOptions is not None
+
+    def test_graph_wrapper_import(self):
+        from vllm_fl.compilation.graph import GraphWrapper
+        assert GraphWrapper is not None
+
+
+class TestGraphOptions:
+    """Test GraphOptions dataclass."""
+
+    def test_default_values(self):
+        from vllm_fl.compilation.graph import GraphOptions
+
+        options = GraphOptions()
+
+        assert options.debug_log_enable is True
+        assert options.gc_disable is False
+        assert options.weak_ref_output is True
+
+    def test_custom_values(self):
+        from vllm_fl.compilation.graph import GraphOptions
+
+        options = GraphOptions(
+            debug_log_enable=False,
+            gc_disable=True,
+            weak_ref_output=False,
+        )
+
+        assert options.debug_log_enable is False
+        assert options.gc_disable is True
+        assert options.weak_ref_output is False
+
+
+class TestGraphEntry:
+    """Test GraphEntry dataclass."""
+
+    def test_default_values(self):
+        from vllm_fl.compilation.graph import GraphEntry
+
+        # Create a mock BatchDescriptor
+        mock_batch_desc = MagicMock()
+
+        entry = GraphEntry(batch_descriptor=mock_batch_desc)
+
+        assert entry.batch_descriptor is mock_batch_desc
+        assert entry.graph is None
+        assert entry.output is None
+        assert entry.input_addresses is None
+
+
+class TestWeakRefTensors:
+    """Test weak_ref_tensors function."""
+
+    def test_import(self):
+        from vllm_fl.compilation.graph import weak_ref_tensors
+        assert weak_ref_tensors is not None
diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py
new file mode 100644
index 0000000..8c29aac
--- /dev/null
+++ b/tests/unit_tests/conftest.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Unit test fixtures and configuration.
+"""
+
+import pytest
+import torch
+
+
+@pytest.fixture
+def mock_tensor():
+    """Create a simple tensor for testing."""
+    return torch.randn(2, 4, 8)
+
+
+@pytest.fixture
+def device():
+    """Get the available device."""
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    return torch.device("cpu")
diff --git a/tests/unit_tests/dispatch/__init__.py b/tests/unit_tests/dispatch/__init__.py
new file mode 100644
index 0000000..8c90136
--- /dev/null
+++ b/tests/unit_tests/dispatch/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
diff --git a/tests/unit_tests/dispatch/test_discovery.py b/tests/unit_tests/dispatch/test_discovery.py
new file mode 100644
index 0000000..67f1773
--- /dev/null
+++ b/tests/unit_tests/dispatch/test_discovery.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for dispatch discovery module.
+"""
+
+import os
+import pytest
+from unittest.mock import patch, MagicMock
+
+from vllm_fl.dispatch.discovery import (
+    discover_plugins,
+    discover_from_env_modules,
+    get_discovered_plugins,
+    clear_discovered_plugins,
+    _call_register_function,
+    PLUGIN_MODULES_ENV,
+)
+
+
+class TestCallRegisterFunction:
+    def test_direct_callable(self):
+        registry = MagicMock()
+        fn = MagicMock()
+
+        result = _call_register_function(fn, registry, "test")
+
+        assert result is True
+        fn.assert_called_once_with(registry)
+
+    def test_module_with_register_function(self):
+        registry = MagicMock()
+        module = MagicMock()
+        module.register = MagicMock()
+
+        result = _call_register_function(module, registry, "test")
+
+        assert result is True
+        module.register.assert_called_once_with(registry)
+
+    def test_module_with_vllm_fl_register(self):
+        registry = MagicMock()
+        module = MagicMock(spec=["vllm_fl_register"])
+        module.vllm_fl_register = MagicMock()
+
+        result = _call_register_function(module, registry, "test")
+
+        assert result is True
+        module.vllm_fl_register.assert_called_once_with(registry)
+
+    def test_callable_raises_exception(self):
+        registry = MagicMock()
+        fn = MagicMock(side_effect=Exception("test error"))
+
+        result = _call_register_function(fn, registry, "test")
+
+        assert result is False
+
+    def test_no_register_function(self):
+        registry = MagicMock()
+        module = MagicMock(spec=[])  # No register function
+
+        result = _call_register_function(module, registry, "test")
+
+        assert result is False
+
+
+class TestDiscoverFromEnvModules:
+    @pytest.fixture(autouse=True)
+    def clear_plugins(self):
+        clear_discovered_plugins()
+        yield
+        clear_discovered_plugins()
+
+    def test_empty_env_var(self):
+        with patch.dict(os.environ, {PLUGIN_MODULES_ENV: ""}):
+            registry = MagicMock()
+            result = discover_from_env_modules(registry)
+            assert result == 0
+
+    def test_no_env_var(self):
+        env = os.environ.copy()
+        env.pop(PLUGIN_MODULES_ENV, None)
+        with patch.dict(os.environ, env, clear=True):
+            registry = MagicMock()
+            result = discover_from_env_modules(registry)
+            assert result == 0
+
+    def test_import_error_handling(self):
+        with patch.dict(os.environ, {PLUGIN_MODULES_ENV: "nonexistent_module"}):
+            registry = MagicMock()
+            result = discover_from_env_modules(registry)
+            assert result == 0
+            plugins = get_discovered_plugins()
+            assert len(plugins) == 1
+            assert plugins[0][2] is False  # success = False
+
+
+class TestDiscoverPlugins:
+    @pytest.fixture(autouse=True)
+    def clear_plugins(self):
+        clear_discovered_plugins()
+        yield
+        clear_discovered_plugins()
+
+    def test_none_registry(self):
+        result = discover_plugins(None)
+        assert result == 0
+
+    def test_empty_discovery(self):
+        with patch.dict(os.environ, {PLUGIN_MODULES_ENV: ""}):
+            with patch(
+                "vllm_fl.dispatch.discovery._get_entry_points", return_value=[]
+            ):
+                registry = MagicMock()
+                result = discover_plugins(registry)
+                assert result == 0
+
+
+class TestGetDiscoveredPlugins:
+    @pytest.fixture(autouse=True)
+    def clear_plugins(self):
+        clear_discovered_plugins()
+        yield
+        clear_discovered_plugins()
+
+    def test_returns_copy(self):
+        plugins1 = get_discovered_plugins()
+        plugins2 = get_discovered_plugins()
+        assert plugins1 is not plugins2
+
+    def test_initially_empty(self):
+        plugins = get_discovered_plugins()
+        assert plugins == []
diff --git a/tests/unit_tests/dispatch/test_policy.py b/tests/unit_tests/dispatch/test_policy.py
new file mode 100644
index 0000000..6eedaae
--- /dev/null
+++ b/tests/unit_tests/dispatch/test_policy.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for dispatch policy module.
+"""
+
+import os
+import pytest
+import tempfile
+from unittest.mock import patch
+
+from vllm_fl.dispatch.policy import (
+    SelectionPolicy,
+    PolicyManager,
+    PREFER_DEFAULT,
+    PREFER_VENDOR,
+    PREFER_REFERENCE,
+    VALID_PREFER_VALUES,
+    get_policy,
+    set_global_policy,
+    reset_global_policy,
+    policy_context,
+    with_preference,
+    with_strict_mode,
+)
+
+
+class TestSelectionPolicy:
+    def test_default_values(self):
+        policy = SelectionPolicy()
+        assert policy.prefer == PREFER_DEFAULT
+        assert policy.strict is False
+        assert policy.per_op_order == ()
+        assert policy.deny_vendors == frozenset()
+        assert policy.allow_vendors is None
+
+    def test_invalid_prefer_value_raises(self):
+        with pytest.raises(ValueError, match="Invalid prefer value"):
+            SelectionPolicy(prefer="invalid")
+
+    def test_valid_prefer_values(self):
+        for prefer in VALID_PREFER_VALUES:
+            policy = SelectionPolicy(prefer=prefer)
+            assert policy.prefer == prefer
+
+    def test_from_dict(self):
+        policy = SelectionPolicy.from_dict(
+            prefer="vendor",
+            strict=True,
+            per_op_order={"silu": ["vendor", "flagos"]},
+            deny_vendors={"AMD"},
+            allow_vendors={"CUDA"},
+        )
+        assert policy.prefer == "vendor"
+        assert policy.strict is True
+        assert policy.deny_vendors == frozenset({"AMD"})
+        assert policy.allow_vendors == frozenset({"CUDA"})
+
+    def test_get_default_order_flagos(self):
+        policy = SelectionPolicy(prefer=PREFER_DEFAULT)
+        order = policy.get_default_order()
+        assert order == ["flagos", "vendor", "reference"]
+
+    def test_get_default_order_vendor(self):
+        policy = SelectionPolicy(prefer=PREFER_VENDOR)
+        order = policy.get_default_order()
+        assert order == ["vendor", "flagos", "reference"]
+
+    def test_get_default_order_reference(self):
+        policy = SelectionPolicy(prefer=PREFER_REFERENCE)
+        order = policy.get_default_order()
+        assert order == ["reference", "flagos", "vendor"]
+
+    def test_is_vendor_allowed_deny_list(self):
+        policy = SelectionPolicy(deny_vendors=frozenset({"AMD"}))
+        assert policy.is_vendor_allowed("CUDA") is True
+        assert policy.is_vendor_allowed("AMD") is False
+
+    def test_is_vendor_allowed_allow_list(self):
+        policy = SelectionPolicy(allow_vendors=frozenset({"CUDA"}))
+        assert policy.is_vendor_allowed("CUDA") is True
+        assert policy.is_vendor_allowed("AMD") is False
+
+    def test_is_vendor_allowed_combined(self):
+        policy = SelectionPolicy(
+            allow_vendors=frozenset({"CUDA", "AMD"}),
+            deny_vendors=frozenset({"AMD"}),
+        )
+        assert policy.is_vendor_allowed("CUDA") is True
+        assert policy.is_vendor_allowed("AMD") is False
+
+    def test_get_per_op_order(self):
+        policy = SelectionPolicy.from_dict(
+            per_op_order={"silu": ["vendor", "flagos"], "rms_norm": ["reference"]},
+        )
+        assert policy.get_per_op_order("silu") == ["vendor", "flagos"]
+        assert policy.get_per_op_order("rms_norm") == ["reference"]
+        assert policy.get_per_op_order("nonexistent") is None
+
+    def test_fingerprint_uniqueness(self):
+        policy1 = SelectionPolicy(prefer=PREFER_DEFAULT)
+        policy2 = SelectionPolicy(prefer=PREFER_VENDOR)
+        policy3 = SelectionPolicy(prefer=PREFER_DEFAULT, strict=True)
+
+        assert policy1.fingerprint() != policy2.fingerprint()
+        assert policy1.fingerprint() != policy3.fingerprint()
+        assert policy2.fingerprint() != policy3.fingerprint()
+
+    def test_frozen_dataclass(self):
+        policy = SelectionPolicy()
+        with pytest.raises(AttributeError):
+            policy.prefer = "vendor"
+
+
+class TestPolicyManager:
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        """Reset global policy before and after each test."""
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    def test_get_instance_singleton(self):
+        manager1 = PolicyManager.get_instance()
+        manager2 = PolicyManager.get_instance()
+        assert manager1 is manager2
+
+    def test_get_policy_returns_default(self):
+        policy = get_policy()
+        assert isinstance(policy, SelectionPolicy)
+        assert policy.prefer == PREFER_DEFAULT
+
+    def test_set_global_policy(self):
+        new_policy = SelectionPolicy(prefer=PREFER_VENDOR)
+        old_policy = set_global_policy(new_policy)
+
+        current = get_policy()
+        assert current.prefer == PREFER_VENDOR
+        assert old_policy.prefer == PREFER_DEFAULT
+
+    def test_reset_global_policy(self):
+        set_global_policy(SelectionPolicy(prefer=PREFER_VENDOR))
+        reset_global_policy()
+
+        policy = get_policy()
+        assert policy.prefer == PREFER_DEFAULT
+
+    def test_policy_epoch_bumps(self):
+        manager = PolicyManager.get_instance()
+        epoch1 = manager.get_policy_epoch()
+
+        set_global_policy(SelectionPolicy(prefer=PREFER_VENDOR))
+        epoch2 = manager.get_policy_epoch()
+
+        assert epoch2 > epoch1
+
+
+class TestPolicyContext:
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    def test_policy_context_override(self):
+        original = get_policy()
+        assert original.prefer == PREFER_DEFAULT
+
+        override_policy = SelectionPolicy(prefer=PREFER_VENDOR)
+        with policy_context(override_policy):
+            inside = get_policy()
+            assert inside.prefer == PREFER_VENDOR
+
+        after = get_policy()
+        assert after.prefer == PREFER_DEFAULT
+
+    def test_with_preference(self):
+        with with_preference("vendor"):
+            policy = get_policy()
+            assert policy.prefer == "vendor"
+
+        policy = get_policy()
+        assert policy.prefer == PREFER_DEFAULT
+
+    def test_with_strict_mode(self):
+        with with_strict_mode():
+            policy = get_policy()
+            assert policy.strict is True
+
+        policy = get_policy()
+        assert policy.strict is False
+
+    def test_nested_contexts(self):
+        with with_preference("vendor"):
+            assert get_policy().prefer == "vendor"
+            with with_strict_mode():
+                policy = get_policy()
+                assert policy.strict is True
+            assert get_policy().prefer == "vendor"
+
+        assert get_policy().prefer == PREFER_DEFAULT
+
+
+class TestPolicyFromEnv:
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    def test_policy_from_env_prefer(self):
+        with patch.dict(os.environ, {"VLLM_FL_PREFER": "vendor"}):
+            reset_global_policy()
+            policy = get_policy()
+            assert policy.prefer == "vendor"
+
+    def test_policy_from_env_strict(self):
+        with patch.dict(os.environ, {"VLLM_FL_STRICT": "1"}):
+            reset_global_policy()
+            policy = get_policy()
+            assert policy.strict is True
+
+    def test_policy_from_env_deny_vendors(self):
+        with patch.dict(os.environ, {"VLLM_FL_DENY_VENDORS": "AMD,Intel"}):
+            reset_global_policy()
+            policy = get_policy()
+            assert "AMD" in policy.deny_vendors
+            assert "Intel" in policy.deny_vendors
+
+
+class TestPolicyFromConfig:
+    def test_policy_from_config_file(self):
+        config_content = """
+prefer: vendor
+strict: true
+allow_vendors:
+  - CUDA
+deny_vendors:
+  - AMD
+op_backends:
+  silu:
+    - vendor
+    - flagos
+"""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            f.write(config_content)
+            f.flush()
+            config_path = f.name
+
+        try:
+            from vllm_fl.dispatch.policy import policy_from_config
+            policy = policy_from_config(config_path)
+
+            assert policy.prefer == "vendor"
+            assert policy.strict is True
+            assert "CUDA" in policy.allow_vendors
+            assert "AMD" in policy.deny_vendors
+            assert policy.get_per_op_order("silu") == ["vendor", "flagos"]
+        finally:
+            os.unlink(config_path)
+
+    def test_policy_from_nonexistent_config(self):
+        from vllm_fl.dispatch.policy import policy_from_config
+        with pytest.raises(FileNotFoundError):
+            policy_from_config("/nonexistent/path/config.yaml")
diff --git a/tests/unit_tests/dispatch/test_registry.py b/tests/unit_tests/dispatch/test_registry.py
new file mode 100644
index 0000000..77e0d97
--- /dev/null
+++ b/tests/unit_tests/dispatch/test_registry.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for dispatch registry.
+"""
+
+import pytest
+from vllm_fl.dispatch.registry import OpRegistry, OpRegistrySnapshot
+from vllm_fl.dispatch.types import BackendImplKind, OpImpl
+
+
+class TestOpRegistry:
+    @pytest.fixture
+    def registry(self):
+        return OpRegistry()
+
+    @pytest.fixture
+    def sample_impl(self):
+        return OpImpl(
+            op_name="silu",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x,
+            priority=100,
+        )
+
+    @pytest.fixture
+    def another_impl(self):
+        return OpImpl(
+            op_name="silu",
+            impl_id="reference.pytorch",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda x: x,
+            priority=50,
+        )
+
+    def test_register_impl(self, registry, sample_impl):
+        registry.register_impl(sample_impl)
+        result = registry.get_implementation("silu", "default.flagos")
+        assert result == sample_impl
+
+    def test_register_impl_duplicate_raises(self, registry, sample_impl):
+        registry.register_impl(sample_impl)
+        with pytest.raises(ValueError, match="Duplicate impl_id"):
+            registry.register_impl(sample_impl)
+
+    def test_register_many(self, registry, sample_impl, another_impl):
+        registry.register_many([sample_impl, another_impl])
+        assert registry.get_implementation("silu", "default.flagos") == sample_impl
+        assert registry.get_implementation("silu", "reference.pytorch") == another_impl
+
+    def test_get_implementations(self, registry, sample_impl, another_impl):
+        registry.register_many([sample_impl, another_impl])
+        impls = registry.get_implementations("silu")
+        assert len(impls) == 2
+        assert sample_impl in impls
+        assert another_impl in impls
+
+    def test_get_implementations_empty(self, registry):
+        impls = registry.get_implementations("nonexistent")
+        assert impls == []
+
+    def test_get_implementation_not_found(self, registry):
+        result = registry.get_implementation("nonexistent", "any_id")
+        assert result is None
+
+    def test_list_operators(self, registry, sample_impl, another_impl):
+        rms_impl = OpImpl(
+            op_name="rms_norm",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x,
+        )
+        registry.register_many([sample_impl, another_impl, rms_impl])
+        ops = registry.list_operators()
+        assert set(ops) == {"silu", "rms_norm"}
+
+    def test_list_operators_empty(self, registry):
+        assert registry.list_operators() == []
+
+    def test_clear(self, registry, sample_impl):
+        registry.register_impl(sample_impl)
+        registry.clear()
+        assert registry.list_operators() == []
+        assert registry.get_implementation("silu", "default.flagos") is None
+
+    def test_snapshot(self, registry, sample_impl, another_impl):
+        registry.register_many([sample_impl, another_impl])
+        snapshot = registry.snapshot()
+
+        assert isinstance(snapshot, OpRegistrySnapshot)
+        assert "silu" in snapshot.impls_by_op
+        assert len(snapshot.impls_by_op["silu"]) == 2
+
+    def test_snapshot_is_immutable_copy(self, registry, sample_impl):
+        registry.register_impl(sample_impl)
+        snapshot = registry.snapshot()
+
+        # Register more after snapshot
+        new_impl = OpImpl(
+            op_name="rms_norm",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x,
+        )
+        registry.register_impl(new_impl)
+
+        # Snapshot should not contain the new impl
+        assert "rms_norm" not in snapshot.impls_by_op
+
+    def test_thread_safety(self, registry):
+        """Test basic thread safety of registry operations."""
+        import threading
+
+        errors = []
+
+        def register_impl(i):
+            try:
+                impl = OpImpl(
+                    op_name=f"op_{i}",
+                    impl_id=f"impl_{i}",
+                    kind=BackendImplKind.DEFAULT,
+                    fn=lambda x: x,
+                )
+                registry.register_impl(impl)
+            except Exception as e:
+                errors.append(e)
+
+        threads = [threading.Thread(target=register_impl, args=(i,)) for i in range(10)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        assert len(errors) == 0
+        assert len(registry.list_operators()) == 10
diff --git a/tests/unit_tests/dispatch/test_types.py b/tests/unit_tests/dispatch/test_types.py
new file mode 100644
index 0000000..9b9aa00
--- /dev/null
+++ b/tests/unit_tests/dispatch/test_types.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for dispatch type definitions.
+"""
+
+import pytest
+from vllm_fl.dispatch.types import (
+    BackendImplKind,
+    BackendPriority,
+    OpImpl,
+    match_token,
+)
+
+
+class TestBackendImplKind:
+    def test_enum_values(self):
+        assert BackendImplKind.DEFAULT.value == "flagos"
+        assert BackendImplKind.REFERENCE.value == "reference"
+        assert BackendImplKind.VENDOR.value == "vendor"
+
+    def test_str_representation(self):
+        assert str(BackendImplKind.DEFAULT) == "flagos"
+        assert str(BackendImplKind.REFERENCE) == "reference"
+        assert str(BackendImplKind.VENDOR) == "vendor"
+
+
+class TestBackendPriority:
+    def test_priority_ordering(self):
+        assert BackendPriority.DEFAULT > BackendPriority.VENDOR
+        assert BackendPriority.VENDOR > BackendPriority.REFERENCE
+        assert BackendPriority.DEFAULT > BackendPriority.REFERENCE
+
+
+class TestOpImpl:
+    def test_create_default_impl(self):
+        fn = lambda x: x
+        impl = OpImpl(
+            op_name="silu",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn,
+            priority=BackendPriority.DEFAULT,
+        )
+        assert impl.op_name == "silu"
+        assert impl.impl_id == "default.flagos"
+        assert impl.kind == BackendImplKind.DEFAULT
+        assert impl.vendor is None
+
+    def test_create_vendor_impl_requires_vendor_name(self):
+        fn = lambda x: x
+        with pytest.raises(ValueError, match="must specify vendor name"):
+            OpImpl(
+                op_name="silu",
+                impl_id="vendor.cuda",
+                kind=BackendImplKind.VENDOR,
+                fn=fn,
+            )
+
+    def test_create_vendor_impl_with_vendor_name(self):
+        fn = lambda x: x
+        impl = OpImpl(
+            op_name="silu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=fn,
+            vendor="CUDA",
+        )
+        assert impl.vendor == "CUDA"
+
+    def test_is_available_default(self):
+        fn = lambda x: x
+        impl = OpImpl(
+            op_name="silu",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn,
+        )
+        assert impl.is_available() is True
+
+    def test_is_available_with_checker(self):
+        def fn(x):
+            return x
+
+        fn._is_available = lambda: True
+        impl = OpImpl(
+            op_name="silu",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn,
+        )
+        assert impl.is_available() is True
+
+        fn._is_available = lambda: False
+        assert impl.is_available() is False
+
+    def test_is_available_handles_exception(self):
+        def fn(x):
+            return x
+
+        fn._is_available = lambda: 1 / 0  # Raises ZeroDivisionError
+        impl = OpImpl(
+            op_name="silu",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn,
+        )
+        assert impl.is_available() is False
+
+    def test_frozen_dataclass(self):
+        fn = lambda x: x
+        impl = OpImpl(
+            op_name="silu",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn,
+        )
+        with pytest.raises(AttributeError):
+            impl.op_name = "new_name"
+
+
+class TestMatchToken:
+    @pytest.fixture
+    def default_impl(self):
+        return OpImpl(
+            op_name="silu",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x,
+        )
+
+    @pytest.fixture
+    def reference_impl(self):
+        return OpImpl(
+            op_name="silu",
+            impl_id="reference.pytorch",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda x: x,
+        )
+
+    @pytest.fixture
+    def vendor_impl(self):
+        return OpImpl(
+            op_name="silu",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda x: x,
+            vendor="CUDA",
+        )
+
+    def test_match_flagos_token(self, default_impl, reference_impl, vendor_impl):
+        assert match_token(default_impl, "flagos") is True
+        assert match_token(reference_impl, "flagos") is False
+        assert match_token(vendor_impl, "flagos") is False
+
+    def test_match_reference_token(self, default_impl, reference_impl, vendor_impl):
+        assert match_token(default_impl, "reference") is False
+        assert match_token(reference_impl, "reference") is True
+        assert match_token(vendor_impl, "reference") is False
+
+    def test_match_vendor_token(self, default_impl, reference_impl, vendor_impl):
+        assert match_token(default_impl, "vendor") is False
+        assert match_token(reference_impl, "vendor") is False
+        assert match_token(vendor_impl, "vendor") is True
+
+    def test_match_vendor_specific(self, vendor_impl):
+        assert match_token(vendor_impl, "vendor:CUDA") is True
+        assert match_token(vendor_impl, "vendor:AMD") is False
+
+    def test_match_impl_id(self, default_impl, reference_impl):
+        assert match_token(default_impl, "impl:default.flagos") is True
+        assert match_token(default_impl, "impl:reference.pytorch") is False
+        assert match_token(reference_impl, "impl:reference.pytorch") is True
+
+    def test_unknown_token_returns_false(self, default_impl):
+        assert match_token(default_impl, "unknown") is False
+        assert match_token(default_impl, "") is False
diff --git a/tests/unit_tests/distributed/__init__.py b/tests/unit_tests/distributed/__init__.py
new file mode 100644
index 0000000..8c90136
--- /dev/null
+++ b/tests/unit_tests/distributed/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
diff --git a/tests/unit_tests/distributed/test_communicator.py b/tests/unit_tests/distributed/test_communicator.py
new file mode 100644
index 0000000..14bd380
--- /dev/null
+++ b/tests/unit_tests/distributed/test_communicator.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for distributed communicator module.
+"""
+
+import pytest
+import torch
+from unittest.mock import patch, MagicMock, PropertyMock
+
+
+class TestCommunicatorFL:
+    @pytest.fixture
+    def mock_pyflagcx(self):
+        with patch("vllm_fl.distributed.communicator.PyFlagcxCommunicator") as mock:
+            yield mock
+
+    @pytest.fixture
+    def mock_base_communicator(self):
+        with patch(
+            "vllm_fl.distributed.communicator.DeviceCommunicatorBase.__init__"
+        ) as mock:
+            mock.return_value = None
+            yield mock
+
+    def test_import(self):
+        from vllm_fl.distributed.communicator import CommunicatorFL
+        assert CommunicatorFL is not None
+
+    def test_init_single_worker_no_pyflagcx(self, mock_base_communicator, mock_pyflagcx):
+        from vllm_fl.distributed.communicator import CommunicatorFL
+
+        # Mock the base class attributes
+        with patch.object(CommunicatorFL, "world_size", new_callable=PropertyMock) as mock_ws:
+            mock_ws.return_value = 1
+
+            with patch.object(CommunicatorFL, "use_all2all", new_callable=PropertyMock) as mock_a2a:
+                mock_a2a.return_value = False
+
+                cpu_group = MagicMock()
+                device = torch.device("cpu")
+
+                comm = CommunicatorFL.__new__(CommunicatorFL)
+                comm.world_size = 1
+                comm.use_all2all = False
+                comm.cpu_group = cpu_group
+                comm.device = device
+                comm.pyflagcx_comm = None
+
+                # Single worker should not create pyflagcx communicator
+                assert comm.pyflagcx_comm is None
+
+
+class TestPyFlagcxCommunicator:
+    def test_import(self):
+        # Test that the module can be imported (may fail if flagcx not available)
+        try:
+            from vllm_fl.distributed.device_communicators.flagcx import PyFlagcxCommunicator
+            assert PyFlagcxCommunicator is not None
+        except ImportError:
+            pytest.skip("flagcx not available")
+
+    def test_disabled_communicator_returns_none(self):
+        """Test that disabled communicator methods return None/early exit."""
+        # Create a mock disabled communicator
+        mock_comm = MagicMock()
+        mock_comm.disabled = True
+        mock_comm.all_reduce.return_value = None
+
+        # When disabled, all_reduce should return None
+        result = mock_comm.all_reduce(torch.randn(2, 4))
+        assert result is None
diff --git a/tests/unit_tests/distributed/test_flagcx.py b/tests/unit_tests/distributed/test_flagcx.py
new file mode 100644
index 0000000..4b85f16
--- /dev/null
+++ b/tests/unit_tests/distributed/test_flagcx.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for flagcx communicator module.
+"""
+
+import pytest
+import torch
+from unittest.mock import patch, MagicMock
+
+
+class TestPyFlagcxCommunicator:
+    """Test PyFlagcxCommunicator class."""
+
+    @pytest.fixture
+    def mock_flagcx_library(self):
+        """Mock the FLAGCXLibrary."""
+        with patch(
+            "vllm_fl.distributed.device_communicators.flagcx.FLAGCXLibrary"
+        ) as mock:
+            yield mock
+
+    def test_world_size_one_disabled(self):
+        """Test that communicator is disabled for world_size=1."""
+        # When world_size is 1, the communicator should be disabled
+        mock_group = MagicMock()
+        mock_group.rank = 0
+        mock_group.world_size = 1
+
+        # Create a mock communicator with world_size=1
+        comm = MagicMock()
+        comm.world_size = 1
+        comm.available = False
+        comm.disabled = True
+
+        assert comm.disabled is True
+        assert comm.available is False
+
+    def test_all_reduce_disabled_returns_none(self):
+        """Test that all_reduce returns None when disabled."""
+        comm = MagicMock()
+        comm.disabled = True
+
+        # Simulate the actual behavior
+        def mock_all_reduce(tensor, out_tensor=None, op=None, stream=None):
+            if comm.disabled:
+                return None
+            return torch.empty_like(tensor)
+
+        comm.all_reduce = mock_all_reduce
+        result = comm.all_reduce(torch.randn(2, 4))
+        assert result is None
+
+    def test_send_disabled_early_return(self):
+        """Test that send returns early when disabled."""
+        comm = MagicMock()
+        comm.disabled = True
+
+        call_count = [0]
+
+        def mock_send(tensor, dst, stream=None):
+            if comm.disabled:
+                return
+            call_count[0] += 1
+
+        comm.send = mock_send
+        comm.send(torch.randn(2, 4), dst=1)
+
+        # Should return early without doing anything
+        assert call_count[0] == 0
+
+    def test_recv_disabled_early_return(self):
+        """Test that recv returns early when disabled."""
+        comm = MagicMock()
+        comm.disabled = True
+
+        call_count = [0]
+
+        def mock_recv(tensor, src, stream=None):
+            if comm.disabled:
+                return
+            call_count[0] += 1
+
+        comm.recv = mock_recv
+        comm.recv(torch.randn(2, 4), src=0)
+
+        # Should return early without doing anything
+        assert call_count[0] == 0
+
+    def test_reduce_scatter_disabled_early_return(self):
+        """Test that reduce_scatter returns early when disabled."""
+        comm = MagicMock()
+        comm.disabled = True
+
+        call_count = [0]
+
+        def mock_reduce_scatter(output_tensor, input_tensor, op=None, stream=None):
+            if comm.disabled:
+                return
+            call_count[0] += 1
+
+        comm.reduce_scatter = mock_reduce_scatter
+        comm.reduce_scatter(torch.randn(2, 4), torch.randn(4, 4))
+
+        assert call_count[0] == 0
+
+
+class TestFlagcxDataTypes:
+    """Test flagcx data type mappings."""
+
+    def test_torch_dtype_mapping_concept(self):
+        """Test the concept of torch dtype to flagcx dtype mapping."""
+        dtype_map = {
+            torch.float32: "FLOAT",
+            torch.float16: "HALF",
+            torch.bfloat16: "BFLOAT16",
+            torch.int32: "INT32",
+            torch.int64: "INT64",
+        }
+
+        # Verify common dtypes are mappable
+        for torch_dtype, flagcx_name in dtype_map.items():
+            assert torch_dtype is not None
+            assert isinstance(flagcx_name, str)
diff --git a/tests/unit_tests/ops/__init__.py b/tests/unit_tests/ops/__init__.py
new file mode 100644
index 0000000..8c90136
--- /dev/null
+++ b/tests/unit_tests/ops/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
diff --git a/tests/unit_tests/ops/test_activation.py b/tests/unit_tests/ops/test_activation.py
new file mode 100644
index 0000000..fe830f3
--- /dev/null
+++ b/tests/unit_tests/ops/test_activation.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for activation ops.
+"""
+
+import pytest
+import torch
+from unittest.mock import patch, MagicMock
+
+
+class TestSiluAndMulFL:
+    @pytest.fixture
+    def mock_call_op(self):
+        with patch("vllm_fl.ops.activation.call_op") as mock:
+            yield mock
+
+    def test_import(self):
+        from vllm_fl.ops.activation import SiluAndMulFL
+        assert SiluAndMulFL is not None
+
+    def test_forward_oot_calls_dispatch(self, mock_call_op):
+        from vllm_fl.ops.activation import SiluAndMulFL
+
+        mock_call_op.return_value = torch.randn(2, 4)
+        layer = SiluAndMulFL()
+        x = torch.randn(2, 8)
+
+        result = layer.forward_oot(x)
+
+        mock_call_op.assert_called_once_with("silu_and_mul", x)
+        assert result.shape == (2, 4)
+
+    def test_all_exports(self):
+        from vllm_fl.ops.activation import __all__
+        assert "SiluAndMulFL" in __all__
diff --git a/tests/unit_tests/ops/test_layernorm.py b/tests/unit_tests/ops/test_layernorm.py
new file mode 100644
index 0000000..9f1f332
--- /dev/null
+++ b/tests/unit_tests/ops/test_layernorm.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for layernorm ops.
+"""
+
+import pytest
+import torch
+from unittest.mock import patch, MagicMock
+
+
+class TestRMSNormFL:
+    @pytest.fixture
+    def mock_call_op(self):
+        with patch("vllm_fl.ops.layernorm.call_op") as mock:
+            yield mock
+
+    def test_import(self):
+        from vllm_fl.ops.layernorm import RMSNormFL
+        assert RMSNormFL is not None
+
+    def test_init_params(self):
+        from vllm_fl.ops.layernorm import RMSNormFL
+
+        hidden_size = 128
+        eps = 1e-5
+        layer = RMSNormFL(hidden_size=hidden_size, eps=eps)
+
+        assert layer.variance_epsilon == eps
+        assert layer.weight.shape == (hidden_size,)
+
+    def test_forward_oot_without_residual(self, mock_call_op):
+        from vllm_fl.ops.layernorm import RMSNormFL
+
+        hidden_size = 128
+        mock_call_op.return_value = torch.randn(2, hidden_size)
+
+        layer = RMSNormFL(hidden_size=hidden_size)
+        x = torch.randn(2, hidden_size)
+
+        result = layer.forward_oot(x)
+
+        mock_call_op.assert_called_once()
+        call_args = mock_call_op.call_args
+        assert call_args[0][0] == "rms_norm"
+        assert torch.equal(call_args[0][1], x)
+        assert call_args[0][2] is None  # residual
+        assert torch.equal(call_args[0][3], layer.weight)
+
+    def test_forward_oot_with_residual(self, mock_call_op):
+        from vllm_fl.ops.layernorm import RMSNormFL
+
+        hidden_size = 128
+        mock_call_op.return_value = (torch.randn(2, hidden_size), torch.randn(2, hidden_size))
+
+        layer = RMSNormFL(hidden_size=hidden_size)
+        x = torch.randn(2, hidden_size)
+        residual = torch.randn(2, hidden_size)
+
+        result = layer.forward_oot(x, residual=residual)
+
+        mock_call_op.assert_called_once()
+        call_args = mock_call_op.call_args
+        assert call_args[0][0] == "rms_norm"
+        assert torch.equal(call_args[0][2], residual)
+
+    def test_all_exports(self):
+        from vllm_fl.ops.layernorm import __all__
+        assert "RMSNormFL" in __all__
diff --git a/tests/unit_tests/ops/test_rotary_embedding.py b/tests/unit_tests/ops/test_rotary_embedding.py
new file mode 100644
index 0000000..02f0568
--- /dev/null
+++ b/tests/unit_tests/ops/test_rotary_embedding.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for rotary embedding ops.
+"""
+
+import pytest
+import torch
+from unittest.mock import patch, MagicMock
+
+
+class TestRotaryEmbeddingFL:
+    @pytest.fixture
+    def mock_call_op(self):
+        with patch("vllm_fl.ops.rotary_embedding.call_op") as mock:
+            yield mock
+
+    def test_import(self):
+        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
+        assert RotaryEmbeddingFL is not None
+
+    def test_init_params(self):
+        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
+
+        head_size = 64
+        rotary_dim = 32
+        max_position_embeddings = 2048
+        base = 10000.0
+
+        layer = RotaryEmbeddingFL(
+            head_size=head_size,
+            rotary_dim=rotary_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=base,
+            is_neox_style=True,
+            dtype=torch.float32,
+        )
+
+        assert layer.head_size == head_size
+        assert layer.rotary_dim == rotary_dim
+        assert layer.max_position_embeddings == max_position_embeddings
+        assert layer.is_neox_style is True
+
+    def test_all_exports(self):
+        from vllm_fl.ops.rotary_embedding import __all__
+        assert "RotaryEmbeddingFL" in __all__
diff --git a/tests/unit_tests/worker/__init__.py b/tests/unit_tests/worker/__init__.py
new file mode 100644
index 0000000..8c90136
--- /dev/null
+++ b/tests/unit_tests/worker/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
diff --git a/tests/unit_tests/worker/test_model_runner.py b/tests/unit_tests/worker/test_model_runner.py
new file mode 100644
index 0000000..3ede95d
--- /dev/null
+++ b/tests/unit_tests/worker/test_model_runner.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for model runner module.
+"""
+
+import pytest
+from unittest.mock import patch, MagicMock
+
+
+class TestModelRunnerFL:
+    """Test ModelRunnerFL class."""
+
+    def test_import(self):
+        from vllm_fl.worker.model_runner import ModelRunnerFL
+        assert ModelRunnerFL is not None
+
+    def test_class_inheritance(self):
+        """Test that ModelRunnerFL inherits from expected base classes."""
+        from vllm_fl.worker.model_runner import ModelRunnerFL
+
+        # Check that it's a class
+        assert isinstance(ModelRunnerFL, type)
diff --git a/tests/unit_tests/worker/test_worker.py b/tests/unit_tests/worker/test_worker.py
new file mode 100644
index 0000000..baccd0e
--- /dev/null
+++ b/tests/unit_tests/worker/test_worker.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for worker module.
+"""
+
+import pytest
+from unittest.mock import patch, MagicMock
+
+
+class TestWorkerFL:
+    """Test WorkerFL class."""
+
+    def test_import(self):
+        from vllm_fl.worker.worker import WorkerFL
+        assert WorkerFL is not None
+
+    def test_memory_snapshot_import(self):
+        from vllm_fl.worker.worker import MemorySnapshot
+        assert MemorySnapshot is not None
+
+    def test_memory_profiling_result_import(self):
+        from vllm_fl.worker.worker import MemoryProfilingResult
+        assert MemoryProfilingResult is not None
+
+
+class TestMemorySnapshot:
+    """Test MemorySnapshot dataclass."""
+
+    def test_default_values(self):
+        from vllm_fl.worker.worker import MemorySnapshot
+
+        # Create with auto_measure=False to avoid actual GPU calls
+        snapshot = MemorySnapshot(auto_measure=False)
+
+        assert snapshot.torch_peak == 0
+        assert snapshot.free_memory == 0
+        assert snapshot.total_memory == 0
+        assert snapshot.cuda_memory == 0
+        assert snapshot.torch_memory == 0
+        assert snapshot.non_torch_memory == 0
+
+    def test_subtraction(self):
+        from vllm_fl.worker.worker import MemorySnapshot
+
+        snapshot1 = MemorySnapshot(auto_measure=False)
+        snapshot1.torch_peak = 1000
+        snapshot1.free_memory = 5000
+        snapshot1.total_memory = 10000
+        snapshot1.cuda_memory = 5000
+        snapshot1.torch_memory = 3000
+        snapshot1.non_torch_memory = 2000
+        snapshot1.timestamp = 10.0
+
+        snapshot2 = MemorySnapshot(auto_measure=False)
+        snapshot2.torch_peak = 500
+        snapshot2.free_memory = 6000
+        snapshot2.total_memory = 10000
+        snapshot2.cuda_memory = 4000
+        snapshot2.torch_memory = 2000
+        snapshot2.non_torch_memory = 2000
+        snapshot2.timestamp = 5.0
+
+        diff = snapshot1 - snapshot2
+
+        assert diff.torch_peak == 500
+        assert diff.free_memory == -1000
+        assert diff.cuda_memory == 1000
+        assert diff.torch_memory == 1000
+        assert diff.timestamp == 5.0
+
+
+class TestMemoryProfilingResult:
+    """Test MemoryProfilingResult dataclass."""
+
+    def test_default_values(self):
+        from vllm_fl.worker.worker import MemoryProfilingResult
+
+        result = MemoryProfilingResult()
+
+        assert result.weights_memory == 0
+        assert result.torch_peak_increase == 0
+        assert result.non_torch_increase == 0
+        assert result.non_kv_cache_memory == 0
+        assert result.profile_time == 0.0
+
+    def test_before_profile_defaults(self):
+        from vllm_fl.worker.worker import MemoryProfilingResult, MemorySnapshot
+
+        result = MemoryProfilingResult()
+
+        # Should create default MemorySnapshot objects
+        assert result.before_profile is not None
+        assert result.after_profile is not None
+
+
+class TestInitWorkerDistributedEnvironment:
+    """Test init_worker_distributed_environment function."""
+
+    def test_import(self):
+        from vllm_fl.worker.worker import init_worker_distributed_environment
+        assert init_worker_distributed_environment is not None

From ac8e70acd68b697298b991d4dfe28c82770e0d4f Mon Sep 17 00:00:00 2001
From: xmhubj <xmhu@baai.ac.cn>
Date: Wed, 4 Feb 2026 20:36:58 +0800
Subject: [PATCH 03/14] add functional tests

---
 pyproject.toml                                |   5 +-
 tests/{e2e => e2e_tests}/conftest.py          |   0
 .../test_offline_inference.py                 |   2 +-
 tests/functional_tests/__init__.py            |   2 +
 .../functional_tests/compilation/__init__.py  |   2 +
 .../compilation/test_graph_capture.py         | 237 ++++++++++++
 tests/functional_tests/conftest.py            |  96 +++++
 tests/functional_tests/dispatch/__init__.py   |   2 +
 .../dispatch/test_dispatch_flow.py            | 357 ++++++++++++++++++
 .../functional_tests/distributed/__init__.py  |   2 +
 .../distributed/test_collective_ops.py        | 209 ++++++++++
 .../flaggems/test_flaggems_get_ops.py         |   0
 .../flaggems/test_gems_whitelist.py           |   0
 tests/functional_tests/ops/__init__.py        |   2 +
 .../ops/test_ops_correctness.py               | 311 +++++++++++++++
 15 files changed, 1225 insertions(+), 2 deletions(-)
 rename tests/{e2e => e2e_tests}/conftest.py (100%)
 rename tests/{e2e => e2e_tests}/test_offline_inference.py (97%)
 create mode 100644 tests/functional_tests/__init__.py
 create mode 100644 tests/functional_tests/compilation/__init__.py
 create mode 100644 tests/functional_tests/compilation/test_graph_capture.py
 create mode 100644 tests/functional_tests/conftest.py
 create mode 100644 tests/functional_tests/dispatch/__init__.py
 create mode 100644 tests/functional_tests/dispatch/test_dispatch_flow.py
 create mode 100644 tests/functional_tests/distributed/__init__.py
 create mode 100644 tests/functional_tests/distributed/test_collective_ops.py
 rename tests/{ => functional_tests}/flaggems/test_flaggems_get_ops.py (100%)
 rename tests/{ => functional_tests}/flaggems/test_gems_whitelist.py (100%)
 create mode 100644 tests/functional_tests/ops/__init__.py
 create mode 100644 tests/functional_tests/ops/test_ops_correctness.py

diff --git a/pyproject.toml b/pyproject.toml
index 332c7e0..ca32a8f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,10 +8,13 @@ python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]
 markers = [
-    "gpu: marks tests as requiring GPU (deselect with '-m \"not gpu\"')",
+    "gpu: marks tests as requiring single GPU (deselect with '-m \"not gpu\"')",
+    "multi_gpu: marks tests as requiring multiple GPUs",
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
     "integration: marks tests as integration tests",
     "e2e: marks tests as end-to-end tests",
+    "flaggems: marks tests as requiring flag_gems library",
+    "functional: marks tests as functional tests",
 ]
 addopts = "-v --tb=short"
 filterwarnings = [
diff --git a/tests/e2e/conftest.py b/tests/e2e_tests/conftest.py
similarity index 100%
rename from tests/e2e/conftest.py
rename to tests/e2e_tests/conftest.py
diff --git a/tests/e2e/test_offline_inference.py b/tests/e2e_tests/test_offline_inference.py
similarity index 97%
rename from tests/e2e/test_offline_inference.py
rename to tests/e2e_tests/test_offline_inference.py
index 7627ab8..e19ad8f 100644
--- a/tests/e2e/test_offline_inference.py
+++ b/tests/e2e_tests/test_offline_inference.py
@@ -4,7 +4,7 @@
 import pytest
 import vllm  # noqa: F401
 from conftest import VllmRunner
-import vllm_flagos
+import vllm_fl  # noqa: F401
 
 MODELS = [
     # "Qwen/Qwen3-0.6B",
diff --git a/tests/functional_tests/__init__.py b/tests/functional_tests/__init__.py
new file mode 100644
index 0000000..edb92d6
--- /dev/null
+++ b/tests/functional_tests/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+"""Functional tests for vllm_fl."""
diff --git a/tests/functional_tests/compilation/__init__.py b/tests/functional_tests/compilation/__init__.py
new file mode 100644
index 0000000..99883a5
--- /dev/null
+++ b/tests/functional_tests/compilation/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+"""Compilation functional tests."""
diff --git a/tests/functional_tests/compilation/test_graph_capture.py b/tests/functional_tests/compilation/test_graph_capture.py
new file mode 100644
index 0000000..e3e34da
--- /dev/null
+++ b/tests/functional_tests/compilation/test_graph_capture.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Functional tests for graph capture and replay.
+Tests CUDA/NPU graph functionality for model optimization.
+"""
+
+import pytest
+import torch
+from unittest.mock import MagicMock, patch
+from dataclasses import dataclass
+
+
+# Mark all tests as requiring GPU
+pytestmark = pytest.mark.gpu
+
+
+class TestGraphClasses:
+    """Test graph-related class functionality."""
+
+    def test_graph_options_defaults(self):
+        """Test GraphOptions default values."""
+        try:
+            from vllm_fl.compilation.graph import GraphOptions
+        except ImportError:
+            pytest.skip("GraphOptions not available")
+
+        options = GraphOptions()
+        assert options.debug_log_enable is True
+        assert options.gc_disable is False
+        assert options.weak_ref_output is True
+
+    def test_graph_options_custom(self):
+        """Test GraphOptions with custom values."""
+        try:
+            from vllm_fl.compilation.graph import GraphOptions
+        except ImportError:
+            pytest.skip("GraphOptions not available")
+
+        options = GraphOptions(
+            debug_log_enable=False,
+            gc_disable=True,
+            weak_ref_output=False,
+        )
+        assert options.debug_log_enable is False
+        assert options.gc_disable is True
+        assert options.weak_ref_output is False
+
+    def test_graph_entry_creation(self):
+        """Test GraphEntry dataclass."""
+        try:
+            from vllm_fl.compilation.graph import GraphEntry
+        except ImportError:
+            pytest.skip("GraphEntry not available")
+
+        mock_batch_desc = MagicMock()
+        entry = GraphEntry(batch_descriptor=mock_batch_desc)
+
+        assert entry.batch_descriptor is mock_batch_desc
+        assert entry.graph is None
+        assert entry.output is None
+        assert entry.input_addresses is None
+
+
+class TestGraphWrapper:
+    """Test GraphWrapper functionality."""
+
+    @pytest.fixture
+    def mock_vllm_config(self):
+        """Create mock VllmConfig."""
+        config = MagicMock()
+        config.compilation_config = MagicMock()
+        config.compilation_config.cudagraph_capture_sizes = [1, 2, 4, 8]
+        return config
+
+    def test_graph_wrapper_import(self):
+        """Test GraphWrapper can be imported."""
+        try:
+            from vllm_fl.compilation.graph import GraphWrapper
+            assert GraphWrapper is not None
+        except ImportError:
+            pytest.skip("GraphWrapper not available")
+
+    def test_graph_wrapper_unwrap(self):
+        """Test GraphWrapper.unwrap returns original runnable."""
+        try:
+            from vllm_fl.compilation.graph import GraphWrapper, GraphOptions
+            from vllm.config import CUDAGraphMode
+        except ImportError:
+            pytest.skip("Required imports not available")
+
+        def simple_runnable(x):
+            return x * 2
+
+        mock_config = MagicMock()
+        mock_config.compilation_config = MagicMock()
+
+        # Mock the platform
+        with patch("vllm_fl.compilation.graph.current_platform") as mock_platform:
+            mock_platform.get_global_graph_pool.return_value = MagicMock()
+
+            wrapper = GraphWrapper(
+                runnable=simple_runnable,
+                vllm_config=mock_config,
+                runtime_mode=CUDAGraphMode.FULL,
+            )
+
+            assert wrapper.unwrap() is simple_runnable
+
+
+class TestWeakRefTensors:
+    """Test weak reference tensor functionality."""
+
+    def test_weak_ref_tensors_function(self):
+        """Test weak_ref_tensors function exists."""
+        try:
+            from vllm_fl.compilation.graph import weak_ref_tensors
+            assert weak_ref_tensors is not None
+        except ImportError:
+            pytest.skip("weak_ref_tensors not available")
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_weak_ref_tensors_with_cuda_tensor(self):
+        """Test weak_ref_tensors with CUDA tensor."""
+        try:
+            from vllm_fl.compilation.graph import weak_ref_tensors
+        except ImportError:
+            pytest.skip("weak_ref_tensors not available")
+
+        tensor = torch.randn(4, 8, device="cuda")
+        result = weak_ref_tensors(tensor)
+        # Result should be either the tensor or a weak reference
+        assert result is not None
+
+
+class TestGraphCaptureFlow:
+    """Test the complete graph capture flow."""
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_cuda_graph_basic_capture(self):
+        """Test basic CUDA graph capture and replay."""
+        # Simple test without vllm_fl dependencies
+        device = torch.device("cuda")
+
+        # Create a simple computation
+        def computation(x):
+            return x * 2 + 1
+
+        # Create input tensor
+        x = torch.randn(4, 8, device=device)
+
+        # Warmup
+        y = computation(x)
+
+        # Capture graph
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            y = computation(x)
+
+        # Replay graph
+        g.replay()
+
+        # Verify output
+        expected = x * 2 + 1
+        assert torch.allclose(y, expected)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_cuda_graph_with_different_inputs(self):
+        """Test CUDA graph with different input values."""
+        device = torch.device("cuda")
+
+        # Static input buffer
+        static_input = torch.randn(4, 8, device=device)
+        static_output = torch.empty(4, 8, device=device)
+
+        def computation(x, out):
+            out.copy_(x * 2)
+
+        # Warmup
+        computation(static_input, static_output)
+
+        # Capture graph
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            computation(static_input, static_output)
+
+        # Test with new input values (copy to static buffer)
+        new_input = torch.ones(4, 8, device=device)
+        static_input.copy_(new_input)
+
+        # Replay
+        g.replay()
+
+        expected = new_input * 2
+        assert torch.allclose(static_output, expected)
+
+
+class TestGraphCacheManagement:
+    """Test graph cache management functionality."""
+
+    def test_batch_descriptor_hashing(self):
+        """Test that batch descriptors can be used as dict keys."""
+        @dataclass(frozen=True)
+        class MockBatchDescriptor:
+            num_tokens: int
+            max_num_reqs: int
+
+        desc1 = MockBatchDescriptor(num_tokens=16, max_num_reqs=4)
+        desc2 = MockBatchDescriptor(num_tokens=16, max_num_reqs=4)
+        desc3 = MockBatchDescriptor(num_tokens=32, max_num_reqs=8)
+
+        cache = {}
+        cache[desc1] = "graph1"
+        cache[desc3] = "graph3"
+
+        # Same values should hash to same key
+        assert cache[desc2] == "graph1"
+        assert cache[desc3] == "graph3"
+
+    def test_graph_entry_storage(self):
+        """Test storing graph entries in cache."""
+        try:
+            from vllm_fl.compilation.graph import GraphEntry
+        except ImportError:
+            pytest.skip("GraphEntry not available")
+
+        @dataclass(frozen=True)
+        class MockBatchDescriptor:
+            num_tokens: int
+
+        cache = {}
+        desc = MockBatchDescriptor(num_tokens=16)
+
+        entry = GraphEntry(batch_descriptor=desc)
+        cache[desc] = entry
+
+        assert cache[desc].batch_descriptor.num_tokens == 16
diff --git a/tests/functional_tests/conftest.py b/tests/functional_tests/conftest.py
new file mode 100644
index 0000000..e765f35
--- /dev/null
+++ b/tests/functional_tests/conftest.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Functional test fixtures and configuration.
+"""
+
+import os
+import pytest
+import torch
+
+
+def pytest_configure(config):
+    """Register custom markers."""
+    config.addinivalue_line("markers", "gpu: marks tests as requiring GPU")
+    config.addinivalue_line("markers", "multi_gpu: marks tests as requiring multiple GPUs")
+    config.addinivalue_line("markers", "flaggems: marks tests as requiring flag_gems library")
+
+
+@pytest.fixture(scope="session")
+def has_gpu():
+    """Check if GPU is available."""
+    return torch.cuda.is_available()
+
+
+@pytest.fixture(scope="session")
+def device(has_gpu):
+    """Get the test device."""
+    if has_gpu:
+        return torch.device("cuda:0")
+    return torch.device("cpu")
+
+
+@pytest.fixture(scope="session")
+def gpu_count():
+    """Get the number of available GPUs."""
+    if torch.cuda.is_available():
+        return torch.cuda.device_count()
+    return 0
+
+
+@pytest.fixture
+def reset_dispatch_manager():
+    """Reset dispatch manager before and after test."""
+    from vllm_fl.dispatch import reset_default_manager, reset_global_policy
+
+    reset_default_manager()
+    reset_global_policy()
+    yield
+    reset_default_manager()
+    reset_global_policy()
+
+
+@pytest.fixture
+def clean_env():
+    """Clean dispatch-related environment variables."""
+    env_vars = [
+        "VLLM_FL_PREFER",
+        "VLLM_FL_STRICT",
+        "VLLM_FL_CONFIG",
+        "VLLM_FL_DENY_VENDORS",
+        "VLLM_FL_ALLOW_VENDORS",
+        "VLLM_FL_PER_OP",
+        "VLLM_FL_DISPATCH_DEBUG",
+    ]
+
+    # Save original values
+    original = {k: os.environ.get(k) for k in env_vars}
+
+    # Clear env vars
+    for k in env_vars:
+        os.environ.pop(k, None)
+
+    yield
+
+    # Restore original values
+    for k, v in original.items():
+        if v is not None:
+            os.environ[k] = v
+        else:
+            os.environ.pop(k, None)
+
+
+def skip_if_no_gpu(fn):
+    """Decorator to skip test if no GPU is available."""
+    return pytest.mark.skipif(
+        not torch.cuda.is_available(),
+        reason="GPU not available"
+    )(fn)
+
+
+def skip_if_no_multi_gpu(fn):
+    """Decorator to skip test if less than 2 GPUs available."""
+    return pytest.mark.skipif(
+        not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+        reason="Multiple GPUs not available"
+    )(fn)
diff --git a/tests/functional_tests/dispatch/__init__.py b/tests/functional_tests/dispatch/__init__.py
new file mode 100644
index 0000000..fb91e8d
--- /dev/null
+++ b/tests/functional_tests/dispatch/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+"""Dispatch functional tests."""
diff --git a/tests/functional_tests/dispatch/test_dispatch_flow.py b/tests/functional_tests/dispatch/test_dispatch_flow.py
new file mode 100644
index 0000000..2ed6fd0
--- /dev/null
+++ b/tests/functional_tests/dispatch/test_dispatch_flow.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Functional tests for dispatch flow.
+Tests the complete operator dispatch mechanism including
+registration, resolution, and policy-based selection.
+"""
+
+import os
+import pytest
+import tempfile
+
+from vllm_fl.dispatch import (
+    OpRegistry,
+    OpManager,
+    OpImpl,
+    BackendImplKind,
+    BackendPriority,
+    SelectionPolicy,
+    get_default_manager,
+    reset_default_manager,
+    call_op,
+    resolve_op,
+    get_policy,
+    set_global_policy,
+    reset_global_policy,
+    policy_context,
+    with_preference,
+    PREFER_DEFAULT,
+    PREFER_VENDOR,
+    PREFER_REFERENCE,
+)
+
+
+class TestDispatchManagerInitialization:
+    """Test OpManager initialization and registration."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self, reset_dispatch_manager, clean_env):
+        """Reset manager before each test."""
+        pass
+
+    def test_manager_initializes_lazily(self):
+        """Test that manager initializes on first use."""
+        manager = get_default_manager()
+        assert manager is not None
+
+        # Should be initialized after first call
+        manager.ensure_initialized()
+        assert manager._state.initialized is True
+
+    def test_manager_registers_builtin_ops(self):
+        """Test that built-in operators are registered."""
+        manager = get_default_manager()
+        manager.ensure_initialized()
+
+        snap = manager.registry.snapshot()
+
+        # Should have some operators registered
+        assert len(snap.impls_by_op) > 0
+
+    def test_manager_singleton(self):
+        """Test that get_default_manager returns singleton."""
+        manager1 = get_default_manager()
+        manager2 = get_default_manager()
+        assert manager1 is manager2
+
+    def test_reset_manager(self):
+        """Test that reset_default_manager creates new instance."""
+        manager1 = get_default_manager()
+        reset_default_manager()
+        manager2 = get_default_manager()
+        assert manager1 is not manager2
+
+
+class TestOperatorResolution:
+    """Test operator resolution logic."""
+
+    @pytest.fixture
+    def custom_manager(self):
+        """Create a custom manager with test implementations."""
+        registry = OpRegistry()
+
+        # Register test implementations
+        registry.register_impl(OpImpl(
+            op_name="test_op",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x * 2,
+            priority=BackendPriority.DEFAULT,
+        ))
+
+        registry.register_impl(OpImpl(
+            op_name="test_op",
+            impl_id="reference.pytorch",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda x: x * 2 + 1,
+            priority=BackendPriority.REFERENCE,
+        ))
+
+        registry.register_impl(OpImpl(
+            op_name="test_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda x: x * 3,
+            vendor="CUDA",
+            priority=BackendPriority.VENDOR,
+        ))
+
+        return OpManager(registry)
+
+    @pytest.fixture(autouse=True)
+    def setup(self, reset_dispatch_manager, clean_env):
+        pass
+
+    def test_resolve_selects_by_default_order(self, custom_manager):
+        """Test that resolve selects by default order (flagos first)."""
+        fn = custom_manager.resolve("test_op")
+        result = fn(10)
+        assert result == 20  # flagos: x * 2
+
+    def test_resolve_with_vendor_preference(self, custom_manager):
+        """Test resolution with vendor preference."""
+        vendor_policy = SelectionPolicy(prefer=PREFER_VENDOR)
+        with policy_context(vendor_policy):
+            fn = custom_manager.resolve("test_op")
+            result = fn(10)
+            assert result == 30  # vendor: x * 3
+
+    def test_resolve_with_reference_preference(self, custom_manager):
+        """Test resolution with reference preference."""
+        ref_policy = SelectionPolicy(prefer=PREFER_REFERENCE)
+        with policy_context(ref_policy):
+            fn = custom_manager.resolve("test_op")
+            result = fn(10)
+            assert result == 21  # reference: x * 2 + 1
+
+    def test_resolve_caches_result(self, custom_manager):
+        """Test that resolution is cached."""
+        fn1 = custom_manager.resolve("test_op")
+        fn2 = custom_manager.resolve("test_op")
+        assert fn1 is fn2
+
+    def test_resolve_nonexistent_raises(self, custom_manager):
+        """Test that resolving non-existent op raises."""
+        with pytest.raises(RuntimeError, match="No available implementation"):
+            custom_manager.resolve("nonexistent_op")
+
+
+class TestPolicyBasedSelection:
+    """Test policy-based operator selection."""
+
+    @pytest.fixture
+    def manager_with_vendors(self):
+        """Create manager with multiple vendor implementations."""
+        registry = OpRegistry()
+
+        registry.register_impl(OpImpl(
+            op_name="multi_vendor_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda: "cuda",
+            vendor="CUDA",
+            priority=100,
+        ))
+
+        registry.register_impl(OpImpl(
+            op_name="multi_vendor_op",
+            impl_id="vendor.ascend",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda: "ascend",
+            vendor="ASCEND",
+            priority=100,
+        ))
+
+        registry.register_impl(OpImpl(
+            op_name="multi_vendor_op",
+            impl_id="reference.pytorch",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda: "reference",
+            priority=50,
+        ))
+
+        return OpManager(registry)
+
+    @pytest.fixture(autouse=True)
+    def setup(self, reset_dispatch_manager, clean_env):
+        pass
+
+    def test_deny_vendor_excludes_implementation(self, manager_with_vendors):
+        """Test that denied vendors are excluded."""
+        policy = SelectionPolicy.from_dict(
+            prefer=PREFER_VENDOR,
+            deny_vendors={"CUDA"},
+        )
+        with policy_context(policy):
+            fn = manager_with_vendors.resolve("multi_vendor_op")
+            result = fn()
+            assert result == "ascend"
+
+    def test_allow_vendor_limits_selection(self, manager_with_vendors):
+        """Test that allow list limits vendor selection."""
+        policy = SelectionPolicy.from_dict(
+            prefer=PREFER_VENDOR,
+            allow_vendors={"CUDA"},
+        )
+        with policy_context(policy):
+            fn = manager_with_vendors.resolve("multi_vendor_op")
+            result = fn()
+            assert result == "cuda"
+
+    def test_per_op_order_overrides_default(self, manager_with_vendors):
+        """Test that per-op order overrides default."""
+        policy = SelectionPolicy.from_dict(
+            prefer=PREFER_VENDOR,
+            per_op_order={"multi_vendor_op": ["reference"]},
+        )
+        with policy_context(policy):
+            fn = manager_with_vendors.resolve("multi_vendor_op")
+            result = fn()
+            assert result == "reference"
+
+
+class TestFallbackMechanism:
+    """Test fallback when primary implementation fails."""
+
+    @pytest.fixture
+    def manager_with_failing_impl(self):
+        """Create manager with a failing primary implementation."""
+        registry = OpRegistry()
+
+        def failing_fn():
+            raise RuntimeError("Primary failed!")
+
+        registry.register_impl(OpImpl(
+            op_name="fallback_op",
+            impl_id="default.flagos",
+            kind=BackendImplKind.DEFAULT,
+            fn=failing_fn,
+            priority=BackendPriority.DEFAULT,
+        ))
+
+        registry.register_impl(OpImpl(
+            op_name="fallback_op",
+            impl_id="reference.pytorch",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda: "fallback_success",
+            priority=BackendPriority.REFERENCE,
+        ))
+
+        return OpManager(registry)
+
+    @pytest.fixture(autouse=True)
+    def setup(self, reset_dispatch_manager, clean_env):
+        pass
+
+    def test_fallback_on_primary_failure(self, manager_with_failing_impl):
+        """Test fallback to next implementation when primary fails."""
+        # Enable fallback (VLLM_FL_STRICT != "0")
+        os.environ["VLLM_FL_STRICT"] = "1"
+
+        result = manager_with_failing_impl.call("fallback_op")
+        assert result == "fallback_success"
+
+    def test_failed_impl_tracked(self, manager_with_failing_impl):
+        """Test that failed implementations are tracked."""
+        os.environ["VLLM_FL_STRICT"] = "1"
+
+        manager_with_failing_impl.call("fallback_op")
+
+        failed = manager_with_failing_impl.get_failed_impls("fallback_op")
+        assert "default.flagos" in failed.get("fallback_op", set())
+
+    def test_clear_failed_impls(self, manager_with_failing_impl):
+        """Test clearing failed implementations cache."""
+        os.environ["VLLM_FL_STRICT"] = "1"
+
+        manager_with_failing_impl.call("fallback_op")
+        manager_with_failing_impl.clear_failed_impls("fallback_op")
+
+        failed = manager_with_failing_impl.get_failed_impls("fallback_op")
+        assert len(failed) == 0
+
+
+class TestConfigFileLoading:
+    """Test loading configuration from YAML file."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self, reset_dispatch_manager, clean_env):
+        pass
+
+    def test_load_config_from_yaml(self):
+        """Test loading policy from YAML config file."""
+        config_content = """
+prefer: vendor
+strict: true
+allow_vendors:
+  - CUDA
+deny_vendors:
+  - AMD
+op_backends:
+  test_op:
+    - vendor
+    - reference
+"""
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".yaml", delete=False
+        ) as f:
+            f.write(config_content)
+            config_path = f.name
+
+        try:
+            os.environ["VLLM_FL_CONFIG"] = config_path
+            reset_global_policy()
+
+            policy = get_policy()
+
+            assert policy.prefer == "vendor"
+            assert policy.strict is True
+            assert "CUDA" in policy.allow_vendors
+            assert "AMD" in policy.deny_vendors
+            assert policy.get_per_op_order("test_op") == ["vendor", "reference"]
+        finally:
+            os.unlink(config_path)
+            os.environ.pop("VLLM_FL_CONFIG", None)
+
+
+class TestContextManagers:
+    """Test policy context managers."""
+
+    @pytest.fixture(autouse=True)
+    def setup(self, reset_dispatch_manager, clean_env):
+        pass
+
+    def test_with_preference_context(self):
+        """Test with_preference context manager."""
+        original = get_policy()
+        assert original.prefer == PREFER_DEFAULT
+
+        with with_preference(PREFER_VENDOR):
+            inside = get_policy()
+            assert inside.prefer == PREFER_VENDOR
+
+        after = get_policy()
+        assert after.prefer == PREFER_DEFAULT
+
+    def test_nested_contexts(self):
+        """Test nested policy contexts."""
+        with with_preference(PREFER_VENDOR):
+            assert get_policy().prefer == PREFER_VENDOR
+
+            with with_preference(PREFER_REFERENCE):
+                assert get_policy().prefer == PREFER_REFERENCE
+
+            assert get_policy().prefer == PREFER_VENDOR
+
+        assert get_policy().prefer == PREFER_DEFAULT
diff --git a/tests/functional_tests/distributed/__init__.py b/tests/functional_tests/distributed/__init__.py
new file mode 100644
index 0000000..7a95e8f
--- /dev/null
+++ b/tests/functional_tests/distributed/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+"""Distributed functional tests."""
diff --git a/tests/functional_tests/distributed/test_collective_ops.py b/tests/functional_tests/distributed/test_collective_ops.py
new file mode 100644
index 0000000..bb8e330
--- /dev/null
+++ b/tests/functional_tests/distributed/test_collective_ops.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Functional tests for distributed collective operations.
+Tests correctness of collective operations like all_reduce, reduce_scatter, etc.
+
+NOTE: These tests require multiple GPUs and a distributed environment.
+They are designed to be run with pytest-mpi or similar multi-process test runners.
+"""
+
+import pytest
+import torch
+from typing import List
+from unittest.mock import MagicMock, patch
+
+
+# Mark all tests as requiring multiple GPUs
+pytestmark = [pytest.mark.multi_gpu, pytest.mark.gpu]
+
+
+class TestCollectiveOpsBasic:
+    """Basic tests for collective operations that can run without actual distributed setup."""
+
+    def test_communicator_fl_import(self):
+        """Test that CommunicatorFL can be imported."""
+        try:
+            from vllm_fl.distributed.communicator import CommunicatorFL
+            assert CommunicatorFL is not None
+        except ImportError as e:
+            pytest.skip(f"CommunicatorFL not available: {e}")
+
+    def test_pyflagcx_import(self):
+        """Test that PyFlagcxCommunicator can be imported."""
+        try:
+            from vllm_fl.distributed.device_communicators.flagcx import PyFlagcxCommunicator
+            assert PyFlagcxCommunicator is not None
+        except ImportError as e:
+            pytest.skip(f"PyFlagcxCommunicator not available: {e}")
+
+
+class TestAllReduceCorrectness:
+    """Test all_reduce operation correctness."""
+
+    @staticmethod
+    def reference_all_reduce(tensors: List[torch.Tensor]) -> torch.Tensor:
+        """Reference implementation of all_reduce (sum)."""
+        return sum(tensors)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+        reason="Multiple GPUs not available"
+    )
+    def test_all_reduce_sum_correctness(self):
+        """Test all_reduce sum produces correct results."""
+        # This test would need actual distributed setup
+        # For now, test the reference implementation
+        tensors = [
+            torch.tensor([1.0, 2.0, 3.0]),
+            torch.tensor([4.0, 5.0, 6.0]),
+        ]
+        expected = torch.tensor([5.0, 7.0, 9.0])
+        result = self.reference_all_reduce(tensors)
+        assert torch.allclose(result, expected)
+
+
+class TestReduceScatterCorrectness:
+    """Test reduce_scatter operation correctness."""
+
+    @staticmethod
+    def reference_reduce_scatter(
+        input_tensor: torch.Tensor,
+        world_size: int
+    ) -> List[torch.Tensor]:
+        """Reference implementation of reduce_scatter."""
+        # Split input into chunks
+        chunks = input_tensor.chunk(world_size, dim=0)
+        # Each rank gets the reduced chunk at its position
+        return list(chunks)
+
+    def test_reduce_scatter_reference(self):
+        """Test reference reduce_scatter implementation."""
+        input_tensor = torch.tensor([
+            [1.0, 2.0],
+            [3.0, 4.0],
+            [5.0, 6.0],
+            [7.0, 8.0],
+        ])
+        world_size = 2
+
+        result = self.reference_reduce_scatter(input_tensor, world_size)
+
+        assert len(result) == world_size
+        assert torch.allclose(result[0], torch.tensor([[1.0, 2.0], [3.0, 4.0]]))
+        assert torch.allclose(result[1], torch.tensor([[5.0, 6.0], [7.0, 8.0]]))
+
+
+class TestAllGatherCorrectness:
+    """Test all_gather operation correctness."""
+
+    @staticmethod
+    def reference_all_gather(tensors: List[torch.Tensor]) -> torch.Tensor:
+        """Reference implementation of all_gather."""
+        return torch.cat(tensors, dim=0)
+
+    def test_all_gather_reference(self):
+        """Test reference all_gather implementation."""
+        tensors = [
+            torch.tensor([[1.0, 2.0]]),
+            torch.tensor([[3.0, 4.0]]),
+        ]
+        expected = torch.tensor([
+            [1.0, 2.0],
+            [3.0, 4.0],
+        ])
+
+        result = self.reference_all_gather(tensors)
+        assert torch.allclose(result, expected)
+
+
+class TestSendRecvCorrectness:
+    """Test point-to-point send/recv operations."""
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+        reason="Multiple GPUs not available"
+    )
+    def test_send_recv_mock(self):
+        """Test send/recv with mocked communicator."""
+        # Create mock communicator
+        mock_comm = MagicMock()
+        mock_comm.disabled = False
+
+        tensor = torch.randn(4, 8)
+
+        # Simulate send
+        mock_comm.send(tensor, dst=1)
+        mock_comm.send.assert_called_once()
+
+        # Simulate recv
+        mock_comm.recv.return_value = tensor.clone()
+        received = mock_comm.recv(tensor.shape, tensor.dtype, src=0)
+        assert received.shape == tensor.shape
+
+
+class TestCommunicatorDisabled:
+    """Test communicator behavior when disabled."""
+
+    def test_disabled_all_reduce_returns_none(self):
+        """Test that disabled communicator all_reduce returns None."""
+        mock_comm = MagicMock()
+        mock_comm.disabled = True
+
+        def mock_all_reduce(tensor, out=None, op=None, stream=None):
+            if mock_comm.disabled:
+                return None
+            return torch.empty_like(tensor)
+
+        mock_comm.all_reduce = mock_all_reduce
+
+        result = mock_comm.all_reduce(torch.randn(4, 8))
+        assert result is None
+
+    def test_disabled_send_returns_early(self):
+        """Test that disabled communicator send returns early."""
+        mock_comm = MagicMock()
+        mock_comm.disabled = True
+
+        call_count = [0]
+
+        def mock_send(tensor, dst, stream=None):
+            if mock_comm.disabled:
+                return
+            call_count[0] += 1
+            # Would do actual send here
+
+        mock_comm.send = mock_send
+        mock_comm.send(torch.randn(4, 8), dst=1)
+
+        assert call_count[0] == 0
+
+
+class TestDistributedUtils:
+    """Test distributed utility functions."""
+
+    def test_world_size_calculation(self):
+        """Test world size calculation logic."""
+        # Single GPU case
+        world_size_1 = 1
+        assert world_size_1 == 1
+
+        # Multi-GPU case
+        world_size_4 = 4
+        tp_size = 2
+        pp_size = 2
+        assert world_size_4 == tp_size * pp_size
+
+    def test_rank_calculation(self):
+        """Test rank calculation in tensor/pipeline parallel."""
+        world_size = 4
+        tp_size = 2
+        pp_size = 2
+
+        # Calculate expected ranks
+        for global_rank in range(world_size):
+            tp_rank = global_rank % tp_size
+            pp_rank = global_rank // tp_size
+
+            assert 0 <= tp_rank < tp_size
+            assert 0 <= pp_rank < pp_size
diff --git a/tests/flaggems/test_flaggems_get_ops.py b/tests/functional_tests/flaggems/test_flaggems_get_ops.py
similarity index 100%
rename from tests/flaggems/test_flaggems_get_ops.py
rename to tests/functional_tests/flaggems/test_flaggems_get_ops.py
diff --git a/tests/flaggems/test_gems_whitelist.py b/tests/functional_tests/flaggems/test_gems_whitelist.py
similarity index 100%
rename from tests/flaggems/test_gems_whitelist.py
rename to tests/functional_tests/flaggems/test_gems_whitelist.py
diff --git a/tests/functional_tests/ops/__init__.py b/tests/functional_tests/ops/__init__.py
new file mode 100644
index 0000000..8ef0672
--- /dev/null
+++ b/tests/functional_tests/ops/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+"""Ops functional tests."""
diff --git a/tests/functional_tests/ops/test_ops_correctness.py b/tests/functional_tests/ops/test_ops_correctness.py
new file mode 100644
index 0000000..4df2e39
--- /dev/null
+++ b/tests/functional_tests/ops/test_ops_correctness.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Functional tests for ops correctness.
+Tests numerical correctness of operator implementations
+by comparing against reference PyTorch implementations.
+"""
+
+import pytest
+import torch
+import torch.nn.functional as F
+from typing import Tuple
+
+
+# Skip all tests in this module if GPU not available
+pytestmark = pytest.mark.gpu
+
+
+def allclose(a: torch.Tensor, b: torch.Tensor, rtol: float = 1e-3, atol: float = 1e-3) -> bool:
+    """Check if two tensors are close within tolerance."""
+    return torch.allclose(a, b, rtol=rtol, atol=atol)
+
+
+class TestSiluAndMulCorrectness:
+    """Test SiluAndMul operator correctness."""
+
+    @pytest.fixture
+    def test_shapes(self):
+        """Common test shapes for SiluAndMul."""
+        return [
+            (1, 64),
+            (4, 128),
+            (16, 256),
+            (32, 512),
+            (64, 1024),
+        ]
+
+    @staticmethod
+    def reference_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
+        """Reference implementation of SiluAndMul."""
+        half = x.shape[-1] // 2
+        return F.silu(x[..., :half]) * x[..., half:]
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_silu_and_mul_forward(self, test_shapes, device):
+        """Test SiluAndMul forward pass correctness."""
+        try:
+            from vllm_fl.dispatch import call_op
+        except ImportError:
+            pytest.skip("vllm_fl.dispatch not available")
+
+        for batch_size, hidden_size in test_shapes:
+            # Input must have even hidden size for SiluAndMul
+            x = torch.randn(batch_size, hidden_size * 2, device=device, dtype=torch.float32)
+
+            # Get reference result
+            ref_result = self.reference_silu_and_mul(x)
+
+            # Get FL result
+            try:
+                fl_result = call_op("silu_and_mul", x)
+
+                # Check correctness
+                assert fl_result.shape == ref_result.shape, (
+                    f"Shape mismatch: {fl_result.shape} vs {ref_result.shape}"
+                )
+                assert allclose(fl_result, ref_result), (
+                    f"Value mismatch for shape ({batch_size}, {hidden_size * 2})"
+                )
+            except RuntimeError as e:
+                if "No available implementation" in str(e):
+                    pytest.skip(f"silu_and_mul not registered: {e}")
+                raise
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_silu_and_mul_dtypes(self, device):
+        """Test SiluAndMul with different dtypes."""
+        try:
+            from vllm_fl.dispatch import call_op
+        except ImportError:
+            pytest.skip("vllm_fl.dispatch not available")
+
+        dtypes = [torch.float32, torch.float16, torch.bfloat16]
+        x_fp32 = torch.randn(4, 128, device=device, dtype=torch.float32)
+
+        for dtype in dtypes:
+            x = x_fp32.to(dtype)
+            ref_result = self.reference_silu_and_mul(x)
+
+            try:
+                fl_result = call_op("silu_and_mul", x)
+                # Use looser tolerance for half precision
+                tol = 1e-2 if dtype in [torch.float16, torch.bfloat16] else 1e-3
+                assert allclose(fl_result, ref_result, rtol=tol, atol=tol), (
+                    f"Value mismatch for dtype {dtype}"
+                )
+            except RuntimeError as e:
+                if "No available implementation" in str(e):
+                    pytest.skip(f"silu_and_mul not registered for {dtype}: {e}")
+                raise
+
+
+class TestRMSNormCorrectness:
+    """Test RMSNorm operator correctness."""
+
+    @pytest.fixture
+    def test_shapes(self):
+        """Common test shapes for RMSNorm."""
+        return [
+            (1, 64, 128),     # (batch, seq, hidden)
+            (4, 32, 256),
+            (8, 16, 512),
+        ]
+
+    @staticmethod
+    def reference_rms_norm(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float = 1e-6
+    ) -> torch.Tensor:
+        """Reference implementation of RMSNorm."""
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x_normed = x * torch.rsqrt(variance + eps)
+        return x_normed * weight
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_rms_norm_forward(self, test_shapes, device):
+        """Test RMSNorm forward pass correctness."""
+        try:
+            from vllm_fl.dispatch import call_op
+        except ImportError:
+            pytest.skip("vllm_fl.dispatch not available")
+
+        eps = 1e-6
+        for batch_size, seq_len, hidden_size in test_shapes:
+            x = torch.randn(batch_size, seq_len, hidden_size, device=device, dtype=torch.float32)
+            weight = torch.ones(hidden_size, device=device, dtype=torch.float32)
+
+            # Get reference result
+            ref_result = self.reference_rms_norm(x, weight, eps)
+
+            # Get FL result
+            try:
+                fl_result = call_op("rms_norm", x, None, weight, eps)
+
+                # Handle tuple return (output, residual)
+                if isinstance(fl_result, tuple):
+                    fl_result = fl_result[0]
+
+                assert fl_result.shape == ref_result.shape, (
+                    f"Shape mismatch: {fl_result.shape} vs {ref_result.shape}"
+                )
+                assert allclose(fl_result, ref_result), (
+                    f"Value mismatch for shape ({batch_size}, {seq_len}, {hidden_size})"
+                )
+            except RuntimeError as e:
+                if "No available implementation" in str(e):
+                    pytest.skip(f"rms_norm not registered: {e}")
+                raise
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_rms_norm_with_residual(self, device):
+        """Test RMSNorm with residual connection."""
+        try:
+            from vllm_fl.dispatch import call_op
+        except ImportError:
+            pytest.skip("vllm_fl.dispatch not available")
+
+        batch_size, seq_len, hidden_size = 4, 32, 256
+        eps = 1e-6
+
+        x = torch.randn(batch_size, seq_len, hidden_size, device=device, dtype=torch.float32)
+        residual = torch.randn(batch_size, seq_len, hidden_size, device=device, dtype=torch.float32)
+        weight = torch.ones(hidden_size, device=device, dtype=torch.float32)
+
+        try:
+            result = call_op("rms_norm", x, residual, weight, eps)
+
+            # Should return tuple (normalized, updated_residual) when residual is provided
+            if isinstance(result, tuple):
+                normalized, updated_residual = result
+                assert normalized.shape == x.shape
+                assert updated_residual.shape == x.shape
+        except RuntimeError as e:
+            if "No available implementation" in str(e):
+                pytest.skip(f"rms_norm not registered: {e}")
+            raise
+
+
+class TestRotaryEmbeddingCorrectness:
+    """Test RotaryEmbedding operator correctness."""
+
+    @staticmethod
+    def reference_rotary_embedding(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        positions: torch.Tensor,
+        rotary_interleaved: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Reference implementation of rotary embedding."""
+
+        def rotate_half(x: torch.Tensor) -> torch.Tensor:
+            """Rotate half the hidden dims."""
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2:]
+            return torch.cat((-x2, x1), dim=-1)
+
+        # Gather cos/sin by positions
+        cos_pos = cos[positions]
+        sin_pos = sin[positions]
+
+        # Add head dimension
+        while cos_pos.dim() < q.dim():
+            cos_pos = cos_pos.unsqueeze(1)
+            sin_pos = sin_pos.unsqueeze(1)
+
+        # Apply rotary embedding
+        q_embed = (q * cos_pos) + (rotate_half(q) * sin_pos)
+        k_embed = (k * cos_pos) + (rotate_half(k) * sin_pos)
+
+        return q_embed, k_embed
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_rotary_embedding_basic(self, device):
+        """Test basic rotary embedding functionality."""
+        try:
+            from vllm_fl.dispatch import call_op
+        except ImportError:
+            pytest.skip("vllm_fl.dispatch not available")
+
+        num_tokens = 16
+        num_heads = 8
+        head_size = 64
+        rotary_dim = head_size
+        max_position = 2048
+
+        # Create test inputs
+        q = torch.randn(num_tokens, num_heads, head_size, device=device, dtype=torch.float32)
+        k = torch.randn(num_tokens, num_heads, head_size, device=device, dtype=torch.float32)
+        positions = torch.arange(num_tokens, device=device)
+
+        # Create cos/sin cache
+        inv_freq = 1.0 / (10000.0 ** (torch.arange(0, rotary_dim, 2, device=device).float() / rotary_dim))
+        t = torch.arange(max_position, device=device).float()
+        freqs = torch.outer(t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+
+        try:
+            q_out, k_out = call_op(
+                "rotary_embedding",
+                q[..., :rotary_dim],
+                k[..., :rotary_dim],
+                cos,
+                sin,
+                positions,
+                False,  # rotary_interleaved
+                False,  # inplace
+            )
+
+            assert q_out.shape == q[..., :rotary_dim].shape
+            assert k_out.shape == k[..., :rotary_dim].shape
+        except RuntimeError as e:
+            if "No available implementation" in str(e):
+                pytest.skip(f"rotary_embedding not registered: {e}")
+            raise
+
+
+class TestOpsEdgeCases:
+    """Test edge cases for operators."""
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_empty_tensor_handling(self, device):
+        """Test handling of empty tensors."""
+        try:
+            from vllm_fl.dispatch import call_op
+        except ImportError:
+            pytest.skip("vllm_fl.dispatch not available")
+
+        # Create empty tensor
+        x = torch.empty(0, 64, device=device, dtype=torch.float32)
+
+        # Some ops may handle empty tensors, others may raise
+        # This test documents the behavior
+        try:
+            result = call_op("silu_and_mul", x)
+            assert result.shape[0] == 0
+        except (RuntimeError, ValueError):
+            # Empty tensor handling is implementation-dependent
+            pass
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    def test_large_batch_handling(self, device):
+        """Test handling of large batch sizes."""
+        try:
+            from vllm_fl.dispatch import call_op
+        except ImportError:
+            pytest.skip("vllm_fl.dispatch not available")
+
+        # Large batch
+        x = torch.randn(1024, 256, device=device, dtype=torch.float32)
+
+        try:
+            result = call_op("silu_and_mul", x)
+            assert result.shape == (1024, 128)
+        except RuntimeError as e:
+            if "No available implementation" in str(e):
+                pytest.skip(f"silu_and_mul not registered: {e}")
+            raise

From a1f5f7dde7d5e15cc287bcb8bf3d885e4f2d685b Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Thu, 5 Feb 2026 20:25:38 +0800
Subject: [PATCH 04/14] ignore build dir

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index c6473d8..dd3b07b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.egg-info
 __pycache__/
+build/
 

From 23e849a788b56bec50db98eed8796b33fb1fa28a Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Thu, 5 Feb 2026 20:53:08 +0800
Subject: [PATCH 05/14] implement module-level lazy import

---
 vllm_fl/__init__.py             |  7 +++++++
 vllm_fl/distributed/__init__.py | 10 ++++++++++
 2 files changed, 17 insertions(+)

diff --git a/vllm_fl/__init__.py b/vllm_fl/__init__.py
index 4c5d40f..4bcc90b 100644
--- a/vllm_fl/__init__.py
+++ b/vllm_fl/__init__.py
@@ -9,6 +9,13 @@
 logger = logging.getLogger(__name__)
 
 
+def __getattr__(name):
+    if name == "distributed":
+        from vllm_fl import distributed
+        return distributed
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
 def register():
     """Register the FL platform."""
 
diff --git a/vllm_fl/distributed/__init__.py b/vllm_fl/distributed/__init__.py
index e69de29..2d080b2 100644
--- a/vllm_fl/distributed/__init__.py
+++ b/vllm_fl/distributed/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+__all__ = ["communicator"]
+
+
+def __getattr__(name):
+    if name == "communicator":
+        from vllm_fl.distributed import communicator
+        return communicator
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

From a6b48a85fc0af30a0e5f00d5de4d8478c59800d0 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Fri, 6 Feb 2026 14:50:13 +0800
Subject: [PATCH 06/14] Requre vllm 0.13.0 or above

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 6ae7876..d089b30 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+vllm>=0.13.0
 decorator
 pyyaml
 scipy

From 536df29730d65fd5b8c5166607568f68acca314c Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Fri, 6 Feb 2026 15:10:18 +0800
Subject: [PATCH 07/14] Fix recursive import

---
 vllm_fl/__init__.py             | 6 ++++--
 vllm_fl/distributed/__init__.py | 9 ++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm_fl/__init__.py b/vllm_fl/__init__.py
index 4bcc90b..8592294 100644
--- a/vllm_fl/__init__.py
+++ b/vllm_fl/__init__.py
@@ -11,8 +11,10 @@
 
 def __getattr__(name):
     if name == "distributed":
-        from vllm_fl import distributed
-        return distributed
+        import importlib
+        module = importlib.import_module(f".{name}", __name__)
+        globals()[name] = module
+        return module
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
diff --git a/vllm_fl/distributed/__init__.py b/vllm_fl/distributed/__init__.py
index 2d080b2..e215455 100644
--- a/vllm_fl/distributed/__init__.py
+++ b/vllm_fl/distributed/__init__.py
@@ -1,10 +1,13 @@
 # Copyright (c) 2025 BAAI. All rights reserved.
 
+import importlib
+
 __all__ = ["communicator"]
 
 
 def __getattr__(name):
-    if name == "communicator":
-        from vllm_fl.distributed import communicator
-        return communicator
+    if name in __all__:
+        module = importlib.import_module(f".{name}", __name__)
+        globals()[name] = module
+        return module
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

From 9292b31737403695f0d8d658bb0a3ac3d898cb40 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Fri, 6 Feb 2026 17:16:46 +0800
Subject: [PATCH 08/14] Optimize the unit tests

---
 requirements.txt                              |   2 +-
 tests/unit_tests/conftest.py                  | 123 ++++++++++++
 tests/unit_tests/dispatch/test_discovery.py   |   8 +-
 .../distributed/test_communicator.py          | 100 +++++-----
 tests/unit_tests/distributed/test_flagcx.py   | 180 ++++++++----------
 tests/unit_tests/ops/test_activation.py       |   7 +-
 tests/unit_tests/ops/test_rotary_embedding.py |  77 ++++++--
 tests/unit_tests/worker/test_model_runner.py  |  45 ++++-
 tests/unit_tests/worker/test_worker.py        |  33 +++-
 9 files changed, 400 insertions(+), 175 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d089b30..705aa39 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-vllm>=0.13.0
+vllm==0.13.0
 decorator
 pyyaml
 scipy
diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py
index 8c29aac..8a0a386 100644
--- a/tests/unit_tests/conftest.py
+++ b/tests/unit_tests/conftest.py
@@ -2,11 +2,46 @@
 
 """
 Unit test fixtures and configuration.
+
+This module provides shared fixtures for all unit tests.
 """
 
+import os
 import pytest
 import torch
+from unittest.mock import MagicMock, NonCallableMagicMock
+
+
+# =============================================================================
+# Environment Detection Helpers
+# =============================================================================
+
+def has_cuda():
+    """Check if CUDA is available."""
+    return torch.cuda.is_available()
+
+
+def has_flagcx():
+    """Check if flagcx is available."""
+    flagcx_path = os.getenv('FLAGCX_PATH')
+    if not flagcx_path:
+        return False
+    lib_path = os.path.join(flagcx_path, "build/lib/libflagcx.so")
+    return os.path.exists(lib_path)
+
+
+def has_vllm_profiler():
+    """Check if vllm profiler is available."""
+    try:
+        from vllm.profiler.wrapper import TorchProfilerWrapper
+        return True
+    except ImportError:
+        return False
+
 
+# =============================================================================
+# Basic Fixtures
+# =============================================================================
 
 @pytest.fixture
 def mock_tensor():
@@ -20,3 +55,91 @@ def device():
     if torch.cuda.is_available():
         return torch.device("cuda")
     return torch.device("cpu")
+
+
+@pytest.fixture
+def cpu_device():
+    """Always return CPU device."""
+    return torch.device("cpu")
+
+
+# =============================================================================
+# Mock Factory Fixtures
+# =============================================================================
+
+@pytest.fixture
+def mock_module():
+    """
+    Create a mock that behaves like a Python module.
+
+    Use this when you need to mock a module object that:
+    - Is not callable
+    - May have specific attributes
+    """
+    return NonCallableMagicMock(spec=['__name__', '__file__'])
+
+
+@pytest.fixture
+def mock_module_with_register():
+    """
+    Create a mock module with a register function.
+
+    Useful for testing plugin discovery.
+    """
+    module = NonCallableMagicMock(spec=['register'])
+    module.register = MagicMock()
+    return module
+
+
+@pytest.fixture
+def mock_process_group():
+    """
+    Create a mock torch distributed ProcessGroup.
+
+    Useful for testing distributed communication.
+    """
+    group = MagicMock()
+    group.rank.return_value = 0
+    group.size.return_value = 1
+    return group
+
+
+# =============================================================================
+# Tensor Fixtures
+# =============================================================================
+
+@pytest.fixture
+def batch_tensors():
+    """Create a batch of tensors for testing."""
+    return {
+        'small': torch.randn(2, 8),
+        'medium': torch.randn(4, 16, 32),
+        'large': torch.randn(8, 32, 64, 128),
+    }
+
+
+@pytest.fixture
+def dtype_tensors():
+    """Create tensors with different dtypes."""
+    return {
+        'float32': torch.randn(2, 4, dtype=torch.float32),
+        'float16': torch.randn(2, 4, dtype=torch.float16),
+        'bfloat16': torch.randn(2, 4, dtype=torch.bfloat16),
+    }
+
+
+# =============================================================================
+# Pytest Markers
+# =============================================================================
+
+def pytest_configure(config):
+    """Register custom markers."""
+    config.addinivalue_line(
+        "markers", "gpu: marks tests as requiring GPU"
+    )
+    config.addinivalue_line(
+        "markers", "flagcx: marks tests as requiring flagcx"
+    )
+    config.addinivalue_line(
+        "markers", "slow: marks tests as slow"
+    )
diff --git a/tests/unit_tests/dispatch/test_discovery.py b/tests/unit_tests/dispatch/test_discovery.py
index 67f1773..c6633cd 100644
--- a/tests/unit_tests/dispatch/test_discovery.py
+++ b/tests/unit_tests/dispatch/test_discovery.py
@@ -6,7 +6,7 @@
 
 import os
 import pytest
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch, MagicMock, NonCallableMagicMock
 
 from vllm_fl.dispatch.discovery import (
     discover_plugins,
@@ -30,7 +30,7 @@ def test_direct_callable(self):
 
     def test_module_with_register_function(self):
         registry = MagicMock()
-        module = MagicMock()
+        module = NonCallableMagicMock(spec=["register"])  # Only has register attr
         module.register = MagicMock()
 
         result = _call_register_function(module, registry, "test")
@@ -40,7 +40,7 @@ def test_module_with_register_function(self):
 
     def test_module_with_vllm_fl_register(self):
         registry = MagicMock()
-        module = MagicMock(spec=["vllm_fl_register"])
+        module = NonCallableMagicMock(spec=["vllm_fl_register"])  # Only has vllm_fl_register attr
         module.vllm_fl_register = MagicMock()
 
         result = _call_register_function(module, registry, "test")
@@ -58,7 +58,7 @@ def test_callable_raises_exception(self):
 
     def test_no_register_function(self):
         registry = MagicMock()
-        module = MagicMock(spec=[])  # No register function
+        module = NonCallableMagicMock(spec=[])  # Not callable, no register function
 
         result = _call_register_function(module, registry, "test")
 
diff --git a/tests/unit_tests/distributed/test_communicator.py b/tests/unit_tests/distributed/test_communicator.py
index 14bd380..79021eb 100644
--- a/tests/unit_tests/distributed/test_communicator.py
+++ b/tests/unit_tests/distributed/test_communicator.py
@@ -4,69 +4,73 @@
 Tests for distributed communicator module.
 """
 
+import os
 import pytest
 import torch
-from unittest.mock import patch, MagicMock, PropertyMock
+from unittest.mock import MagicMock
+
+
+def has_flagcx():
+    """Check if flagcx is available."""
+    flagcx_path = os.getenv('FLAGCX_PATH')
+    if not flagcx_path:
+        return False
+    lib_path = os.path.join(flagcx_path, "build/lib/libflagcx.so")
+    return os.path.exists(lib_path)
+
+
+# Skip all tests if flagcx is not available (communicator depends on it)
+pytestmark = pytest.mark.skipif(
+    not has_flagcx(),
+    reason="FLAGCX_PATH not set or flagcx library not found"
+)
 
 
 class TestCommunicatorFL:
-    @pytest.fixture
-    def mock_pyflagcx(self):
-        with patch("vllm_fl.distributed.communicator.PyFlagcxCommunicator") as mock:
-            yield mock
-
-    @pytest.fixture
-    def mock_base_communicator(self):
-        with patch(
-            "vllm_fl.distributed.communicator.DeviceCommunicatorBase.__init__"
-        ) as mock:
-            mock.return_value = None
-            yield mock
+    """Test CommunicatorFL class."""
 
     def test_import(self):
+        """Test that CommunicatorFL can be imported."""
         from vllm_fl.distributed.communicator import CommunicatorFL
         assert CommunicatorFL is not None
 
-    def test_init_single_worker_no_pyflagcx(self, mock_base_communicator, mock_pyflagcx):
+    def test_class_inherits_from_base(self):
+        """Test that CommunicatorFL inherits from DeviceCommunicatorBase."""
         from vllm_fl.distributed.communicator import CommunicatorFL
+        from vllm.distributed.device_communicators.base_device_communicator import (
+            DeviceCommunicatorBase
+        )
+        assert issubclass(CommunicatorFL, DeviceCommunicatorBase)
 
-        # Mock the base class attributes
-        with patch.object(CommunicatorFL, "world_size", new_callable=PropertyMock) as mock_ws:
-            mock_ws.return_value = 1
+    def test_class_has_required_methods(self):
+        """Test that CommunicatorFL has all required methods."""
+        from vllm_fl.distributed.communicator import CommunicatorFL
 
-            with patch.object(CommunicatorFL, "use_all2all", new_callable=PropertyMock) as mock_a2a:
-                mock_a2a.return_value = False
+        required_methods = [
+            'all_reduce',
+            'reduce_scatter',
+            'send',
+            'recv',
+            'destroy',
+        ]
 
-                cpu_group = MagicMock()
-                device = torch.device("cpu")
+        for method in required_methods:
+            assert hasattr(CommunicatorFL, method), f"Missing method: {method}"
 
-                comm = CommunicatorFL.__new__(CommunicatorFL)
-                comm.world_size = 1
-                comm.use_all2all = False
-                comm.cpu_group = cpu_group
-                comm.device = device
-                comm.pyflagcx_comm = None
+    def test_instance_attributes_single_worker(self):
+        """Test instance attributes for single worker scenario."""
+        from vllm_fl.distributed.communicator import CommunicatorFL
 
-                # Single worker should not create pyflagcx communicator
-                assert comm.pyflagcx_comm is None
+        # Create instance without calling __init__ to test attribute access
+        comm = CommunicatorFL.__new__(CommunicatorFL)
 
+        # Manually set attributes that would be set by parent class
+        comm.world_size = 1
+        comm.use_all2all = False
+        comm.cpu_group = MagicMock()
+        comm.device = torch.device("cpu")
+        comm.pyflagcx_comm = None
 
-class TestPyFlagcxCommunicator:
-    def test_import(self):
-        # Test that the module can be imported (may fail if flagcx not available)
-        try:
-            from vllm_fl.distributed.device_communicators.flagcx import PyFlagcxCommunicator
-            assert PyFlagcxCommunicator is not None
-        except ImportError:
-            pytest.skip("flagcx not available")
-
-    def test_disabled_communicator_returns_none(self):
-        """Test that disabled communicator methods return None/early exit."""
-        # Create a mock disabled communicator
-        mock_comm = MagicMock()
-        mock_comm.disabled = True
-        mock_comm.all_reduce.return_value = None
-
-        # When disabled, all_reduce should return None
-        result = mock_comm.all_reduce(torch.randn(2, 4))
-        assert result is None
+        # Verify attributes
+        assert comm.world_size == 1
+        assert comm.pyflagcx_comm is None
diff --git a/tests/unit_tests/distributed/test_flagcx.py b/tests/unit_tests/distributed/test_flagcx.py
index 4b85f16..0c659c3 100644
--- a/tests/unit_tests/distributed/test_flagcx.py
+++ b/tests/unit_tests/distributed/test_flagcx.py
@@ -2,123 +2,97 @@
 
 """
 Tests for flagcx communicator module.
+
+Note: Most tests require FLAGCX_PATH environment variable to be set.
+Tests are skipped if flagcx is not available.
 """
 
+import os
 import pytest
 import torch
-from unittest.mock import patch, MagicMock
-
-
-class TestPyFlagcxCommunicator:
-    """Test PyFlagcxCommunicator class."""
-
-    @pytest.fixture
-    def mock_flagcx_library(self):
-        """Mock the FLAGCXLibrary."""
-        with patch(
-            "vllm_fl.distributed.device_communicators.flagcx.FLAGCXLibrary"
-        ) as mock:
-            yield mock
-
-    def test_world_size_one_disabled(self):
-        """Test that communicator is disabled for world_size=1."""
-        # When world_size is 1, the communicator should be disabled
-        mock_group = MagicMock()
-        mock_group.rank = 0
-        mock_group.world_size = 1
-
-        # Create a mock communicator with world_size=1
-        comm = MagicMock()
-        comm.world_size = 1
-        comm.available = False
-        comm.disabled = True
-
-        assert comm.disabled is True
-        assert comm.available is False
-
-    def test_all_reduce_disabled_returns_none(self):
-        """Test that all_reduce returns None when disabled."""
-        comm = MagicMock()
-        comm.disabled = True
 
-        # Simulate the actual behavior
-        def mock_all_reduce(tensor, out_tensor=None, op=None, stream=None):
-            if comm.disabled:
-                return None
-            return torch.empty_like(tensor)
 
-        comm.all_reduce = mock_all_reduce
-        result = comm.all_reduce(torch.randn(2, 4))
-        assert result is None
+def has_flagcx():
+    """Check if flagcx is available."""
+    flagcx_path = os.getenv('FLAGCX_PATH')
+    if not flagcx_path:
+        return False
+    lib_path = os.path.join(flagcx_path, "build/lib/libflagcx.so")
+    return os.path.exists(lib_path)
 
-    def test_send_disabled_early_return(self):
-        """Test that send returns early when disabled."""
-        comm = MagicMock()
-        comm.disabled = True
 
-        call_count = [0]
+# Mark all tests in this module as requiring flagcx
+pytestmark = pytest.mark.skipif(
+    not has_flagcx(),
+    reason="FLAGCX_PATH not set or flagcx library not found"
+)
 
-        def mock_send(tensor, dst, stream=None):
-            if comm.disabled:
-                return
-            call_count[0] += 1
 
-        comm.send = mock_send
-        comm.send(torch.randn(2, 4), dst=1)
-
-        # Should return early without doing anything
-        assert call_count[0] == 0
-
-    def test_recv_disabled_early_return(self):
-        """Test that recv returns early when disabled."""
-        comm = MagicMock()
-        comm.disabled = True
-
-        call_count = [0]
-
-        def mock_recv(tensor, src, stream=None):
-            if comm.disabled:
-                return
-            call_count[0] += 1
-
-        comm.recv = mock_recv
-        comm.recv(torch.randn(2, 4), src=0)
-
-        # Should return early without doing anything
-        assert call_count[0] == 0
-
-    def test_reduce_scatter_disabled_early_return(self):
-        """Test that reduce_scatter returns early when disabled."""
-        comm = MagicMock()
-        comm.disabled = True
+class TestPyFlagcxCommunicator:
+    """Test PyFlagcxCommunicator class."""
 
-        call_count = [0]
+    def test_import(self):
+        """Test that the module can be imported when flagcx is available."""
+        from vllm_fl.distributed.device_communicators.flagcx import PyFlagcxCommunicator
+        assert PyFlagcxCommunicator is not None
 
-        def mock_reduce_scatter(output_tensor, input_tensor, op=None, stream=None):
-            if comm.disabled:
-                return
-            call_count[0] += 1
+    def test_class_has_required_methods(self):
+        """Test that PyFlagcxCommunicator has all required methods."""
+        from vllm_fl.distributed.device_communicators.flagcx import PyFlagcxCommunicator
 
-        comm.reduce_scatter = mock_reduce_scatter
-        comm.reduce_scatter(torch.randn(2, 4), torch.randn(4, 4))
+        required_methods = [
+            'all_reduce',
+            'all_gather',
+            'reduce_scatter',
+            'send',
+            'recv',
+            'broadcast',
+            'group_start',
+            'group_end',
+        ]
 
-        assert call_count[0] == 0
+        for method in required_methods:
+            assert hasattr(PyFlagcxCommunicator, method), f"Missing method: {method}"
 
 
 class TestFlagcxDataTypes:
-    """Test flagcx data type mappings."""
-
-    def test_torch_dtype_mapping_concept(self):
-        """Test the concept of torch dtype to flagcx dtype mapping."""
-        dtype_map = {
-            torch.float32: "FLOAT",
-            torch.float16: "HALF",
-            torch.bfloat16: "BFLOAT16",
-            torch.int32: "INT32",
-            torch.int64: "INT64",
-        }
-
-        # Verify common dtypes are mappable
-        for torch_dtype, flagcx_name in dtype_map.items():
-            assert torch_dtype is not None
-            assert isinstance(flagcx_name, str)
+    """Test flagcx data type related functionality."""
+
+    def test_flagcx_dtype_enum_import(self):
+        """Test that flagcxDataTypeEnum can be imported."""
+        from plugin.interservice.flagcx_wrapper import flagcxDataTypeEnum
+        assert flagcxDataTypeEnum is not None
+
+    def test_flagcx_dtype_from_torch(self):
+        """Test torch dtype to flagcx dtype conversion."""
+        from plugin.interservice.flagcx_wrapper import flagcxDataTypeEnum
+
+        # Test common dtypes
+        test_dtypes = [
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+        ]
+
+        for dtype in test_dtypes:
+            # Should not raise
+            result = flagcxDataTypeEnum.from_torch(dtype)
+            assert result is not None
+
+
+class TestFlagcxReduceOps:
+    """Test flagcx reduce operation types."""
+
+    def test_flagcx_reduce_op_enum_import(self):
+        """Test that flagcxRedOpTypeEnum can be imported."""
+        from plugin.interservice.flagcx_wrapper import flagcxRedOpTypeEnum
+        assert flagcxRedOpTypeEnum is not None
+
+    def test_flagcx_reduce_op_from_torch(self):
+        """Test torch ReduceOp to flagcx reduce op conversion."""
+        from plugin.interservice.flagcx_wrapper import flagcxRedOpTypeEnum
+        from torch.distributed import ReduceOp
+
+        # Test SUM operation
+        result = flagcxRedOpTypeEnum.from_torch(ReduceOp.SUM)
+        assert result is not None
diff --git a/tests/unit_tests/ops/test_activation.py b/tests/unit_tests/ops/test_activation.py
index fe830f3..5b697b0 100644
--- a/tests/unit_tests/ops/test_activation.py
+++ b/tests/unit_tests/ops/test_activation.py
@@ -15,11 +15,16 @@ def mock_call_op(self):
         with patch("vllm_fl.ops.activation.call_op") as mock:
             yield mock
 
+    @pytest.fixture
+    def mock_parent_init(self):
+        with patch("vllm_fl.ops.activation.SiluAndMul.__init__", return_value=None):
+            yield
+
     def test_import(self):
         from vllm_fl.ops.activation import SiluAndMulFL
         assert SiluAndMulFL is not None
 
-    def test_forward_oot_calls_dispatch(self, mock_call_op):
+    def test_forward_oot_calls_dispatch(self, mock_parent_init, mock_call_op):
         from vllm_fl.ops.activation import SiluAndMulFL
 
         mock_call_op.return_value = torch.randn(2, 4)
diff --git a/tests/unit_tests/ops/test_rotary_embedding.py b/tests/unit_tests/ops/test_rotary_embedding.py
index 02f0568..eddeba3 100644
--- a/tests/unit_tests/ops/test_rotary_embedding.py
+++ b/tests/unit_tests/ops/test_rotary_embedding.py
@@ -10,37 +10,88 @@
 
 
 class TestRotaryEmbeddingFL:
+    """Test RotaryEmbeddingFL class."""
+
     @pytest.fixture
     def mock_call_op(self):
+        """Mock the call_op function."""
         with patch("vllm_fl.ops.rotary_embedding.call_op") as mock:
             yield mock
 
+    @pytest.fixture
+    def mock_parent_init(self):
+        """Mock the parent class __init__ to avoid vllm C++ dependencies."""
+        with patch("vllm_fl.ops.rotary_embedding.RotaryEmbedding.__init__", return_value=None):
+            yield
+
     def test_import(self):
+        """Test that RotaryEmbeddingFL can be imported."""
         from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
         assert RotaryEmbeddingFL is not None
 
-    def test_init_params(self):
+    def test_class_exists(self):
+        """Test that RotaryEmbeddingFL is a class."""
+        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
+        assert isinstance(RotaryEmbeddingFL, type)
+
+    def test_has_forward_oot_method(self):
+        """Test that RotaryEmbeddingFL has forward_oot method."""
         from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
+        assert hasattr(RotaryEmbeddingFL, 'forward_oot')
+        assert callable(getattr(RotaryEmbeddingFL, 'forward_oot'))
+
+    def test_init_calls_parent(self, mock_parent_init):
+        """Test that __init__ calls parent class."""
+        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
+
+        layer = RotaryEmbeddingFL(
+            head_size=64,
+            rotary_dim=32,
+            max_position_embeddings=2048,
+            base=10000.0,
+            is_neox_style=True,
+            dtype=torch.float32,
+        )
 
-        head_size = 64
-        rotary_dim = 32
-        max_position_embeddings = 2048
-        base = 10000.0
+        # Instance should be created
+        assert layer is not None
+
+    def test_forward_oot_calls_dispatch(self, mock_parent_init, mock_call_op):
+        """Test that forward_oot calls the dispatch call_op."""
+        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
 
+        # Create layer with mocked parent
         layer = RotaryEmbeddingFL(
-            head_size=head_size,
-            rotary_dim=rotary_dim,
-            max_position_embeddings=max_position_embeddings,
-            base=base,
+            head_size=64,
+            rotary_dim=32,
+            max_position_embeddings=2048,
+            base=10000.0,
             is_neox_style=True,
             dtype=torch.float32,
         )
 
-        assert layer.head_size == head_size
-        assert layer.rotary_dim == rotary_dim
-        assert layer.max_position_embeddings == max_position_embeddings
-        assert layer.is_neox_style is True
+        # Manually set attributes that parent __init__ would set
+        layer.head_size = 64
+        layer.rotary_dim = 32
+        layer.is_neox_style = True
+        layer.cos_sin_cache = torch.randn(2048, 64)
+
+        # Setup mock return value
+        mock_call_op.return_value = (torch.randn(4, 8, 32), torch.randn(4, 8, 32))
+
+        # Call forward_oot
+        positions = torch.tensor([0, 1, 2, 3])
+        query = torch.randn(4, 8, 64)
+        key = torch.randn(4, 8, 64)
+
+        result = layer.forward_oot(positions, query, key)
+
+        # Verify call_op was called with "rotary_embedding"
+        mock_call_op.assert_called_once()
+        call_args = mock_call_op.call_args
+        assert call_args[0][0] == "rotary_embedding"
 
     def test_all_exports(self):
+        """Test that __all__ contains RotaryEmbeddingFL."""
         from vllm_fl.ops.rotary_embedding import __all__
         assert "RotaryEmbeddingFL" in __all__
diff --git a/tests/unit_tests/worker/test_model_runner.py b/tests/unit_tests/worker/test_model_runner.py
index 3ede95d..2d48257 100644
--- a/tests/unit_tests/worker/test_model_runner.py
+++ b/tests/unit_tests/worker/test_model_runner.py
@@ -2,22 +2,59 @@
 
 """
 Tests for model runner module.
+
+Note: These tests require vllm >= 0.13.0 with full installation.
 """
 
 import pytest
 from unittest.mock import patch, MagicMock
 
 
+def has_vllm_model_runner():
+    """Check if vllm model runner dependencies are available."""
+    try:
+        from vllm_fl.worker.model_runner import ModelRunnerFL
+        return True
+    except ImportError:
+        return False
+
+
+# Skip all tests if vllm model runner is not available
+pytestmark = pytest.mark.skipif(
+    not has_vllm_model_runner(),
+    reason="vllm_fl.worker.model_runner not available"
+)
+
+
 class TestModelRunnerFL:
     """Test ModelRunnerFL class."""
 
     def test_import(self):
+        """Test that ModelRunnerFL can be imported."""
         from vllm_fl.worker.model_runner import ModelRunnerFL
         assert ModelRunnerFL is not None
 
-    def test_class_inheritance(self):
-        """Test that ModelRunnerFL inherits from expected base classes."""
+    def test_is_class(self):
+        """Test that ModelRunnerFL is a class."""
         from vllm_fl.worker.model_runner import ModelRunnerFL
-
-        # Check that it's a class
         assert isinstance(ModelRunnerFL, type)
+
+    def test_inherits_from_mixins(self):
+        """Test that ModelRunnerFL inherits from expected mixins."""
+        from vllm_fl.worker.model_runner import ModelRunnerFL
+        from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+
+        # ModelRunnerFL uses mixins, not GPUModelRunner
+        assert issubclass(ModelRunnerFL, LoRAModelRunnerMixin)
+
+    def test_has_load_model_method(self):
+        """Test that ModelRunnerFL has load_model method."""
+        from vllm_fl.worker.model_runner import ModelRunnerFL
+        assert hasattr(ModelRunnerFL, 'load_model')
+        assert callable(getattr(ModelRunnerFL, 'load_model'))
+
+    def test_has_execute_model_method(self):
+        """Test that ModelRunnerFL has execute_model method."""
+        from vllm_fl.worker.model_runner import ModelRunnerFL
+        assert hasattr(ModelRunnerFL, 'execute_model')
+        assert callable(getattr(ModelRunnerFL, 'execute_model'))
diff --git a/tests/unit_tests/worker/test_worker.py b/tests/unit_tests/worker/test_worker.py
index baccd0e..461b664 100644
--- a/tests/unit_tests/worker/test_worker.py
+++ b/tests/unit_tests/worker/test_worker.py
@@ -2,24 +2,45 @@
 
 """
 Tests for worker module.
+
+Note: These tests require vllm >= 0.13.0 with profiler support.
 """
 
 import pytest
 from unittest.mock import patch, MagicMock
 
 
+def has_vllm_profiler():
+    """Check if vllm profiler is available."""
+    try:
+        from vllm.profiler.wrapper import TorchProfilerWrapper
+        return True
+    except ImportError:
+        return False
+
+
+# Skip all tests if vllm profiler is not available
+pytestmark = pytest.mark.skipif(
+    not has_vllm_profiler(),
+    reason="vllm.profiler.wrapper not available (requires vllm >= 0.13.0)"
+)
+
+
 class TestWorkerFL:
     """Test WorkerFL class."""
 
     def test_import(self):
+        """Test that WorkerFL can be imported."""
         from vllm_fl.worker.worker import WorkerFL
         assert WorkerFL is not None
 
     def test_memory_snapshot_import(self):
+        """Test that MemorySnapshot can be imported."""
         from vllm_fl.worker.worker import MemorySnapshot
         assert MemorySnapshot is not None
 
     def test_memory_profiling_result_import(self):
+        """Test that MemoryProfilingResult can be imported."""
         from vllm_fl.worker.worker import MemoryProfilingResult
         assert MemoryProfilingResult is not None
 
@@ -28,6 +49,7 @@ class TestMemorySnapshot:
     """Test MemorySnapshot dataclass."""
 
     def test_default_values(self):
+        """Test that MemorySnapshot has correct default values."""
         from vllm_fl.worker.worker import MemorySnapshot
 
         # Create with auto_measure=False to avoid actual GPU calls
@@ -41,6 +63,7 @@ def test_default_values(self):
         assert snapshot.non_torch_memory == 0
 
     def test_subtraction(self):
+        """Test MemorySnapshot subtraction operator."""
         from vllm_fl.worker.worker import MemorySnapshot
 
         snapshot1 = MemorySnapshot(auto_measure=False)
@@ -74,6 +97,7 @@ class TestMemoryProfilingResult:
     """Test MemoryProfilingResult dataclass."""
 
     def test_default_values(self):
+        """Test that MemoryProfilingResult has correct default values."""
         from vllm_fl.worker.worker import MemoryProfilingResult
 
         result = MemoryProfilingResult()
@@ -85,7 +109,8 @@ def test_default_values(self):
         assert result.profile_time == 0.0
 
     def test_before_profile_defaults(self):
-        from vllm_fl.worker.worker import MemoryProfilingResult, MemorySnapshot
+        """Test that MemoryProfilingResult creates default snapshots."""
+        from vllm_fl.worker.worker import MemoryProfilingResult
 
         result = MemoryProfilingResult()
 
@@ -98,5 +123,11 @@ class TestInitWorkerDistributedEnvironment:
     """Test init_worker_distributed_environment function."""
 
     def test_import(self):
+        """Test that init_worker_distributed_environment can be imported."""
         from vllm_fl.worker.worker import init_worker_distributed_environment
         assert init_worker_distributed_environment is not None
+
+    def test_is_callable(self):
+        """Test that init_worker_distributed_environment is callable."""
+        from vllm_fl.worker.worker import init_worker_distributed_environment
+        assert callable(init_worker_distributed_environment)

From 68b14730d729c597b365f16ea3d2ae97390ee34c Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Fri, 6 Feb 2026 18:00:38 +0800
Subject: [PATCH 09/14] add numerical correctness tests

---
 tests/unit_tests/dispatch/test_call_op.py | 398 +++++++++++
 tests/unit_tests/dispatch/test_manager.py | 795 ++++++++++++++++++++++
 tests/unit_tests/ops/test_numerical.py    | 588 ++++++++++++++++
 3 files changed, 1781 insertions(+)
 create mode 100644 tests/unit_tests/dispatch/test_call_op.py
 create mode 100644 tests/unit_tests/dispatch/test_manager.py
 create mode 100644 tests/unit_tests/ops/test_numerical.py

diff --git a/tests/unit_tests/dispatch/test_call_op.py b/tests/unit_tests/dispatch/test_call_op.py
new file mode 100644
index 0000000..0e76084
--- /dev/null
+++ b/tests/unit_tests/dispatch/test_call_op.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for dispatch call_op and resolve_op convenience functions.
+
+This module tests the high-level dispatch API exposed through the
+dispatch module's __init__.py, ensuring the full dispatch pipeline
+works correctly from call_op -> manager -> registry -> implementation.
+"""
+
+import os
+import pytest
+from unittest.mock import patch, MagicMock
+
+from vllm_fl.dispatch import (
+    call_op,
+    resolve_op,
+    get_default_manager,
+    reset_default_manager,
+    OpRegistry,
+    OpImpl,
+    BackendImplKind,
+    BackendPriority,
+    SelectionPolicy,
+    set_global_policy,
+    reset_global_policy,
+    policy_context,
+    with_preference,
+    PREFER_DEFAULT,
+    PREFER_VENDOR,
+    PREFER_REFERENCE,
+)
+
+
+class TestCallOp:
+    """Test call_op convenience function."""
+
+    @pytest.fixture(autouse=True)
+    def reset_all(self):
+        """Reset global state before and after each test."""
+        reset_default_manager()
+        reset_global_policy()
+        yield
+        reset_default_manager()
+        reset_global_policy()
+
+    @pytest.fixture
+    def setup_test_op(self):
+        """Setup a test operator in the registry."""
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        def impl_fn(x, multiplier=2):
+            return x * multiplier
+
+        impl = OpImpl(
+            op_name="test_call_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=impl_fn,
+        )
+        manager.registry.register_impl(impl)
+        return impl_fn
+
+    def test_call_op_basic(self, setup_test_op):
+        result = call_op("test_call_op", 5)
+        assert result == 10
+
+    def test_call_op_with_kwargs(self, setup_test_op):
+        result = call_op("test_call_op", 5, multiplier=3)
+        assert result == 15
+
+    def test_call_op_nonexistent_raises(self):
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        with pytest.raises(RuntimeError, match="No available implementation"):
+            call_op("nonexistent_op", 1)
+
+    def test_call_op_uses_default_manager(self):
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        call_tracker = {"called": False}
+
+        def tracking_fn(x):
+            call_tracker["called"] = True
+            return x
+
+        manager.registry.register_impl(OpImpl(
+            op_name="track_op",
+            impl_id="default.track",
+            kind=BackendImplKind.DEFAULT,
+            fn=tracking_fn,
+        ))
+
+        call_op("track_op", 1)
+        assert call_tracker["called"] is True
+
+
+class TestResolveOp:
+    """Test resolve_op convenience function."""
+
+    @pytest.fixture(autouse=True)
+    def reset_all(self):
+        reset_default_manager()
+        reset_global_policy()
+        yield
+        reset_default_manager()
+        reset_global_policy()
+
+    @pytest.fixture
+    def setup_test_op(self):
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        def impl_fn(x):
+            return x * 2
+
+        impl = OpImpl(
+            op_name="test_resolve_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=impl_fn,
+        )
+        manager.registry.register_impl(impl)
+        return impl_fn
+
+    def test_resolve_op_returns_function(self, setup_test_op):
+        fn = resolve_op("test_resolve_op")
+        assert callable(fn)
+        assert fn is setup_test_op
+
+    def test_resolve_op_can_be_called(self, setup_test_op):
+        fn = resolve_op("test_resolve_op")
+        result = fn(5)
+        assert result == 10
+
+    def test_resolve_op_nonexistent_raises(self):
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        with pytest.raises(RuntimeError, match="No available implementation"):
+            resolve_op("nonexistent_op")
+
+
+class TestCallOpWithPolicy:
+    """Test call_op behavior with different policies."""
+
+    @pytest.fixture(autouse=True)
+    def reset_all(self):
+        reset_default_manager()
+        reset_global_policy()
+        yield
+        reset_default_manager()
+        reset_global_policy()
+
+    @pytest.fixture
+    def setup_multi_impl_op(self):
+        """Setup an operator with multiple implementations."""
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        results = {"default": 0, "vendor": 0, "reference": 0}
+
+        def default_fn(x):
+            results["default"] += 1
+            return x * 2
+
+        def vendor_fn(x):
+            results["vendor"] += 1
+            return x * 3
+
+        def reference_fn(x):
+            results["reference"] += 1
+            return x * 4
+
+        manager.registry.register_impl(OpImpl(
+            op_name="policy_op",
+            impl_id="default.impl",
+            kind=BackendImplKind.DEFAULT,
+            fn=default_fn,
+            priority=BackendPriority.DEFAULT,
+        ))
+        manager.registry.register_impl(OpImpl(
+            op_name="policy_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=vendor_fn,
+            priority=BackendPriority.VENDOR,
+            vendor="CUDA",
+        ))
+        manager.registry.register_impl(OpImpl(
+            op_name="policy_op",
+            impl_id="reference.pytorch",
+            kind=BackendImplKind.REFERENCE,
+            fn=reference_fn,
+            priority=BackendPriority.REFERENCE,
+        ))
+
+        return results
+
+    def test_call_op_default_policy_uses_default(self, setup_multi_impl_op):
+        results = setup_multi_impl_op
+
+        result = call_op("policy_op", 5)
+
+        assert result == 10  # default_fn: x * 2
+        assert results["default"] == 1
+        assert results["vendor"] == 0
+        assert results["reference"] == 0
+
+    def test_call_op_vendor_policy(self, setup_multi_impl_op):
+        results = setup_multi_impl_op
+
+        set_global_policy(SelectionPolicy(prefer=PREFER_VENDOR))
+        # Clear cache after policy change
+        get_default_manager().bump_policy_epoch()
+
+        result = call_op("policy_op", 5)
+
+        assert result == 15  # vendor_fn: x * 3
+        assert results["vendor"] == 1
+
+    def test_call_op_reference_policy(self, setup_multi_impl_op):
+        results = setup_multi_impl_op
+
+        set_global_policy(SelectionPolicy(prefer=PREFER_REFERENCE))
+        get_default_manager().bump_policy_epoch()
+
+        result = call_op("policy_op", 5)
+
+        assert result == 20  # reference_fn: x * 4
+        assert results["reference"] == 1
+
+    def test_call_op_with_policy_context(self, setup_multi_impl_op):
+        results = setup_multi_impl_op
+
+        # Default call
+        result1 = call_op("policy_op", 5)
+        assert result1 == 10
+        assert results["default"] == 1
+
+        # With vendor preference context
+        with with_preference("vendor"):
+            get_default_manager().bump_policy_epoch()
+            result2 = call_op("policy_op", 5)
+            assert result2 == 15
+            assert results["vendor"] == 1
+
+        # Back to default after context
+        get_default_manager().bump_policy_epoch()
+        result3 = call_op("policy_op", 5)
+        assert result3 == 10
+        assert results["default"] == 2
+
+
+class TestCallOpIntegration:
+    """Integration tests for the full dispatch pipeline."""
+
+    @pytest.fixture(autouse=True)
+    def reset_all(self):
+        reset_default_manager()
+        reset_global_policy()
+        yield
+        reset_default_manager()
+        reset_global_policy()
+
+    def test_full_pipeline_with_multiple_ops(self):
+        """Test calling multiple different operators."""
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        # Register multiple operators
+        manager.registry.register_impl(OpImpl(
+            op_name="add_op",
+            impl_id="default.add",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x, y: x + y,
+        ))
+        manager.registry.register_impl(OpImpl(
+            op_name="mul_op",
+            impl_id="default.mul",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x, y: x * y,
+        ))
+        manager.registry.register_impl(OpImpl(
+            op_name="sub_op",
+            impl_id="default.sub",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x, y: x - y,
+        ))
+
+        # Call each operator
+        assert call_op("add_op", 2, 3) == 5
+        assert call_op("mul_op", 2, 3) == 6
+        assert call_op("sub_op", 5, 3) == 2
+
+    def test_resolve_and_call_consistency(self):
+        """Test that resolve_op and call_op give consistent results."""
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        manager.registry.register_impl(OpImpl(
+            op_name="consistent_op",
+            impl_id="default.impl",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x * 10,
+        ))
+
+        # Both should give same result
+        fn = resolve_op("consistent_op")
+        result1 = fn(5)
+        result2 = call_op("consistent_op", 5)
+
+        assert result1 == result2 == 50
+
+    @patch.dict(os.environ, {"VLLM_FL_STRICT": "1"})
+    def test_fallback_chain(self):
+        """Test fallback from failed impl to successful one."""
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        call_sequence = []
+
+        def failing_impl(x):
+            call_sequence.append("failing")
+            raise RuntimeError("Intentional failure")
+
+        def success_impl(x):
+            call_sequence.append("success")
+            return x * 2
+
+        manager.registry.register_impl(OpImpl(
+            op_name="fallback_chain_op",
+            impl_id="default.failing",
+            kind=BackendImplKind.DEFAULT,
+            fn=failing_impl,
+            priority=200,
+        ))
+        manager.registry.register_impl(OpImpl(
+            op_name="fallback_chain_op",
+            impl_id="reference.success",
+            kind=BackendImplKind.REFERENCE,
+            fn=success_impl,
+            priority=100,
+        ))
+
+        result = call_op("fallback_chain_op", 5)
+
+        assert result == 10
+        assert call_sequence == ["failing", "success"]
+
+    def test_vendor_filtering(self):
+        """Test that vendor filtering works through call_op."""
+        manager = get_default_manager()
+        manager._state.initialized = True
+        manager._state.init_pid = os.getpid()
+
+        manager.registry.register_impl(OpImpl(
+            op_name="vendor_filter_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda x: x * 2,
+            vendor="CUDA",
+        ))
+        manager.registry.register_impl(OpImpl(
+            op_name="vendor_filter_op",
+            impl_id="vendor.amd",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda x: x * 3,
+            vendor="AMD",
+        ))
+        manager.registry.register_impl(OpImpl(
+            op_name="vendor_filter_op",
+            impl_id="reference.pytorch",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda x: x * 4,
+        ))
+
+        # Deny AMD, prefer vendor -> should use CUDA
+        set_global_policy(SelectionPolicy(
+            prefer=PREFER_VENDOR,
+            deny_vendors=frozenset({"AMD"})
+        ))
+        manager.bump_policy_epoch()
+
+        result = call_op("vendor_filter_op", 5)
+        assert result == 10  # CUDA: x * 2
diff --git a/tests/unit_tests/dispatch/test_manager.py b/tests/unit_tests/dispatch/test_manager.py
new file mode 100644
index 0000000..e5a5261
--- /dev/null
+++ b/tests/unit_tests/dispatch/test_manager.py
@@ -0,0 +1,795 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Tests for dispatch manager module.
+
+This module tests the core OpManager class which handles:
+- Operator resolution and selection
+- Dispatch caching with policy epoch invalidation
+- Fallback mechanisms for failed implementations
+- Multi-process safety (fork handling)
+"""
+
+import os
+import pytest
+import threading
+from unittest.mock import patch, MagicMock
+
+from vllm_fl.dispatch.manager import (
+    OpManager,
+    get_default_manager,
+    reset_default_manager,
+    _OpManagerState,
+)
+from vllm_fl.dispatch.registry import OpRegistry
+from vllm_fl.dispatch.types import OpImpl, BackendImplKind, BackendPriority
+from vllm_fl.dispatch.policy import (
+    SelectionPolicy,
+    set_global_policy,
+    reset_global_policy,
+    PREFER_DEFAULT,
+    PREFER_VENDOR,
+    PREFER_REFERENCE,
+)
+
+
+class TestOpManagerState:
+    """Test _OpManagerState dataclass."""
+
+    def test_default_values(self):
+        state = _OpManagerState()
+        assert state.init_pid == -1
+        assert state.initialized is False
+        assert state.policy_epoch == 0
+
+
+class TestOpManagerBasic:
+    """Test basic OpManager functionality."""
+
+    @pytest.fixture
+    def registry(self):
+        return OpRegistry()
+
+    @pytest.fixture
+    def manager(self, registry):
+        return OpManager(registry=registry)
+
+    @pytest.fixture
+    def sample_impl(self):
+        return OpImpl(
+            op_name="test_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x * 2,
+            priority=BackendPriority.DEFAULT,
+        )
+
+    @pytest.fixture
+    def reference_impl(self):
+        return OpImpl(
+            op_name="test_op",
+            impl_id="reference.test",
+            kind=BackendImplKind.REFERENCE,
+            fn=lambda x: x * 2,
+            priority=BackendPriority.REFERENCE,
+        )
+
+    @pytest.fixture
+    def vendor_impl(self):
+        return OpImpl(
+            op_name="test_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=lambda x: x * 3,
+            priority=BackendPriority.VENDOR,
+            vendor="CUDA",
+        )
+
+    def test_init_with_custom_registry(self, registry):
+        manager = OpManager(registry=registry)
+        assert manager.registry is registry
+
+    def test_init_creates_default_registry(self):
+        manager = OpManager()
+        assert manager.registry is not None
+        assert isinstance(manager.registry, OpRegistry)
+
+    def test_registry_property(self, manager, registry):
+        assert manager.registry is registry
+
+
+class TestOpManagerPolicyEpoch:
+    """Test policy epoch and cache invalidation."""
+
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    @pytest.fixture
+    def manager(self):
+        return OpManager()
+
+    def test_bump_policy_epoch(self, manager):
+        initial_epoch = manager._state.policy_epoch
+        manager.bump_policy_epoch()
+        assert manager._state.policy_epoch == initial_epoch + 1
+
+    def test_bump_policy_epoch_clears_cache(self, manager):
+        # Add something to cache
+        manager._dispatch_cache[("test", "fp", 0)] = lambda x: x
+        assert len(manager._dispatch_cache) == 1
+
+        manager.bump_policy_epoch()
+        assert len(manager._dispatch_cache) == 0
+
+    def test_bump_policy_epoch_clears_failed_impls(self, manager):
+        manager._failed_impls["test_op"] = {"impl1", "impl2"}
+        manager.bump_policy_epoch()
+        assert len(manager._failed_impls) == 0
+
+
+class TestOpManagerFailedImpls:
+    """Test failed implementation tracking."""
+
+    @pytest.fixture
+    def manager(self):
+        return OpManager()
+
+    def test_clear_failed_impls_all(self, manager):
+        manager._failed_impls["op1"] = {"impl1"}
+        manager._failed_impls["op2"] = {"impl2"}
+
+        manager.clear_failed_impls()
+        assert manager._failed_impls == {}
+
+    def test_clear_failed_impls_specific_op(self, manager):
+        manager._failed_impls["op1"] = {"impl1"}
+        manager._failed_impls["op2"] = {"impl2"}
+
+        manager.clear_failed_impls("op1")
+        assert "op1" not in manager._failed_impls
+        assert "op2" in manager._failed_impls
+
+    def test_clear_failed_impls_nonexistent_op(self, manager):
+        manager._failed_impls["op1"] = {"impl1"}
+        manager.clear_failed_impls("nonexistent")
+        assert "op1" in manager._failed_impls
+
+    def test_get_failed_impls_all(self, manager):
+        manager._failed_impls["op1"] = {"impl1", "impl2"}
+        manager._failed_impls["op2"] = {"impl3"}
+
+        result = manager.get_failed_impls()
+        assert result == {"op1": {"impl1", "impl2"}, "op2": {"impl3"}}
+
+    def test_get_failed_impls_specific_op(self, manager):
+        manager._failed_impls["op1"] = {"impl1"}
+        manager._failed_impls["op2"] = {"impl2"}
+
+        result = manager.get_failed_impls("op1")
+        assert result == {"op1": {"impl1"}}
+
+    def test_get_failed_impls_returns_copy(self, manager):
+        manager._failed_impls["op1"] = {"impl1"}
+        result = manager.get_failed_impls()
+
+        # Modify the result
+        result["op1"].add("impl2")
+
+        # Original should not be modified
+        assert manager._failed_impls["op1"] == {"impl1"}
+
+
+class TestOpManagerResolve:
+    """Test operator resolution logic."""
+
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    @pytest.fixture
+    def registry(self):
+        return OpRegistry()
+
+    @pytest.fixture
+    def manager(self, registry):
+        mgr = OpManager(registry=registry)
+        # Mark as initialized to skip builtin registration
+        mgr._state.initialized = True
+        mgr._state.init_pid = os.getpid()
+        return mgr
+
+    def test_resolve_single_impl(self, manager, registry):
+        impl = OpImpl(
+            op_name="single_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x * 2,
+        )
+        registry.register_impl(impl)
+
+        fn = manager.resolve("single_op")
+        assert fn is impl.fn
+
+    def test_resolve_prefers_default_by_policy(self, manager, registry):
+        default_fn = lambda x: x * 2
+        reference_fn = lambda x: x * 3
+
+        registry.register_impl(OpImpl(
+            op_name="multi_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=default_fn,
+            priority=BackendPriority.DEFAULT,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="multi_op",
+            impl_id="reference.test",
+            kind=BackendImplKind.REFERENCE,
+            fn=reference_fn,
+            priority=BackendPriority.REFERENCE,
+        ))
+
+        # Default policy prefers "flagos" (DEFAULT)
+        fn = manager.resolve("multi_op")
+        assert fn is default_fn
+
+    def test_resolve_prefers_vendor_with_policy(self, manager, registry):
+        default_fn = lambda x: x * 2
+        vendor_fn = lambda x: x * 3
+
+        registry.register_impl(OpImpl(
+            op_name="vendor_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=default_fn,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="vendor_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=vendor_fn,
+            vendor="CUDA",
+        ))
+
+        # Set policy to prefer vendor
+        set_global_policy(SelectionPolicy(prefer=PREFER_VENDOR))
+
+        fn = manager.resolve("vendor_op")
+        assert fn is vendor_fn
+
+    def test_resolve_prefers_reference_with_policy(self, manager, registry):
+        default_fn = lambda x: x * 2
+        reference_fn = lambda x: x * 3
+
+        registry.register_impl(OpImpl(
+            op_name="ref_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=default_fn,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="ref_op",
+            impl_id="reference.test",
+            kind=BackendImplKind.REFERENCE,
+            fn=reference_fn,
+        ))
+
+        # Set policy to prefer reference
+        set_global_policy(SelectionPolicy(prefer=PREFER_REFERENCE))
+
+        fn = manager.resolve("ref_op")
+        assert fn is reference_fn
+
+    def test_resolve_filters_denied_vendors(self, manager, registry):
+        default_fn = lambda x: x * 2
+        vendor_fn = lambda x: x * 3
+
+        registry.register_impl(OpImpl(
+            op_name="deny_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=default_fn,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="deny_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=vendor_fn,
+            vendor="CUDA",
+        ))
+
+        # Deny CUDA vendor, prefer vendor
+        set_global_policy(SelectionPolicy(
+            prefer=PREFER_VENDOR,
+            deny_vendors=frozenset({"CUDA"})
+        ))
+
+        # Should fall back to default since CUDA is denied
+        fn = manager.resolve("deny_op")
+        assert fn is default_fn
+
+    def test_resolve_filters_by_allow_vendors(self, manager, registry):
+        vendor_cuda_fn = lambda x: x * 2
+        vendor_amd_fn = lambda x: x * 3
+
+        registry.register_impl(OpImpl(
+            op_name="allow_op",
+            impl_id="vendor.cuda",
+            kind=BackendImplKind.VENDOR,
+            fn=vendor_cuda_fn,
+            vendor="CUDA",
+        ))
+        registry.register_impl(OpImpl(
+            op_name="allow_op",
+            impl_id="vendor.amd",
+            kind=BackendImplKind.VENDOR,
+            fn=vendor_amd_fn,
+            vendor="AMD",
+        ))
+
+        # Only allow CUDA vendor
+        set_global_policy(SelectionPolicy(
+            prefer=PREFER_VENDOR,
+            allow_vendors=frozenset({"CUDA"})
+        ))
+
+        fn = manager.resolve("allow_op")
+        assert fn is vendor_cuda_fn
+
+    def test_resolve_no_impl_raises(self, manager):
+        with pytest.raises(RuntimeError, match="No available implementation"):
+            manager.resolve("nonexistent_op")
+
+    def test_resolve_caches_result(self, manager, registry):
+        impl = OpImpl(
+            op_name="cache_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x,
+        )
+        registry.register_impl(impl)
+
+        # First call
+        fn1 = manager.resolve("cache_op")
+
+        # Second call should return cached result
+        fn2 = manager.resolve("cache_op")
+
+        assert fn1 is fn2
+        assert len(manager._dispatch_cache) == 1
+
+    def test_resolve_cache_invalidated_by_policy_change(self, manager, registry):
+        impl = OpImpl(
+            op_name="epoch_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x,
+        )
+        registry.register_impl(impl)
+
+        # First resolve
+        manager.resolve("epoch_op")
+        assert len(manager._dispatch_cache) == 1
+
+        # Bump epoch (simulates policy change)
+        manager.bump_policy_epoch()
+        assert len(manager._dispatch_cache) == 0
+
+    def test_resolve_filters_unavailable_impls(self, manager, registry):
+        available_fn = lambda x: x * 2
+        unavailable_fn = lambda x: x * 3
+
+        # Create an unavailable implementation
+        def check_available():
+            return False
+        unavailable_fn._is_available = check_available
+
+        registry.register_impl(OpImpl(
+            op_name="avail_op",
+            impl_id="default.unavailable",
+            kind=BackendImplKind.DEFAULT,
+            fn=unavailable_fn,
+            priority=200,  # Higher priority but unavailable
+        ))
+        registry.register_impl(OpImpl(
+            op_name="avail_op",
+            impl_id="default.available",
+            kind=BackendImplKind.DEFAULT,
+            fn=available_fn,
+            priority=100,
+        ))
+
+        fn = manager.resolve("avail_op")
+        assert fn is available_fn
+
+
+class TestOpManagerCall:
+    """Test operator call with fallback support."""
+
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    @pytest.fixture
+    def registry(self):
+        return OpRegistry()
+
+    @pytest.fixture
+    def manager(self, registry):
+        mgr = OpManager(registry=registry)
+        mgr._state.initialized = True
+        mgr._state.init_pid = os.getpid()
+        return mgr
+
+    def test_call_invokes_implementation(self, manager, registry):
+        result_value = [0]
+
+        def impl_fn(x):
+            result_value[0] = x * 2
+            return result_value[0]
+
+        registry.register_impl(OpImpl(
+            op_name="call_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=impl_fn,
+        ))
+
+        result = manager.call("call_op", 5)
+        assert result == 10
+        assert result_value[0] == 10
+
+    def test_call_passes_args_and_kwargs(self, manager, registry):
+        def impl_fn(a, b, c=10):
+            return a + b + c
+
+        registry.register_impl(OpImpl(
+            op_name="args_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=impl_fn,
+        ))
+
+        result = manager.call("args_op", 1, 2, c=3)
+        assert result == 6
+
+    @patch.dict(os.environ, {"VLLM_FL_STRICT": "1"})
+    def test_call_fallback_on_failure(self, manager, registry):
+        call_order = []
+
+        def failing_fn(x):
+            call_order.append("failing")
+            raise RuntimeError("Primary failed")
+
+        def fallback_fn(x):
+            call_order.append("fallback")
+            return x * 2
+
+        registry.register_impl(OpImpl(
+            op_name="fallback_op",
+            impl_id="default.primary",
+            kind=BackendImplKind.DEFAULT,
+            fn=failing_fn,
+            priority=200,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="fallback_op",
+            impl_id="reference.fallback",
+            kind=BackendImplKind.REFERENCE,
+            fn=fallback_fn,
+            priority=100,
+        ))
+
+        result = manager.call("fallback_op", 5)
+        assert result == 10
+        assert call_order == ["failing", "fallback"]
+
+    @patch.dict(os.environ, {"VLLM_FL_STRICT": "1"})
+    def test_call_tracks_failed_impls(self, manager, registry):
+        def failing_fn(x):
+            raise RuntimeError("Failed")
+
+        def success_fn(x):
+            return x
+
+        registry.register_impl(OpImpl(
+            op_name="track_op",
+            impl_id="default.failing",
+            kind=BackendImplKind.DEFAULT,
+            fn=failing_fn,
+            priority=200,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="track_op",
+            impl_id="reference.success",
+            kind=BackendImplKind.REFERENCE,
+            fn=success_fn,
+            priority=100,
+        ))
+
+        manager.call("track_op", 1)
+
+        # Check that failed impl is tracked
+        failed = manager.get_failed_impls("track_op")
+        assert "default.failing" in failed["track_op"]
+
+    @patch.dict(os.environ, {"VLLM_FL_STRICT": "1"})
+    def test_call_all_impls_fail_raises(self, manager, registry):
+        def failing_fn1(x):
+            raise RuntimeError("Failed 1")
+
+        def failing_fn2(x):
+            raise RuntimeError("Failed 2")
+
+        registry.register_impl(OpImpl(
+            op_name="allfail_op",
+            impl_id="default.fail1",
+            kind=BackendImplKind.DEFAULT,
+            fn=failing_fn1,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="allfail_op",
+            impl_id="reference.fail2",
+            kind=BackendImplKind.REFERENCE,
+            fn=failing_fn2,
+        ))
+
+        with pytest.raises(RuntimeError, match="implementation.*failed"):
+            manager.call("allfail_op", 1)
+
+    @patch.dict(os.environ, {"VLLM_FL_STRICT": "0"})
+    def test_call_no_fallback_when_disabled(self, manager, registry):
+        def failing_fn(x):
+            raise RuntimeError("Primary failed")
+
+        def fallback_fn(x):
+            return x * 2
+
+        registry.register_impl(OpImpl(
+            op_name="nofallback_op",
+            impl_id="default.primary",
+            kind=BackendImplKind.DEFAULT,
+            fn=failing_fn,
+            priority=200,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="nofallback_op",
+            impl_id="reference.fallback",
+            kind=BackendImplKind.REFERENCE,
+            fn=fallback_fn,
+            priority=100,
+        ))
+
+        # Should raise immediately without trying fallback
+        with pytest.raises(RuntimeError, match="Primary failed"):
+            manager.call("nofallback_op", 5)
+
+
+class TestOpManagerResolveCandidates:
+    """Test resolve_candidates method."""
+
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    @pytest.fixture
+    def registry(self):
+        return OpRegistry()
+
+    @pytest.fixture
+    def manager(self, registry):
+        mgr = OpManager(registry=registry)
+        mgr._state.initialized = True
+        mgr._state.init_pid = os.getpid()
+        return mgr
+
+    def test_resolve_candidates_returns_sorted_list(self, manager, registry):
+        fn1 = lambda x: x
+        fn2 = lambda x: x
+        fn3 = lambda x: x
+
+        registry.register_impl(OpImpl(
+            op_name="multi_op",
+            impl_id="default.impl",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn1,
+            priority=BackendPriority.DEFAULT,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="multi_op",
+            impl_id="vendor.impl",
+            kind=BackendImplKind.VENDOR,
+            fn=fn2,
+            priority=BackendPriority.VENDOR,
+            vendor="CUDA",
+        ))
+        registry.register_impl(OpImpl(
+            op_name="multi_op",
+            impl_id="reference.impl",
+            kind=BackendImplKind.REFERENCE,
+            fn=fn3,
+            priority=BackendPriority.REFERENCE,
+        ))
+
+        candidates = manager.resolve_candidates("multi_op")
+
+        # Default policy: flagos > vendor > reference
+        assert len(candidates) == 3
+        assert candidates[0].impl_id == "default.impl"
+        assert candidates[1].impl_id == "vendor.impl"
+        assert candidates[2].impl_id == "reference.impl"
+
+    def test_resolve_candidates_respects_policy_order(self, manager, registry):
+        fn1 = lambda x: x
+        fn2 = lambda x: x
+
+        registry.register_impl(OpImpl(
+            op_name="order_op",
+            impl_id="default.impl",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn1,
+        ))
+        registry.register_impl(OpImpl(
+            op_name="order_op",
+            impl_id="reference.impl",
+            kind=BackendImplKind.REFERENCE,
+            fn=fn2,
+        ))
+
+        # Set policy to prefer reference
+        set_global_policy(SelectionPolicy(prefer=PREFER_REFERENCE))
+
+        candidates = manager.resolve_candidates("order_op")
+
+        # Reference should come first
+        assert candidates[0].impl_id == "reference.impl"
+        assert candidates[1].impl_id == "default.impl"
+
+    def test_resolve_candidates_no_impl_raises(self, manager):
+        with pytest.raises(RuntimeError, match="No available implementation"):
+            manager.resolve_candidates("nonexistent")
+
+
+class TestOpManagerGetSelectedImplId:
+    """Test get_selected_impl_id method."""
+
+    @pytest.fixture(autouse=True)
+    def reset_policy(self):
+        reset_global_policy()
+        yield
+        reset_global_policy()
+
+    @pytest.fixture
+    def registry(self):
+        return OpRegistry()
+
+    @pytest.fixture
+    def manager(self, registry):
+        mgr = OpManager(registry=registry)
+        mgr._state.initialized = True
+        mgr._state.init_pid = os.getpid()
+        return mgr
+
+    def test_get_selected_impl_id(self, manager, registry):
+        fn = lambda x: x
+
+        registry.register_impl(OpImpl(
+            op_name="id_op",
+            impl_id="default.test",
+            kind=BackendImplKind.DEFAULT,
+            fn=fn,
+        ))
+
+        impl_id = manager.get_selected_impl_id("id_op")
+        assert impl_id == "default.test"
+
+
+class TestOpManagerThreadSafety:
+    """Test thread safety of OpManager."""
+
+    @pytest.fixture
+    def registry(self):
+        return OpRegistry()
+
+    @pytest.fixture
+    def manager(self, registry):
+        mgr = OpManager(registry=registry)
+        mgr._state.initialized = True
+        mgr._state.init_pid = os.getpid()
+        return mgr
+
+    def test_concurrent_resolve(self, manager, registry):
+        # Register multiple implementations
+        for i in range(5):
+            registry.register_impl(OpImpl(
+                op_name=f"thread_op_{i}",
+                impl_id=f"default.impl_{i}",
+                kind=BackendImplKind.DEFAULT,
+                fn=lambda x, i=i: x + i,
+            ))
+
+        errors = []
+        results = []
+
+        def resolve_op(op_idx):
+            try:
+                fn = manager.resolve(f"thread_op_{op_idx}")
+                results.append((op_idx, fn(10)))
+            except Exception as e:
+                errors.append(e)
+
+        threads = [
+            threading.Thread(target=resolve_op, args=(i % 5,))
+            for i in range(20)
+        ]
+
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        assert len(errors) == 0
+        assert len(results) == 20
+
+    def test_concurrent_bump_policy_epoch(self, manager, registry):
+        registry.register_impl(OpImpl(
+            op_name="epoch_test",
+            impl_id="default.impl",
+            kind=BackendImplKind.DEFAULT,
+            fn=lambda x: x,
+        ))
+
+        errors = []
+
+        def bump_and_resolve():
+            try:
+                manager.bump_policy_epoch()
+                manager.resolve("epoch_test")
+            except Exception as e:
+                errors.append(e)
+
+        threads = [
+            threading.Thread(target=bump_and_resolve)
+            for _ in range(10)
+        ]
+
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+        assert len(errors) == 0
+
+
+class TestGlobalDefaultManager:
+    """Test global default manager functions."""
+
+    @pytest.fixture(autouse=True)
+    def reset_manager(self):
+        reset_default_manager()
+        yield
+        reset_default_manager()
+
+    def test_get_default_manager_singleton(self):
+        manager1 = get_default_manager()
+        manager2 = get_default_manager()
+        assert manager1 is manager2
+
+    def test_reset_default_manager(self):
+        manager1 = get_default_manager()
+        reset_default_manager()
+        manager2 = get_default_manager()
+        assert manager1 is not manager2
+
+    def test_get_default_manager_creates_instance(self):
+        manager = get_default_manager()
+        assert isinstance(manager, OpManager)
diff --git a/tests/unit_tests/ops/test_numerical.py b/tests/unit_tests/ops/test_numerical.py
new file mode 100644
index 0000000..83511cf
--- /dev/null
+++ b/tests/unit_tests/ops/test_numerical.py
@@ -0,0 +1,588 @@
+# Copyright (c) 2025 BAAI. All rights reserved.
+
+"""
+Numerical correctness tests for operator implementations.
+
+This module tests that operator implementations produce numerically correct
+results by comparing against reference (PyTorch) implementations.
+
+These tests verify:
+- Output shape correctness
+- Numerical accuracy within tolerance
+- Handling of edge cases (zeros, large values, etc.)
+- Different dtypes (float32, float16, bfloat16)
+"""
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+
+# =============================================================================
+# Reference Implementations (PyTorch baseline)
+# =============================================================================
+
+def reference_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
+    """Reference SiLU and multiply implementation."""
+    d = x.shape[-1] // 2
+    x1, x2 = x[..., :d], x[..., d:]
+    return F.silu(x1) * x2
+
+
+def reference_rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+    residual: torch.Tensor = None,
+):
+    """Reference RMS normalization implementation."""
+    if residual is not None:
+        x = x + residual
+        new_residual = x.clone()
+
+    variance = x.pow(2).mean(-1, keepdim=True)
+    x = x * torch.rsqrt(variance + epsilon)
+    output = weight * x
+
+    if residual is not None:
+        return output, new_residual
+    return output
+
+
+def reference_rotary_embedding(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool = True,
+):
+    """
+    Reference rotary position embedding implementation.
+
+    Applies rotary position embedding to query and key tensors.
+    """
+    def apply_rotary(x, cos, sin, is_neox_style):
+        if is_neox_style:
+            # GPT-NeoX style: split at half
+            d = x.shape[-1] // 2
+            x1, x2 = x[..., :d], x[..., d:]
+            rotated = torch.cat([-x2, x1], dim=-1)
+        else:
+            # GPT-J style: interleaved
+            x1 = x[..., ::2]
+            x2 = x[..., 1::2]
+            rotated = torch.stack([-x2, x1], dim=-1).flatten(-2)
+
+        return x * cos + rotated * sin
+
+    q_embed = apply_rotary(query, cos, sin, is_neox_style)
+    k_embed = apply_rotary(key, cos, sin, is_neox_style)
+
+    return q_embed, k_embed
+
+
+# =============================================================================
+# Test Fixtures
+# =============================================================================
+
+@pytest.fixture(params=[torch.float32, torch.float16])
+def dtype(request):
+    """Test with different floating point dtypes."""
+    return request.param
+
+
+@pytest.fixture(params=[(2, 128), (4, 256), (8, 512)])
+def hidden_size_config(request):
+    """Different batch and hidden size configurations."""
+    return request.param
+
+
+@pytest.fixture
+def device():
+    """Get available device (CPU for unit tests)."""
+    return torch.device("cpu")
+
+
+# =============================================================================
+# SiLU and Multiply Tests
+# =============================================================================
+
+class TestSiluAndMulNumerical:
+    """Numerical correctness tests for SiLU and multiply operation."""
+
+    def test_basic_correctness(self, device):
+        """Test basic numerical correctness."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = torch.randn(2, 16, device=device, dtype=torch.float32)
+
+        result = silu_and_mul_torch(x)
+        expected = reference_silu_and_mul(x)
+
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+
+    def test_output_shape(self, hidden_size_config, device):
+        """Test that output shape is correct (half of input)."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        batch, hidden = hidden_size_config
+        x = torch.randn(batch, hidden * 2, device=device)
+
+        result = silu_and_mul_torch(x)
+
+        assert result.shape == (batch, hidden)
+
+    def test_dtype_preservation(self, dtype, device):
+        """Test that dtype is preserved."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = torch.randn(2, 16, device=device, dtype=dtype)
+
+        result = silu_and_mul_torch(x)
+
+        assert result.dtype == dtype
+
+    def test_zero_input(self, device):
+        """Test with zero input tensor."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = torch.zeros(2, 16, device=device)
+
+        result = silu_and_mul_torch(x)
+        expected = reference_silu_and_mul(x)
+
+        torch.testing.assert_close(result, expected)
+
+    def test_large_values(self, device):
+        """Test with large input values."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = torch.randn(2, 16, device=device) * 100
+
+        result = silu_and_mul_torch(x)
+        expected = reference_silu_and_mul(x)
+
+        torch.testing.assert_close(result, expected, rtol=1e-4, atol=1e-4)
+
+    def test_negative_values(self, device):
+        """Test with negative input values."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = -torch.abs(torch.randn(2, 16, device=device))
+
+        result = silu_and_mul_torch(x)
+        expected = reference_silu_and_mul(x)
+
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+
+    def test_3d_input(self, device):
+        """Test with 3D input tensor."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = torch.randn(2, 4, 16, device=device)
+
+        result = silu_and_mul_torch(x)
+        expected = reference_silu_and_mul(x)
+
+        assert result.shape == (2, 4, 8)
+        torch.testing.assert_close(result, expected)
+
+
+# =============================================================================
+# RMS Normalization Tests
+# =============================================================================
+
+class TestRMSNormNumerical:
+    """Numerical correctness tests for RMS normalization."""
+
+    def test_basic_correctness(self, device):
+        """Test basic numerical correctness."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        hidden_size = 128
+        x = torch.randn(2, hidden_size, device=device, dtype=torch.float32)
+        weight = torch.ones(hidden_size, device=device, dtype=torch.float32)
+        epsilon = 1e-5
+
+        result = rms_norm_torch(x, None, weight, epsilon)
+        expected = reference_rms_norm(x, weight, epsilon)
+
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+
+    def test_with_residual(self, device):
+        """Test RMS norm with residual connection."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        hidden_size = 128
+        x = torch.randn(2, hidden_size, device=device, dtype=torch.float32)
+        residual = torch.randn(2, hidden_size, device=device, dtype=torch.float32)
+        weight = torch.ones(hidden_size, device=device, dtype=torch.float32)
+        epsilon = 1e-5
+
+        result_out, result_res = rms_norm_torch(x, residual, weight, epsilon)
+        expected_out, expected_res = reference_rms_norm(x, weight, epsilon, residual)
+
+        torch.testing.assert_close(result_out, expected_out, rtol=1e-5, atol=1e-5)
+        torch.testing.assert_close(result_res, expected_res, rtol=1e-5, atol=1e-5)
+
+    def test_output_shape(self, hidden_size_config, device):
+        """Test that output shape matches input shape."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        batch, hidden = hidden_size_config
+        x = torch.randn(batch, hidden, device=device)
+        weight = torch.ones(hidden, device=device)
+        epsilon = 1e-5
+
+        result = rms_norm_torch(x, None, weight, epsilon)
+
+        assert result.shape == x.shape
+
+    def test_dtype_preservation(self, dtype, device):
+        """Test that dtype is preserved."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        hidden_size = 128
+        x = torch.randn(2, hidden_size, device=device, dtype=dtype)
+        weight = torch.ones(hidden_size, device=device, dtype=dtype)
+        epsilon = 1e-5
+
+        result = rms_norm_torch(x, None, weight, epsilon)
+
+        assert result.dtype == dtype
+
+    def test_normalization_effect(self, device):
+        """Test that normalization actually normalizes the tensor."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        hidden_size = 128
+        # Create input with known variance
+        x = torch.randn(2, hidden_size, device=device) * 10
+        weight = torch.ones(hidden_size, device=device)
+        epsilon = 1e-5
+
+        result = rms_norm_torch(x, None, weight, epsilon)
+
+        # After RMS norm, the RMS should be approximately 1
+        rms = result.pow(2).mean(-1).sqrt()
+        torch.testing.assert_close(
+            rms,
+            torch.ones_like(rms),
+            rtol=0.1,
+            atol=0.1
+        )
+
+    def test_epsilon_effect(self, device):
+        """Test that epsilon prevents division by zero."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        hidden_size = 128
+        x = torch.zeros(2, hidden_size, device=device)
+        weight = torch.ones(hidden_size, device=device)
+        epsilon = 1e-5
+
+        # Should not raise or produce NaN/Inf
+        result = rms_norm_torch(x, None, weight, epsilon)
+
+        assert not torch.isnan(result).any()
+        assert not torch.isinf(result).any()
+
+    def test_weight_scaling(self, device):
+        """Test that weight properly scales the output."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        hidden_size = 128
+        x = torch.randn(2, hidden_size, device=device)
+        weight1 = torch.ones(hidden_size, device=device)
+        weight2 = torch.ones(hidden_size, device=device) * 2
+        epsilon = 1e-5
+
+        result1 = rms_norm_torch(x, None, weight1, epsilon)
+        result2 = rms_norm_torch(x, None, weight2, epsilon)
+
+        # Result with weight=2 should be twice result with weight=1
+        torch.testing.assert_close(result2, result1 * 2, rtol=1e-5, atol=1e-5)
+
+    def test_3d_input(self, device):
+        """Test with 3D input tensor (batch, seq, hidden)."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        x = torch.randn(2, 4, 128, device=device)
+        weight = torch.ones(128, device=device)
+        epsilon = 1e-5
+
+        result = rms_norm_torch(x, None, weight, epsilon)
+        expected = reference_rms_norm(x, weight, epsilon)
+
+        assert result.shape == x.shape
+        torch.testing.assert_close(result, expected)
+
+
+# =============================================================================
+# Rotary Embedding Tests
+# =============================================================================
+
+class TestRotaryEmbeddingNumerical:
+    """Numerical correctness tests for rotary position embedding."""
+
+    def test_basic_correctness_4d(self, device):
+        """Test basic numerical correctness with 4D tensors."""
+        from vllm_fl.dispatch.backends.reference.impl.rotary import rotary_embedding_torch
+
+        batch, num_heads, seq_len, head_dim = 2, 4, 8, 64
+        max_seq_len = 16
+        rotary_dim = head_dim // 2
+
+        # [batch, num_heads, seq_len, head_dim]
+        query = torch.randn(batch, num_heads, seq_len, head_dim, device=device)
+        key = torch.randn(batch, num_heads, seq_len, head_dim, device=device)
+
+        # Create cos/sin cache [max_seq_len, rotary_dim]
+        freqs = 1.0 / (10000 ** (torch.arange(0, rotary_dim, device=device).float() / rotary_dim))
+        angles = torch.arange(max_seq_len, device=device).unsqueeze(1) * freqs.unsqueeze(0)
+        cos = torch.cos(angles)  # [max_seq_len, rotary_dim]
+        sin = torch.sin(angles)  # [max_seq_len, rotary_dim]
+
+        # For 4D query, position_ids should be 2D [batch, seq_len]
+        positions = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch, -1)
+
+        result_q, result_k = rotary_embedding_torch(
+            query, key, cos, sin,
+            position_ids=positions,
+            rotary_interleaved=False,
+            inplace=False,
+        )
+
+        # Should have same shape as input
+        assert result_q.shape == query.shape
+        assert result_k.shape == key.shape
+        assert not torch.isnan(result_q).any()
+        assert not torch.isnan(result_k).any()
+
+    def test_output_shape_4d(self, device):
+        """Test that output shapes match input shapes for 4D tensors."""
+        from vllm_fl.dispatch.backends.reference.impl.rotary import rotary_embedding_torch
+
+        batch, num_heads, seq_len, head_dim = 4, 8, 16, 32
+        max_seq_len = 32
+        rotary_dim = head_dim // 2
+
+        # [batch, num_heads, seq_len, head_dim]
+        query = torch.randn(batch, num_heads, seq_len, head_dim, device=device)
+        key = torch.randn(batch, num_heads, seq_len, head_dim, device=device)
+
+        # For 4D query, use 2D position_ids [batch, seq_len]
+        positions = torch.arange(seq_len, device=device).unsqueeze(0).expand(batch, -1)
+        cos = torch.randn(max_seq_len, rotary_dim, device=device)
+        sin = torch.randn(max_seq_len, rotary_dim, device=device)
+
+        result_q, result_k = rotary_embedding_torch(
+            query, key, cos, sin,
+            position_ids=positions,
+            rotary_interleaved=False,
+            inplace=False,
+        )
+
+        assert result_q.shape == query.shape
+        assert result_k.shape == key.shape
+
+    def test_output_shape_3d(self, device):
+        """Test that output shapes match input shapes for 3D tensors."""
+        from vllm_fl.dispatch.backends.reference.impl.rotary import rotary_embedding_torch
+
+        seq_len, num_heads, head_dim = 16, 8, 32
+        max_seq_len = 32
+        rotary_dim = head_dim // 2
+
+        # [seq_len, num_heads, head_dim]
+        query = torch.randn(seq_len, num_heads, head_dim, device=device)
+        key = torch.randn(seq_len, num_heads, head_dim, device=device)
+
+        # For 3D query, use 1D position_ids [seq_len]
+        positions = torch.arange(seq_len, device=device)
+        cos = torch.randn(max_seq_len, rotary_dim, device=device)
+        sin = torch.randn(max_seq_len, rotary_dim, device=device)
+
+        result_q, result_k = rotary_embedding_torch(
+            query, key, cos, sin,
+            position_ids=positions,
+            rotary_interleaved=False,
+            inplace=False,
+        )
+
+        assert result_q.shape == query.shape
+        assert result_k.shape == key.shape
+
+    def test_dtype_preservation_3d(self, dtype, device):
+        """Test that dtype is preserved with 3D tensors."""
+        from vllm_fl.dispatch.backends.reference.impl.rotary import rotary_embedding_torch
+
+        seq_len, num_heads, head_dim = 8, 4, 32
+        max_seq_len = 16
+        rotary_dim = head_dim // 2
+
+        # Use 3D tensors for simpler testing
+        query = torch.randn(seq_len, num_heads, head_dim, device=device, dtype=dtype)
+        key = torch.randn(seq_len, num_heads, head_dim, device=device, dtype=dtype)
+
+        positions = torch.arange(seq_len, device=device)
+        cos = torch.randn(max_seq_len, rotary_dim, device=device, dtype=dtype)
+        sin = torch.randn(max_seq_len, rotary_dim, device=device, dtype=dtype)
+
+        result_q, result_k = rotary_embedding_torch(
+            query, key, cos, sin,
+            position_ids=positions,
+            rotary_interleaved=False,
+            inplace=False,
+        )
+
+        assert result_q.dtype == dtype
+        assert result_k.dtype == dtype
+
+    def test_interleaved_vs_neox_style(self, device):
+        """Test both interleaved and neox rotary styles."""
+        from vllm_fl.dispatch.backends.reference.impl.rotary import rotary_embedding_torch
+
+        seq_len, num_heads, head_dim = 8, 4, 64
+        max_seq_len = 16
+        rotary_dim = head_dim // 2
+
+        # Use 3D tensors
+        query = torch.randn(seq_len, num_heads, head_dim, device=device)
+        key = torch.randn(seq_len, num_heads, head_dim, device=device)
+
+        positions = torch.arange(seq_len, device=device)
+        cos = torch.randn(max_seq_len, rotary_dim, device=device)
+        sin = torch.randn(max_seq_len, rotary_dim, device=device)
+
+        # Test neox style (default)
+        result_q_neox, result_k_neox = rotary_embedding_torch(
+            query, key, cos, sin,
+            position_ids=positions,
+            rotary_interleaved=False,
+            inplace=False,
+        )
+
+        # Test interleaved style
+        result_q_interleaved, result_k_interleaved = rotary_embedding_torch(
+            query, key, cos, sin,
+            position_ids=positions,
+            rotary_interleaved=True,
+            inplace=False,
+        )
+
+        # Results should be different between styles
+        assert not torch.allclose(result_q_neox, result_q_interleaved)
+        assert not torch.allclose(result_k_neox, result_k_interleaved)
+
+        # But both should have valid outputs (no NaN/Inf)
+        assert not torch.isnan(result_q_neox).any()
+        assert not torch.isnan(result_q_interleaved).any()
+
+    def test_rotary_embedding_mathematically(self, device):
+        """Test that rotary embedding produces expected rotation behavior."""
+        from vllm_fl.dispatch.backends.reference.impl.rotary import rotary_embedding_torch
+
+        seq_len, num_heads, head_dim = 4, 2, 8
+        max_seq_len = 8
+        rotary_dim = head_dim // 2
+
+        # Create simple test tensors
+        query = torch.ones(seq_len, num_heads, head_dim, device=device)
+        key = torch.ones(seq_len, num_heads, head_dim, device=device)
+
+        # Create cos/sin with known values
+        positions = torch.arange(seq_len, device=device)
+        cos = torch.ones(max_seq_len, rotary_dim, device=device)
+        sin = torch.zeros(max_seq_len, rotary_dim, device=device)
+
+        # With cos=1 and sin=0, output should equal input (no rotation)
+        result_q, result_k = rotary_embedding_torch(
+            query, key, cos, sin,
+            position_ids=positions,
+            rotary_interleaved=False,
+            inplace=False,
+        )
+
+        torch.testing.assert_close(result_q, query, rtol=1e-5, atol=1e-5)
+        torch.testing.assert_close(result_k, key, rtol=1e-5, atol=1e-5)
+
+
+# =============================================================================
+# Cross-Implementation Consistency Tests
+# =============================================================================
+
+class TestCrossImplementationConsistency:
+    """Test consistency between different backend implementations."""
+
+    def test_silu_reference_matches_pytorch(self, device):
+        """Test that reference implementation matches PyTorch exactly."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = torch.randn(4, 32, device=device)
+
+        result = silu_and_mul_torch(x)
+        expected = reference_silu_and_mul(x)
+
+        # Should be exactly equal (same implementation)
+        torch.testing.assert_close(result, expected, rtol=0, atol=0)
+
+    def test_rms_norm_reference_matches_pytorch(self, device):
+        """Test that reference RMS norm matches our baseline."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        x = torch.randn(4, 64, device=device)
+        weight = torch.randn(64, device=device)
+        epsilon = 1e-6
+
+        result = rms_norm_torch(x, None, weight, epsilon)
+        expected = reference_rms_norm(x, weight, epsilon)
+
+        torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-5)
+
+
+# =============================================================================
+# Edge Case Tests
+# =============================================================================
+
+class TestEdgeCases:
+    """Test edge cases and boundary conditions."""
+
+    def test_silu_single_element(self, device):
+        """Test SiLU with single element batch."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+
+        x = torch.randn(1, 4, device=device)
+        result = silu_and_mul_torch(x)
+
+        assert result.shape == (1, 2)
+        assert not torch.isnan(result).any()
+
+    def test_rms_norm_single_element(self, device):
+        """Test RMS norm with single element batch."""
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        x = torch.randn(1, 64, device=device)
+        weight = torch.ones(64, device=device)
+
+        result = rms_norm_torch(x, None, weight, 1e-5)
+
+        assert result.shape == (1, 64)
+        assert not torch.isnan(result).any()
+
+    def test_very_small_values(self, device):
+        """Test with very small input values."""
+        from vllm_fl.dispatch.backends.reference.impl.activation import silu_and_mul_torch
+        from vllm_fl.dispatch.backends.reference.impl.normalization import rms_norm_torch
+
+        x_silu = torch.randn(2, 8, device=device) * 1e-7
+        x_norm = torch.randn(2, 32, device=device) * 1e-7
+        weight = torch.ones(32, device=device)
+
+        result_silu = silu_and_mul_torch(x_silu)
+        result_norm = rms_norm_torch(x_norm, None, weight, 1e-5)
+
+        assert not torch.isnan(result_silu).any()
+        assert not torch.isnan(result_norm).any()
+        assert not torch.isinf(result_silu).any()
+        assert not torch.isinf(result_norm).any()

From f55db9a4a96acf2c7eaccc632e44b74237c08634 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Fri, 6 Feb 2026 20:35:50 +0800
Subject: [PATCH 10/14] Remove invalid test cases in unit tests

---
 .gitignore                                    |   4 +
 tests/unit_tests/compilation/test_graph.py    |  28 +-
 .../distributed/test_communicator.py          |  56 +---
 tests/unit_tests/distributed/test_flagcx.py   |  89 +-----
 tests/unit_tests/ops/test_activation.py       |  15 +-
 tests/unit_tests/ops/test_layernorm.py        |  24 +-
 tests/unit_tests/ops/test_rotary_embedding.py |  51 +---
 tests/unit_tests/worker/test_model_runner.py  | 287 ++++++++++++++++--
 tests/unit_tests/worker/test_worker.py        |  54 +---
 9 files changed, 318 insertions(+), 290 deletions(-)

diff --git a/.gitignore b/.gitignore
index dd3b07b..b03a82e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,7 @@
 __pycache__/
 build/
 
+# Coverage
+.coverage
+.coverage.*
+htmlcov/
diff --git a/tests/unit_tests/compilation/test_graph.py b/tests/unit_tests/compilation/test_graph.py
index 38f1a54..7ad08a0 100644
--- a/tests/unit_tests/compilation/test_graph.py
+++ b/tests/unit_tests/compilation/test_graph.py
@@ -5,24 +5,7 @@
 """
 
 import pytest
-from unittest.mock import patch, MagicMock
-from dataclasses import dataclass
-
-
-class TestGraphClasses:
-    """Test graph-related classes."""
-
-    def test_graph_entry_import(self):
-        from vllm_fl.compilation.graph import GraphEntry
-        assert GraphEntry is not None
-
-    def test_graph_options_import(self):
-        from vllm_fl.compilation.graph import GraphOptions
-        assert GraphOptions is not None
-
-    def test_graph_wrapper_import(self):
-        from vllm_fl.compilation.graph import GraphWrapper
-        assert GraphWrapper is not None
+from unittest.mock import MagicMock
 
 
 class TestGraphOptions:
@@ -57,7 +40,6 @@ class TestGraphEntry:
     def test_default_values(self):
         from vllm_fl.compilation.graph import GraphEntry
 
-        # Create a mock BatchDescriptor
         mock_batch_desc = MagicMock()
 
         entry = GraphEntry(batch_descriptor=mock_batch_desc)
@@ -66,11 +48,3 @@ def test_default_values(self):
         assert entry.graph is None
         assert entry.output is None
         assert entry.input_addresses is None
-
-
-class TestWeakRefTensors:
-    """Test weak_ref_tensors function."""
-
-    def test_import(self):
-        from vllm_fl.compilation.graph import weak_ref_tensors
-        assert weak_ref_tensors is not None
diff --git a/tests/unit_tests/distributed/test_communicator.py b/tests/unit_tests/distributed/test_communicator.py
index 79021eb..412d2b5 100644
--- a/tests/unit_tests/distributed/test_communicator.py
+++ b/tests/unit_tests/distributed/test_communicator.py
@@ -2,12 +2,13 @@
 
 """
 Tests for distributed communicator module.
+
+Note: Tests require FLAGCX_PATH environment variable to be set.
+Tests are skipped if flagcx is not available.
 """
 
 import os
 import pytest
-import torch
-from unittest.mock import MagicMock
 
 
 def has_flagcx():
@@ -26,51 +27,6 @@ def has_flagcx():
 )
 
 
-class TestCommunicatorFL:
-    """Test CommunicatorFL class."""
-
-    def test_import(self):
-        """Test that CommunicatorFL can be imported."""
-        from vllm_fl.distributed.communicator import CommunicatorFL
-        assert CommunicatorFL is not None
-
-    def test_class_inherits_from_base(self):
-        """Test that CommunicatorFL inherits from DeviceCommunicatorBase."""
-        from vllm_fl.distributed.communicator import CommunicatorFL
-        from vllm.distributed.device_communicators.base_device_communicator import (
-            DeviceCommunicatorBase
-        )
-        assert issubclass(CommunicatorFL, DeviceCommunicatorBase)
-
-    def test_class_has_required_methods(self):
-        """Test that CommunicatorFL has all required methods."""
-        from vllm_fl.distributed.communicator import CommunicatorFL
-
-        required_methods = [
-            'all_reduce',
-            'reduce_scatter',
-            'send',
-            'recv',
-            'destroy',
-        ]
-
-        for method in required_methods:
-            assert hasattr(CommunicatorFL, method), f"Missing method: {method}"
-
-    def test_instance_attributes_single_worker(self):
-        """Test instance attributes for single worker scenario."""
-        from vllm_fl.distributed.communicator import CommunicatorFL
-
-        # Create instance without calling __init__ to test attribute access
-        comm = CommunicatorFL.__new__(CommunicatorFL)
-
-        # Manually set attributes that would be set by parent class
-        comm.world_size = 1
-        comm.use_all2all = False
-        comm.cpu_group = MagicMock()
-        comm.device = torch.device("cpu")
-        comm.pyflagcx_comm = None
-
-        # Verify attributes
-        assert comm.world_size == 1
-        assert comm.pyflagcx_comm is None
+# Note: CommunicatorFL requires multi-process GPU environment for meaningful tests.
+# Integration tests should be in functional_tests/.
+# Unit tests here are minimal as the class requires distributed infrastructure.
diff --git a/tests/unit_tests/distributed/test_flagcx.py b/tests/unit_tests/distributed/test_flagcx.py
index 0c659c3..21ab7a4 100644
--- a/tests/unit_tests/distributed/test_flagcx.py
+++ b/tests/unit_tests/distributed/test_flagcx.py
@@ -3,96 +3,39 @@
 """
 Tests for flagcx communicator module.
 
-Note: Most tests require FLAGCX_PATH environment variable to be set.
+Note: Tests require FLAGCX_PATH environment variable and the flagcx Python bindings.
 Tests are skipped if flagcx is not available.
+
+Integration tests for actual distributed operations should be in functional_tests/.
 """
 
 import os
 import pytest
-import torch
 
 
 def has_flagcx():
-    """Check if flagcx is available."""
+    """Check if flagcx is available (both library and Python bindings)."""
     flagcx_path = os.getenv('FLAGCX_PATH')
     if not flagcx_path:
         return False
     lib_path = os.path.join(flagcx_path, "build/lib/libflagcx.so")
-    return os.path.exists(lib_path)
+    if not os.path.exists(lib_path):
+        return False
+    # Also check Python bindings
+    try:
+        from plugin.interservice.flagcx_wrapper import flagcxDataTypeEnum
+        return True
+    except ImportError:
+        return False
 
 
 # Mark all tests in this module as requiring flagcx
 pytestmark = pytest.mark.skipif(
     not has_flagcx(),
-    reason="FLAGCX_PATH not set or flagcx library not found"
+    reason="FLAGCX_PATH not set, flagcx library not found, or Python bindings unavailable"
 )
 
 
-class TestPyFlagcxCommunicator:
-    """Test PyFlagcxCommunicator class."""
-
-    def test_import(self):
-        """Test that the module can be imported when flagcx is available."""
-        from vllm_fl.distributed.device_communicators.flagcx import PyFlagcxCommunicator
-        assert PyFlagcxCommunicator is not None
-
-    def test_class_has_required_methods(self):
-        """Test that PyFlagcxCommunicator has all required methods."""
-        from vllm_fl.distributed.device_communicators.flagcx import PyFlagcxCommunicator
-
-        required_methods = [
-            'all_reduce',
-            'all_gather',
-            'reduce_scatter',
-            'send',
-            'recv',
-            'broadcast',
-            'group_start',
-            'group_end',
-        ]
-
-        for method in required_methods:
-            assert hasattr(PyFlagcxCommunicator, method), f"Missing method: {method}"
-
-
-class TestFlagcxDataTypes:
-    """Test flagcx data type related functionality."""
-
-    def test_flagcx_dtype_enum_import(self):
-        """Test that flagcxDataTypeEnum can be imported."""
-        from plugin.interservice.flagcx_wrapper import flagcxDataTypeEnum
-        assert flagcxDataTypeEnum is not None
-
-    def test_flagcx_dtype_from_torch(self):
-        """Test torch dtype to flagcx dtype conversion."""
-        from plugin.interservice.flagcx_wrapper import flagcxDataTypeEnum
-
-        # Test common dtypes
-        test_dtypes = [
-            torch.float32,
-            torch.float16,
-            torch.bfloat16,
-        ]
-
-        for dtype in test_dtypes:
-            # Should not raise
-            result = flagcxDataTypeEnum.from_torch(dtype)
-            assert result is not None
-
-
-class TestFlagcxReduceOps:
-    """Test flagcx reduce operation types."""
-
-    def test_flagcx_reduce_op_enum_import(self):
-        """Test that flagcxRedOpTypeEnum can be imported."""
-        from plugin.interservice.flagcx_wrapper import flagcxRedOpTypeEnum
-        assert flagcxRedOpTypeEnum is not None
-
-    def test_flagcx_reduce_op_from_torch(self):
-        """Test torch ReduceOp to flagcx reduce op conversion."""
-        from plugin.interservice.flagcx_wrapper import flagcxRedOpTypeEnum
-        from torch.distributed import ReduceOp
-
-        # Test SUM operation
-        result = flagcxRedOpTypeEnum.from_torch(ReduceOp.SUM)
-        assert result is not None
+# Note: PyFlagcxCommunicator requires multi-GPU distributed environment for meaningful tests.
+# Unit tests for dtype/op conversions are moved here but require the plugin module.
+# Integration tests should be in functional_tests/.
diff --git a/tests/unit_tests/ops/test_activation.py b/tests/unit_tests/ops/test_activation.py
index 5b697b0..c2063c3 100644
--- a/tests/unit_tests/ops/test_activation.py
+++ b/tests/unit_tests/ops/test_activation.py
@@ -6,10 +6,12 @@
 
 import pytest
 import torch
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch
 
 
 class TestSiluAndMulFL:
+    """Test SiluAndMulFL class behavior."""
+
     @pytest.fixture
     def mock_call_op(self):
         with patch("vllm_fl.ops.activation.call_op") as mock:
@@ -20,11 +22,8 @@ def mock_parent_init(self):
         with patch("vllm_fl.ops.activation.SiluAndMul.__init__", return_value=None):
             yield
 
-    def test_import(self):
-        from vllm_fl.ops.activation import SiluAndMulFL
-        assert SiluAndMulFL is not None
-
-    def test_forward_oot_calls_dispatch(self, mock_parent_init, mock_call_op):
+    def test_forward_oot_dispatches_correctly(self, mock_parent_init, mock_call_op):
+        """Test forward_oot calls dispatch system with correct op name and input."""
         from vllm_fl.ops.activation import SiluAndMulFL
 
         mock_call_op.return_value = torch.randn(2, 4)
@@ -35,7 +34,3 @@ def test_forward_oot_calls_dispatch(self, mock_parent_init, mock_call_op):
 
         mock_call_op.assert_called_once_with("silu_and_mul", x)
         assert result.shape == (2, 4)
-
-    def test_all_exports(self):
-        from vllm_fl.ops.activation import __all__
-        assert "SiluAndMulFL" in __all__
diff --git a/tests/unit_tests/ops/test_layernorm.py b/tests/unit_tests/ops/test_layernorm.py
index 9f1f332..ba8aa76 100644
--- a/tests/unit_tests/ops/test_layernorm.py
+++ b/tests/unit_tests/ops/test_layernorm.py
@@ -6,20 +6,19 @@
 
 import pytest
 import torch
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch
 
 
 class TestRMSNormFL:
+    """Test RMSNormFL class behavior."""
+
     @pytest.fixture
     def mock_call_op(self):
         with patch("vllm_fl.ops.layernorm.call_op") as mock:
             yield mock
 
-    def test_import(self):
-        from vllm_fl.ops.layernorm import RMSNormFL
-        assert RMSNormFL is not None
-
-    def test_init_params(self):
+    def test_init_creates_weight_parameter(self):
+        """Test that initialization creates weight parameter with correct shape."""
         from vllm_fl.ops.layernorm import RMSNormFL
 
         hidden_size = 128
@@ -29,7 +28,8 @@ def test_init_params(self):
         assert layer.variance_epsilon == eps
         assert layer.weight.shape == (hidden_size,)
 
-    def test_forward_oot_without_residual(self, mock_call_op):
+    def test_forward_oot_dispatches_without_residual(self, mock_call_op):
+        """Test forward_oot calls dispatch system correctly without residual."""
         from vllm_fl.ops.layernorm import RMSNormFL
 
         hidden_size = 128
@@ -44,10 +44,10 @@ def test_forward_oot_without_residual(self, mock_call_op):
         call_args = mock_call_op.call_args
         assert call_args[0][0] == "rms_norm"
         assert torch.equal(call_args[0][1], x)
-        assert call_args[0][2] is None  # residual
-        assert torch.equal(call_args[0][3], layer.weight)
+        assert call_args[0][2] is None  # residual should be None
 
-    def test_forward_oot_with_residual(self, mock_call_op):
+    def test_forward_oot_dispatches_with_residual(self, mock_call_op):
+        """Test forward_oot passes residual to dispatch system."""
         from vllm_fl.ops.layernorm import RMSNormFL
 
         hidden_size = 128
@@ -63,7 +63,3 @@ def test_forward_oot_with_residual(self, mock_call_op):
         call_args = mock_call_op.call_args
         assert call_args[0][0] == "rms_norm"
         assert torch.equal(call_args[0][2], residual)
-
-    def test_all_exports(self):
-        from vllm_fl.ops.layernorm import __all__
-        assert "RMSNormFL" in __all__
diff --git a/tests/unit_tests/ops/test_rotary_embedding.py b/tests/unit_tests/ops/test_rotary_embedding.py
index eddeba3..fddcb41 100644
--- a/tests/unit_tests/ops/test_rotary_embedding.py
+++ b/tests/unit_tests/ops/test_rotary_embedding.py
@@ -6,42 +6,24 @@
 
 import pytest
 import torch
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch
 
 
 class TestRotaryEmbeddingFL:
-    """Test RotaryEmbeddingFL class."""
+    """Test RotaryEmbeddingFL class behavior."""
 
     @pytest.fixture
     def mock_call_op(self):
-        """Mock the call_op function."""
         with patch("vllm_fl.ops.rotary_embedding.call_op") as mock:
             yield mock
 
     @pytest.fixture
     def mock_parent_init(self):
-        """Mock the parent class __init__ to avoid vllm C++ dependencies."""
         with patch("vllm_fl.ops.rotary_embedding.RotaryEmbedding.__init__", return_value=None):
             yield
 
-    def test_import(self):
-        """Test that RotaryEmbeddingFL can be imported."""
-        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
-        assert RotaryEmbeddingFL is not None
-
-    def test_class_exists(self):
-        """Test that RotaryEmbeddingFL is a class."""
-        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
-        assert isinstance(RotaryEmbeddingFL, type)
-
-    def test_has_forward_oot_method(self):
-        """Test that RotaryEmbeddingFL has forward_oot method."""
-        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
-        assert hasattr(RotaryEmbeddingFL, 'forward_oot')
-        assert callable(getattr(RotaryEmbeddingFL, 'forward_oot'))
-
-    def test_init_calls_parent(self, mock_parent_init):
-        """Test that __init__ calls parent class."""
+    def test_forward_oot_dispatches_correctly(self, mock_parent_init, mock_call_op):
+        """Test forward_oot calls dispatch system with correct arguments."""
         from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
 
         layer = RotaryEmbeddingFL(
@@ -53,45 +35,20 @@ def test_init_calls_parent(self, mock_parent_init):
             dtype=torch.float32,
         )
 
-        # Instance should be created
-        assert layer is not None
-
-    def test_forward_oot_calls_dispatch(self, mock_parent_init, mock_call_op):
-        """Test that forward_oot calls the dispatch call_op."""
-        from vllm_fl.ops.rotary_embedding import RotaryEmbeddingFL
-
-        # Create layer with mocked parent
-        layer = RotaryEmbeddingFL(
-            head_size=64,
-            rotary_dim=32,
-            max_position_embeddings=2048,
-            base=10000.0,
-            is_neox_style=True,
-            dtype=torch.float32,
-        )
-
         # Manually set attributes that parent __init__ would set
         layer.head_size = 64
         layer.rotary_dim = 32
         layer.is_neox_style = True
         layer.cos_sin_cache = torch.randn(2048, 64)
 
-        # Setup mock return value
         mock_call_op.return_value = (torch.randn(4, 8, 32), torch.randn(4, 8, 32))
 
-        # Call forward_oot
         positions = torch.tensor([0, 1, 2, 3])
         query = torch.randn(4, 8, 64)
         key = torch.randn(4, 8, 64)
 
         result = layer.forward_oot(positions, query, key)
 
-        # Verify call_op was called with "rotary_embedding"
         mock_call_op.assert_called_once()
         call_args = mock_call_op.call_args
         assert call_args[0][0] == "rotary_embedding"
-
-    def test_all_exports(self):
-        """Test that __all__ contains RotaryEmbeddingFL."""
-        from vllm_fl.ops.rotary_embedding import __all__
-        assert "RotaryEmbeddingFL" in __all__
diff --git a/tests/unit_tests/worker/test_model_runner.py b/tests/unit_tests/worker/test_model_runner.py
index 2d48257..f7ee06d 100644
--- a/tests/unit_tests/worker/test_model_runner.py
+++ b/tests/unit_tests/worker/test_model_runner.py
@@ -3,12 +3,23 @@
 """
 Tests for model runner module.
 
+This module follows a layered testing strategy:
+- Layer 1: Pure functions and data classes (no external dependencies)
+- Layer 2: Methods with mocked dependencies
+- Layer 3: Integration tests (in functional_tests/, requires GPU)
+
 Note: These tests require vllm >= 0.13.0 with full installation.
 """
 
 import pytest
-from unittest.mock import patch, MagicMock
+import numpy as np
+import torch
+from unittest.mock import MagicMock
+
 
+# =============================================================================
+# Test Utilities - Check availability before importing
+# =============================================================================
 
 def has_vllm_model_runner():
     """Check if vllm model runner dependencies are available."""
@@ -26,35 +37,263 @@ def has_vllm_model_runner():
 )
 
 
-class TestModelRunnerFL:
-    """Test ModelRunnerFL class."""
+# =============================================================================
+# Layer 1: ExecuteModelState Data Structure Tests
+# =============================================================================
 
-    def test_import(self):
-        """Test that ModelRunnerFL can be imported."""
-        from vllm_fl.worker.model_runner import ModelRunnerFL
-        assert ModelRunnerFL is not None
+class TestExecuteModelState:
+    """Test ExecuteModelState NamedTuple behavior and contract."""
 
-    def test_is_class(self):
-        """Test that ModelRunnerFL is a class."""
-        from vllm_fl.worker.model_runner import ModelRunnerFL
-        assert isinstance(ModelRunnerFL, type)
+    def test_fields_match_expected_contract(self):
+        """Verify ExecuteModelState has exact fields required by execute_model pipeline."""
+        from vllm_fl.worker.model_runner import ExecuteModelState
+
+        expected_fields = (
+            'scheduler_output', 'logits', 'spec_decode_metadata',
+            'spec_decode_common_attn_metadata', 'hidden_states',
+            'sample_hidden_states', 'aux_hidden_states',
+            'ec_connector_output', 'cudagraph_stats'
+        )
+        assert ExecuteModelState._fields == expected_fields, (
+            "ExecuteModelState fields changed - this may break execute_model consumers"
+        )
+
+    def test_immutability_prevents_accidental_mutation(self):
+        """Ensure state cannot be mutated after creation (important for pipeline safety)."""
+        from vllm_fl.worker.model_runner import ExecuteModelState
+
+        state = ExecuteModelState(
+            scheduler_output=MagicMock(),
+            logits=torch.randn(4, 1000),
+            spec_decode_metadata=None,
+            spec_decode_common_attn_metadata=None,
+            hidden_states=torch.randn(4, 512),
+            sample_hidden_states=torch.randn(4, 512),
+            aux_hidden_states=None,
+            ec_connector_output=None,
+            cudagraph_stats=None,
+        )
+
+        with pytest.raises(AttributeError):
+            state.logits = torch.randn(4, 1000)
+
+    def test_unpacking_for_downstream_processing(self):
+        """Test that state can be unpacked correctly for downstream use."""
+        from vllm_fl.worker.model_runner import ExecuteModelState
+
+        mock_scheduler = MagicMock()
+        mock_logits = torch.randn(4, 1000)
+
+        state = ExecuteModelState(
+            scheduler_output=mock_scheduler,
+            logits=mock_logits,
+            spec_decode_metadata=None,
+            spec_decode_common_attn_metadata=None,
+            hidden_states=None,
+            sample_hidden_states=None,
+            aux_hidden_states=None,
+            ec_connector_output=None,
+            cudagraph_stats=None,
+        )
+
+        # Simulate downstream unpacking
+        scheduler, logits, *rest = state
+        assert scheduler is mock_scheduler
+        assert torch.equal(logits, mock_logits)
+
+
+# =============================================================================
+# Layer 2: _get_cumsum_and_arange Algorithm Tests
+# =============================================================================
+
+class TestGetCumsumAndArange:
+    """Test _get_cumsum_and_arange method - critical for batch processing."""
 
-    def test_inherits_from_mixins(self):
-        """Test that ModelRunnerFL inherits from expected mixins."""
+    @pytest.fixture
+    def mock_model_runner(self):
+        """Create a minimal mock of ModelRunnerFL for testing."""
         from vllm_fl.worker.model_runner import ModelRunnerFL
-        from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-        # ModelRunnerFL uses mixins, not GPUModelRunner
-        assert issubclass(ModelRunnerFL, LoRAModelRunnerMixin)
+        mock_runner = MagicMock(spec=ModelRunnerFL)
+        mock_runner.arange_np = np.arange(10000)
+        mock_runner._get_cumsum_and_arange = ModelRunnerFL._get_cumsum_and_arange.__get__(
+            mock_runner, ModelRunnerFL
+        )
+        return mock_runner
+
+    def test_multi_sequence_batch(self, mock_model_runner):
+        """Test cumsum and per-sequence arange for typical multi-sequence batch."""
+        num_tokens = np.array([2, 5, 3])
+
+        cu_num_tokens, arange = mock_model_runner._get_cumsum_and_arange(num_tokens)
+
+        # Cumsum: [2, 7, 10] - used for indexing into flattened batch
+        np.testing.assert_array_equal(cu_num_tokens, np.array([2, 7, 10]))
+
+        # Arange: per-sequence position indices [0,1 | 0,1,2,3,4 | 0,1,2]
+        expected_arange = np.array([0, 1, 0, 1, 2, 3, 4, 0, 1, 2])
+        np.testing.assert_array_equal(arange, expected_arange)
+
+    def test_single_sequence(self, mock_model_runner):
+        """Test with single sequence (common in generation phase)."""
+        num_tokens = np.array([5])
+
+        cu_num_tokens, arange = mock_model_runner._get_cumsum_and_arange(num_tokens)
+
+        np.testing.assert_array_equal(cu_num_tokens, np.array([5]))
+        np.testing.assert_array_equal(arange, np.array([0, 1, 2, 3, 4]))
+
+    def test_all_single_token_sequences(self, mock_model_runner):
+        """Test batch where each sequence has 1 token (decode phase)."""
+        num_tokens = np.array([1, 1, 1, 1])
+
+        cu_num_tokens, arange = mock_model_runner._get_cumsum_and_arange(num_tokens)
+
+        np.testing.assert_array_equal(cu_num_tokens, np.array([1, 2, 3, 4]))
+        np.testing.assert_array_equal(arange, np.array([0, 0, 0, 0]))
+
+    def test_large_sequences(self, mock_model_runner):
+        """Test with larger sequences to verify correct boundary handling."""
+        num_tokens = np.array([10, 20, 30])
+
+        cu_num_tokens, arange = mock_model_runner._get_cumsum_and_arange(num_tokens)
+
+        assert cu_num_tokens[-1] == 60
+        assert len(arange) == 60
+        # Verify boundaries: first seq 0-9, second seq 0-19, third seq 0-29
+        np.testing.assert_array_equal(arange[:10], np.arange(10))
+        np.testing.assert_array_equal(arange[10:30], np.arange(20))
+        np.testing.assert_array_equal(arange[30:60], np.arange(30))
+
+    def test_dtype_preservation(self, mock_model_runner):
+        """Test that dtype is correctly applied to cumsum output."""
+        num_tokens = np.array([2, 3])
+
+        cu_num_tokens, _ = mock_model_runner._get_cumsum_and_arange(
+            num_tokens, cumsum_dtype=np.int64
+        )
 
-    def test_has_load_model_method(self):
-        """Test that ModelRunnerFL has load_model method."""
+        assert cu_num_tokens.dtype == np.int64
+
+
+# =============================================================================
+# Layer 2: _pad_for_sequence_parallelism Logic Tests
+# =============================================================================
+
+class TestPadForSequenceParallelism:
+    """Test sequence parallelism padding logic."""
+
+    @pytest.fixture
+    def mock_model_runner(self):
+        """Create mock model runner for padding tests."""
         from vllm_fl.worker.model_runner import ModelRunnerFL
-        assert hasattr(ModelRunnerFL, 'load_model')
-        assert callable(getattr(ModelRunnerFL, 'load_model'))
 
-    def test_has_execute_model_method(self):
-        """Test that ModelRunnerFL has execute_model method."""
+        mock_runner = MagicMock(spec=ModelRunnerFL)
+        mock_runner.vllm_config = MagicMock()
+        mock_runner.vllm_config.parallel_config = MagicMock()
+        mock_runner.compilation_config = MagicMock()
+        mock_runner.compilation_config.pass_config = MagicMock()
+        mock_runner._pad_for_sequence_parallelism = (
+            ModelRunnerFL._pad_for_sequence_parallelism.__get__(
+                mock_runner, ModelRunnerFL
+            )
+        )
+        return mock_runner
+
+    def test_no_padding_when_sp_disabled(self, mock_model_runner):
+        """SP disabled should return original token count."""
+        mock_model_runner.vllm_config.parallel_config.tensor_parallel_size = 4
+        mock_model_runner.compilation_config.pass_config.enable_sp = False
+
+        assert mock_model_runner._pad_for_sequence_parallelism(10) == 10
+
+    def test_no_padding_when_tp_size_1(self, mock_model_runner):
+        """TP size 1 means no parallelism, no padding needed."""
+        mock_model_runner.vllm_config.parallel_config.tensor_parallel_size = 1
+        mock_model_runner.compilation_config.pass_config.enable_sp = True
+
+        assert mock_model_runner._pad_for_sequence_parallelism(10) == 10
+
+    @pytest.mark.parametrize("num_tokens,tp_size,expected", [
+        (10, 4, 12),   # 10 -> ceil to multiple of 4
+        (8, 4, 8),     # 8 already multiple of 4
+        (10, 8, 16),   # 10 -> ceil to multiple of 8
+        (1, 4, 4),     # 1 -> ceil to multiple of 4
+        (15, 8, 16),   # 15 -> ceil to multiple of 8
+    ])
+    def test_padding_calculation(self, mock_model_runner, num_tokens, tp_size, expected):
+        """Verify padding rounds up to next multiple of tp_size."""
+        mock_model_runner.vllm_config.parallel_config.tensor_parallel_size = tp_size
+        mock_model_runner.compilation_config.pass_config.enable_sp = True
+
+        result = mock_model_runner._pad_for_sequence_parallelism(num_tokens)
+
+        assert result == expected
+        assert result % tp_size == 0  # Must be divisible
+
+
+# =============================================================================
+# Layer 2: _get_positions Routing Tests
+# =============================================================================
+
+class TestGetPositions:
+    """Test position retrieval for different position encoding schemes."""
+
+    @pytest.fixture
+    def mock_model_runner(self):
+        """Create mock model runner for position tests."""
         from vllm_fl.worker.model_runner import ModelRunnerFL
-        assert hasattr(ModelRunnerFL, 'execute_model')
-        assert callable(getattr(ModelRunnerFL, 'execute_model'))
+
+        mock_runner = MagicMock(spec=ModelRunnerFL)
+
+        # Standard positions buffer
+        mock_runner.positions = MagicMock()
+        mock_runner.positions.gpu = torch.arange(100)
+
+        # MRoPE positions (3D for temporal, height, width)
+        mock_runner.mrope_positions = MagicMock()
+        mock_runner.mrope_positions.gpu = torch.arange(300).reshape(3, 100)
+
+        # XDRoPE positions (2D)
+        mock_runner.xdrope_positions = MagicMock()
+        mock_runner.xdrope_positions.gpu = torch.arange(200).reshape(2, 100)
+
+        mock_runner.uses_mrope = False
+        mock_runner.uses_xdrope_dim = 0
+
+        mock_runner._get_positions = ModelRunnerFL._get_positions.__get__(
+            mock_runner, ModelRunnerFL
+        )
+        return mock_runner
+
+    def test_standard_positions_with_int(self, mock_model_runner):
+        """Standard RoPE: integer returns first N positions."""
+        result = mock_model_runner._get_positions(10)
+        torch.testing.assert_close(result, torch.arange(10))
+
+    def test_standard_positions_with_indices(self, mock_model_runner):
+        """Standard RoPE: tensor indices for selective position lookup."""
+        indices = torch.tensor([0, 5, 10, 15])
+        result = mock_model_runner._get_positions(indices)
+        expected = mock_model_runner.positions.gpu[indices]
+        torch.testing.assert_close(result, expected)
+
+    def test_mrope_returns_3d_positions(self, mock_model_runner):
+        """MRoPE (Qwen2-VL): returns [3, num_tokens] positions."""
+        mock_model_runner.uses_mrope = True
+
+        result = mock_model_runner._get_positions(10)
+
+        expected = mock_model_runner.mrope_positions.gpu[:, :10]
+        assert result.shape == (3, 10)
+        torch.testing.assert_close(result, expected)
+
+    def test_xdrope_returns_2d_positions(self, mock_model_runner):
+        """XDRoPE: returns [2, num_tokens] positions."""
+        mock_model_runner.uses_xdrope_dim = 64
+
+        result = mock_model_runner._get_positions(10)
+
+        expected = mock_model_runner.xdrope_positions.gpu[:, :10]
+        assert result.shape == (2, 10)
+        torch.testing.assert_close(result, expected)
diff --git a/tests/unit_tests/worker/test_worker.py b/tests/unit_tests/worker/test_worker.py
index 461b664..be12b06 100644
--- a/tests/unit_tests/worker/test_worker.py
+++ b/tests/unit_tests/worker/test_worker.py
@@ -7,7 +7,6 @@
 """
 
 import pytest
-from unittest.mock import patch, MagicMock
 
 
 def has_vllm_profiler():
@@ -26,33 +25,13 @@ def has_vllm_profiler():
 )
 
 
-class TestWorkerFL:
-    """Test WorkerFL class."""
-
-    def test_import(self):
-        """Test that WorkerFL can be imported."""
-        from vllm_fl.worker.worker import WorkerFL
-        assert WorkerFL is not None
-
-    def test_memory_snapshot_import(self):
-        """Test that MemorySnapshot can be imported."""
-        from vllm_fl.worker.worker import MemorySnapshot
-        assert MemorySnapshot is not None
-
-    def test_memory_profiling_result_import(self):
-        """Test that MemoryProfilingResult can be imported."""
-        from vllm_fl.worker.worker import MemoryProfilingResult
-        assert MemoryProfilingResult is not None
-
-
 class TestMemorySnapshot:
-    """Test MemorySnapshot dataclass."""
+    """Test MemorySnapshot dataclass behavior."""
 
-    def test_default_values(self):
-        """Test that MemorySnapshot has correct default values."""
+    def test_default_values_without_auto_measure(self):
+        """Test MemorySnapshot initializes with correct default values."""
         from vllm_fl.worker.worker import MemorySnapshot
 
-        # Create with auto_measure=False to avoid actual GPU calls
         snapshot = MemorySnapshot(auto_measure=False)
 
         assert snapshot.torch_peak == 0
@@ -62,8 +41,8 @@ def test_default_values(self):
         assert snapshot.torch_memory == 0
         assert snapshot.non_torch_memory == 0
 
-    def test_subtraction(self):
-        """Test MemorySnapshot subtraction operator."""
+    def test_subtraction_computes_difference(self):
+        """Test MemorySnapshot subtraction operator computes correct differences."""
         from vllm_fl.worker.worker import MemorySnapshot
 
         snapshot1 = MemorySnapshot(auto_measure=False)
@@ -94,10 +73,10 @@ def test_subtraction(self):
 
 
 class TestMemoryProfilingResult:
-    """Test MemoryProfilingResult dataclass."""
+    """Test MemoryProfilingResult dataclass behavior."""
 
     def test_default_values(self):
-        """Test that MemoryProfilingResult has correct default values."""
+        """Test MemoryProfilingResult initializes with correct default values."""
         from vllm_fl.worker.worker import MemoryProfilingResult
 
         result = MemoryProfilingResult()
@@ -108,26 +87,11 @@ def test_default_values(self):
         assert result.non_kv_cache_memory == 0
         assert result.profile_time == 0.0
 
-    def test_before_profile_defaults(self):
-        """Test that MemoryProfilingResult creates default snapshots."""
+    def test_creates_default_snapshots(self):
+        """Test MemoryProfilingResult creates default snapshot objects."""
         from vllm_fl.worker.worker import MemoryProfilingResult
 
         result = MemoryProfilingResult()
 
-        # Should create default MemorySnapshot objects
         assert result.before_profile is not None
         assert result.after_profile is not None
-
-
-class TestInitWorkerDistributedEnvironment:
-    """Test init_worker_distributed_environment function."""
-
-    def test_import(self):
-        """Test that init_worker_distributed_environment can be imported."""
-        from vllm_fl.worker.worker import init_worker_distributed_environment
-        assert init_worker_distributed_environment is not None
-
-    def test_is_callable(self):
-        """Test that init_worker_distributed_environment is callable."""
-        from vllm_fl.worker.worker import init_worker_distributed_environment
-        assert callable(init_worker_distributed_environment)

From 3140e99ee92b075c06d2d5b085c8cbd74184b0d1 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Mon, 9 Feb 2026 10:20:58 +0800
Subject: [PATCH 11/14] don't fail if config not found

---
 .github/labeler.yml           | 4 ++--
 .github/workflows/labeler.yml | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 4d58834..ffdf97d 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,5 +1,5 @@
-# PR Labeler 配置文件
-# 根据修改的文件路径自动添加标签
+# PR Labeler configuratoin file
+# Automatically add labels based on modified file paths
 
 docs:
   - changed-files:
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index b8220f4..3f0e93a 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,7 +1,8 @@
 name: "Pull Request Labeler"
 
 on:
-  pull_request:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
 
 permissions:
   contents: read
@@ -12,8 +13,12 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
       - name: Apply labels
         uses: actions/labeler@v5
+        continue-on-error: true  # Don't fail if config not yet on main branch
         with:
           repo-token: "${{ secrets.GITHUB_TOKEN }}"
           sync-labels: true
\ No newline at end of file

From 84d3d1d609bc01f2af31b071c55e20a972888ae2 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Tue, 10 Feb 2026 09:07:01 +0800
Subject: [PATCH 12/14] move e2e tests to e2e_tests folder

---
 tests/{ => e2e_tests}/test_offline_minicmp.py          | 0
 tests/{ => e2e_tests}/test_offline_qwen3_next.py       | 0
 tests/{ => e2e_tests}/test_vllm_serve_minicmp.py       | 0
 tests/{ => e2e_tests}/test_vllm_serve_qwen3_next.py    | 0
 tests/functional_tests/flaggems/test_gems_whitelist.py | 8 ++++----
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename tests/{ => e2e_tests}/test_offline_minicmp.py (100%)
 rename tests/{ => e2e_tests}/test_offline_qwen3_next.py (100%)
 rename tests/{ => e2e_tests}/test_vllm_serve_minicmp.py (100%)
 rename tests/{ => e2e_tests}/test_vllm_serve_qwen3_next.py (100%)

diff --git a/tests/test_offline_minicmp.py b/tests/e2e_tests/test_offline_minicmp.py
similarity index 100%
rename from tests/test_offline_minicmp.py
rename to tests/e2e_tests/test_offline_minicmp.py
diff --git a/tests/test_offline_qwen3_next.py b/tests/e2e_tests/test_offline_qwen3_next.py
similarity index 100%
rename from tests/test_offline_qwen3_next.py
rename to tests/e2e_tests/test_offline_qwen3_next.py
diff --git a/tests/test_vllm_serve_minicmp.py b/tests/e2e_tests/test_vllm_serve_minicmp.py
similarity index 100%
rename from tests/test_vllm_serve_minicmp.py
rename to tests/e2e_tests/test_vllm_serve_minicmp.py
diff --git a/tests/test_vllm_serve_qwen3_next.py b/tests/e2e_tests/test_vllm_serve_qwen3_next.py
similarity index 100%
rename from tests/test_vllm_serve_qwen3_next.py
rename to tests/e2e_tests/test_vllm_serve_qwen3_next.py
diff --git a/tests/functional_tests/flaggems/test_gems_whitelist.py b/tests/functional_tests/flaggems/test_gems_whitelist.py
index 36eaef7..b7e37e4 100644
--- a/tests/functional_tests/flaggems/test_gems_whitelist.py
+++ b/tests/functional_tests/flaggems/test_gems_whitelist.py
@@ -70,15 +70,15 @@ def test_use_flaggems_op_blacklist_only_listed_disallowed(monkeypatch):
     assert use_flaggems_op("other_op") is True
 
 
-def test_use_flaggems_op_whitelist_and_blacklist_same_op_raises(monkeypatch):
-    """When same op is in both whitelist and blacklist, ValueError is raised."""
+def test_use_flaggems_op_whitelist_and_blacklist_both_set_raises(monkeypatch):
+    """When both whitelist and blacklist are set, ValueError is raised."""
     _env_for_flaggems_enabled(monkeypatch)
     monkeypatch.setenv("VLLM_FL_FLAGOS_WHITELIST", "silu_and_mul,rms_norm")
     monkeypatch.setenv("VLLM_FL_FLAGOS_BLACKLIST", "rms_norm,other")
 
     with pytest.raises(ValueError) as exc_info:
-        use_flaggems_op("rms_norm")
-    assert "rms_norm" in str(exc_info.value)
+        use_flaggems_op("silu_and_mul")
+    # Implementation disallows setting both whitelist and blacklist simultaneously
     assert "VLLM_FL_FLAGOS_WHITELIST" in str(exc_info.value)
     assert "VLLM_FL_FLAGOS_BLACKLIST" in str(exc_info.value)
 

From 0640f569a4002bb7f48e906af1b5bb95de7d0571 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Tue, 10 Feb 2026 09:39:38 +0800
Subject: [PATCH 13/14] fix error in functional tests

---
 .../functional_tests/compilation/test_graph_capture.py | 10 ++++++++++
 vllm_fl/utils.py                                       |  8 ++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/functional_tests/compilation/test_graph_capture.py b/tests/functional_tests/compilation/test_graph_capture.py
index e3e34da..a137a0e 100644
--- a/tests/functional_tests/compilation/test_graph_capture.py
+++ b/tests/functional_tests/compilation/test_graph_capture.py
@@ -15,6 +15,14 @@
 pytestmark = pytest.mark.gpu
 
 
+def has_weak_ref_tensor_op():
+    """Check if vllm C++ extension weak_ref_tensor is available."""
+    try:
+        return hasattr(torch.ops._C, 'weak_ref_tensor')
+    except Exception:
+        return False
+
+
 class TestGraphClasses:
     """Test graph-related class functionality."""
 
@@ -120,6 +128,8 @@ def test_weak_ref_tensors_function(self):
             pytest.skip("weak_ref_tensors not available")
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
+    @pytest.mark.skipif(not has_weak_ref_tensor_op(),
+                        reason="vllm C++ extension weak_ref_tensor not available")
     def test_weak_ref_tensors_with_cuda_tensor(self):
         """Test weak_ref_tensors with CUDA tensor."""
         try:
diff --git a/vllm_fl/utils.py b/vllm_fl/utils.py
index dc87a19..d28c70f 100644
--- a/vllm_fl/utils.py
+++ b/vllm_fl/utils.py
@@ -180,12 +180,12 @@ def get_flaggems_all_ops() -> list[str]:
     Get all FlagGems operator names from flag_gems._FULL_CONFIG.
     """
     try:
-        pass
+        # _FULL_CONFIG is a tuple of (op_name, function, ...) tuples
+        # Some entries have 2 elements, some have 3
+        ops = [entry[0] for entry in flag_gems._FULL_CONFIG]
+        return ops
     except Exception:
         return []
-    ops = flag_gems.all_registered_ops()
-
-    return ops
 
 
 # OOT operator names as registered in custom_ops.py (op_name lowercase)

From b8115589766eb39845f9f710e280a6bb53a5f696 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Tue, 10 Feb 2026 12:45:46 +0800
Subject: [PATCH 14/14] Optimize functional tests

---
 .../compilation/test_graph_capture.py         | 108 +-----
 .../dispatch/test_dispatch_flow.py            | 357 ------------------
 .../flaggems}/__init__.py                     |   1 -
 .../flaggems/test_flaggems_get_ops.py         |   5 +-
 .../flaggems/test_gems_whitelist.py           |   2 +-
 vllm_fl/platform.py                           |   6 +
 6 files changed, 15 insertions(+), 464 deletions(-)
 delete mode 100644 tests/functional_tests/dispatch/test_dispatch_flow.py
 rename tests/{functional_tests/dispatch => unit_tests/flaggems}/__init__.py (59%)
 rename tests/{functional_tests => unit_tests}/flaggems/test_flaggems_get_ops.py (58%)
 rename tests/{functional_tests => unit_tests}/flaggems/test_gems_whitelist.py (99%)

diff --git a/tests/functional_tests/compilation/test_graph_capture.py b/tests/functional_tests/compilation/test_graph_capture.py
index a137a0e..6bd9b63 100644
--- a/tests/functional_tests/compilation/test_graph_capture.py
+++ b/tests/functional_tests/compilation/test_graph_capture.py
@@ -3,11 +3,14 @@
 """
 Functional tests for graph capture and replay.
 Tests CUDA/NPU graph functionality for model optimization.
+
+Note: Unit tests for GraphOptions, GraphEntry, and GraphWrapper are in
+unit_tests/compilation/test_graph.py. This file only contains functional
+tests that require actual GPU execution.
 """
 
 import pytest
 import torch
-from unittest.mock import MagicMock, patch
 from dataclasses import dataclass
 
 
@@ -15,107 +18,6 @@
 pytestmark = pytest.mark.gpu
 
 
-def has_weak_ref_tensor_op():
-    """Check if vllm C++ extension weak_ref_tensor is available."""
-    try:
-        return hasattr(torch.ops._C, 'weak_ref_tensor')
-    except Exception:
-        return False
-
-
-class TestGraphClasses:
-    """Test graph-related class functionality."""
-
-    def test_graph_options_defaults(self):
-        """Test GraphOptions default values."""
-        try:
-            from vllm_fl.compilation.graph import GraphOptions
-        except ImportError:
-            pytest.skip("GraphOptions not available")
-
-        options = GraphOptions()
-        assert options.debug_log_enable is True
-        assert options.gc_disable is False
-        assert options.weak_ref_output is True
-
-    def test_graph_options_custom(self):
-        """Test GraphOptions with custom values."""
-        try:
-            from vllm_fl.compilation.graph import GraphOptions
-        except ImportError:
-            pytest.skip("GraphOptions not available")
-
-        options = GraphOptions(
-            debug_log_enable=False,
-            gc_disable=True,
-            weak_ref_output=False,
-        )
-        assert options.debug_log_enable is False
-        assert options.gc_disable is True
-        assert options.weak_ref_output is False
-
-    def test_graph_entry_creation(self):
-        """Test GraphEntry dataclass."""
-        try:
-            from vllm_fl.compilation.graph import GraphEntry
-        except ImportError:
-            pytest.skip("GraphEntry not available")
-
-        mock_batch_desc = MagicMock()
-        entry = GraphEntry(batch_descriptor=mock_batch_desc)
-
-        assert entry.batch_descriptor is mock_batch_desc
-        assert entry.graph is None
-        assert entry.output is None
-        assert entry.input_addresses is None
-
-
-class TestGraphWrapper:
-    """Test GraphWrapper functionality."""
-
-    @pytest.fixture
-    def mock_vllm_config(self):
-        """Create mock VllmConfig."""
-        config = MagicMock()
-        config.compilation_config = MagicMock()
-        config.compilation_config.cudagraph_capture_sizes = [1, 2, 4, 8]
-        return config
-
-    def test_graph_wrapper_import(self):
-        """Test GraphWrapper can be imported."""
-        try:
-            from vllm_fl.compilation.graph import GraphWrapper
-            assert GraphWrapper is not None
-        except ImportError:
-            pytest.skip("GraphWrapper not available")
-
-    def test_graph_wrapper_unwrap(self):
-        """Test GraphWrapper.unwrap returns original runnable."""
-        try:
-            from vllm_fl.compilation.graph import GraphWrapper, GraphOptions
-            from vllm.config import CUDAGraphMode
-        except ImportError:
-            pytest.skip("Required imports not available")
-
-        def simple_runnable(x):
-            return x * 2
-
-        mock_config = MagicMock()
-        mock_config.compilation_config = MagicMock()
-
-        # Mock the platform
-        with patch("vllm_fl.compilation.graph.current_platform") as mock_platform:
-            mock_platform.get_global_graph_pool.return_value = MagicMock()
-
-            wrapper = GraphWrapper(
-                runnable=simple_runnable,
-                vllm_config=mock_config,
-                runtime_mode=CUDAGraphMode.FULL,
-            )
-
-            assert wrapper.unwrap() is simple_runnable
-
-
 class TestWeakRefTensors:
     """Test weak reference tensor functionality."""
 
@@ -128,8 +30,6 @@ def test_weak_ref_tensors_function(self):
             pytest.skip("weak_ref_tensors not available")
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="GPU not available")
-    @pytest.mark.skipif(not has_weak_ref_tensor_op(),
-                        reason="vllm C++ extension weak_ref_tensor not available")
     def test_weak_ref_tensors_with_cuda_tensor(self):
         """Test weak_ref_tensors with CUDA tensor."""
         try:
diff --git a/tests/functional_tests/dispatch/test_dispatch_flow.py b/tests/functional_tests/dispatch/test_dispatch_flow.py
deleted file mode 100644
index 2ed6fd0..0000000
--- a/tests/functional_tests/dispatch/test_dispatch_flow.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2025 BAAI. All rights reserved.
-
-"""
-Functional tests for dispatch flow.
-Tests the complete operator dispatch mechanism including
-registration, resolution, and policy-based selection.
-"""
-
-import os
-import pytest
-import tempfile
-
-from vllm_fl.dispatch import (
-    OpRegistry,
-    OpManager,
-    OpImpl,
-    BackendImplKind,
-    BackendPriority,
-    SelectionPolicy,
-    get_default_manager,
-    reset_default_manager,
-    call_op,
-    resolve_op,
-    get_policy,
-    set_global_policy,
-    reset_global_policy,
-    policy_context,
-    with_preference,
-    PREFER_DEFAULT,
-    PREFER_VENDOR,
-    PREFER_REFERENCE,
-)
-
-
-class TestDispatchManagerInitialization:
-    """Test OpManager initialization and registration."""
-
-    @pytest.fixture(autouse=True)
-    def setup(self, reset_dispatch_manager, clean_env):
-        """Reset manager before each test."""
-        pass
-
-    def test_manager_initializes_lazily(self):
-        """Test that manager initializes on first use."""
-        manager = get_default_manager()
-        assert manager is not None
-
-        # Should be initialized after first call
-        manager.ensure_initialized()
-        assert manager._state.initialized is True
-
-    def test_manager_registers_builtin_ops(self):
-        """Test that built-in operators are registered."""
-        manager = get_default_manager()
-        manager.ensure_initialized()
-
-        snap = manager.registry.snapshot()
-
-        # Should have some operators registered
-        assert len(snap.impls_by_op) > 0
-
-    def test_manager_singleton(self):
-        """Test that get_default_manager returns singleton."""
-        manager1 = get_default_manager()
-        manager2 = get_default_manager()
-        assert manager1 is manager2
-
-    def test_reset_manager(self):
-        """Test that reset_default_manager creates new instance."""
-        manager1 = get_default_manager()
-        reset_default_manager()
-        manager2 = get_default_manager()
-        assert manager1 is not manager2
-
-
-class TestOperatorResolution:
-    """Test operator resolution logic."""
-
-    @pytest.fixture
-    def custom_manager(self):
-        """Create a custom manager with test implementations."""
-        registry = OpRegistry()
-
-        # Register test implementations
-        registry.register_impl(OpImpl(
-            op_name="test_op",
-            impl_id="default.flagos",
-            kind=BackendImplKind.DEFAULT,
-            fn=lambda x: x * 2,
-            priority=BackendPriority.DEFAULT,
-        ))
-
-        registry.register_impl(OpImpl(
-            op_name="test_op",
-            impl_id="reference.pytorch",
-            kind=BackendImplKind.REFERENCE,
-            fn=lambda x: x * 2 + 1,
-            priority=BackendPriority.REFERENCE,
-        ))
-
-        registry.register_impl(OpImpl(
-            op_name="test_op",
-            impl_id="vendor.cuda",
-            kind=BackendImplKind.VENDOR,
-            fn=lambda x: x * 3,
-            vendor="CUDA",
-            priority=BackendPriority.VENDOR,
-        ))
-
-        return OpManager(registry)
-
-    @pytest.fixture(autouse=True)
-    def setup(self, reset_dispatch_manager, clean_env):
-        pass
-
-    def test_resolve_selects_by_default_order(self, custom_manager):
-        """Test that resolve selects by default order (flagos first)."""
-        fn = custom_manager.resolve("test_op")
-        result = fn(10)
-        assert result == 20  # flagos: x * 2
-
-    def test_resolve_with_vendor_preference(self, custom_manager):
-        """Test resolution with vendor preference."""
-        vendor_policy = SelectionPolicy(prefer=PREFER_VENDOR)
-        with policy_context(vendor_policy):
-            fn = custom_manager.resolve("test_op")
-            result = fn(10)
-            assert result == 30  # vendor: x * 3
-
-    def test_resolve_with_reference_preference(self, custom_manager):
-        """Test resolution with reference preference."""
-        ref_policy = SelectionPolicy(prefer=PREFER_REFERENCE)
-        with policy_context(ref_policy):
-            fn = custom_manager.resolve("test_op")
-            result = fn(10)
-            assert result == 21  # reference: x * 2 + 1
-
-    def test_resolve_caches_result(self, custom_manager):
-        """Test that resolution is cached."""
-        fn1 = custom_manager.resolve("test_op")
-        fn2 = custom_manager.resolve("test_op")
-        assert fn1 is fn2
-
-    def test_resolve_nonexistent_raises(self, custom_manager):
-        """Test that resolving non-existent op raises."""
-        with pytest.raises(RuntimeError, match="No available implementation"):
-            custom_manager.resolve("nonexistent_op")
-
-
-class TestPolicyBasedSelection:
-    """Test policy-based operator selection."""
-
-    @pytest.fixture
-    def manager_with_vendors(self):
-        """Create manager with multiple vendor implementations."""
-        registry = OpRegistry()
-
-        registry.register_impl(OpImpl(
-            op_name="multi_vendor_op",
-            impl_id="vendor.cuda",
-            kind=BackendImplKind.VENDOR,
-            fn=lambda: "cuda",
-            vendor="CUDA",
-            priority=100,
-        ))
-
-        registry.register_impl(OpImpl(
-            op_name="multi_vendor_op",
-            impl_id="vendor.ascend",
-            kind=BackendImplKind.VENDOR,
-            fn=lambda: "ascend",
-            vendor="ASCEND",
-            priority=100,
-        ))
-
-        registry.register_impl(OpImpl(
-            op_name="multi_vendor_op",
-            impl_id="reference.pytorch",
-            kind=BackendImplKind.REFERENCE,
-            fn=lambda: "reference",
-            priority=50,
-        ))
-
-        return OpManager(registry)
-
-    @pytest.fixture(autouse=True)
-    def setup(self, reset_dispatch_manager, clean_env):
-        pass
-
-    def test_deny_vendor_excludes_implementation(self, manager_with_vendors):
-        """Test that denied vendors are excluded."""
-        policy = SelectionPolicy.from_dict(
-            prefer=PREFER_VENDOR,
-            deny_vendors={"CUDA"},
-        )
-        with policy_context(policy):
-            fn = manager_with_vendors.resolve("multi_vendor_op")
-            result = fn()
-            assert result == "ascend"
-
-    def test_allow_vendor_limits_selection(self, manager_with_vendors):
-        """Test that allow list limits vendor selection."""
-        policy = SelectionPolicy.from_dict(
-            prefer=PREFER_VENDOR,
-            allow_vendors={"CUDA"},
-        )
-        with policy_context(policy):
-            fn = manager_with_vendors.resolve("multi_vendor_op")
-            result = fn()
-            assert result == "cuda"
-
-    def test_per_op_order_overrides_default(self, manager_with_vendors):
-        """Test that per-op order overrides default."""
-        policy = SelectionPolicy.from_dict(
-            prefer=PREFER_VENDOR,
-            per_op_order={"multi_vendor_op": ["reference"]},
-        )
-        with policy_context(policy):
-            fn = manager_with_vendors.resolve("multi_vendor_op")
-            result = fn()
-            assert result == "reference"
-
-
-class TestFallbackMechanism:
-    """Test fallback when primary implementation fails."""
-
-    @pytest.fixture
-    def manager_with_failing_impl(self):
-        """Create manager with a failing primary implementation."""
-        registry = OpRegistry()
-
-        def failing_fn():
-            raise RuntimeError("Primary failed!")
-
-        registry.register_impl(OpImpl(
-            op_name="fallback_op",
-            impl_id="default.flagos",
-            kind=BackendImplKind.DEFAULT,
-            fn=failing_fn,
-            priority=BackendPriority.DEFAULT,
-        ))
-
-        registry.register_impl(OpImpl(
-            op_name="fallback_op",
-            impl_id="reference.pytorch",
-            kind=BackendImplKind.REFERENCE,
-            fn=lambda: "fallback_success",
-            priority=BackendPriority.REFERENCE,
-        ))
-
-        return OpManager(registry)
-
-    @pytest.fixture(autouse=True)
-    def setup(self, reset_dispatch_manager, clean_env):
-        pass
-
-    def test_fallback_on_primary_failure(self, manager_with_failing_impl):
-        """Test fallback to next implementation when primary fails."""
-        # Enable fallback (VLLM_FL_STRICT != "0")
-        os.environ["VLLM_FL_STRICT"] = "1"
-
-        result = manager_with_failing_impl.call("fallback_op")
-        assert result == "fallback_success"
-
-    def test_failed_impl_tracked(self, manager_with_failing_impl):
-        """Test that failed implementations are tracked."""
-        os.environ["VLLM_FL_STRICT"] = "1"
-
-        manager_with_failing_impl.call("fallback_op")
-
-        failed = manager_with_failing_impl.get_failed_impls("fallback_op")
-        assert "default.flagos" in failed.get("fallback_op", set())
-
-    def test_clear_failed_impls(self, manager_with_failing_impl):
-        """Test clearing failed implementations cache."""
-        os.environ["VLLM_FL_STRICT"] = "1"
-
-        manager_with_failing_impl.call("fallback_op")
-        manager_with_failing_impl.clear_failed_impls("fallback_op")
-
-        failed = manager_with_failing_impl.get_failed_impls("fallback_op")
-        assert len(failed) == 0
-
-
-class TestConfigFileLoading:
-    """Test loading configuration from YAML file."""
-
-    @pytest.fixture(autouse=True)
-    def setup(self, reset_dispatch_manager, clean_env):
-        pass
-
-    def test_load_config_from_yaml(self):
-        """Test loading policy from YAML config file."""
-        config_content = """
-prefer: vendor
-strict: true
-allow_vendors:
-  - CUDA
-deny_vendors:
-  - AMD
-op_backends:
-  test_op:
-    - vendor
-    - reference
-"""
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".yaml", delete=False
-        ) as f:
-            f.write(config_content)
-            config_path = f.name
-
-        try:
-            os.environ["VLLM_FL_CONFIG"] = config_path
-            reset_global_policy()
-
-            policy = get_policy()
-
-            assert policy.prefer == "vendor"
-            assert policy.strict is True
-            assert "CUDA" in policy.allow_vendors
-            assert "AMD" in policy.deny_vendors
-            assert policy.get_per_op_order("test_op") == ["vendor", "reference"]
-        finally:
-            os.unlink(config_path)
-            os.environ.pop("VLLM_FL_CONFIG", None)
-
-
-class TestContextManagers:
-    """Test policy context managers."""
-
-    @pytest.fixture(autouse=True)
-    def setup(self, reset_dispatch_manager, clean_env):
-        pass
-
-    def test_with_preference_context(self):
-        """Test with_preference context manager."""
-        original = get_policy()
-        assert original.prefer == PREFER_DEFAULT
-
-        with with_preference(PREFER_VENDOR):
-            inside = get_policy()
-            assert inside.prefer == PREFER_VENDOR
-
-        after = get_policy()
-        assert after.prefer == PREFER_DEFAULT
-
-    def test_nested_contexts(self):
-        """Test nested policy contexts."""
-        with with_preference(PREFER_VENDOR):
-            assert get_policy().prefer == PREFER_VENDOR
-
-            with with_preference(PREFER_REFERENCE):
-                assert get_policy().prefer == PREFER_REFERENCE
-
-            assert get_policy().prefer == PREFER_VENDOR
-
-        assert get_policy().prefer == PREFER_DEFAULT
diff --git a/tests/functional_tests/dispatch/__init__.py b/tests/unit_tests/flaggems/__init__.py
similarity index 59%
rename from tests/functional_tests/dispatch/__init__.py
rename to tests/unit_tests/flaggems/__init__.py
index fb91e8d..8c90136 100644
--- a/tests/functional_tests/dispatch/__init__.py
+++ b/tests/unit_tests/flaggems/__init__.py
@@ -1,2 +1 @@
 # Copyright (c) 2025 BAAI. All rights reserved.
-"""Dispatch functional tests."""
diff --git a/tests/functional_tests/flaggems/test_flaggems_get_ops.py b/tests/unit_tests/flaggems/test_flaggems_get_ops.py
similarity index 58%
rename from tests/functional_tests/flaggems/test_flaggems_get_ops.py
rename to tests/unit_tests/flaggems/test_flaggems_get_ops.py
index 453b5c4..1764d3a 100644
--- a/tests/functional_tests/flaggems/test_flaggems_get_ops.py
+++ b/tests/unit_tests/flaggems/test_flaggems_get_ops.py
@@ -1,5 +1,8 @@
-# Copyright (c) 2026 BAAI. All rights reserved.
+# Copyright (c) 2025 BAAI. All rights reserved.
 
+"""
+Unit tests for FlagGems ops discovery functionality.
+"""
 
 from vllm_fl.utils import get_flaggems_all_ops
 
diff --git a/tests/functional_tests/flaggems/test_gems_whitelist.py b/tests/unit_tests/flaggems/test_gems_whitelist.py
similarity index 99%
rename from tests/functional_tests/flaggems/test_gems_whitelist.py
rename to tests/unit_tests/flaggems/test_gems_whitelist.py
index b7e37e4..46c21d9 100644
--- a/tests/functional_tests/flaggems/test_gems_whitelist.py
+++ b/tests/unit_tests/flaggems/test_gems_whitelist.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2026 BAAI. All rights reserved.
+# Copyright (c) 2025 BAAI. All rights reserved.
 
 """
 Unit tests for FlagGems operator whitelist/blacklist functionality.
diff --git a/vllm_fl/platform.py b/vllm_fl/platform.py
index 8a7c269..231ada6 100644
--- a/vllm_fl/platform.py
+++ b/vllm_fl/platform.py
@@ -10,6 +10,12 @@
 
 import torch
 
+# import custom ops, trigger op registration (CUDA only)
+try:
+    import vllm._C  # noqa
+except ImportError:
+    pass  # NPU or other platforms may not have vllm._C
+
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger