From 84fd202b40949dc1436ab6ada4229964d4eb9148 Mon Sep 17 00:00:00 2001
From: Yanzhao Wang <wyz159753@gmail.com>
Date: Thu, 22 Jan 2026 16:22:41 +0800
Subject: [PATCH 1/2] reduce gpu utilization to 6GB

---
 test/common/offline_inference_utils.py        |  9 ++++++--
 test/conftest.py                              | 22 ++++++++++++++-----
 test/suites/E2E/test_offline_inference.py     |  2 +-
 .../E2E/test_offline_inference_sparse.py      |  4 ++--
 4 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/test/common/offline_inference_utils.py b/test/common/offline_inference_utils.py
index ae3687b74..f55473e00 100644
--- a/test/common/offline_inference_utils.py
+++ b/test/common/offline_inference_utils.py
@@ -222,7 +222,6 @@ def build_llm_with_uc(
         "model": model_path,
         "kv_transfer_config": ktc,
         "max_model_len": 12000,
-        "gpu_memory_utilization": 0.3,  # Reduced to prevent OOM after Phase 1
         "max_num_batched_tokens": max_num_batched_tokens,
         "block_size": 128,
         "enforce_eager": llm_kwargs.get("enforce_eager", True),
@@ -276,11 +275,17 @@ def run_offline_inference(
     """
     sampling_params = from_dict_for_serialization(sampling_params_dict)
 
+    gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1"))
+    logger.info(
+        "run offline inference with gpu memory utilization: %.4f",
+        gpu_memory_utilization,
+    )
+
     with build_llm_with_uc(
         model_path=model_path,
         ucm_config=ucm_config,
         enable_prefix_caching=enable_prefix_caching,
-        gpu_memory_utilization=0.3,
+        gpu_memory_utilization=gpu_memory_utilization,
         max_num_batched_tokens=max_num_batched_tokens,
         enforce_eager=enforce_eager,
     ) as llm:
diff --git a/test/conftest.py b/test/conftest.py
index 2189094e9..784a29135 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -162,6 +162,7 @@ def pytest_runtest_logreport(report):
 
 
 def get_free_gpu(required_memory_mb):
+    mem_needed_with_buffer = int(required_memory_mb * 1.3)  # add buffer to avoid OOM
     pynvml.nvmlInit()
     device_count = pynvml.nvmlDeviceGetCount()
     device_indices = list(range(device_count))
@@ -170,9 +171,12 @@ def get_free_gpu(required_memory_mb):
         handle = pynvml.nvmlDeviceGetHandleByIndex(i)
         info = pynvml.nvmlDeviceGetMemoryInfo(handle)
         free_in_mb = info.free / 1024**2
-        if free_in_mb >= required_memory_mb:
-            return i, free_in_mb
-    return None, 0
+        if free_in_mb >= mem_needed_with_buffer:
+            utilization = (
+                required_memory_mb * (1024**2) / info.total if info.total else 0
+            )
+            return i, free_in_mb, utilization
+    return None, 0, 0
 
 
 @pytest.fixture(autouse=True)
@@ -180,9 +184,15 @@ def setup_gpu_resource(request):
     marker = request.node.get_closest_marker("gpu_mem")
     if marker:
         mem_needed = marker.args[0]
-        gpu_id, free_in_mb = get_free_gpu(mem_needed)
+        gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed)
         if gpu_id is not None:
-            print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory")
+            print(
+                f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}"
+            )
             os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+            if gpu_utilization:
+                os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization)
         else:
-            pytest.fail(f"No GPU with {mem_needed}MB free memory available")
+            pytest.fail(
+                f"No GPU with {mem_needed}MB(+30% buffer) free memory available"
+            )
diff --git a/test/suites/E2E/test_offline_inference.py b/test/suites/E2E/test_offline_inference.py
index 345c759e5..ced06ff04 100644
--- a/test/suites/E2E/test_offline_inference.py
+++ b/test/suites/E2E/test_offline_inference.py
@@ -25,7 +25,7 @@ class TestBasicOfflineInference:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(6000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half
diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py
index 49ead5d3c..fda2532cb 100644
--- a/test/suites/E2E/test_offline_inference_sparse.py
+++ b/test/suites/E2E/test_offline_inference_sparse.py
@@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(6000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("prompt_split_ratio", [0.5])  # Split prompt in half
@@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
-    @pytest.mark.gpu_mem(30000)
+    @pytest.mark.gpu_mem(6000)
     @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"])
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("enforce_eager", [False])

From a4928719f7292a25269ae30390dc87dae8e0a476 Mon Sep 17 00:00:00 2001
From: Yanzhao Wang <wyz159753@gmail.com>
Date: Fri, 23 Jan 2026 15:21:22 +0800
Subject: [PATCH 2/2] build docker for test, use gsa for sparse attention test

---
 .github/workflows/pull-request.yml            | 63 ++++++++++---
 docker/Dockerfile-onlyPC                      | 16 ++++
 test/conftest.py                              | 33 ++++---
 .../E2E/test_offline_inference_sparse.py      | 92 ++++++++-----------
 4 files changed, 120 insertions(+), 84 deletions(-)
 create mode 100644 docker/Dockerfile-onlyPC

diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index f1b395765..f509bb99f 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -71,17 +71,30 @@ jobs:
           rm -rf .[!.]*
         fi
     - uses: actions/checkout@v4
+    - name: Generate Docker Image Version
+      id: version
+      run: |
+        DATE=$(date +%Y%m%d)
+        SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
+        VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
+        echo "version=${VERSION}" >> $GITHUB_OUTPUT
+        echo "Docker image version: ${VERSION}"
     - name: Build
       run: |
         cd ${{github.workspace}}
-        export PLATFORM=cuda
-        pip install -v -e . --no-build-isolation
-    - name: Test E2E
+        sudo docker build -t ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} -f ./docker/Dockerfile-onlyPC ./
+    - name: Test E2E in Docker
       run: |
-        cd ${{github.workspace}}
-        cd test
-        pip install pytest pytest-cov pynvml pandas
-        python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml
+        sudo docker run --rm \
+          -itd \
+          --gpus all \
+          --network=host \
+          --ipc=host \
+          --cap-add IPC_LOCK \
+          -v /home/models:/home/models \
+          -v ${{github.workspace}}:/workspace \
+          ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} \
+          -c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml"
     - name: Upload pytest results
       uses: EnricoMi/publish-unit-test-result-action/linux@v2
       if: (!cancelled())
@@ -89,6 +102,10 @@ jobs:
         files: |
           ${{github.workspace}}/test/offline-inference.xml
         check_name: Prefix cache test results
+    - name: Cleanup Docker Image
+      if: always()
+      run: |
+        sudo docker rmi ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} || true
 
   test-e2e-sparse-gpu:
     runs-on: gpu
@@ -107,18 +124,30 @@ jobs:
           rm -rf .[!.]*
         fi
     - uses: actions/checkout@v4
+    - name: Generate Docker Image Version
+      id: version
+      run: |
+        DATE=$(date +%Y%m%d)
+        SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
+        VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
+        echo "version=${VERSION}" >> $GITHUB_OUTPUT
+        echo "Docker image version: ${VERSION}"
     - name: Build
       run: |
         cd ${{github.workspace}}
-        export PLATFORM=cuda
-        export ENABLE_SPARSE=TRUE
-        pip install -v -e . --no-build-isolation
-    - name: Test E2E
+        sudo docker build -t ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} -f ./docker/Dockerfile ./
+    - name: Test E2E in Docker
       run: |
-        cd ${{github.workspace}}
-        cd test
-        pip install pytest pytest-cov pynvml pandas
-        python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml
+        sudo docker run --rm \
+          -itd \
+          --gpus all \
+          --network=host \
+          --ipc=host \
+          --cap-add IPC_LOCK \
+          -v /home/models:/home/models \
+          -v ${{github.workspace}}:/workspace \
+          ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} \
+          -c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml"
     - name: Upload pytest results
       uses: EnricoMi/publish-unit-test-result-action/linux@v2
       if: (!cancelled())
@@ -126,4 +155,8 @@ jobs:
         files: |
           ${{github.workspace}}/test/offline-inference-sparse.xml
         check_name: Sparse attention test results
+    - name: Cleanup Docker Image
+      if: always()
+      run: |
+        sudo docker rmi ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} || true
 
diff --git a/docker/Dockerfile-onlyPC b/docker/Dockerfile-onlyPC
new file mode 100644
index 000000000..0ad355476
--- /dev/null
+++ b/docker/Dockerfile-onlyPC
@@ -0,0 +1,16 @@
+# Set to other image if needed
+FROM vllm/vllm-openai:v0.9.2
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+
+WORKDIR /workspace
+
+# Install unified-cache-management
+COPY . /workspace/unified-cache-management
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+RUN export PLATFORM="cuda" && \
+     pip install -v -e /workspace/unified-cache-management --no-build-isolation
+
+ENTRYPOINT ["/bin/bash"]
\ No newline at end of file
diff --git a/test/conftest.py b/test/conftest.py
index 784a29135..178fe0450 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -162,20 +162,25 @@ def pytest_runtest_logreport(report):
 
 
 def get_free_gpu(required_memory_mb):
-    mem_needed_with_buffer = int(required_memory_mb * 1.3)  # add buffer to avoid OOM
-    pynvml.nvmlInit()
-    device_count = pynvml.nvmlDeviceGetCount()
-    device_indices = list(range(device_count))
-    random.shuffle(device_indices)
-    for i in device_indices:  # random order to reduce collisions
-        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        free_in_mb = info.free / 1024**2
-        if free_in_mb >= mem_needed_with_buffer:
-            utilization = (
-                required_memory_mb * (1024**2) / info.total if info.total else 0
-            )
-            return i, free_in_mb, utilization
+    try:
+        mem_needed_with_buffer = int(
+            required_memory_mb * 1.3
+        )  # add buffer to avoid OOM
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        device_indices = list(range(device_count))
+        random.shuffle(device_indices)
+        for i in device_indices:  # random order to reduce collisions
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            free_in_mb = info.free / 1024**2
+            if free_in_mb >= mem_needed_with_buffer:
+                utilization = (
+                    required_memory_mb * (1024**2) / info.total if info.total else 0
+                )
+                return i, free_in_mb, utilization
+    finally:
+        pynvml.nvmlShutdown()
     return None, 0, 0
 
 
diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py
index fda2532cb..a818dc005 100644
--- a/test/suites/E2E/test_offline_inference_sparse.py
+++ b/test/suites/E2E/test_offline_inference_sparse.py
@@ -15,10 +15,6 @@
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 
-from ucm.logger import init_logger
-
-logger = init_logger(__name__)
-
 
 class TestBasicOfflineInferenceSparse:
     """Test basic offline inference functionality."""
@@ -67,11 +63,9 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
             test_prompt, standard_answers = load_prompt_from_file(
                 Path(__file__).parent / "prompts" / "test_offline_inference.json"
             )
-            logger.info(
-                f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)"
-            )
+            print(f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)")
             if standard_answers:
-                logger.info(f"Standard answers: {standard_answers}")
+                print(f"Standard answers: {standard_answers}")
             else:
                 pytest.fail(f"No standard answers found in prompt.json")
         except Exception as e:
@@ -120,19 +114,19 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
             ignore_eos=False,
         )
 
-        logger.info(f"\n===== HBM + SSD Mixed Accuracy Test =====")
-        logger.info(f"Model: {model_path}")
-        logger.info(f"Full prompt length: {len(test_prompt)} chars")
-        logger.info(f"Max tokens: {max_tokens}")
-        logger.info(f"Temperature: 0.0 (deterministic)")
-        logger.info(f"UCM storage: {ucm_storage_dir}")
-        logger.info(f"Prompt split ratio: {prompt_split_ratio}")
-        logger.info(f"Enforce eager: {enforce_eager}")
-        logger.info(f"Max num batched tokens: {max_num_batched_tokens}")
+        print(f"\n===== HBM + SSD Mixed Accuracy Test =====")
+        print(f"Model: {model_path}")
+        print(f"Full prompt length: {len(test_prompt)} chars")
+        print(f"Max tokens: {max_tokens}")
+        print(f"Temperature: 0.0 (deterministic)")
+        print(f"UCM storage: {ucm_storage_dir}")
+        print(f"Prompt split ratio: {prompt_split_ratio}")
+        print(f"Enforce eager: {enforce_eager}")
+        print(f"Max num batched tokens: {max_num_batched_tokens}")
 
         # ===== Phase 1: Disable HBM PC, save KV cache to SSD and load (baseline) =====
         # Run Phase 1 in a separate subprocess to ensure GPU memory is fully released
-        logger.info(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")
+        print(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")
 
         # Convert SamplingParams to dict for serialization, as non-picklable objects cannot be passed to subprocess
         sampling_params_dict = to_dict_for_serialization(sampling_params)
@@ -151,13 +145,13 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
         )
         phase1_1_output = phase1_outputs[0]  # Phase 1.1: SSD save
         phase1_2_output = phase1_outputs[1]  # Phase 1.2: SSD load
-        logger.info(f"Phase 1 completed in subprocess")
-        logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
-        logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
+        print(f"Phase 1 completed in subprocess")
+        print(f'Phase 1.1 output: "{phase1_1_output}"')
+        print(f'Phase 1.2 output: "{phase1_2_output}"')
 
         # ===== Phase 2: Enable HBM PC, test HBM + SSD mixed hit =====
         # Run Phase 2 in a separate subprocess to ensure GPU memory is fully released
-        logger.info(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")
+        print(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")
 
         phase2_outputs = run_in_spawn_subprocess(
             run_offline_inference,
@@ -173,11 +167,11 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
         )
         phase2_partial_output = phase2_outputs[0]
         phase2_full_output = phase2_outputs[1]
-        logger.info(f"Phase 2 completed in subprocess")
-        logger.info(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
-        logger.info(f"[INFO] Phase 2.2 output: {phase2_full_output}")
+        print(f"Phase 2 completed in subprocess")
+        print(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
+        print(f"[INFO] Phase 2.2 output: {phase2_full_output}")
 
-        logger.info(f"\n[INFO] ===== Accuracy Test Results =====")
+        print(f"\n[INFO] ===== Accuracy Test Results =====")
 
         # Note: Small numerical precision differences in KV cache loading can cause
         # punctuation token selection differences (e.g., full-width vs half-width comma)
@@ -203,29 +197,25 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
             phase1_1_output, standard_answers
         ) and match_any_answer(phase1_2_output, standard_answers)
         if not phase1_correct:
-            logger.warning(
-                f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) ====="
-            )
-            logger.warning(
+            print(f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) =====")
+            print(
                 f"Incorrect answer in Phase 1.1 (SSD save) or Phase 1.2 (SSD load) output!"
             )
-            logger.warning(f"Phase 1.1 output:\n{phase1_1_output}")
-            logger.warning(f"Phase 1.2 output:\n{phase1_2_output}")
-            logger.warning(f"Standard answers:\n{standard_answers}")
+            print(f"Phase 1.1 output:\n{phase1_1_output}")
+            print(f"Phase 1.2 output:\n{phase1_2_output}")
+            print(f"Standard answers:\n{standard_answers}")
             pytest.fail("SSD Load Accuracy Test Failed!")
 
         # Phase 2.1 should be skipped from accuracy check since it's only partial prompt
         phase2_correct = match_any_answer(phase2_full_output, standard_answers)
         if not phase2_correct:
-            logger.warning(
-                f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) ====="
-            )
-            logger.warning(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
-            logger.warning(f"Phase 2.2 output:\n{phase2_full_output}")
-            logger.warning(f"Standard answers:\n{standard_answers}")
+            print(f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) =====")
+            print(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
+            print(f"Phase 2.2 output:\n{phase2_full_output}")
+            print(f"Standard answers:\n{standard_answers}")
             pytest.fail("HBM + SSD Mixed Accuracy Test Failed!")
 
-    """Test ESA sparse attention."""
+    """Test GSA sparse attention."""
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
@@ -234,7 +224,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("enforce_eager", [False])
     @pytest.mark.parametrize("max_num_batched_tokens", [2047])
-    def test_offline_esa(
+    def test_offline_gsa(
         self,
         model_name: str,
         max_tokens: int,
@@ -265,7 +255,7 @@ def test_offline_esa(
         except Exception as e:
             pytest.fail(f"Failed to load prompt from prompt.json: {e}")
 
-        logger.info(f"Standard answers: {standard_answers}")
+        print(f"Standard answers: {standard_answers}")
 
         tokenizer = AutoTokenizer.from_pretrained(model_path, use_chat_template=True)
 
@@ -296,15 +286,7 @@ def test_offline_esa(
                     },
                 }
             ],
-            "ucm_sparse_config": {
-                "ESA": {
-                    "init_window_sz": 1,
-                    "local_window_sz": 2,
-                    "min_blocks": 4,
-                    "sparse_ratio": 0.3,
-                    "retrieval_stride": 5,
-                }
-            },
+            "ucm_sparse_config": {"GSAOnDevice": {}},
         }
 
         sampling_params = SamplingParams(
@@ -325,12 +307,12 @@ def test_offline_esa(
             sampling_params_dict,
             False,  # enable_prefix_caching=False
             enforce_eager,
-            "ESA",
+            "GSA",
             max_num_batched_tokens,
             timeout=180,
         )
         phase1_1_output = phase1_outputs[0]  # Phase 1.1: SSD save
         phase1_2_output = phase1_outputs[1]  # Phase 1.2: SSD load
-        logger.info(f"ESA inference completed in subprocess")
-        logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
-        logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
+        print(f"ESA inference completed in subprocess")
+        print(f'Phase 1.1 output: "{phase1_1_output}"')
+        print(f'Phase 1.2 output: "{phase1_2_output}"')