ModelEngine-Group · dante159753 · Jan 22, 2026 · Jan 23, 2026 · Jan 23, 2026
@@ -71,24 +71,41 @@ jobs:
           rm -rf .[!.]*
         fi
     - uses: actions/checkout@v4
+    - name: Generate Docker Image Version
+      id: version
+      run: |
+        DATE=$(date +%Y%m%d)
+        SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
+        VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
+        echo "version=${VERSION}" >> $GITHUB_OUTPUT
+        echo "Docker image version: ${VERSION}"
     - name: Build
       run: |
         cd ${{github.workspace}}
-        export PLATFORM=cuda
-        pip install -v -e . --no-build-isolation
-    - name: Test E2E
+        sudo docker build -t ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} -f ./docker/Dockerfile-onlyPC ./
+    - name: Test E2E in Docker
       run: |
-        cd ${{github.workspace}}
-        cd test
-        pip install pytest pytest-cov pynvml pandas
-        python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml
+        sudo docker run --rm \
+          -itd \
+          --gpus all \
+          --network=host \
+          --ipc=host \
+          --cap-add IPC_LOCK \
+          -v /home/models:/home/models \
+          -v ${{github.workspace}}:/workspace \
+          ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} \
+          -c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml"
     - name: Upload pytest results
       uses: EnricoMi/publish-unit-test-result-action/linux@v2
       if: (!cancelled())
       with:
         files: |
           ${{github.workspace}}/test/offline-inference.xml
         check_name: Prefix cache test results
+    - name: Cleanup Docker Image
+      if: always()
+      run: |
+        sudo docker rmi ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} || true
 
   test-e2e-sparse-gpu:
     runs-on: gpu
@@ -107,23 +124,39 @@ jobs:
           rm -rf .[!.]*
         fi
     - uses: actions/checkout@v4
+    - name: Generate Docker Image Version
+      id: version
+      run: |
+        DATE=$(date +%Y%m%d)
+        SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
+        VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
+        echo "version=${VERSION}" >> $GITHUB_OUTPUT
+        echo "Docker image version: ${VERSION}"
     - name: Build
       run: |
         cd ${{github.workspace}}
-        export PLATFORM=cuda
-        export ENABLE_SPARSE=TRUE
-        pip install -v -e . --no-build-isolation
-    - name: Test E2E
+        sudo docker build -t ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} -f ./docker/Dockerfile ./
+    - name: Test E2E in Docker
       run: |
-        cd ${{github.workspace}}
-        cd test
-        pip install pytest pytest-cov pynvml pandas
-        python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml
+        sudo docker run --rm \
+          -itd \
+          --gpus all \
+          --network=host \
+          --ipc=host \
+          --cap-add IPC_LOCK \
+          -v /home/models:/home/models \
+          -v ${{github.workspace}}:/workspace \
+          ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} \
+          -c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml"
     - name: Upload pytest results
       uses: EnricoMi/publish-unit-test-result-action/linux@v2
       if: (!cancelled())
       with:
         files: |
           ${{github.workspace}}/test/offline-inference-sparse.xml
         check_name: Sparse attention test results
+    - name: Cleanup Docker Image
+      if: always()
+      run: |
+        sudo docker rmi ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} || true
 
@@ -0,0 +1,16 @@
+# Set to other image if needed
+FROM vllm/vllm-openai:v0.9.2
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+
+WORKDIR /workspace
+
+# Install unified-cache-management
+COPY . /workspace/unified-cache-management
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+RUN export PLATFORM="cuda" && \
+     pip install -v -e /workspace/unified-cache-management --no-build-isolation
+
+ENTRYPOINT ["/bin/bash"]
@@ -162,20 +162,25 @@ def pytest_runtest_logreport(report):
 
 
 def get_free_gpu(required_memory_mb):
-    mem_needed_with_buffer = int(required_memory_mb * 1.3)  # add buffer to avoid OOM
-    pynvml.nvmlInit()
-    device_count = pynvml.nvmlDeviceGetCount()
-    device_indices = list(range(device_count))
-    random.shuffle(device_indices)
-    for i in device_indices:  # random order to reduce collisions
-        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
-        free_in_mb = info.free / 1024**2
-        if free_in_mb >= mem_needed_with_buffer:
-            utilization = (
-                required_memory_mb * (1024**2) / info.total if info.total else 0
-            )
-            return i, free_in_mb, utilization
+    try:
+        mem_needed_with_buffer = int(
+            required_memory_mb * 1.3
+        )  # add buffer to avoid OOM
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        device_indices = list(range(device_count))
+        random.shuffle(device_indices)
+        for i in device_indices:  # random order to reduce collisions
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            free_in_mb = info.free / 1024**2
+            if free_in_mb >= mem_needed_with_buffer:
+                utilization = (
+                    required_memory_mb * (1024**2) / info.total if info.total else 0
+                )
+                return i, free_in_mb, utilization
+    finally:
+        pynvml.nvmlShutdown()
     return None, 0, 0
 
 

@@ -15,10 +15,6 @@
 from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
 
-from ucm.logger import init_logger
-
-logger = init_logger(__name__)
-
 
 class TestBasicOfflineInferenceSparse:
     """Test basic offline inference functionality."""
@@ -67,11 +63,9 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
             test_prompt, standard_answers = load_prompt_from_file(
                 Path(__file__).parent / "prompts" / "test_offline_inference.json"
             )
-            logger.info(
-                f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)"
-            )
+            print(f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)")
             if standard_answers:
-                logger.info(f"Standard answers: {standard_answers}")
+                print(f"Standard answers: {standard_answers}")
             else:
                 pytest.fail(f"No standard answers found in prompt.json")
         except Exception as e:
@@ -120,19 +114,19 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
             ignore_eos=False,
         )
 
-        logger.info(f"\n===== HBM + SSD Mixed Accuracy Test =====")
-        logger.info(f"Model: {model_path}")
-        logger.info(f"Full prompt length: {len(test_prompt)} chars")
-        logger.info(f"Max tokens: {max_tokens}")
-        logger.info(f"Temperature: 0.0 (deterministic)")
-        logger.info(f"UCM storage: {ucm_storage_dir}")
-        logger.info(f"Prompt split ratio: {prompt_split_ratio}")
-        logger.info(f"Enforce eager: {enforce_eager}")
-        logger.info(f"Max num batched tokens: {max_num_batched_tokens}")
+        print(f"\n===== HBM + SSD Mixed Accuracy Test =====")
+        print(f"Model: {model_path}")
+        print(f"Full prompt length: {len(test_prompt)} chars")
+        print(f"Max tokens: {max_tokens}")
+        print(f"Temperature: 0.0 (deterministic)")
+        print(f"UCM storage: {ucm_storage_dir}")
+        print(f"Prompt split ratio: {prompt_split_ratio}")
+        print(f"Enforce eager: {enforce_eager}")
+        print(f"Max num batched tokens: {max_num_batched_tokens}")
 
         # ===== Phase 1: Disable HBM PC, save KV cache to SSD and load (baseline) =====
         # Run Phase 1 in a separate subprocess to ensure GPU memory is fully released
-        logger.info(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")
+        print(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")
 
         # Convert SamplingParams to dict for serialization, as non-picklable objects cannot be passed to subprocess
         sampling_params_dict = to_dict_for_serialization(sampling_params)
@@ -151,13 +145,13 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
         )
         phase1_1_output = phase1_outputs[0]  # Phase 1.1: SSD save
         phase1_2_output = phase1_outputs[1]  # Phase 1.2: SSD load
-        logger.info(f"Phase 1 completed in subprocess")
-        logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
-        logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
+        print(f"Phase 1 completed in subprocess")
+        print(f'Phase 1.1 output: "{phase1_1_output}"')
+        print(f'Phase 1.2 output: "{phase1_2_output}"')
 
         # ===== Phase 2: Enable HBM PC, test HBM + SSD mixed hit =====
         # Run Phase 2 in a separate subprocess to ensure GPU memory is fully released
-        logger.info(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")
+        print(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")
 
         phase2_outputs = run_in_spawn_subprocess(
             run_offline_inference,
@@ -173,11 +167,11 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
         )
         phase2_partial_output = phase2_outputs[0]
         phase2_full_output = phase2_outputs[1]
-        logger.info(f"Phase 2 completed in subprocess")
-        logger.info(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
-        logger.info(f"[INFO] Phase 2.2 output: {phase2_full_output}")
+        print(f"Phase 2 completed in subprocess")
+        print(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
+        print(f"[INFO] Phase 2.2 output: {phase2_full_output}")
 
-        logger.info(f"\n[INFO] ===== Accuracy Test Results =====")
+        print(f"\n[INFO] ===== Accuracy Test Results =====")
 
         # Note: Small numerical precision differences in KV cache loading can cause
         # punctuation token selection differences (e.g., full-width vs half-width comma)
@@ -203,29 +197,25 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
             phase1_1_output, standard_answers
         ) and match_any_answer(phase1_2_output, standard_answers)
         if not phase1_correct:
-            logger.warning(
-                f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) ====="
-            )
-            logger.warning(
+            print(f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) =====")
+            print(
                 f"Incorrect answer in Phase 1.1 (SSD save) or Phase 1.2 (SSD load) output!"
             )
-            logger.warning(f"Phase 1.1 output:\n{phase1_1_output}")
-            logger.warning(f"Phase 1.2 output:\n{phase1_2_output}")
-            logger.warning(f"Standard answers:\n{standard_answers}")
+            print(f"Phase 1.1 output:\n{phase1_1_output}")
+            print(f"Phase 1.2 output:\n{phase1_2_output}")
+            print(f"Standard answers:\n{standard_answers}")
             pytest.fail("SSD Load Accuracy Test Failed!")
 
         # Phase 2.1 should be skipped from accuracy check since it's only partial prompt
         phase2_correct = match_any_answer(phase2_full_output, standard_answers)
         if not phase2_correct:
-            logger.warning(
-                f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) ====="
-            )
-            logger.warning(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
-            logger.warning(f"Phase 2.2 output:\n{phase2_full_output}")
-            logger.warning(f"Standard answers:\n{standard_answers}")
+            print(f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) =====")
+            print(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
+            print(f"Phase 2.2 output:\n{phase2_full_output}")
+            print(f"Standard answers:\n{standard_answers}")
             pytest.fail("HBM + SSD Mixed Accuracy Test Failed!")
 
-    """Test ESA sparse attention."""
+    """Test GSA sparse attention."""
 
     @pytest.mark.stage(1)
     @pytest.mark.feature("offline_inference_sparse")
@@ -234,7 +224,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
     @pytest.mark.parametrize("max_tokens", [200])
     @pytest.mark.parametrize("enforce_eager", [False])
     @pytest.mark.parametrize("max_num_batched_tokens", [2047])
-    def test_offline_esa(
+    def test_offline_gsa(
         self,
         model_name: str,
         max_tokens: int,
@@ -265,7 +255,7 @@ def test_offline_esa(
         except Exception as e:
             pytest.fail(f"Failed to load prompt from prompt.json: {e}")
 
-        logger.info(f"Standard answers: {standard_answers}")
+        print(f"Standard answers: {standard_answers}")
 
         tokenizer = AutoTokenizer.from_pretrained(model_path, use_chat_template=True)
 
@@ -296,15 +286,7 @@ def test_offline_esa(
                     },
                 }
             ],
-            "ucm_sparse_config": {
-                "ESA": {
-                    "init_window_sz": 1,
-                    "local_window_sz": 2,
-                    "min_blocks": 4,
-                    "sparse_ratio": 0.3,
-                    "retrieval_stride": 5,
-                }
-            },
+            "ucm_sparse_config": {"GSAOnDevice": {}},
         }
 
         sampling_params = SamplingParams(
@@ -325,12 +307,12 @@ def test_offline_esa(
             sampling_params_dict,
             False,  # enable_prefix_caching=False
             enforce_eager,
-            "ESA",
+            "GSA",
             max_num_batched_tokens,
             timeout=180,
         )
         phase1_1_output = phase1_outputs[0]  # Phase 1.1: SSD save
         phase1_2_output = phase1_outputs[1]  # Phase 1.2: SSD load
-        logger.info(f"ESA inference completed in subprocess")
-        logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
-        logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
+        print(f"ESA inference completed in subprocess")
+        print(f'Phase 1.1 output: "{phase1_1_output}"')
+        print(f'Phase 1.2 output: "{phase1_2_output}"')