From 84fd202b40949dc1436ab6ada4229964d4eb9148 Mon Sep 17 00:00:00 2001 From: Yanzhao Wang Date: Thu, 22 Jan 2026 16:22:41 +0800 Subject: [PATCH 1/2] reduce gpu utilization to 6GB --- test/common/offline_inference_utils.py | 9 ++++++-- test/conftest.py | 22 ++++++++++++++----- test/suites/E2E/test_offline_inference.py | 2 +- .../E2E/test_offline_inference_sparse.py | 4 ++-- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/test/common/offline_inference_utils.py b/test/common/offline_inference_utils.py index ae3687b74..f55473e00 100644 --- a/test/common/offline_inference_utils.py +++ b/test/common/offline_inference_utils.py @@ -222,7 +222,6 @@ def build_llm_with_uc( "model": model_path, "kv_transfer_config": ktc, "max_model_len": 12000, - "gpu_memory_utilization": 0.3, # Reduced to prevent OOM after Phase 1 "max_num_batched_tokens": max_num_batched_tokens, "block_size": 128, "enforce_eager": llm_kwargs.get("enforce_eager", True), @@ -276,11 +275,17 @@ def run_offline_inference( """ sampling_params = from_dict_for_serialization(sampling_params_dict) + gpu_memory_utilization = float(os.getenv("E2E_TEST_GPU_MEMORY_UTILIZATION", "0.1")) + logger.info( + "run offline inference with gpu memory utilization: %.4f", + gpu_memory_utilization, + ) + with build_llm_with_uc( model_path=model_path, ucm_config=ucm_config, enable_prefix_caching=enable_prefix_caching, - gpu_memory_utilization=0.3, + gpu_memory_utilization=gpu_memory_utilization, max_num_batched_tokens=max_num_batched_tokens, enforce_eager=enforce_eager, ) as llm: diff --git a/test/conftest.py b/test/conftest.py index 2189094e9..784a29135 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -162,6 +162,7 @@ def pytest_runtest_logreport(report): def get_free_gpu(required_memory_mb): + mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() device_indices = list(range(device_count)) @@ -170,9 +171,12 @@ def get_free_gpu(required_memory_mb): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) free_in_mb = info.free / 1024**2 - if free_in_mb >= required_memory_mb: - return i, free_in_mb - return None, 0 + if free_in_mb >= mem_needed_with_buffer: + utilization = ( + required_memory_mb * (1024**2) / info.total if info.total else 0 + ) + return i, free_in_mb, utilization + return None, 0, 0 @pytest.fixture(autouse=True) @@ -180,9 +184,15 @@ def setup_gpu_resource(request): marker = request.node.get_closest_marker("gpu_mem") if marker: mem_needed = marker.args[0] - gpu_id, free_in_mb = get_free_gpu(mem_needed) + gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed) if gpu_id is not None: - print(f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory") + print( + f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}" + ) os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + if gpu_utilization: + os.environ["E2E_TEST_GPU_MEMORY_UTILIZATION"] = str(gpu_utilization) else: - pytest.fail(f"No GPU with {mem_needed}MB free memory available") + pytest.fail( + f"No GPU with {mem_needed}MB(+30% buffer) free memory available" + ) diff --git a/test/suites/E2E/test_offline_inference.py b/test/suites/E2E/test_offline_inference.py index 345c759e5..ced06ff04 100644 --- a/test/suites/E2E/test_offline_inference.py +++ b/test/suites/E2E/test_offline_inference.py @@ -25,7 +25,7 @@ class TestBasicOfflineInference: @pytest.mark.stage(1) @pytest.mark.feature("offline_inference") - @pytest.mark.gpu_mem(30000) + @pytest.mark.gpu_mem(6000) @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"]) @pytest.mark.parametrize("max_tokens", [200]) @pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py index 49ead5d3c..fda2532cb 100644 --- a/test/suites/E2E/test_offline_inference_sparse.py +++ b/test/suites/E2E/test_offline_inference_sparse.py @@ -25,7 +25,7 @@ class TestBasicOfflineInferenceSparse: @pytest.mark.stage(1) @pytest.mark.feature("offline_inference_sparse") - @pytest.mark.gpu_mem(30000) + @pytest.mark.gpu_mem(6000) @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"]) @pytest.mark.parametrize("max_tokens", [200]) @pytest.mark.parametrize("prompt_split_ratio", [0.5]) # Split prompt in half @@ -229,7 +229,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool: @pytest.mark.stage(1) @pytest.mark.feature("offline_inference_sparse") - @pytest.mark.gpu_mem(30000) + @pytest.mark.gpu_mem(6000) @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"]) @pytest.mark.parametrize("max_tokens", [200]) @pytest.mark.parametrize("enforce_eager", [False]) From a4928719f7292a25269ae30390dc87dae8e0a476 Mon Sep 17 00:00:00 2001 From: Yanzhao Wang Date: Fri, 23 Jan 2026 15:21:22 +0800 Subject: [PATCH 2/2] build docker for test, use gsa for sparse attention test --- .github/workflows/pull-request.yml | 63 ++++++++++--- docker/Dockerfile-onlyPC | 16 ++++ test/conftest.py | 33 ++++--- .../E2E/test_offline_inference_sparse.py | 92 ++++++++----------- 4 files changed, 120 insertions(+), 84 deletions(-) create mode 100644 docker/Dockerfile-onlyPC diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index f1b395765..f509bb99f 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -71,17 +71,30 @@ jobs: rm -rf .[!.]* fi - uses: actions/checkout@v4 + - name: Generate Docker Image Version + id: version + run: | + DATE=$(date +%Y%m%d) + SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7) + VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}" + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "Docker image version: ${VERSION}" - name: Build run: | cd ${{github.workspace}} - export PLATFORM=cuda - pip install -v -e . --no-build-isolation - - name: Test E2E + sudo docker build -t ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} -f ./docker/Dockerfile-onlyPC ./ + - name: Test E2E in Docker run: | - cd ${{github.workspace}} - cd test - pip install pytest pytest-cov pynvml pandas - python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml + sudo docker run --rm \ + -itd \ + --gpus all \ + --network=host \ + --ipc=host \ + --cap-add IPC_LOCK \ + -v /home/models:/home/models \ + -v ${{github.workspace}}:/workspace \ + ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} \ + -c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml" - name: Upload pytest results uses: EnricoMi/publish-unit-test-result-action/linux@v2 if: (!cancelled()) @@ -89,6 +102,10 @@ jobs: files: | ${{github.workspace}}/test/offline-inference.xml check_name: Prefix cache test results + - name: Cleanup Docker Image + if: always() + run: | + sudo docker rmi ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} || true test-e2e-sparse-gpu: runs-on: gpu @@ -107,18 +124,30 @@ jobs: rm -rf .[!.]* fi - uses: actions/checkout@v4 + - name: Generate Docker Image Version + id: version + run: | + DATE=$(date +%Y%m%d) + SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7) + VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}" + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "Docker image version: ${VERSION}" - name: Build run: | cd ${{github.workspace}} - export PLATFORM=cuda - export ENABLE_SPARSE=TRUE - pip install -v -e . --no-build-isolation - - name: Test E2E + sudo docker build -t ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} -f ./docker/Dockerfile ./ + - name: Test E2E in Docker run: | - cd ${{github.workspace}} - cd test - pip install pytest pytest-cov pynvml pandas - python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml + sudo docker run --rm \ + -itd \ + --gpus all \ + --network=host \ + --ipc=host \ + --cap-add IPC_LOCK \ + -v /home/models:/home/models \ + -v ${{github.workspace}}:/workspace \ + ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} \ + -c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml" - name: Upload pytest results uses: EnricoMi/publish-unit-test-result-action/linux@v2 if: (!cancelled()) @@ -126,4 +155,8 @@ jobs: files: | ${{github.workspace}}/test/offline-inference-sparse.xml check_name: Sparse attention test results + - name: Cleanup Docker Image + if: always() + run: | + sudo docker rmi ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} || true diff --git a/docker/Dockerfile-onlyPC b/docker/Dockerfile-onlyPC new file mode 100644 index 000000000..0ad355476 --- /dev/null +++ b/docker/Dockerfile-onlyPC @@ -0,0 +1,16 @@ +# Set to other image if needed +FROM vllm/vllm-openai:v0.9.2 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" + +WORKDIR /workspace + +# Install unified-cache-management +COPY . /workspace/unified-cache-management + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +RUN export PLATFORM="cuda" && \ + pip install -v -e /workspace/unified-cache-management --no-build-isolation + +ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py index 784a29135..178fe0450 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -162,20 +162,25 @@ def pytest_runtest_logreport(report): def get_free_gpu(required_memory_mb): - mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM - pynvml.nvmlInit() - device_count = pynvml.nvmlDeviceGetCount() - device_indices = list(range(device_count)) - random.shuffle(device_indices) - for i in device_indices: # random order to reduce collisions - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - info = pynvml.nvmlDeviceGetMemoryInfo(handle) - free_in_mb = info.free / 1024**2 - if free_in_mb >= mem_needed_with_buffer: - utilization = ( - required_memory_mb * (1024**2) / info.total if info.total else 0 - ) - return i, free_in_mb, utilization + try: + mem_needed_with_buffer = int( + required_memory_mb * 1.3 + ) # add buffer to avoid OOM + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + device_indices = list(range(device_count)) + random.shuffle(device_indices) + for i in device_indices: # random order to reduce collisions + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + free_in_mb = info.free / 1024**2 + if free_in_mb >= mem_needed_with_buffer: + utilization = ( + required_memory_mb * (1024**2) / info.total if info.total else 0 + ) + return i, free_in_mb, utilization + finally: + pynvml.nvmlShutdown() return None, 0, 0 diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py index fda2532cb..a818dc005 100644 --- a/test/suites/E2E/test_offline_inference_sparse.py +++ b/test/suites/E2E/test_offline_inference_sparse.py @@ -15,10 +15,6 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams -from ucm.logger import init_logger - -logger = init_logger(__name__) - class TestBasicOfflineInferenceSparse: """Test basic offline inference functionality.""" @@ -67,11 +63,9 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse( test_prompt, standard_answers = load_prompt_from_file( Path(__file__).parent / "prompts" / "test_offline_inference.json" ) - logger.info( - f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)" - ) + print(f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)") if standard_answers: - logger.info(f"Standard answers: {standard_answers}") + print(f"Standard answers: {standard_answers}") else: pytest.fail(f"No standard answers found in prompt.json") except Exception as e: @@ -120,19 +114,19 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse( ignore_eos=False, ) - logger.info(f"\n===== HBM + SSD Mixed Accuracy Test =====") - logger.info(f"Model: {model_path}") - logger.info(f"Full prompt length: {len(test_prompt)} chars") - logger.info(f"Max tokens: {max_tokens}") - logger.info(f"Temperature: 0.0 (deterministic)") - logger.info(f"UCM storage: {ucm_storage_dir}") - logger.info(f"Prompt split ratio: {prompt_split_ratio}") - logger.info(f"Enforce eager: {enforce_eager}") - logger.info(f"Max num batched tokens: {max_num_batched_tokens}") + print(f"\n===== HBM + SSD Mixed Accuracy Test =====") + print(f"Model: {model_path}") + print(f"Full prompt length: {len(test_prompt)} chars") + print(f"Max tokens: {max_tokens}") + print(f"Temperature: 0.0 (deterministic)") + print(f"UCM storage: {ucm_storage_dir}") + print(f"Prompt split ratio: {prompt_split_ratio}") + print(f"Enforce eager: {enforce_eager}") + print(f"Max num batched tokens: {max_num_batched_tokens}") # ===== Phase 1: Disable HBM PC, save KV cache to SSD and load (baseline) ===== # Run Phase 1 in a separate subprocess to ensure GPU memory is fully released - logger.info(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====") + print(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====") # Convert SamplingParams to dict for serialization, as non-picklable objects cannot be passed to subprocess sampling_params_dict = to_dict_for_serialization(sampling_params) @@ -151,13 +145,13 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse( ) phase1_1_output = phase1_outputs[0] # Phase 1.1: SSD save phase1_2_output = phase1_outputs[1] # Phase 1.2: SSD load - logger.info(f"Phase 1 completed in subprocess") - logger.info(f'Phase 1.1 output: "{phase1_1_output}"') - logger.info(f'Phase 1.2 output: "{phase1_2_output}"') + print(f"Phase 1 completed in subprocess") + print(f'Phase 1.1 output: "{phase1_1_output}"') + print(f'Phase 1.2 output: "{phase1_2_output}"') # ===== Phase 2: Enable HBM PC, test HBM + SSD mixed hit ===== # Run Phase 2 in a separate subprocess to ensure GPU memory is fully released - logger.info(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====") + print(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====") phase2_outputs = run_in_spawn_subprocess( run_offline_inference, @@ -173,11 +167,11 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse( ) phase2_partial_output = phase2_outputs[0] phase2_full_output = phase2_outputs[1] - logger.info(f"Phase 2 completed in subprocess") - logger.info(f"[INFO] Phase 2.1 output: {phase2_partial_output}") - logger.info(f"[INFO] Phase 2.2 output: {phase2_full_output}") + print(f"Phase 2 completed in subprocess") + print(f"[INFO] Phase 2.1 output: {phase2_partial_output}") + print(f"[INFO] Phase 2.2 output: {phase2_full_output}") - logger.info(f"\n[INFO] ===== Accuracy Test Results =====") + print(f"\n[INFO] ===== Accuracy Test Results =====") # Note: Small numerical precision differences in KV cache loading can cause # punctuation token selection differences (e.g., full-width vs half-width comma) @@ -203,29 +197,25 @@ def match_any_answer(output: str, answers: list[str]) -> bool: phase1_1_output, standard_answers ) and match_any_answer(phase1_2_output, standard_answers) if not phase1_correct: - logger.warning( - f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) =====" - ) - logger.warning( + print(f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) =====") + print( f"Incorrect answer in Phase 1.1 (SSD save) or Phase 1.2 (SSD load) output!" ) - logger.warning(f"Phase 1.1 output:\n{phase1_1_output}") - logger.warning(f"Phase 1.2 output:\n{phase1_2_output}") - logger.warning(f"Standard answers:\n{standard_answers}") + print(f"Phase 1.1 output:\n{phase1_1_output}") + print(f"Phase 1.2 output:\n{phase1_2_output}") + print(f"Standard answers:\n{standard_answers}") pytest.fail("SSD Load Accuracy Test Failed!") # Phase 2.1 should be skipped from accuracy check since it's only partial prompt phase2_correct = match_any_answer(phase2_full_output, standard_answers) if not phase2_correct: - logger.warning( - f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) =====" - ) - logger.warning(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!") - logger.warning(f"Phase 2.2 output:\n{phase2_full_output}") - logger.warning(f"Standard answers:\n{standard_answers}") + print(f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) =====") + print(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!") + print(f"Phase 2.2 output:\n{phase2_full_output}") + print(f"Standard answers:\n{standard_answers}") pytest.fail("HBM + SSD Mixed Accuracy Test Failed!") - """Test ESA sparse attention.""" + """Test GSA sparse attention.""" @pytest.mark.stage(1) @pytest.mark.feature("offline_inference_sparse") @@ -234,7 +224,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool: @pytest.mark.parametrize("max_tokens", [200]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("max_num_batched_tokens", [2047]) - def test_offline_esa( + def test_offline_gsa( self, model_name: str, max_tokens: int, @@ -265,7 +255,7 @@ def test_offline_esa( except Exception as e: pytest.fail(f"Failed to load prompt from prompt.json: {e}") - logger.info(f"Standard answers: {standard_answers}") + print(f"Standard answers: {standard_answers}") tokenizer = AutoTokenizer.from_pretrained(model_path, use_chat_template=True) @@ -296,15 +286,7 @@ def test_offline_esa( }, } ], - "ucm_sparse_config": { - "ESA": { - "init_window_sz": 1, - "local_window_sz": 2, - "min_blocks": 4, - "sparse_ratio": 0.3, - "retrieval_stride": 5, - } - }, + "ucm_sparse_config": {"GSAOnDevice": {}}, } sampling_params = SamplingParams( @@ -325,12 +307,12 @@ def test_offline_esa( sampling_params_dict, False, # enable_prefix_caching=False enforce_eager, - "ESA", + "GSA", max_num_batched_tokens, timeout=180, ) phase1_1_output = phase1_outputs[0] # Phase 1.1: SSD save phase1_2_output = phase1_outputs[1] # Phase 1.2: SSD load - logger.info(f"ESA inference completed in subprocess") - logger.info(f'Phase 1.1 output: "{phase1_1_output}"') - logger.info(f'Phase 1.2 output: "{phase1_2_output}"') + print(f"ESA inference completed in subprocess") + print(f'Phase 1.1 output: "{phase1_1_output}"') + print(f'Phase 1.2 output: "{phase1_2_output}"')