Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 48 additions & 15 deletions .github/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,24 +71,41 @@ jobs:
rm -rf .[!.]*
fi
- uses: actions/checkout@v4
- name: Generate Docker Image Version
id: version
run: |
DATE=$(date +%Y%m%d)
SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
echo "version=${VERSION}" >> $GITHUB_OUTPUT
echo "Docker image version: ${VERSION}"
- name: Build
run: |
cd ${{github.workspace}}
export PLATFORM=cuda
pip install -v -e . --no-build-isolation
- name: Test E2E
sudo docker build -t ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} -f ./docker/Dockerfile-onlyPC ./
- name: Test E2E in Docker
run: |
cd ${{github.workspace}}
cd test
pip install pytest pytest-cov pynvml pandas
python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml
sudo docker run --rm \
-itd \
--gpus all \
--network=host \
--ipc=host \
--cap-add IPC_LOCK \
-v /home/models:/home/models \
-v ${{github.workspace}}:/workspace \
ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} \
-c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference --junitxml=offline-inference.xml"
- name: Upload pytest results
uses: EnricoMi/publish-unit-test-result-action/linux@v2
if: (!cancelled())
with:
files: |
${{github.workspace}}/test/offline-inference.xml
check_name: Prefix cache test results
- name: Cleanup Docker Image
if: always()
run: |
sudo docker rmi ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} || true

test-e2e-sparse-gpu:
runs-on: gpu
Expand All @@ -107,23 +124,39 @@ jobs:
rm -rf .[!.]*
fi
- uses: actions/checkout@v4
- name: Generate Docker Image Version
id: version
run: |
DATE=$(date +%Y%m%d)
SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7)
VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}"
echo "version=${VERSION}" >> $GITHUB_OUTPUT
echo "Docker image version: ${VERSION}"
- name: Build
run: |
cd ${{github.workspace}}
export PLATFORM=cuda
export ENABLE_SPARSE=TRUE
pip install -v -e . --no-build-isolation
- name: Test E2E
sudo docker build -t ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} -f ./docker/Dockerfile ./
- name: Test E2E in Docker
run: |
cd ${{github.workspace}}
cd test
pip install pytest pytest-cov pynvml pandas
python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml
sudo docker run --rm \
-itd \
--gpus all \
--network=host \
--ipc=host \
--cap-add IPC_LOCK \
-v /home/models:/home/models \
-v ${{github.workspace}}:/workspace \
ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} \
-c "cd /workspace/test && pip install pytest pytest-cov nvidia-ml-py pandas && python3 -m pytest --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml"
- name: Upload pytest results
uses: EnricoMi/publish-unit-test-result-action/linux@v2
if: (!cancelled())
with:
files: |
${{github.workspace}}/test/offline-inference-sparse.xml
check_name: Sparse attention test results
- name: Cleanup Docker Image
if: always()
run: |
sudo docker rmi ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} || true

16 changes: 16 additions & 0 deletions docker/Dockerfile-onlyPC
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Set to other image if needed
FROM vllm/vllm-openai:v0.9.2

ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"

WORKDIR /workspace

# Install unified-cache-management
COPY . /workspace/unified-cache-management

RUN pip config set global.index-url ${PIP_INDEX_URL}

RUN export PLATFORM="cuda" && \
pip install -v -e /workspace/unified-cache-management --no-build-isolation

ENTRYPOINT ["/bin/bash"]
33 changes: 19 additions & 14 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,20 +162,25 @@ def pytest_runtest_logreport(report):


def get_free_gpu(required_memory_mb):
mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
device_indices = list(range(device_count))
random.shuffle(device_indices)
for i in device_indices: # random order to reduce collisions
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
free_in_mb = info.free / 1024**2
if free_in_mb >= mem_needed_with_buffer:
utilization = (
required_memory_mb * (1024**2) / info.total if info.total else 0
)
return i, free_in_mb, utilization
try:
mem_needed_with_buffer = int(
required_memory_mb * 1.3
) # add buffer to avoid OOM
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
device_indices = list(range(device_count))
random.shuffle(device_indices)
for i in device_indices: # random order to reduce collisions
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
free_in_mb = info.free / 1024**2
if free_in_mb >= mem_needed_with_buffer:
utilization = (
required_memory_mb * (1024**2) / info.total if info.total else 0
)
return i, free_in_mb, utilization
finally:
pynvml.nvmlShutdown()
return None, 0, 0


Expand Down
92 changes: 37 additions & 55 deletions test/suites/E2E/test_offline_inference_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

from ucm.logger import init_logger

logger = init_logger(__name__)


class TestBasicOfflineInferenceSparse:
"""Test basic offline inference functionality."""
Expand Down Expand Up @@ -67,11 +63,9 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
test_prompt, standard_answers = load_prompt_from_file(
Path(__file__).parent / "prompts" / "test_offline_inference.json"
)
logger.info(
f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)"
)
print(f"Loaded prompt from prompt.json (length: {len(test_prompt)} chars)")
if standard_answers:
logger.info(f"Standard answers: {standard_answers}")
print(f"Standard answers: {standard_answers}")
else:
pytest.fail(f"No standard answers found in prompt.json")
except Exception as e:
Expand Down Expand Up @@ -120,19 +114,19 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
ignore_eos=False,
)

logger.info(f"\n===== HBM + SSD Mixed Accuracy Test =====")
logger.info(f"Model: {model_path}")
logger.info(f"Full prompt length: {len(test_prompt)} chars")
logger.info(f"Max tokens: {max_tokens}")
logger.info(f"Temperature: 0.0 (deterministic)")
logger.info(f"UCM storage: {ucm_storage_dir}")
logger.info(f"Prompt split ratio: {prompt_split_ratio}")
logger.info(f"Enforce eager: {enforce_eager}")
logger.info(f"Max num batched tokens: {max_num_batched_tokens}")
print(f"\n===== HBM + SSD Mixed Accuracy Test =====")
print(f"Model: {model_path}")
print(f"Full prompt length: {len(test_prompt)} chars")
print(f"Max tokens: {max_tokens}")
print(f"Temperature: 0.0 (deterministic)")
print(f"UCM storage: {ucm_storage_dir}")
print(f"Prompt split ratio: {prompt_split_ratio}")
print(f"Enforce eager: {enforce_eager}")
print(f"Max num batched tokens: {max_num_batched_tokens}")

# ===== Phase 1: Disable HBM PC, save KV cache to SSD and load (baseline) =====
# Run Phase 1 in a separate subprocess to ensure GPU memory is fully released
logger.info(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")
print(f"\n===== Phase 1: Save KV Cache to SSD And Load (Baseline) =====")

# Convert SamplingParams to dict for serialization, as non-picklable objects cannot be passed to subprocess
sampling_params_dict = to_dict_for_serialization(sampling_params)
Expand All @@ -151,13 +145,13 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
)
phase1_1_output = phase1_outputs[0] # Phase 1.1: SSD save
phase1_2_output = phase1_outputs[1] # Phase 1.2: SSD load
logger.info(f"Phase 1 completed in subprocess")
logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
print(f"Phase 1 completed in subprocess")
print(f'Phase 1.1 output: "{phase1_1_output}"')
print(f'Phase 1.2 output: "{phase1_2_output}"')

# ===== Phase 2: Enable HBM PC, test HBM + SSD mixed hit =====
# Run Phase 2 in a separate subprocess to ensure GPU memory is fully released
logger.info(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")
print(f"\n===== Phase 2: HBM + SSD Mixed Hit Test =====")

phase2_outputs = run_in_spawn_subprocess(
run_offline_inference,
Expand All @@ -173,11 +167,11 @@ def test_offline_accuracy_hbm_ssd_mixed_nosparse(
)
phase2_partial_output = phase2_outputs[0]
phase2_full_output = phase2_outputs[1]
logger.info(f"Phase 2 completed in subprocess")
logger.info(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
logger.info(f"[INFO] Phase 2.2 output: {phase2_full_output}")
print(f"Phase 2 completed in subprocess")
print(f"[INFO] Phase 2.1 output: {phase2_partial_output}")
print(f"[INFO] Phase 2.2 output: {phase2_full_output}")

logger.info(f"\n[INFO] ===== Accuracy Test Results =====")
print(f"\n[INFO] ===== Accuracy Test Results =====")

# Note: Small numerical precision differences in KV cache loading can cause
# punctuation token selection differences (e.g., full-width vs half-width comma)
Expand All @@ -203,29 +197,25 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
phase1_1_output, standard_answers
) and match_any_answer(phase1_2_output, standard_answers)
if not phase1_correct:
logger.warning(
f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) ====="
)
logger.warning(
print(f"\n===== Phase 1: SSD Load Accuracy Test (Exact Match) =====")
print(
f"Incorrect answer in Phase 1.1 (SSD save) or Phase 1.2 (SSD load) output!"
)
logger.warning(f"Phase 1.1 output:\n{phase1_1_output}")
logger.warning(f"Phase 1.2 output:\n{phase1_2_output}")
logger.warning(f"Standard answers:\n{standard_answers}")
print(f"Phase 1.1 output:\n{phase1_1_output}")
print(f"Phase 1.2 output:\n{phase1_2_output}")
print(f"Standard answers:\n{standard_answers}")
pytest.fail("SSD Load Accuracy Test Failed!")

# Phase 2.1 should be skipped from accuracy check since it's only partial prompt
phase2_correct = match_any_answer(phase2_full_output, standard_answers)
if not phase2_correct:
logger.warning(
f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) ====="
)
logger.warning(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
logger.warning(f"Phase 2.2 output:\n{phase2_full_output}")
logger.warning(f"Standard answers:\n{standard_answers}")
print(f"\n===== Phase 2: HBM + SSD Mixed Accuracy Test (Exact Match) =====")
print(f"Incorrect answer in Phase 2.2 (HBM + SSD mixed) output!")
print(f"Phase 2.2 output:\n{phase2_full_output}")
print(f"Standard answers:\n{standard_answers}")
pytest.fail("HBM + SSD Mixed Accuracy Test Failed!")

"""Test ESA sparse attention."""
"""Test GSA sparse attention."""

@pytest.mark.stage(1)
@pytest.mark.feature("offline_inference_sparse")
Expand All @@ -234,7 +224,7 @@ def match_any_answer(output: str, answers: list[str]) -> bool:
@pytest.mark.parametrize("max_tokens", [200])
@pytest.mark.parametrize("enforce_eager", [False])
@pytest.mark.parametrize("max_num_batched_tokens", [2047])
def test_offline_esa(
def test_offline_gsa(
self,
model_name: str,
max_tokens: int,
Expand Down Expand Up @@ -265,7 +255,7 @@ def test_offline_esa(
except Exception as e:
pytest.fail(f"Failed to load prompt from prompt.json: {e}")

logger.info(f"Standard answers: {standard_answers}")
print(f"Standard answers: {standard_answers}")

tokenizer = AutoTokenizer.from_pretrained(model_path, use_chat_template=True)

Expand Down Expand Up @@ -296,15 +286,7 @@ def test_offline_esa(
},
}
],
"ucm_sparse_config": {
"ESA": {
"init_window_sz": 1,
"local_window_sz": 2,
"min_blocks": 4,
"sparse_ratio": 0.3,
"retrieval_stride": 5,
}
},
"ucm_sparse_config": {"GSAOnDevice": {}},
}

sampling_params = SamplingParams(
Expand All @@ -325,12 +307,12 @@ def test_offline_esa(
sampling_params_dict,
False, # enable_prefix_caching=False
enforce_eager,
"ESA",
"GSA",
max_num_batched_tokens,
timeout=180,
)
phase1_1_output = phase1_outputs[0] # Phase 1.1: SSD save
phase1_2_output = phase1_outputs[1] # Phase 1.2: SSD load
logger.info(f"ESA inference completed in subprocess")
logger.info(f'Phase 1.1 output: "{phase1_1_output}"')
logger.info(f'Phase 1.2 output: "{phase1_2_output}"')
print(f"ESA inference completed in subprocess")
print(f'Phase 1.1 output: "{phase1_1_output}"')
print(f'Phase 1.2 output: "{phase1_2_output}"')
Loading