diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 3043a7d75..26d615d1f 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -76,8 +76,6 @@ jobs: test-e2e-pc-gpu: runs-on: gpu needs: lint-and-unit-tests - env: - BUILD_TYPE: Release permissions: checks: write pull-requests: write @@ -90,17 +88,39 @@ jobs: rm -rf .[!.]* fi - uses: actions/checkout@v4 + - name: Install Docker CLI + run: | + if ! command -v docker &> /dev/null; then + echo "Docker CLI not found, installing..." + sudo apt-get update + sudo apt-get install -y docker.io + else + echo "Docker CLI already installed" + fi + - name: Generate Docker Image Version + id: version + run: | + DATE=$(date +%Y%m%d) + SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7) + VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}" + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "Docker image version: ${VERSION}" - name: Build run: | cd ${{github.workspace}} - export PLATFORM=cuda - pip install -v -e . --no-build-isolation - - name: Test E2E + sudo docker build -t ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} -f ./docker/Dockerfile-onlyPC ./ + - name: Test E2E in Docker run: | - cd ${{github.workspace}} - cd test - pip install -r requirements.txt - python3 -m pytest -x --stage=1 --feature=offline_inference --junitxml=offline-inference.xml + sudo docker run --rm \ + -itd \ + --gpus all \ + --network=host \ + --ipc=host \ + --cap-add IPC_LOCK \ + -v /home/models:/home/models \ + -v ${{github.workspace}}:/workspace \ + ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} \ + -c "cd /workspace/test && pip install -r requirements.txt && python3 -m pytest -x --stage=1 --feature=offline_inference --junitxml=offline-inference.xml" - name: Upload pytest results uses: EnricoMi/publish-unit-test-result-action/linux@v2 if: (!cancelled()) @@ -108,12 +128,14 @@ jobs: files: | ${{github.workspace}}/test/offline-inference.xml check_name: Prefix cache test results + - name: Cleanup Docker Image + if: always() + run: | + sudo docker rmi ucm-e2etest-gpu-pc:${{ steps.version.outputs.version }} || true test-e2e-sparse-gpu: runs-on: gpu needs: lint-and-unit-tests - env: - BUILD_TYPE: Release permissions: checks: write pull-requests: write @@ -126,18 +148,39 @@ jobs: rm -rf .[!.]* fi - uses: actions/checkout@v4 + - name: Install Docker CLI + run: | + if ! command -v docker &> /dev/null; then + echo "Docker CLI not found, installing..." + sudo apt-get update + sudo apt-get install -y docker.io + else + echo "Docker CLI already installed" + fi + - name: Generate Docker Image Version + id: version + run: | + DATE=$(date +%Y%m%d) + SHORT_SHA=$(echo '${{ github.sha }}' | cut -c1-7) + VERSION="${{ github.ref_name }}-${DATE}-${{ github.run_number }}-${SHORT_SHA}" + echo "version=${VERSION}" >> $GITHUB_OUTPUT + echo "Docker image version: ${VERSION}" - name: Build run: | cd ${{github.workspace}} - export PLATFORM=cuda - export ENABLE_SPARSE=TRUE - pip install -v -e . --no-build-isolation - - name: Test E2E + sudo docker build -t ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} -f ./docker/Dockerfile ./ + - name: Test E2E in Docker run: | - cd ${{github.workspace}} - cd test - pip install -r requirements.txt - python3 -m pytest -x --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml + sudo docker run --rm \ + -itd \ + --gpus all \ + --network=host \ + --ipc=host \ + --cap-add IPC_LOCK \ + -v /home/models:/home/models \ + -v ${{github.workspace}}:/workspace \ + ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} \ + -c "cd /workspace/test && pip install -r requirements.txt && python3 -m pytest -x --stage=1 --feature=offline_inference_sparse --junitxml=offline-inference-sparse.xml" - name: Upload pytest results uses: EnricoMi/publish-unit-test-result-action/linux@v2 if: (!cancelled()) @@ -145,4 +188,8 @@ jobs: files: | ${{github.workspace}}/test/offline-inference-sparse.xml check_name: Sparse attention test results + - name: Cleanup Docker Image + if: always() + run: | + sudo docker rmi ucm-e2etest-gpu-sparse:${{ steps.version.outputs.version }} || true diff --git a/docker/Dockerfile-onlyPC b/docker/Dockerfile-onlyPC new file mode 100644 index 000000000..0ad355476 --- /dev/null +++ b/docker/Dockerfile-onlyPC @@ -0,0 +1,16 @@ +# Set to other image if needed +FROM vllm/vllm-openai:v0.9.2 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" + +WORKDIR /workspace + +# Install unified-cache-management +COPY . /workspace/unified-cache-management + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +RUN export PLATFORM="cuda" && \ + pip install -v -e /workspace/unified-cache-management --no-build-isolation + +ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py index ae45bb244..b8735e220 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -164,20 +164,25 @@ def pytest_runtest_logreport(report): def get_free_gpu(required_memory_mb): - mem_needed_with_buffer = int(required_memory_mb * 1.3) # add buffer to avoid OOM - pynvml.nvmlInit() - device_count = pynvml.nvmlDeviceGetCount() - device_indices = list(range(device_count)) - random.shuffle(device_indices) - for i in device_indices: # random order to reduce collisions - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - info = pynvml.nvmlDeviceGetMemoryInfo(handle) - free_in_mb = info.free / 1024**2 - if free_in_mb >= mem_needed_with_buffer: - utilization = ( - required_memory_mb * (1024**2) / info.total if info.total else 0 - ) - return i, free_in_mb, utilization + try: + mem_needed_with_buffer = int( + required_memory_mb * 1.3 + ) # add buffer to avoid OOM + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + device_indices = list(range(device_count)) + random.shuffle(device_indices) + for i in device_indices: # random order to reduce collisions + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + free_in_mb = info.free / 1024**2 + if free_in_mb >= mem_needed_with_buffer: + utilization = ( + required_memory_mb * (1024**2) / info.total if info.total else 0 + ) + return i, free_in_mb, utilization + finally: + pynvml.nvmlShutdown() return None, 0, 0 @@ -189,7 +194,7 @@ def setup_gpu_resource(request): gpu_id, free_in_mb, gpu_utilization = get_free_gpu(mem_needed) if gpu_id is not None: print( - f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization {gpu_utilization:.4%}" + f"Allocating GPU {gpu_id} with {free_in_mb}MB free memory, gpu utilization for test {gpu_utilization:.4%}" ) os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) if gpu_utilization: diff --git a/test/suites/E2E/test_offline_inference_sparse.py b/test/suites/E2E/test_offline_inference_sparse.py index 15b7ae886..79553523e 100644 --- a/test/suites/E2E/test_offline_inference_sparse.py +++ b/test/suites/E2E/test_offline_inference_sparse.py @@ -215,16 +215,16 @@ def match_any_answer(output: str, answers: list[str]) -> bool: print(f"Standard answers:\n{standard_answers}") pytest.fail("HBM + SSD Mixed Accuracy Test Failed!") - """Test ESA sparse attention.""" + """Test GSA sparse attention.""" @pytest.mark.stage(1) @pytest.mark.feature("offline_inference_sparse") @pytest.mark.gpu_mem(6000) @pytest.mark.parametrize("model_name", ["Qwen2.5-1.5B-Instruct"]) @pytest.mark.parametrize("max_tokens", [200]) - @pytest.mark.parametrize("enforce_eager", [False]) - @pytest.mark.parametrize("max_num_batched_tokens", [2047]) - def test_offline_esa( + @pytest.mark.parametrize("enforce_eager", [True]) + @pytest.mark.parametrize("max_num_batched_tokens", [30000]) + def test_offline_gsa( self, model_name: str, max_tokens: int, @@ -286,15 +286,7 @@ def test_offline_esa( }, } ], - "ucm_sparse_config": { - "ESA": { - "init_window_sz": 1, - "local_window_sz": 2, - "min_blocks": 4, - "sparse_ratio": 0.3, - "retrieval_stride": 5, - } - }, + "ucm_sparse_config": {"GSAOnDevice": {}}, } sampling_params = SamplingParams( @@ -315,7 +307,7 @@ def test_offline_esa( sampling_params_dict, False, # enable_prefix_caching=False enforce_eager, - "ESA", + "GSA", max_num_batched_tokens, timeout=180, )