Merge branch 'main' into dev/iirzynsk/test_prefix_caching

iirzynska · web-flow · commit ff169c3fc697 · 2025-10-14T15:13:10.000+02:00
diff --git a/docs/dev_guide/ci-failures.md b/docs/dev_guide/ci-failures.md
@@ -1,3 +1,53 @@
 # CI Failures
 
-WIP
+## CI
+
+For all PRs that are created in vllm-gaudi repository all checks in CI are required:
+- pre-commit & DCO
+- HPU tests
+- HPU Gaudi tests
+
+### Pre-commit & DCO
+To install run:
+
+```pre-commit install```
+
+This way all of your commits should be correctly formated and signed-off. If you need to manually sign off your commits, remember to use ```git commit -s``` to pass DCO.
+
+### HPU tests
+HPU tests consist of several unit tests:
+- pre merge tests
+- unit tests
+- perf test
+- feature tests
+- e2e tests
+
+All of the above tests are mandatory. Those tests operate in fast fail mode, meaning if one test fails, all of the others won't be triggered.
+
+### HPU Gaudi tests
+Additional Gaudi tests are expectd to pass, but aren't mandatory. Those tests are being run on internal Jenkins system, so results are internal only. Those tests can be run by CODEOWNERs and TESTOWNERs only.
+
+## Docs Pull Requests
+All PRs that do not interfere in code, like docstring changes or README updates can be merged without HPU tests and Gaudi tests. It is still required to pass pre-commit check.
+
+## Hourly Checks and Tests
+On vllm-gaudi repository hourly tests can be found in ```Hourly Commit Check and Tests``` under ```Actions``` tab. This tab also allows developers to manually trigger hourly tests on selected branch.
+
+If the last hourly test is failing it means that vllm-gaudi main branch doesn't work with upstream newest main commit. To find last good commit check [last good commit](https://github.com/vllm-project/vllm-gaudi/blob/vllm/last-good-commit-for-vllm-gaudi/VLLM_STABLE_COMMIT).
+
+Failing hourly checks will be fixed by developers as soon as possible.
+
+## Troubleshooting
+### Unreleated failures
+Sometimes there may be some issues that are unreleated to your specific changes in code. Often causeb by connection problems. In this case failed checks should be reruned. Those errors are:
+- ```Error response from daemon: No such container```
+- ```ValueError: Unsupported device: the device type is 7.```
+- ```[Device not found] Device acquire failed.```
+
+### Accuracy and functionality issues
+Accuracy issues can be tracked in HPU Gaudi tests with gsm8k runs. If any check fails with accuracy - too low accuracy compare to the one measured, or functionality issues, the **PR can't be merged** until solved.
+
+### Pre-commit failures
+To run pre-commit test manually run:
+
+```pre-commit run --show-diff-on-failure --color=always --all-files --hook-stage manual```
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -3007,19 +3007,24 @@ def execute_model(
 
                 self.event_start = self.profiler.get_timestamp_us()
                 self.profiler.start("internal", "prefill")
-                # Align behavior of incomplete prompt with gpu_model_runner
-                # If logits_indices is smaller than req_id,
-                # add the last token position
+                # NOTE(tianmu-li): Align behavior of incomplete prompt with gpu_model_runner
+                # If logits_indices is smaller than req_id, the last request is a chunked prompt request that
+                # hasn't finished in this step. We add the last token position to logits_indices to ensure
+                # the last token of the chunk is sampled. This sampled token will be discarded later
                 if logits_indices.shape[0] < len(req_id):
-                    if structured_output:
-                        logits_append = torch.tensor([torch.sum(prompt_len) - 1],
-                                                     device=token_ids.device,
-                                                     dtype=torch.int32)
-                        logits_indices = torch.cat([logits_indices, logits_append])
-                    elif self.use_async_scheduling:
-                        # Discard partial prefill logits for async scheduling
+                    if structured_output or self.use_async_scheduling:
+                        # When there are multiple requests in the batch (e.g. self.use_merged_prefill=True),
+                        # the last token position is the sum of all prompt lengths - 1
+                        # This logic also holds when there is only one request in the batch
+                        logits_indices_append = torch.tensor([torch.sum(prompt_len) - 1],
+                                                             device=token_ids.device,
+                                                             dtype=torch.int32)
+                        logits_indices = torch.cat([logits_indices, logits_indices_append])
+                    if self.use_async_scheduling:
+                        # Discard partial prefill logit for async scheduling
                         # Depends on 1 decode token/batch
-                        invalid_req_indices.append(num_decodes + idx)
+                        prefill_start_idx = num_decodes
+                        invalid_req_indices.append(prefill_start_idx + idx)
                 htorch.core.mark_step()
                 non_flattened_hidden_states, aux_hidden_states, \
                     sample_hidden_states, logits_device = \
@@ -3321,7 +3326,7 @@ def execute_model(
             return AsyncHPUModelRunnerOutput(
                 model_runner_output=model_runner_output,
                 sampled_token_ids=sampled_token_ids,
-                invalid_req_indices=[],
+                invalid_req_indices=invalid_req_indices,
                 async_output_copy_stream=self.async_output_copy_stream,
             )
         model_runner_output = ModelRunnerOutput(