From 2b1227b228bfcf448a937c220344a12aedd55a69 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Mon, 13 Oct 2025 16:09:55 +0300 Subject: [PATCH 1/5] Use query in linear flags Signed-off-by: Agata Dobrzyniewicz --- docs/configuration/env_vars.md | 8 +++---- vllm_gaudi/extension/bucketing/linear.py | 28 ++++++++++++++++++++---- vllm_gaudi/extension/features.py | 7 +++--- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 8451c7a6..dc69eaef 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -50,9 +50,9 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32` - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `max_num_prefill_seqs` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - query length min (`VLLM_PROMPT_QUERY_BUCKET_MIN`): `block_size` + - query length step (`VLLM_PROMPT_QUERY_BUCKET_STEP`): `block_size` + - query length max (`VLLM_PROMPT_QUERY_BUCKET_MAX`): `max_model_len` - sequence ctx min (`VLLM_PROMPT_CTX_BUCKET_MIN`): `0` - sequence ctx step (`VLLM_PROMPT_CTX_BUCKET_STEP`): `1` - sequence ctx max (`VLLM_PROMPT_CTX_BUCKET_MAX`): `(max_model_len - block_size) // block_size` @@ -60,6 +60,6 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `32` - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `1` - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max_blocks` diff --git a/vllm_gaudi/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py index 4e677a29..53ad6e6a 100644 --- a/vllm_gaudi/extension/bucketing/linear.py +++ b/vllm_gaudi/extension/bucketing/linear.py @@ -17,7 +17,7 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=max_num_prefill_seqs) prompt_query_bucket_cfg = read_bucket_settings('prompt', - 'seq', + 'query', min=block_size, step=block_size, max=max_model_len) @@ -92,9 +92,29 @@ def read_bucket_settings(phase: str, dim: str, **defaults): params = ['min', 'step', 'max'] env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params] default_values = [defaults[p] for p in params] - values = [int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)] - for e, v, d in zip(env_vars, values, default_values): - logger().info(f'{e}={v} (default:{d})') + values = [] + + used_dim = dim # Track which dim was actually used + + for p, e, d in zip(params, env_vars, default_values): + val = os.environ.get(e) + + if val is None and dim == 'query': + # Check if fallback 'seq' flag is set + fallback_env = f'VLLM_{phase}_SEQ_BUCKET_{p}'.upper() + fallback_val = os.environ.get(fallback_env) + + if fallback_val is not None: + val = fallback_val + used_dim = 'seq' # Treat as if query used seq values + logger().warning( + f"{e} not set, using {fallback_env}={fallback_val} instead. " + "This fallback behavior is deprecated and will be removed in future versions." + ) + resolved_val = int(val) if val is not None else d + logger().info(f'{e}={resolved_val} (default:{d})') + values.append(resolved_val) + return values diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py index a2002a82..a34bdc45 100644 --- a/vllm_gaudi/extension/features.py +++ b/vllm_gaudi/extension/features.py @@ -18,22 +18,21 @@ def get_user_flags(): Env('VLLM_PROMPT_BS_BUCKET_MIN', int), Env('VLLM_PROMPT_BS_BUCKET_STEP', int), Env('VLLM_PROMPT_BS_BUCKET_MAX', int), - Env('VLLM_PROMPT_BS_BUCKET_LIMIT', int), + Env('VLLM_PROMPT_QUERY_BUCKET_MIN', int), + Env('VLLM_PROMPT_QUERY_BUCKET_STEP', int), + Env('VLLM_PROMPT_QUERY_BUCKET_MAX', int), Env('VLLM_PROMPT_SEQ_BUCKET_MIN', int), Env('VLLM_PROMPT_SEQ_BUCKET_STEP', int), Env('VLLM_PROMPT_SEQ_BUCKET_MAX', int), - Env('VLLM_PROMPT_SEQ_BUCKET_LIMIT', int), Env('VLLM_PROMPT_CTX_BUCKET_MIN', int), Env('VLLM_PROMPT_CTX_BUCKET_STEP', int), Env('VLLM_PROMPT_CTX_BUCKET_MAX', int), Env('VLLM_DECODE_BS_BUCKET_MIN', int), Env('VLLM_DECODE_BS_BUCKET_STEP', int), Env('VLLM_DECODE_BS_BUCKET_MAX', int), - Env('VLLM_DECODE_BS_BUCKET_LIMIT', int), Env('VLLM_DECODE_BLOCK_BUCKET_MIN', int), Env('VLLM_DECODE_BLOCK_BUCKET_STEP', int), Env('VLLM_DECODE_BLOCK_BUCKET_MAX', int), - Env('VLLM_DECODE_BLOCK_BUCKET_LIMIT', int), # Non-vllm flags that are also important to print Env('EXPERIMENTAL_WEIGHT_SHARING', str), From b7f8d59eab099196884587eb281f7a290ea7f263 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Mon, 13 Oct 2025 16:30:25 +0300 Subject: [PATCH 2/5] Add version Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/bucketing/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py index 53ad6e6a..57ff41c1 100644 --- a/vllm_gaudi/extension/bucketing/linear.py +++ b/vllm_gaudi/extension/bucketing/linear.py @@ -109,7 +109,7 @@ def read_bucket_settings(phase: str, dim: str, **defaults): used_dim = 'seq' # Treat as if query used seq values logger().warning( f"{e} not set, using {fallback_env}={fallback_val} instead. " - "This fallback behavior is deprecated and will be removed in future versions." + "This fallback behavior is deprecated and will be removed in v0.12.0." ) resolved_val = int(val) if val is not None else d logger().info(f'{e}={resolved_val} (default:{d})') From b81d2df742d03b6636fda59dd9830d39b4dcad4e Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Tue, 14 Oct 2025 11:33:21 +0300 Subject: [PATCH 3/5] After review Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/bucketing/linear.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm_gaudi/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py index 57ff41c1..ce25c250 100644 --- a/vllm_gaudi/extension/bucketing/linear.py +++ b/vllm_gaudi/extension/bucketing/linear.py @@ -85,7 +85,7 @@ def read_bucket_settings(phase: str, dim: str, **defaults): """Read bucketing configuration from env variables. phase is either 'prompt' or 'decode' - dim is either 'bs', 'seq' or 'block' + dim is either 'bs', 'query' or 'block' param is either 'min', 'step' or 'max' example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 """ @@ -94,8 +94,6 @@ def read_bucket_settings(phase: str, dim: str, **defaults): default_values = [defaults[p] for p in params] values = [] - used_dim = dim # Track which dim was actually used - for p, e, d in zip(params, env_vars, default_values): val = os.environ.get(e) @@ -106,9 +104,8 @@ def read_bucket_settings(phase: str, dim: str, **defaults): if fallback_val is not None: val = fallback_val - used_dim = 'seq' # Treat as if query used seq values logger().warning( - f"{e} not set, using {fallback_env}={fallback_val} instead. " + f"{e} not set, using {fallback_env} value ({fallback_val}) instead. " "This fallback behavior is deprecated and will be removed in v0.12.0." ) resolved_val = int(val) if val is not None else d From 7481c5fda0e66c6816ecbecbe004e4f52459d4be Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Wed, 29 Oct 2025 09:07:04 +0200 Subject: [PATCH 4/5] Precommit Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/bucketing/linear.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm_gaudi/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py index 5457f849..708111e0 100644 --- a/vllm_gaudi/extension/bucketing/linear.py +++ b/vllm_gaudi/extension/bucketing/linear.py @@ -101,13 +101,11 @@ def read_bucket_settings(phase: str, dim: str, **defaults): # Check if fallback 'seq' flag is set fallback_env = f'VLLM_{phase}_SEQ_BUCKET_{p}'.upper() fallback_val = os.environ.get(fallback_env) - + if fallback_val is not None: val = fallback_val - logger().warning( - f"{e} not set, using {fallback_env} value ({fallback_val}) instead. " - "This fallback behavior is deprecated and will be removed in v0.12.0." - ) + logger().warning(f"{e} not set, using {fallback_env} value ({fallback_val}) instead. " + "This fallback behavior is deprecated and will be removed in v0.12.0.") resolved_val = int(val) if val is not None else d logger().info(f'{e}={resolved_val} (default:{d})') values.append(resolved_val) From 07a67a68e48f28b53a6d2e5a34c42330e8f327fc Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 30 Oct 2025 09:34:20 +0200 Subject: [PATCH 5/5] Remove limit Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/bucketing/exponential.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/extension/bucketing/exponential.py b/vllm_gaudi/extension/bucketing/exponential.py index 610baca5..a1779a43 100644 --- a/vllm_gaudi/extension/bucketing/exponential.py +++ b/vllm_gaudi/extension/bucketing/exponential.py @@ -14,7 +14,7 @@ class ExponentialBucketingStrategy(): def check_for_user_flags(self, phase): dim = ['bs', 'seq'] if phase == 'prompt' else ['bs', 'block'] - params = ['min', 'step', 'max', 'limit'] + params = ['min', 'step', 'max'] env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for dim in dim for p in params] user_flags = [] for e in env_vars: