Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions docs/configuration/env_vars.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,16 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
- batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
- batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `1`
- batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `max_num_prefill_seqs`
- query length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
- query length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
- query length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_num_batched_tokens`
- sequence ctx min (`VLLM_PROMPT_CTX_BUCKET_MIN`): `0`
- sequence ctx step (`VLLM_PROMPT_CTX_BUCKET_STEP`): `1`
- sequence ctx max (`VLLM_PROMPT_CTX_BUCKET_MAX`): `(max_model_len - block_size) // block_size`
- query length min (`VLLM_PROMPT_QUERY_BUCKET_MIN`): `block_size`
- query length step (`VLLM_PROMPT_QUERY_BUCKET_STEP`): `block_size`
- query length max (`VLLM_PROMPT_QUERY_BUCKET_MAX`): `max_num_batched_tokens`
- ctx min for whole sequence (`VLLM_PROMPT_CTX_BUCKET_MIN`): `0`
- ctx step for whole sequence (`VLLM_PROMPT_CTX_BUCKET_STEP`): `1`
- ctx max for whole sequence (`VLLM_PROMPT_CTX_BUCKET_MAX`): `(max_model_len - block_size) // block_size`
- Decode:
- batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
- batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `32`
- batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
- block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
- block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `1`
- block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
- block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max_model_len * max_num_seqs // block_size` by default or `max_blocks` for CONTIGUOUS PA
2 changes: 1 addition & 1 deletion vllm_gaudi/extension/bucketing/exponential.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class ExponentialBucketingStrategy():

def check_for_user_flags(self, phase):
dim = ['bs', 'seq'] if phase == 'prompt' else ['bs', 'block']
params = ['min', 'step', 'max', 'limit']
params = ['min', 'step', 'max']
env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for dim in dim for p in params]
user_flags = []
for e in env_vars:
Expand Down
25 changes: 20 additions & 5 deletions vllm_gaudi/extension/bucketing/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke

prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=1, max=max_num_prefill_seqs)
prompt_query_bucket_cfg = read_bucket_settings('prompt',
'seq',
'query',
min=block_size,
step=block_size,
max=max_num_batched_tokens)
Expand Down Expand Up @@ -85,16 +85,31 @@ def read_bucket_settings(phase: str, dim: str, **defaults):
"""Read bucketing configuration from env variables.
phase is either 'prompt' or 'decode'
dim is either 'bs', 'seq' or 'block'
dim is either 'bs', 'query' or 'block'
param is either 'min', 'step' or 'max'
example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
"""
params = ['min', 'step', 'max']
env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params]
default_values = [defaults[p] for p in params]
values = [int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)]
for e, v, d in zip(env_vars, values, default_values):
logger().info(f'{e}={v} (default:{d})')
values = []

for p, e, d in zip(params, env_vars, default_values):
val = os.environ.get(e)

if val is None and dim == 'query':

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I know we don't have VLLM_DECODE_SEQ_BUCKET_{p} - is there a need for making this code handle such a case ? This code looks like it would need to

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, we don't set this dim - nor query nor seq, for decode

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so shouldn't we put also "and phase == 'prompt' and then set
fallback_env = f'VLLM_PROMPT_SEQ_BUCKET_{p}'.upper() in 102 line ?

# Check if fallback 'seq' flag is set
fallback_env = f'VLLM_{phase}_SEQ_BUCKET_{p}'.upper()
fallback_val = os.environ.get(fallback_env)

if fallback_val is not None:
val = fallback_val
logger().warning(f"{e} not set, using {fallback_env} value ({fallback_val}) instead. "
"This fallback behavior is deprecated and will be removed in v0.12.0.")
resolved_val = int(val) if val is not None else d
logger().info(f'{e}={resolved_val} (default:{d})')
values.append(resolved_val)

return values


Expand Down
6 changes: 3 additions & 3 deletions vllm_gaudi/extension/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@ def get_user_flags():
Env('VLLM_PROMPT_BS_BUCKET_MIN', int),
Env('VLLM_PROMPT_BS_BUCKET_STEP', int),
Env('VLLM_PROMPT_BS_BUCKET_MAX', int),
Env('VLLM_PROMPT_BS_BUCKET_LIMIT', int),
Env('VLLM_PROMPT_QUERY_BUCKET_MIN', int),
Env('VLLM_PROMPT_QUERY_BUCKET_STEP', int),
Env('VLLM_PROMPT_QUERY_BUCKET_MAX', int),
Env('VLLM_PROMPT_SEQ_BUCKET_MIN', int),
Env('VLLM_PROMPT_SEQ_BUCKET_STEP', int),
Env('VLLM_PROMPT_SEQ_BUCKET_MAX', int),
Env('VLLM_PROMPT_SEQ_BUCKET_LIMIT', int),
Env('VLLM_PROMPT_CTX_BUCKET_MIN', int),
Env('VLLM_PROMPT_CTX_BUCKET_STEP', int),
Env('VLLM_PROMPT_CTX_BUCKET_MAX', int),
Env('VLLM_DECODE_BS_BUCKET_MIN', int),
Env('VLLM_DECODE_BS_BUCKET_STEP', int),
Env('VLLM_DECODE_BS_BUCKET_MAX', int),
Env('VLLM_DECODE_BS_BUCKET_LIMIT', int),
Env('VLLM_DECODE_BLOCK_BUCKET_MIN', int),
Env('VLLM_DECODE_BLOCK_BUCKET_STEP', int),
Env('VLLM_DECODE_BLOCK_BUCKET_MAX', int),
Expand Down