Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions docs/configuration/env_vars.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,16 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
- batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
- batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32`
- batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `max_num_prefill_seqs`
- sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
- sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
- sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
- query length min (`VLLM_PROMPT_QUERY_BUCKET_MIN`): `block_size`
- query length step (`VLLM_PROMPT_QUERY_BUCKET_STEP`): `block_size`
- query length max (`VLLM_PROMPT_QUERY_BUCKET_MAX`): `max_model_len`
- sequence ctx min (`VLLM_PROMPT_CTX_BUCKET_MIN`): `0`
- sequence ctx step (`VLLM_PROMPT_CTX_BUCKET_STEP`): `1`
- sequence ctx max (`VLLM_PROMPT_CTX_BUCKET_MAX`): `(max_model_len - block_size) // block_size`
- Decode:
- batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
- batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `32`
- batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
- block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
- block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `1`
- block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
- block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max_blocks`
27 changes: 22 additions & 5 deletions vllm_gaudi/extension/bucketing/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke

prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=max_num_prefill_seqs)
prompt_query_bucket_cfg = read_bucket_settings('prompt',
'seq',
'query',
min=block_size,
step=block_size,
max=max_model_len)
Expand Down Expand Up @@ -81,16 +81,33 @@ def read_bucket_settings(phase: str, dim: str, **defaults):
"""Read bucketing configuration from env variables.

phase is either 'prompt' or 'decode'
dim is either 'bs', 'seq' or 'block'
dim is either 'bs', 'query' or 'block'
param is either 'min', 'step' or 'max'
example env variable: VLLM_DECODE_BS_BUCKET_STEP=128
"""
params = ['min', 'step', 'max']
env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params]
default_values = [defaults[p] for p in params]
values = [int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values)]
for e, v, d in zip(env_vars, values, default_values):
logger().info(f'{e}={v} (default:{d})')
values = []

for p, e, d in zip(params, env_vars, default_values):
val = os.environ.get(e)

if val is None and dim == 'query':

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I know we don't have VLLM_DECODE_SEQ_BUCKET_{p} - is there a need for making this code handle such a case ? This code looks like it would need to

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, we don't set this dim - nor query nor seq, for decode

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so shouldn't we put also "and phase == 'prompt' and then set
fallback_env = f'VLLM_PROMPT_SEQ_BUCKET_{p}'.upper() in 102 line ?

# Check if fallback 'seq' flag is set
fallback_env = f'VLLM_{phase}_SEQ_BUCKET_{p}'.upper()
fallback_val = os.environ.get(fallback_env)

if fallback_val is not None:
val = fallback_val
logger().warning(
f"{e} not set, using {fallback_env} value ({fallback_val}) instead. "
"This fallback behavior is deprecated and will be removed in v0.12.0."
)
resolved_val = int(val) if val is not None else d
logger().info(f'{e}={resolved_val} (default:{d})')
values.append(resolved_val)

return values


Expand Down
7 changes: 3 additions & 4 deletions vllm_gaudi/extension/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,21 @@ def get_user_flags():
Env('VLLM_PROMPT_BS_BUCKET_MIN', int),
Env('VLLM_PROMPT_BS_BUCKET_STEP', int),
Env('VLLM_PROMPT_BS_BUCKET_MAX', int),
Env('VLLM_PROMPT_BS_BUCKET_LIMIT', int),
Env('VLLM_PROMPT_QUERY_BUCKET_MIN', int),
Env('VLLM_PROMPT_QUERY_BUCKET_STEP', int),
Env('VLLM_PROMPT_QUERY_BUCKET_MAX', int),
Env('VLLM_PROMPT_SEQ_BUCKET_MIN', int),
Env('VLLM_PROMPT_SEQ_BUCKET_STEP', int),
Env('VLLM_PROMPT_SEQ_BUCKET_MAX', int),
Env('VLLM_PROMPT_SEQ_BUCKET_LIMIT', int),
Env('VLLM_PROMPT_CTX_BUCKET_MIN', int),
Env('VLLM_PROMPT_CTX_BUCKET_STEP', int),
Env('VLLM_PROMPT_CTX_BUCKET_MAX', int),
Env('VLLM_DECODE_BS_BUCKET_MIN', int),
Env('VLLM_DECODE_BS_BUCKET_STEP', int),
Env('VLLM_DECODE_BS_BUCKET_MAX', int),
Env('VLLM_DECODE_BS_BUCKET_LIMIT', int),
Env('VLLM_DECODE_BLOCK_BUCKET_MIN', int),
Env('VLLM_DECODE_BLOCK_BUCKET_STEP', int),
Env('VLLM_DECODE_BLOCK_BUCKET_MAX', int),
Env('VLLM_DECODE_BLOCK_BUCKET_LIMIT', int),

# Non-vllm flags that are also important to print
Env('EXPERIMENTAL_WEIGHT_SHARING', str),
Expand Down
Loading