Skip to content

[Usage]: 指定use_layerwise为true运行模型报错了 #887

@abjackjay

Description

@abjackjay

Your current environment

将use_layerwise配置为true,运行模型报错了
vllm serve /home/model/Qwen2.5-14B-Instruct
--gpu_memory_utilization 0.9
--port 8000
--max-model-len 32768
--block_size 128
--served-model-name qwen
--no-enable-prefix-caching
--kv-transfer-config
'{
"kv_connector": "UCMConnector",
"kv_role": "kv_both",
"kv_connector_module_path": "ucm.integration.vllm.ucm_connector",
"kv_connector_extra_config": {"UCM_CONFIG_FILE": "/home/config/nfs.yaml", "use_layerwise": "true"}
}'

报错信息:
[2026-04-01 00:51:07.884486][UC][E] EngineCore failed to start. Traceback (most recent call last): File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 699, in run_engine_core engine_core = EngineCoreProc(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 498, in __init__ super().__init__(vllm_config, executor_class, log_stats, File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 92, in __init__ self._initialize_kv_caches(vllm_config) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 207, in _initialize_kv_caches self.model_executor.initialize_from_config(kv_cache_configs) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 75, in initialize_from_config self.collective_rpc("compile_or_warm_up_model") File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc return [run_method(self.driver_worker, method, args, kwargs)] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 3122, in run_method return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 346, in compile_or_warm_up_model cuda_graph_memory_bytes = self.model_runner.capture_model() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3530, in capture_model self._capture_cudagraphs( File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3590, in _capture_cudagraphs self._dummy_run(num_tokens, File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3219, in _dummy_run outputs = self.model( ^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/cuda_graph.py", line 121, in __call__ return self.runnable(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 529, in forward hidden_states = self.model(input_ids, positions, intermediate_tensors, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 317, in __call__ model_output = self.forward(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 356, in forward def forward( File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 375, in __call__ return super().__call__(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 929, in _fn return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 848, in call_wrapped return self._wrapped_call(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 424, in __call__ raise e File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 411, in __call__ return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "<eval_with_key>.98", line 352, in forward submod_1 = self.submod_1(getitem, s72, getitem_1, getitem_2, getitem_3); getitem = getitem_1 = getitem_2 = submod_1 = None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 848, in call_wrapped return self._wrapped_call(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 424, in __call__ raise e File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 411, in __call__ return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "<eval_with_key>.2", line 5, in forward unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_2, key_2, value, output_1, 'model.layers.0.self_attn.attn'); query_2 = key_2 = value = output_1 = unified_attention_with_output = None

How would you like to use ucm.

I want to run inference of a [specific model](put link here). I don't know how to integrate it with unified.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions