-
Notifications
You must be signed in to change notification settings - Fork 71
[Usage]: 指定use_layerwise为true运行模型报错了 #887
Description
Your current environment
将use_layerwise配置为true,运行模型报错了
vllm serve /home/model/Qwen2.5-14B-Instruct
--gpu_memory_utilization 0.9
--port 8000
--max-model-len 32768
--block_size 128
--served-model-name qwen
--no-enable-prefix-caching
--kv-transfer-config
'{
"kv_connector": "UCMConnector",
"kv_role": "kv_both",
"kv_connector_module_path": "ucm.integration.vllm.ucm_connector",
"kv_connector_extra_config": {"UCM_CONFIG_FILE": "/home/config/nfs.yaml", "use_layerwise": "true"}
}'
报错信息:
[2026-04-01 00:51:07.884486][UC][E] EngineCore failed to start. Traceback (most recent call last): File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 699, in run_engine_core engine_core = EngineCoreProc(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 498, in __init__ super().__init__(vllm_config, executor_class, log_stats, File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 92, in __init__ self._initialize_kv_caches(vllm_config) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 207, in _initialize_kv_caches self.model_executor.initialize_from_config(kv_cache_configs) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 75, in initialize_from_config self.collective_rpc("compile_or_warm_up_model") File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 83, in collective_rpc return [run_method(self.driver_worker, method, args, kwargs)] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 3122, in run_method return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 346, in compile_or_warm_up_model cuda_graph_memory_bytes = self.model_runner.capture_model() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3530, in capture_model self._capture_cudagraphs( File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3590, in _capture_cudagraphs self._dummy_run(num_tokens, File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 120, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3219, in _dummy_run outputs = self.model( ^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/cuda_graph.py", line 121, in __call__ return self.runnable(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 529, in forward hidden_states = self.model(input_ids, positions, intermediate_tensors, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 317, in __call__ model_output = self.forward(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 356, in forward def forward( File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 375, in __call__ return super().__call__(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 929, in _fn return fn(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 848, in call_wrapped return self._wrapped_call(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 424, in __call__ raise e File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 411, in __call__ return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "<eval_with_key>.98", line 352, in forward submod_1 = self.submod_1(getitem, s72, getitem_1, getitem_2, getitem_3); getitem = getitem_1 = getitem_2 = submod_1 = None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 848, in call_wrapped return self._wrapped_call(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 424, in __call__ raise e File "/usr/local/lib/python3.12/dist-packages/torch/fx/graph_module.py", line 411, in __call__ return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "<eval_with_key>.2", line 5, in forward unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_2, key_2, value, output_1, 'model.layers.0.self_attn.attn'); query_2 = key_2 = value = output_1 = unified_attention_with_output = None
How would you like to use ucm.
I want to run inference of a [specific model](put link here). I don't know how to integrate it with unified.