-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_block_release_debug.py
More file actions
108 lines (84 loc) · 4.03 KB
/
test_block_release_debug.py
File metadata and controls
108 lines (84 loc) · 4.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3
"""
调试KV-cache压缩后的内存释放
"""
from fastcache_paths import ensure_sys_paths, CKPT_DIR, DATASETS_DIR, RESULTS_DIR
ensure_sys_paths()
import os
import sys
import torch
import gc
def test_block_release_debug():
"""详细调试block释放"""
from nanovllm.sampling_params import SamplingParams
from nanovllm.engine.llava_engine import LlavaLLM
print("=" * 70)
print(" 调试KV-cache压缩后的Block释放")
print("=" * 70)
model_path = "/data/huggingface/llava-1.5-7b-hf"
compressor_path = str(CKPT_DIR / "llava_mlp.pth")
gc.collect()
torch.cuda.empty_cache()
# 使用同步压缩
llm = LlavaLLM(
model_path,
compressor_path=compressor_path, # 添加压缩器路径
enable_compression=True,
async_compression=False,
compression_factor=5,
enforce_eager=True,
max_model_len=2048,
)
block_size = llm.scheduler.block_manager.block_size
print(f"\nBlock size: {block_size}")
print(f"压缩因子: {llm.compression_factor}")
print(f"压缩器状态: {llm.model_runner.compressor is not None}")
print(f"批量压缩器状态: {getattr(llm.model_runner, 'use_batched_compressor', False)}")
initial_free = len(llm.scheduler.block_manager.free_block_ids)
print(f"初始空闲blocks: {initial_free}")
# 单个长prompt - 需要足够长以使用多个blocks (>256 tokens)
# 构造一个很长的prompt,目标是使用5-6个blocks
base = "USER: Please provide a very detailed and comprehensive explanation of the history of artificial intelligence. "
# 重复以达到足够长度 - 目标是1200+ tokens
prompt = base + "Include information about all major milestones, key researchers, breakthrough algorithms, and significant applications. Discuss the evolution from symbolic AI to connectionist approaches, and explain how deep learning revolutionized the field. " * 30 + "ASSISTANT:"
sampling_params = SamplingParams(max_tokens=64)
prompt_len = len(llm.tokenizer.encode(prompt))
print(f"\nPrompt长度: {prompt_len} tokens")
expected_blocks = (prompt_len + block_size - 1) // block_size
expected_blocks_after = (prompt_len // 5 + block_size - 1) // block_size
print(f"预期使用blocks: {expected_blocks}")
print(f"压缩后预期blocks: {expected_blocks_after} (假设5x压缩)")
print(f"预期释放blocks: {expected_blocks - expected_blocks_after}")
# 添加请求
llm.add_request(prompt, sampling_params)
print("\n开始推理...")
step_count = 0
while not llm.is_finished():
outputs, num_tokens = llm.step(apply_compression=True)
step_count += 1
# 检查压缩状态
if step_count == 1: # prefill后
print(f"\n[Step {step_count}] Prefill完成")
print(f" _compressed_lens: {llm.model_runner._compressed_lens}")
# 检查每个running序列
for seq in llm.scheduler.running:
seq_id = seq.seq_id
block_count = len(seq.block_table)
compressed_block_count = llm.model_runner.get_compressed_block_count(seq_id)
compressed_len = llm.model_runner._compressed_lens.get(seq_id, -1)
print(f" 序列{seq_id}:")
print(f" 当前block数: {block_count}")
print(f" 压缩后block数: {compressed_block_count}")
print(f" 压缩后长度: {compressed_len}")
print(f" block_table: {seq.block_table[:5]}...")
if step_count <= 3:
free_blocks = len(llm.scheduler.block_manager.free_block_ids)
print(f"[Step {step_count}] 空闲blocks: {free_blocks}")
if outputs:
for seq_id, tokens in outputs:
print(f"\n完成序列{seq_id}: {len(tokens)} tokens")
final_free = len(llm.scheduler.block_manager.free_block_ids)
print(f"\n最终空闲blocks: {final_free}")
print(f"变化: {final_free - initial_free}")
if __name__ == "__main__":
test_block_release_debug()