Skip to content

Commit

Permalink
fixup some bug for convert_from_qwen_hf, update readme for better wit…
Browse files Browse the repository at this point in the history
…h lower memory GPU
  • Loading branch information
Tlntin committed Oct 1, 2023
1 parent 19b0604 commit 8bd018c
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 8 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,15 @@
cd tensorrt_llm_july-release-v1/examples/qwen/
```

7. 将Huggingface格式的数据转成FT(FastTransformer)需要的数据格式(非必选,不convert直接build也是可以的,两种方式都兼容,直接build更省空间,但是不支持smooth quant)
7. 将Huggingface格式的数据转成FT(FastTransformer)需要的数据格式(非必选,不convert直接build也是可以的,两种方式都兼容,直接build更省空间,但是不支持smooth quant; 运行该代码默认是需要加载cuda版huggingface模型再转换,所以低于24G显存的显卡建议跳过这步。

```bash
python3 hf_qwen_convert.py
```

8. 修改编译参数(可选)

- 默认编译参数,包括batch_size, max_input_len, max_new_tokens都存放在`default_config.py`
- 默认编译参数,包括batch_size, max_input_len, max_new_tokens, seq_length都存放在`default_config.py`
- 对于24G显存用户,直接编译即可,默认是fp16数据类型,max_batch_size=2
- 对于低显存用户,可以降低max_batch_size=1,或者继续降低max_input_len, max_new_tokens

Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ onnx==1.12.0
mpi4py
tensorrt>=8.6.0
numpy
cuda-python==12.1.0
cuda-python==12.2.0
mypy
pytest-cov
pytest-xdist
Expand Down
6 changes: 3 additions & 3 deletions tensorrt_llm_july-release-v1/examples/qwen/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def build_rank_engine(builder: Builder,
num_layers=args.n_layer,
num_heads=args.n_head,
hidden_size=args.n_embd,
seq_length=2048,
seq_length=default_config.seq_length,
vocab_size=args.vocab_size,
hidden_act=args.hidden_act,
max_position_embeddings=args.n_positions,
Expand Down Expand Up @@ -360,7 +360,8 @@ def build_rank_engine(builder: Builder,
QuantMode.use_weight_only(use_int4_weights=True)
)

if args.hf_model_dir is not None and args.ft_dir_path is None:
if args.hf_model_dir is not None and \
(args.ft_dir_path is None or not os.path.exists(args.ft_dir_path)):
logger.info(f'Loading HF QWen ... from {args.hf_model_dir}')
tik = time.time()
hf_qwen = AutoModelForCausalLM.from_pretrained(
Expand All @@ -381,7 +382,6 @@ def build_rank_engine(builder: Builder,
rank,
args.world_size,
max_position_embeddings=args.n_positions,
seq_length=args.max_input_len,
kv_channels=args.kv_channels,
rotary_emb_base=args.rotary_emb_base,
dtype=args.dtype,
Expand Down
5 changes: 5 additions & 0 deletions tensorrt_llm_july-release-v1/examples/qwen/default_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ class DefaultConfig:
# Maximum number of generate new tokens.
max_new_tokens = 2048

# Maximum sequence length.
# for Qwen-7B-Chat V1.0, the seq_length is 2048
# for Qwen-7B-Chat V1.1, the seq_length is 8192
# for Qwen-14B-Chat, the seq_length is 8192
seq_length = 2048

# Top p for sampling.
top_p = 0.5
Expand Down
3 changes: 1 addition & 2 deletions tensorrt_llm_july-release-v1/examples/qwen/weight.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,6 @@ def load_from_hf_qwen(tensorrt_llm_qwen: QWenForCausalLM,
hf_qwen,
rank=0,
tensor_parallel=1,
seq_length=2048,
max_position_embeddings=8192,
rotary_emb_base=10000,
kv_channels=128,
Expand Down Expand Up @@ -463,7 +462,7 @@ def load_from_hf_qwen(tensorrt_llm_qwen: QWenForCausalLM,
if layer_idx is None:
continue
idx = int(layer_idx)
if idx >= tensorrt_llm_qwen.num_layers:
if idx >= tensorrt_llm_qwen._num_layers:
continue
if 'ln_1.weight' in k:
tensorrt_llm_qwen.layers[idx].ln_1.weight.value = v
Expand Down

0 comments on commit 8bd018c

Please sign in to comment.