Skip to content

Commit

Permalink
support Qwen-VL-Chat
Browse files Browse the repository at this point in the history
  • Loading branch information
wangzhaode committed Jan 3, 2024
1 parent 312ce0c commit f2add57
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 24 deletions.
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@ cmake_minimum_required(VERSION 3.0)
project(mnn-llm)

option(BUILD_FOR_ANDROID "Build for android whith mini memory mode." OFF)
option(USING_VISUAL_MODEL "Using visual model will need dpes: MNNOpenCV and httplib." OFF)
option(USING_DISK_EMBED "Using disk embedding to save memory." OFF)
option(DUMP_PROFILE_INFO "Dump profile info when chat." OFF)
option(WITH_CUDA "Enable CUDA support" OFF)

if (USING_DISK_EMBED)
add_definitions(-DUSING_DISK_EMBED)
endif()

if (USING_VISUAL_MODEL)
add_definitions(-DUSING_VISUAL_MODEL)
endif()

if (DUMP_PROFILE_INFO)
add_definitions(-DDUMP_PROFILE_INFO)
endif()
Expand All @@ -35,6 +39,9 @@ else()
set_target_properties(llm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)

target_link_libraries(llm MNN MNN_Express)
if (USING_VISUAL_MODEL)
target_link_libraries(llm MNNOpenCV)
endif()
endif()

if (BUILD_FOR_ANDROID)
Expand Down
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,6 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
| windows | PC | Intel(R) Core(TM) i7-13700K | 32GB |




### 下载int4模型
```
# <model> like `chatglm-6b`
Expand Down Expand Up @@ -154,10 +152,16 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
./script/android_build.sh
```

默认使用`CPU`后端,如果使用其他后端,可以在脚本中添加`MNN`编译宏
一些编译宏:
- `BUILD_FOR_ANDROID`: 编译到Android设备;
- `USING_VISUAL_MODEL`: 支持多模态能力的模型,需要依赖`libMNNOpenCV`
- `USING_DISK_EMBED`: 使用硬盘加载的方式实现embedding,节省内存;
- `DUMP_PROFILE_INFO`: 每次对话后dump出性能数据到命令行中;

默认使用`CPU`后端且不实用多模态能力,如果使用其他后端或能力,可以在编译MNN的脚本中添加`MNN`编译宏
- cuda: `-DMNN_CUDA=ON`
- opencl: `-DMNN_OPENCL=ON`

- opencv: `-DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON`

### 4. 执行

Expand All @@ -184,6 +188,7 @@ adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo qwen-1.8
- [codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)
- [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)
- [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)
- [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
- [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)
- [Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)
- [internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)
Expand Down
46 changes: 37 additions & 9 deletions include/llm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,7 @@ class Llm {
}
virtual ~Llm() = default;
static Llm* createLLM(const std::string& path, std::string model_type = "auto");
VARP disk_embedding(const std::vector<int>& input_ids);
void load(const std::string& model_dir);
int forward(const std::vector<int>& input_ids);
std::vector<int> tokenizer_encode(const std::string& input_str);
std::string decode(int id);
void chat();
void warmup();
std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
Expand All @@ -75,14 +71,16 @@ class Llm {
// time
int64_t prefill_us_ = 0;
int64_t decode_us_ = 0;
private:
virtual std::vector<int> tokenizer(const std::string& query) = 0;
virtual VARP gen_attention_mask(int seq_len) = 0;
virtual VARP gen_position_ids(int seq_len) = 0;
virtual bool is_stop(int token_id) = 0;
protected:
VARP embedding(const std::vector<int>& input_ids);
VARP txt_embedding(const std::vector<int>& input_ids);
int forward(const std::vector<int>& input_ids);
std::vector<int> tokenizer_encode(const std::string& input_str);
std::string decode(int id);
protected:
// model configs
bool is_single_ = false;
bool is_visual_ = false;
int layer_nums_ = 0;
int hidden_size_ = 4096;
std::vector<int> key_value_shape_ = {};
Expand All @@ -91,6 +89,13 @@ class Llm {
float load_progress_ = 0.f;
// tokenizer
std::unique_ptr<Tokenizer> tokenizer_;
std::shared_ptr<Module> visual_module_;
private:
virtual VARP visual_embedding(const std::vector<int>& input_ids) { return nullptr; }
virtual std::vector<int> tokenizer(const std::string& query) = 0;
virtual VARP gen_attention_mask(int seq_len) = 0;
virtual VARP gen_position_ids(int seq_len) = 0;
virtual bool is_stop(int token_id) = 0;
private:
// MNN Modules
std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
Expand Down Expand Up @@ -160,6 +165,29 @@ class Qwen_7b : public Llm {
virtual bool is_stop(int token_id) override;
};

class Qwen_vl : public Qwen_7b {
public:
Qwen_vl() {
model_name_ = "Qwen_vl";
is_visual_ = true;
layer_nums_ = 32;
key_value_shape_ = {2, 1, 0, 32, 128};
hidden_size_ = 4096;
tokenizer_.reset(new Tiktoken);
}
private:
const int img_size_ = 448;
const int imgpad_len_ = 256;
const int img_start_ = 151857;
const int img_end_ = 151858;
const int img_pad_ = 151859;
private:
std::vector<int> url_encode(const std::string& url);
virtual VARP visual_embedding(const std::vector<int>& input_ids) override;
virtual std::vector<int> tokenizer(const std::string& query) override;
virtual VARP gen_attention_mask(int seq_len) override;
};

class Qwen_1_8b : public Qwen_7b {
public:
Qwen_1_8b() {
Expand Down
170 changes: 160 additions & 10 deletions src/llm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@
// #define MNN_OPEN_TIME_TRACE 1

#include <iostream>
#include <fstream>
#include <regex>

#include "llm.hpp"
#include "tokenizer.hpp"
#include <MNN/expr/ExecutorScope.hpp>
#include <MNN/AutoTime.hpp>
#include "llm.hpp"
#include "tokenizer.hpp"

#include <fstream>
#ifdef USING_VISUAL_MODEL
#include "httplib.h"
#include <cv/cv.hpp>
#endif

Llm* Llm::createLLM(const std::string& path, std::string model_type) {
auto size = path.size();
Expand Down Expand Up @@ -43,6 +48,8 @@ Llm* Llm::createLLM(const std::string& path, std::string model_type) {
} else if (model_type.find("qwen") != std::string::npos) {
if (model_type.find("1.8") != std::string::npos) {
llm = new Qwen_1_8b;
} else if (model_type.find("vl") != std::string::npos) {
llm = new Qwen_vl;
} else {
llm = new Qwen_7b;
}
Expand Down Expand Up @@ -223,6 +230,14 @@ void Llm::load(const std::string& model_dir) {
MNN_PRINT("Done!\n");
load_progress_ += step;
#endif
if (is_visual_) {
std::string visual_model_path = model_dir + "/visual.mnn";
MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, visual_model_path.c_str());fflush(stdout);
module_config.rearrange = false;
visual_module_.reset(Module::load({}, {}, visual_model_path.c_str(), runtime_manager_, &module_config));
MNN_PRINT("Done!\n");
module_config.rearrange = true;
}
// load glm_block models
for (int i = 0; i < layer_nums_; i++) {
load_progress_ += step;
Expand Down Expand Up @@ -269,11 +284,7 @@ int Llm::forward(const std::vector<int>& input_ids) {
past_key_values_[0] = outputs[1];
} else {
// split block models
#ifdef USING_DISK_EMBED
auto hidden_states = disk_embedding(input_ids);
#else
auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0];
#endif
auto hidden_states = embedding(input_ids);
for (int i = 0; i < layer_nums_; i++) {
AUTOTIME;
auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
Expand All @@ -292,9 +303,15 @@ int Llm::forward(const std::vector<int>& input_ids) {
return id;
}

VARP Llm::disk_embedding(const std::vector<int>& input_ids) {
VARP Llm::txt_embedding(const std::vector<int>& input_ids) {
#ifndef USING_DISK_EMBED
// using model forward
auto inputs_ids_ = _Const(input_ids.data(), {static_cast<int>(input_ids.size())}, NCHW, halide_type_of<int>());
auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0];
return hidden_states;
#endif
AUTOTIME;
// disk embedding save memory
// disk embedding to save memory
size_t seq_len = input_ids.size();
auto embedding = _Input({static_cast<int>(seq_len), 1, hidden_size_}, NCHW);
size_t size = hidden_size_ * sizeof(int16_t);
Expand All @@ -314,6 +331,13 @@ VARP Llm::disk_embedding(const std::vector<int>& input_ids) {
return embedding;
}

VARP Llm::embedding(const std::vector<int>& input_ids) {
if (is_visual_ && !gen_seq_len_) {
return visual_embedding(input_ids);
}
return txt_embedding(input_ids);
}

std::vector<int> Llm::tokenizer_encode(const std::string& input_str) {
auto ids = tokenizer_->encode(input_str);
return ids;
Expand Down Expand Up @@ -463,6 +487,132 @@ bool Qwen_7b::is_stop(int token_id) {
return token_id >= 151645;
}

// Qwen_vl
std::vector<int> Qwen_vl::url_encode(const std::string& url) {
std::vector<int> ascii_values(imgpad_len_, img_pad_);
ascii_values[0] = img_start_;
ascii_values[imgpad_len_ - 1] = img_end_;
for (int i = 0; i < url.size(); i++) {
ascii_values[i + 1] = static_cast<int>(url[i]);
}
return ascii_values;
}

VARP Qwen_vl::visual_embedding(const std::vector<int>& input_ids) {
#ifdef USING_VISUAL_MODEL
int start_pos = 0, pad_pos = 0, end_pos = 0;
for (int i = 0; i < input_ids.size(); i++) {
int id = input_ids[i];
if (id == img_start_ && !start_pos) {
start_pos = i;
}
if (id == img_pad_ && !pad_pos) {
pad_pos = i;
}
if (id == img_end_ && !end_pos) {
end_pos = i;
}
}
if (!start_pos) {
return txt_embedding(input_ids);
}
std::vector<int> prefix(input_ids.begin(), input_ids.begin() + start_pos);
std::vector<int> img_ascii(input_ids.begin() + start_pos + 1, input_ids.begin() + pad_pos);
std::vector<int> suffix(input_ids.begin() + end_pos + 1, input_ids.end());
std::string img_path;
for (auto ascii_val : img_ascii) {
img_path += static_cast<char>(ascii_val);
}
VARP image = nullptr;
if (img_path.substr(0, 4) == "http") {
std::regex url_regex(R"(^https?://([^/]+)(/.*))");
std::smatch url_match_result;
std::string host, path;
if (std::regex_search(img_path, url_match_result, url_regex) && url_match_result.size() == 3) {
host = url_match_result[1].str();
path = url_match_result[2].str();
}
std::cout << host << "#" << path << std::endl;
httplib::Client cli(host);
auto res = cli.Get(path);
std::string img_file = "downloaded_image.jpg";
if (res && res->status == 200) {
std::ofstream file(img_file, std::ios::binary);
if (file.is_open()) {
file.write(res->body.c_str(), res->body.size());
std::cout << "Image has been downloaded successfully." << std::endl;
file.close();
} else {
std::cerr << "Unable to open file to write image." << std::endl;
exit(0);
}
} else {
std::cerr << "Failed to download image. Status code: " << (res ? res->status : 0) << std::endl;
exit(0);
}
image = MNN::CV::imread(img_file);
} else {
image = MNN::CV::imread(img_path);
}
image = MNN::CV::resize(image, {img_size_, img_size_}, 0, 0, MNN::CV::INTER_LINEAR, MNN::CV::COLOR_BGR2RGB,
{123.25239296, 117.20384, 104.50194688}, {0.0145414 , 0.01494914, 0.01416452});
image = MNN::Express::_Unsqueeze(image, {0});
image = MNN::Express::_Convert(image, NC4HW4);
auto image_embedding = visual_module_->forward(image);
image_embedding = MNN::Express::_Permute(image_embedding, {1, 0, 2});
auto prefix_embedding = txt_embedding(prefix);
auto suffix_embedding = txt_embedding(suffix);
auto embeddings = MNN::Express::_Concat({prefix_embedding, image_embedding, suffix_embedding}, 0);
#else
auto embeddings = txt_embedding(input_ids);
#endif
return embeddings;
}

std::vector<int> Qwen_vl::tokenizer(const std::string& query) {
// split query
std::regex img_regex("<img>(.*?)</img>");
std::string::const_iterator searchStart(query.cbegin());
std::smatch match;
std::vector<std::string> img_info, txt_info;
std::vector<int> ids {};
while (std::regex_search(searchStart, query.cend(), match, img_regex)) {
auto txt_ids = tokenizer_encode(match.prefix().str());
ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
auto img_ids = url_encode(match[1].str());
ids.insert(ids.end(), img_ids.begin(), img_ids.end());
searchStart = match.suffix().first;
}
if (searchStart != query.cend()) {
auto txt_ids = tokenizer_encode(std::string(searchStart, query.cend()));
ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
}
// auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
ids.insert(ids.begin(), {198, 151644, 872, 198});
ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
return ids;
}

VARP Qwen_vl::gen_attention_mask(int seq_len) {
if (seq_len == 1) {
auto attention_mask = _Input({1, 1, 1, all_seq_len_ + 1}, NCHW, halide_type_of<float>());
auto ptr = attention_mask->writeMap<float>();
for (int i = 0; i < all_seq_len_ + 1; i++) {
ptr[i] = 0;
}
return attention_mask;
} else {
auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<float>());
auto ptr = attention_mask->writeMap<float>();
for (int i = 0; i < seq_len; i++) {
for (int j = 0; j < seq_len; j++) {
ptr[seq_len * i + j] = (j > i) * std::numeric_limits<float>::lowest();
}
}
return attention_mask;
}
}

// Llama2_7b
std::vector<int> Llama2_7b::tokenizer(const std::string& query) {
auto ids = tokenizer_encode(query);
Expand Down

0 comments on commit f2add57

Please sign in to comment.