support Qwen-VL-Chat

wangzhaode · Jan 3, 2024 · f2add57 · f2add57
1 parent 312ce0c
commit f2add57
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 24 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,14 +2,18 @@ cmake_minimum_required(VERSION 3.0)
 project(mnn-llm)
 
 option(BUILD_FOR_ANDROID "Build for android whith mini memory mode." OFF)
+option(USING_VISUAL_MODEL "Using visual model will need dpes: MNNOpenCV and httplib." OFF)
 option(USING_DISK_EMBED "Using disk embedding to save memory." OFF)
 option(DUMP_PROFILE_INFO "Dump profile info when chat." OFF)
-option(WITH_CUDA "Enable CUDA support" OFF)
 
 if (USING_DISK_EMBED)
     add_definitions(-DUSING_DISK_EMBED)
 endif()
 
+if (USING_VISUAL_MODEL)
+    add_definitions(-DUSING_VISUAL_MODEL)
+endif()
+
 if (DUMP_PROFILE_INFO)
     add_definitions(-DDUMP_PROFILE_INFO)
 endif()
@@ -35,6 +39,9 @@ else()
     set_target_properties(llm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
 
     target_link_libraries(llm MNN MNN_Express)
+    if (USING_VISUAL_MODEL)
+        target_link_libraries(llm MNNOpenCV)
+    endif()
 endif()
 
 if (BUILD_FOR_ANDROID)

diff --git a/README.md b/README.md
@@ -107,8 +107,6 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 | windows | PC | Intel(R) Core(TM) i7-13700K | 32GB |
 
 
-
-
 ### 下载int4模型
 ```
 # <model> like `chatglm-6b`
@@ -154,10 +152,16 @@ llm模型导出`onnx`和`mnn`模型请使用[llm-export](https://github.com/wang
 ./script/android_build.sh
 ```
 
-默认使用`CPU`后端，如果使用其他后端，可以在脚本中添加`MNN`编译宏
+一些编译宏：
+- `BUILD_FOR_ANDROID`: 编译到Android设备；
+- `USING_VISUAL_MODEL`: 支持多模态能力的模型，需要依赖`libMNNOpenCV`；
+- `USING_DISK_EMBED`: 使用硬盘加载的方式实现embedding，节省内存；
+- `DUMP_PROFILE_INFO`: 每次对话后dump出性能数据到命令行中；
+
+默认使用`CPU`后端且不实用多模态能力，如果使用其他后端或能力，可以在编译MNN的脚本中添加`MNN`编译宏
 - cuda: `-DMNN_CUDA=ON`
 - opencl: `-DMNN_OPENCL=ON`
-
+- opencv: `-DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON`
 
 ### 4. 执行
 
@@ -184,6 +188,7 @@ adb shell "cd /data/local/tmp && export LD_LIBRARY_PATH=. && ./cli_demo qwen-1.8
 - [codegeex2-6b](https://modelscope.cn/models/ZhipuAI/codegeex2-6b/summary)
 - [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary)
 - [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary)
+- [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
 - [Qwen-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary)
 - [Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary)
 - [internlm-chat-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary)

diff --git a/include/llm.hpp b/include/llm.hpp
@@ -54,11 +54,7 @@ class Llm {
     }
     virtual ~Llm() = default;
     static Llm* createLLM(const std::string& path, std::string model_type = "auto");
-    VARP disk_embedding(const std::vector<int>& input_ids);
     void load(const std::string& model_dir);
-    int forward(const std::vector<int>& input_ids);
-    std::vector<int> tokenizer_encode(const std::string& input_str);
-    std::string decode(int id);
     void chat();
     void warmup();
     std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
@@ -75,14 +71,16 @@ class Llm {
     // time
     int64_t prefill_us_ = 0;
     int64_t decode_us_ = 0;
-private:
-    virtual std::vector<int> tokenizer(const std::string& query) = 0;
-    virtual VARP gen_attention_mask(int seq_len) = 0;
-    virtual VARP gen_position_ids(int seq_len) = 0;
-    virtual bool is_stop(int token_id) = 0;
+protected:
+    VARP embedding(const std::vector<int>& input_ids);
+    VARP txt_embedding(const std::vector<int>& input_ids);
+    int forward(const std::vector<int>& input_ids);
+    std::vector<int> tokenizer_encode(const std::string& input_str);
+    std::string decode(int id);
 protected:
     // model configs
     bool is_single_ = false;
+    bool is_visual_ = false;
     int layer_nums_ = 0;
     int hidden_size_ = 4096;
     std::vector<int> key_value_shape_ = {};
@@ -91,6 +89,13 @@ class Llm {
     float load_progress_ = 0.f;
     // tokenizer
     std::unique_ptr<Tokenizer> tokenizer_;
+    std::shared_ptr<Module> visual_module_;
+private:
+    virtual VARP visual_embedding(const std::vector<int>& input_ids) { return nullptr; }
+    virtual std::vector<int> tokenizer(const std::string& query) = 0;
+    virtual VARP gen_attention_mask(int seq_len) = 0;
+    virtual VARP gen_position_ids(int seq_len) = 0;
+    virtual bool is_stop(int token_id) = 0;
 private:
     // MNN Modules
     std::shared_ptr<Executor::RuntimeManager> runtime_manager_;
@@ -160,6 +165,29 @@ class Qwen_7b : public Llm {
     virtual bool is_stop(int token_id) override;
 };
 
+class Qwen_vl : public Qwen_7b {
+public:
+    Qwen_vl() {
+        model_name_ = "Qwen_vl";
+        is_visual_ = true;
+        layer_nums_ = 32;
+        key_value_shape_ = {2, 1, 0, 32, 128};
+        hidden_size_ = 4096;
+        tokenizer_.reset(new Tiktoken);
+    }
+private:
+    const int img_size_ = 448;
+    const int imgpad_len_ = 256;
+    const int img_start_ = 151857;
+    const int img_end_ = 151858;
+    const int img_pad_ = 151859;
+private:
+    std::vector<int> url_encode(const std::string& url);
+    virtual VARP visual_embedding(const std::vector<int>& input_ids) override;
+    virtual std::vector<int> tokenizer(const std::string& query) override;
+    virtual VARP gen_attention_mask(int seq_len) override;
+};
+
 class Qwen_1_8b : public Qwen_7b {
 public:
     Qwen_1_8b() {

diff --git a/src/llm.cpp b/src/llm.cpp
@@ -7,13 +7,18 @@
 // #define MNN_OPEN_TIME_TRACE 1
 
 #include <iostream>
+#include <fstream>
+#include <regex>
 
-#include "llm.hpp"
-#include "tokenizer.hpp"
 #include <MNN/expr/ExecutorScope.hpp>
 #include <MNN/AutoTime.hpp>
+#include "llm.hpp"
+#include "tokenizer.hpp"
 
-#include <fstream>
+#ifdef USING_VISUAL_MODEL
+#include "httplib.h"
+#include <cv/cv.hpp>
+#endif
 
 Llm* Llm::createLLM(const std::string& path, std::string model_type) {
     auto size = path.size();
@@ -43,6 +48,8 @@ Llm* Llm::createLLM(const std::string& path, std::string model_type) {
     } else if (model_type.find("qwen") != std::string::npos) {
         if (model_type.find("1.8") != std::string::npos) {
             llm = new Qwen_1_8b;
+        } else if (model_type.find("vl") != std::string::npos) {
+            llm = new Qwen_vl;
         } else {
             llm = new Qwen_7b;
         }
@@ -223,6 +230,14 @@ void Llm::load(const std::string& model_dir) {
         MNN_PRINT("Done!\n");
         load_progress_ += step;
 #endif
+        if (is_visual_) {
+            std::string visual_model_path = model_dir + "/visual.mnn";
+            MNN_PRINT("[%3.0f%% ] load %s model ... ", load_progress_, visual_model_path.c_str());fflush(stdout);
+            module_config.rearrange = false;
+            visual_module_.reset(Module::load({}, {}, visual_model_path.c_str(), runtime_manager_, &module_config));
+            MNN_PRINT("Done!\n");
+            module_config.rearrange = true;
+        }
         // load glm_block models
         for (int i = 0; i < layer_nums_; i++) {
             load_progress_ += step;
@@ -269,11 +284,7 @@ int Llm::forward(const std::vector<int>& input_ids) {
         past_key_values_[0] = outputs[1];
     } else {
         // split block models
-#ifdef USING_DISK_EMBED
-        auto hidden_states = disk_embedding(input_ids);
-#else
-        auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0];
-#endif
+        auto hidden_states = embedding(input_ids);
         for (int i = 0; i < layer_nums_; i++) {
             AUTOTIME;
             auto outputs = modules_[i]->onForward({hidden_states, attention_mask, position_ids, past_key_values_[i]});
@@ -292,9 +303,15 @@ int Llm::forward(const std::vector<int>& input_ids) {
     return id;
 }
 
-VARP Llm::disk_embedding(const std::vector<int>& input_ids) {
+VARP Llm::txt_embedding(const std::vector<int>& input_ids) {
+#ifndef USING_DISK_EMBED
+    // using model forward
+    auto inputs_ids_ = _Const(input_ids.data(), {static_cast<int>(input_ids.size())}, NCHW, halide_type_of<int>());
+    auto hidden_states = modules_[layer_nums_ + 1]->onForward({inputs_ids_})[0];
+    return hidden_states;
+#endif
     AUTOTIME;
-    // disk embedding save memory
+    // disk embedding to save memory
     size_t seq_len = input_ids.size();
     auto embedding = _Input({static_cast<int>(seq_len), 1, hidden_size_}, NCHW);
     size_t size = hidden_size_ * sizeof(int16_t);
@@ -314,6 +331,13 @@ VARP Llm::disk_embedding(const std::vector<int>& input_ids) {
     return embedding;
 }
 
+VARP Llm::embedding(const std::vector<int>& input_ids) {
+    if (is_visual_ && !gen_seq_len_) {
+        return visual_embedding(input_ids);
+    }
+    return txt_embedding(input_ids);
+}
+
 std::vector<int> Llm::tokenizer_encode(const std::string& input_str) {
     auto ids = tokenizer_->encode(input_str);
     return ids;
@@ -463,6 +487,132 @@ bool Qwen_7b::is_stop(int token_id) {
     return token_id >= 151645;
 }
 
+// Qwen_vl
+std::vector<int> Qwen_vl::url_encode(const std::string& url) {
+    std::vector<int> ascii_values(imgpad_len_, img_pad_);
+    ascii_values[0] = img_start_;
+    ascii_values[imgpad_len_ - 1] = img_end_;
+    for (int i = 0; i < url.size(); i++) {
+        ascii_values[i + 1] = static_cast<int>(url[i]);
+    }
+    return ascii_values;
+}
+
+VARP Qwen_vl::visual_embedding(const std::vector<int>& input_ids) {
+#ifdef USING_VISUAL_MODEL
+    int start_pos = 0, pad_pos = 0, end_pos = 0;
+    for (int i = 0; i < input_ids.size(); i++) {
+        int id = input_ids[i];
+        if (id == img_start_ && !start_pos) {
+            start_pos = i;
+        }
+        if (id == img_pad_ && !pad_pos) {
+            pad_pos = i;
+        }
+        if (id == img_end_ && !end_pos) {
+            end_pos = i;
+        }
+    }
+    if (!start_pos) {
+        return txt_embedding(input_ids);
+    }
+    std::vector<int> prefix(input_ids.begin(), input_ids.begin() + start_pos);
+    std::vector<int> img_ascii(input_ids.begin() + start_pos + 1, input_ids.begin() + pad_pos);
+    std::vector<int> suffix(input_ids.begin() + end_pos + 1, input_ids.end());
+    std::string img_path;
+    for (auto ascii_val : img_ascii) {
+        img_path += static_cast<char>(ascii_val);
+    }
+    VARP image = nullptr;
+    if (img_path.substr(0, 4) == "http") {
+        std::regex url_regex(R"(^https?://([^/]+)(/.*))");
+        std::smatch url_match_result;
+        std::string host, path;
+        if (std::regex_search(img_path, url_match_result, url_regex) && url_match_result.size() == 3) {
+            host = url_match_result[1].str();
+            path = url_match_result[2].str();
+        }
+        std::cout << host << "#" << path << std::endl;
+        httplib::Client cli(host);
+        auto res = cli.Get(path);
+        std::string img_file = "downloaded_image.jpg";
+        if (res && res->status == 200) {
+            std::ofstream file(img_file, std::ios::binary);
+            if (file.is_open()) {
+                file.write(res->body.c_str(), res->body.size());
+                std::cout << "Image has been downloaded successfully." << std::endl;
+                file.close();
+            } else {
+                std::cerr << "Unable to open file to write image." << std::endl;
+                exit(0);
+            }
+        } else {
+            std::cerr << "Failed to download image. Status code: " << (res ? res->status : 0) << std::endl;
+            exit(0);
+        }
+        image = MNN::CV::imread(img_file);
+    } else {
+        image = MNN::CV::imread(img_path);
+    }
+    image = MNN::CV::resize(image, {img_size_, img_size_}, 0, 0, MNN::CV::INTER_LINEAR, MNN::CV::COLOR_BGR2RGB,
+                            {123.25239296, 117.20384, 104.50194688}, {0.0145414 , 0.01494914, 0.01416452});
+    image = MNN::Express::_Unsqueeze(image, {0});
+    image = MNN::Express::_Convert(image, NC4HW4);
+    auto image_embedding = visual_module_->forward(image);
+    image_embedding = MNN::Express::_Permute(image_embedding, {1, 0, 2});
+    auto prefix_embedding = txt_embedding(prefix);
+    auto suffix_embedding = txt_embedding(suffix);
+    auto embeddings = MNN::Express::_Concat({prefix_embedding, image_embedding, suffix_embedding}, 0);
+#else
+    auto embeddings = txt_embedding(input_ids);
+#endif
+    return embeddings;
+}
+
+std::vector<int> Qwen_vl::tokenizer(const std::string& query) {
+    // split query
+    std::regex img_regex("<img>(.*?)</img>");
+    std::string::const_iterator searchStart(query.cbegin());
+    std::smatch match;
+    std::vector<std::string> img_info, txt_info;
+    std::vector<int> ids {};
+    while (std::regex_search(searchStart, query.cend(), match, img_regex)) {
+        auto txt_ids = tokenizer_encode(match.prefix().str());
+        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
+        auto img_ids = url_encode(match[1].str());
+        ids.insert(ids.end(), img_ids.begin(), img_ids.end());
+        searchStart = match.suffix().first;
+    }
+    if (searchStart != query.cend()) {
+        auto txt_ids = tokenizer_encode(std::string(searchStart, query.cend()));
+        ids.insert(ids.end(), txt_ids.begin(), txt_ids.end());
+    }
+    // auto prompt = "\n<|im_start|>user\n" + query + "<|im_end|>\n<|im_start|>assistant\n";
+    ids.insert(ids.begin(), {198, 151644, 872, 198});
+    ids.insert(ids.end(), {151645, 198, 151644, 77091, 198});
+    return ids;
+}
+
+VARP Qwen_vl::gen_attention_mask(int seq_len) {
+    if (seq_len == 1) {
+        auto attention_mask = _Input({1, 1, 1, all_seq_len_ + 1}, NCHW, halide_type_of<float>());
+        auto ptr = attention_mask->writeMap<float>();
+        for (int i = 0; i < all_seq_len_ + 1; i++) {
+            ptr[i] = 0;
+        }
+        return attention_mask;
+    } else {
+        auto attention_mask = _Input({1, 1, seq_len, seq_len}, NCHW, halide_type_of<float>());
+        auto ptr = attention_mask->writeMap<float>();
+        for (int i = 0; i < seq_len; i++) {
+            for (int j = 0; j < seq_len; j++) {
+                ptr[seq_len * i + j] = (j > i) * std::numeric_limits<float>::lowest();
+            }
+        }
+        return attention_mask;
+    }
+}
+
 // Llama2_7b
 std::vector<int> Llama2_7b::tokenizer(const std::string& query) {
     auto ids = tokenizer_encode(query);