merge

UbiquitousLearning · Oct 27, 2024 · 4f7b03d · 4f7b03d
2 parents b8e230e + 64dd7b6
commit 4f7b03d
Show file tree

Hide file tree

Showing 73 changed files with 53,300 additions and 1,870 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,8 +34,6 @@ mllm.egg-info/
 
 examples/demo_deepseek.cpp
 src/models/deepseek/*
-examples/demo_phonellm.cpp
-src/models/phonellm/*
 examples/demo_minicpm3.cpp
 src/models/minicpm3/*
 examples/demo.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,6 +68,13 @@ endif()
 
 # backend options
 option(QNN "Enable QNN" OFF)
+option(QNN_OLD_FRONTEND "Enable Old QNN" OFF)
+if(QNN)
+    add_definitions(-DUSE_QNN) # the USE_QNN should come before cpu subdirectory
+endif()
+if(QNN_OLD_FRONTEND)
+    add_definitions(-DOLD_QNN)
+endif()
 
 if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
     cmake_policy(SET CMP0135 NEW)
@@ -116,8 +123,7 @@ include_directories(${PROJECT_SOURCE_DIR}/third_party/pybind11/include)
 
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/cpu)
 
-if(QNN)
-    add_definitions(-DUSE_QNN)
+if(QNN) # QNN lib
     add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn)
 endif()
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -54,7 +54,6 @@ endmacro()
 
 
 ## new demos
-
 func_llm_add_executable(benchmark)
 func_llm_add_executable(demo_llama)
 func_llm_add_executable(demo_tinyllama)
@@ -72,7 +71,8 @@ func_llm_add_executable(demo_smollm)
 func_llm_add_executable(demo_openelm)
 func_llm_add_executable(demo_dclm)
 func_llm_add_executable(demo_bert)
-# func_llm_add_executable(demo_phonellm)
+func_llm_add_executable(demo_phonelm)
+
 
 func_vlm_add_executable(demo_llava)
 func_vlm_add_executable(demo_fuyu)
@@ -84,6 +84,7 @@ func_vlm_add_executable(demo_imagebind_1mod)
 
 # QNN demo
 if(QNN)
+    func_llm_add_executable(demo_qnn)
     func_llm_add_executable(main_qwen_npu)
 endif()
 

diff --git a/examples/demo_phonelm.cpp b/examples/demo_phonelm.cpp
@@ -0,0 +1,60 @@
+#include <iostream>
+#include <vector>
+#include "Types.hpp"
+#include "cmdline.h"
+#include "models/phonelm/modeling_phonelm.hpp"
+#include "models/smollm/tokenization_smollm.hpp"
+
+using namespace mllm;
+
+int main(int argc, char **argv) {
+    cmdline::parser cmdParser;
+    cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
+    cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
+    cmdParser.add<string>("model", 'o', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-fp32.mllm");
+    cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
+    cmdParser.add<int>("thread", 't', "num of threads", false, 40);
+    cmdParser.parse_check(argc, argv);
+
+    string merge_path = cmdParser.get<string>("merge");
+    string vocab_path = cmdParser.get<string>("vocab");
+    string model_path = cmdParser.get<string>("model");
+    int tokens_limit = cmdParser.get<int>("limits");
+    CPUBackend::cpu_threads = cmdParser.get<int>("thread");
+
+    auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);
+
+    string system_prompt_start;
+    string system_prompt_end;
+
+    PhoneLMConfig config(tokens_limit, "1.5B");
+    auto model = PhoneLMForCausalLM(config);
+    model.load(model_path);
+
+    vector<string> in_strs = {
+        "Hello, who are you?",
+        "What can you do?",
+        "Please introduce Beijing University of Posts and Telecommunications.",
+    };
+
+    for (int i = 0; i < in_strs.size(); ++i) {
+        auto input_str = tokenizer.apply_chat_template(in_strs[i]);
+        auto input_tensor = tokenizer.tokenize(input_str);
+        std::cout << "[Q] " << in_strs[i] << std::endl;
+        std::cout << "[A] " << std::flush;
+        LlmTextGeneratorOpts opt{
+            .max_new_tokens = 100,
+            .do_sample = false,
+        };
+        model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
+            auto out_string = tokenizer.detokenize({out_token});
+            auto [not_end, output_string] = tokenizer.postprocess(out_string);
+            if (!not_end) { return false; }
+            std::cout << output_string << std::flush;
+            return true;
+        });
+        model.clear_kvcache();
+        std::cout << "\n";
+    }
+    return 0;
+}
diff --git a/examples/demo_qnn.cpp b/examples/demo_qnn.cpp
@@ -1,6 +1,7 @@
 #include "cmdline.h"
 #include "models/qwen/configuration_qwen.hpp"
 #include "models/qwen/modeling_qwen_npu.hpp"
+#include "models/qwen/modeling_qwen.hpp"
 #include "models/qwen/tokenization_qwen.hpp"
 
 using namespace mllm;
@@ -9,7 +10,7 @@ int main(int argc, char **argv) {
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
-    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm");
+    cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-chat-int8.mllm");
     cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B]", false, "1.8B");
     cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
     cmdParser.add<int>("thread", 't', "num of threads", false, 4);
@@ -24,25 +25,29 @@ int main(int argc, char **argv) {
 
     auto tokenizer = QWenTokenizer(vocab_path, merge_path);
     QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
-    auto model = QWenForCausalLM(config);
+    auto model = QWenForCausalLM_NPU(config);
     model.load(model_path);
+    // auto decoding_model = QWenForCausalLM(config);
+    // decoding_model.load("../models/qwen-1.5-1.8b-chat-q4k.mllm");
 
     vector<string> in_strs = {
         " Give me a short introduction to large language model.",
     };
 
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
-        auto input_tensor = tokenizer.tokenize(input_str);
+        auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
         std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
 
         LlmTextGeneratorOpts opt{
             .max_new_tokens = 100,
-            .do_sample = true,
+            .do_sample = false,
             .temperature = 0.3f,
             .top_k = 50,
             .top_p = 0.f,
+            .is_padding = true,
+            .seq_before_padding = real_seq_length,
         };
         model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
             auto out_string = tokenizer.detokenize({out_token});
@@ -51,6 +56,24 @@ int main(int argc, char **argv) {
             std::cout << output_string << std::flush;
             return true;
         });
-        std::cout << "FINISH\n";
+
+        LlmTextGeneratorOpts decoding_opt{
+            .max_new_tokens = 100,
+            .do_sample = false,
+            .temperature = 0.3f,
+            .top_k = 50,
+            .top_p = 0.f,
+            .is_padding = false,
+        };
+        // decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
+        //     auto out_string = tokenizer.detokenize({out_token});
+        //     auto [isOk, print_string] = processOutput(out_string);
+        //     if (isOk) {
+        //         std::cout << print_string << std::flush;
+        //     } else {
+        //         return false;
+        //     }
+        //     return true;
+        // });
     }
 }
diff --git a/examples/demo_tinyllama.cpp b/examples/demo_tinyllama.cpp
@@ -46,6 +46,7 @@ int main(int argc, char **argv) {
             auto [out_string, out_token] = tokenizer.detokenize(result[0]);
             auto [not_end, output_string] = tokenizer.postprocess(out_string);
             if (!not_end) { break; }
+            std::cout << output_string << std::flush;
             chatPostProcessing(out_token, input_tensor, {});
         }
         printf("\n");

diff --git a/examples/main_qwen_npu.cpp b/examples/main_qwen_npu.cpp
@@ -138,17 +138,12 @@ int main(int argc, char **argv) {
         std::cout << "[Q] " << in_str << std::endl;
         std::cout << "[A] " << std::flush;
 
-        // cpuExe.run(&cpuNet, {input});
-        // auto result = cpuExe.result();
-        // auto token_idx = postProcessing(result[0], input);
-
-        // auto out_token = tokenizer.detokenize({token_idx});
-        // std::cout << out_token << std::flush;
-        // exit(0);
-
         do {
             // 1: Prefill stage using NPU chunk execute
-            npuExe.run(npu_ctx, &npuNet, {input});
+            if (chunk == 1)
+                npuExe.run(npu_ctx, &npuNet, {input});
+            else
+                npuExe.runExp(npu_ctx, &npuNet, {input});
             auto result = npuExe.result();
 
             // inter model for prefill-decode

diff --git a/examples/main_qwen_npu.hpp b/examples/main_qwen_npu.hpp
@@ -10,11 +10,12 @@ namespace modeling {
 NetTensor *Qwen_FFN_NPU(Context *c, NetTensor *i, int hidden_dim, int ffn_hidden_dim, string name) {
     auto *x = _LinearINT8({i}, hidden_dim, ffn_hidden_dim, false, name + ".gate_proj");
     auto *y = _LinearINT8({i}, hidden_dim, ffn_hidden_dim, false, name + ".up_proj");
-    x = _Dequantize({x}, true, (string)name + ".gate_proj.dequantize", true);
-    y = _Dequantize({y}, true, (string)name + ".up_proj.dequantize", true);
-    x = _SiLU({x}, name + ".silu");
-    x = *x * y;
-    x = _Quantize({x}, true, (string)name + ".down_proj.quantize");
+    x = _SuperSiLU({x,y}, name + ".supersilu");
+    // x = _Dequantize({x}, true, (string)name + ".gate_proj.dequantize", false);
+    // y = _Dequantize({y}, true, (string)name + ".up_proj.dequantize", false);
+    // x = _SiLU({x}, name + ".silu");
+    // x = *x * y;
+    // x = _Quantize({x}, true, (string)name + ".down_proj.quantize");
     x = _LinearINT8({x}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj");
     x = _Dequantize({x}, true, (string)name + ".down_proj.dequantize");
     return x;
@@ -29,9 +30,9 @@ std::vector<NetTensor *> Qwen_CPUNPUAttention(Context *c, NetTensor *x, NetTenso
     k = k->view(1, head_size, seq / chunk, hidden_size);
     v = v->view(1, head_size, seq / chunk, hidden_size);
 
-    q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize");
-    k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize");
-    v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize");
+    q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize", true);
+    k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize", false);
+    v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize", false);
 
     v = _Transpose({v}, {0, 2, 3, 1}, (string)name + ".v_proj.transpose");
 
@@ -153,41 +154,6 @@ NetTensor *Qwen_FFN_CPU_q4k(Context *c, NetTensor *i, int hidden_dim, int ffn_hi
     return x;
 }
 
-void qwen_cpu(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200, int seq = 256, int chunk = 2) {
-    auto *i = _Input(c);
-    i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.embed_tokens");
-
-    for (int layer = 0; layer < 24; ++layer) {
-        auto res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".input_layernorm");
-
-        i = *Qwen_CPUAttention(c, res, hidden_dim, hidden_dim / mutil_head_size, mutil_head_size, cache_max, (string) "model.layers." + std::to_string(layer) + ".self_attn", seq, chunk) + i;
-
-        res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm");
-
-        if (layer != 6 && layer != 1 && layer != 2) {
-            i = *Qwen_FFN_CPU(c, res, hidden_dim, ffn_hidden_dim, (string) "model.layers." + std::to_string(layer) + ".mlp") + i;
-        } else {
-            auto name = (string) "model.layers." + std::to_string(layer) + ".mlp";
-
-            auto *x = _LinearINT8({res}, hidden_dim, ffn_hidden_dim, false, name + ".gate_proj");
-            x = _SiLU({x}, name + ".silu");
-            auto *y = _LinearINT8({res}, hidden_dim, ffn_hidden_dim, false, name + ".up_proj");
-            x = *x * y; // x = _Mul( {x, y}, name+".dot");
-
-            auto *i1 = x;
-            x = _LinearINT8({x}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj");
-
-            auto *i2 = x;
-
-            i = *x + i;
-
-            i = _LinearINT8Shadow({i1, i2, i}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj.shadow");
-        }
-    }
-    i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.norm");
-    i = _Linear({i}, hidden_dim, vocab_size, false, "lm_head");
-}
-
 void qwen_cpu_q4k(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200, int seq = 256, int chunk = 2) {
     auto *i = _Input(c);
     i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.embed_tokens");
@@ -242,9 +208,9 @@ void qwen_npu(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn
 
         res = i;
 
-        i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm");
+        i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm", false);
 
-        i = _Quantize({i}, true, (string) "model.layers." + std::to_string(layer) + ".mlp.up_proj.quantize");
+        // i = _Quantize({i}, true, (string) "model.layers." + std::to_string(layer) + ".mlp.up_proj.quantize");
 
         i = i->view(1, static_cast<int>(seq / chunk / 32), static_cast<int>(32), hidden_dim);
 

diff --git a/include/OpDefined.hpp b/include/OpDefined.hpp
@@ -61,6 +61,7 @@ enum OpType {
     MERGEOUTPUT,
     SPLITINPUT,
     IROPE,
+    SUPERSILU,
     OP_NUM
 };
 
@@ -107,6 +108,7 @@ static const vector<string> OpNames = {
     "Range",
     "Where",
     "Replace",
+    "Predictor",
     "SparseLinear",
     "SparseIdLinear",
     "ElasticLinear",
@@ -117,6 +119,7 @@ static const vector<string> OpNames = {
     "MergeOutput",
     "SplitInput",
     "IRoPE",
+    "SuperSiLU",
     "OP_NUM"};
 
 enum TensorFuncType {

diff --git a/include/Types.hpp b/include/Types.hpp
@@ -93,9 +93,9 @@ inline std::map<std::vector<int>, ChlType> Chls2Type = {
     {{0, 3, 4, 1, 2}, BWCTH}};
 
 enum TensorType {
-    INPUT_TENSOR = 0,
+    INPUT_TENSOR = 0, // used for input of the model
     NORMAL_TENSOR,
-    OUTPUT_TENSOR,
+    GRAPH_OUTPUT, // used for output of a graph
 };
 
 enum Chl {

diff --git a/scripts/build_qnn_android.sh b/scripts/build_qnn_android.sh
@@ -12,6 +12,7 @@ cmake .. \
 -DQNN=ON \
 -DDEBUG=OFF \
 -DTEST=OFF \
--DQUANT=OFF
+-DQUANT=OFF \
+-DQNN_OLD_FRONTEND=ON
 
 make -j4
diff --git a/src/Backend.cpp b/src/Backend.cpp
@@ -3,6 +3,7 @@
 #include <memory>
 #include <unordered_map>
 #include <mutex>
+#include "Layer.hpp"
 
 namespace mllm {
 extern void registerCPUBackendCreator();
@@ -29,6 +30,9 @@ static std::unordered_map<BackendType, std::shared_ptr<BackendCreator>> &GetBack
 }
 
 const std::shared_ptr<BackendCreator> GetBackendCreator(BackendType type) {
+    if (type == MLLM_QNN) {
+        Layer::use_layername_2_tensorname = false;
+    }
     registerBackend();
 
     auto &gExtraCreator = GetBackendCreatorMap();

diff --git a/src/Backend.hpp b/src/Backend.hpp
@@ -13,6 +13,11 @@ class Op;
 class Tensor;
 class Backend;
 
+// KVCache map for QNN-CPU KVCache sharing
+#ifdef USE_QNN
+static std::unordered_map<string, Op *> kv_cache_map;
+#endif
+
 class TensorFunction {
 public:
     virtual void setup(vector<Tensor *> outputs, vector<Tensor *> inputs, vector<float> args) = 0;