UbiquitousLearning · yirongjie · Aug 2, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -481,6 +481,8 @@ endif ()
 add_executable(demo_qwen ${PROJECT_SOURCE_DIR}/examples/demo_qwen.cpp ${DIR_SRC_CPU} ${DIR_SRC_MEM_MANAGER} ${DIR_SRC_EXP} ${DIR_SRC}
         src/tokenizers/Tokenizer.cpp
         src/tokenizers/BPE/Bpe.cpp
+        src/tokenizers/Unicode.cpp
+        src/tokenizers/UnicodeData.cpp
         src/processor/PreProcess.cpp
 )
 if (ARM AND NOT APK)
@@ -546,6 +548,8 @@ if (APK)
             src/tokenizers/Unigram/trie.hpp
             src/tokenizers/BPE/Bpe.cpp
             src/tokenizers/BPE/Bpe.hpp
+            src/tokenizers/Unicode.cpp
+            src/tokenizers/UnicodeData.cpp
             src/processor/PreProcess.hpp
             src/processor/FuyuPreProcess.hpp
             src/processor/FuyuPreProcess.cpp

diff --git a/examples/demo_qwen.cpp b/examples/demo_qwen.cpp
@@ -11,11 +11,12 @@
 #include "models/qwen/configuration_qwen.hpp"
 #include "models/qwen/modeling_qwen.hpp"
 #include "models/qwen/tokenization_qwen.hpp"
-#include "processor/PostProcess.hpp"
 
 using namespace mllm;
 
 int main(int argc, char **argv) {
+    // std::iostream::sync_with_stdio(false);
+
     cmdline::parser cmdParser;
     cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
     cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
@@ -70,6 +71,6 @@ int main(int argc, char **argv) {
             }
             return true;
         });
-        printf("\n");
+        std::cout << "\n";
     }
 }
diff --git a/src/backends/cpu/compute/GEMM_AArch64.hpp b/src/backends/cpu/compute/GEMM_AArch64.hpp
@@ -1,34 +1,37 @@
 #ifndef MLLM_GEMM_AARCH64_HPP
 #define MLLM_GEMM_AARCH64_HPP
 
-
 #include "VecDot.hpp"
 using namespace mllm;
 
-
-
 // Quantization
-void quantize_q8_0_4x4(const float * __restrict x, void * __restrict y, int64_t k);
-void quantize_q8_0_4x8(const float * __restrict x, void * __restrict y, int64_t k);
+void quantize_q8_0_4x4(const float *__restrict x, void *__restrict y, int64_t k);
+void quantize_q8_0_4x8(const float *__restrict x, void *__restrict y, int64_t k);
 
-void quantize_mat_q8_0(const float * __restrict x, void * __restrict y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
+void quantize_mat_q8_0(const float *__restrict x, void *__restrict y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
 
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-size_t quantize_q4_0_4x4(const float * __restrict src, void * __restrict dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0_4x8(const float * __restrict src, void * __restrict dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
-size_t quantize_q4_0_8x8(const float * __restrict src, void * __restrict dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+size_t quantize_q4_0_4x4(const float *__restrict src, void *__restrict dst, int64_t nrows, int64_t n_per_row, const float *imatrix);
+size_t quantize_q4_0_4x8(const float *__restrict src, void *__restrict dst, int64_t nrows, int64_t n_per_row, const float *imatrix);
+size_t quantize_q4_0_8x8(const float *__restrict src, void *__restrict dst, int64_t nrows, int64_t n_per_row, const float *imatrix);
 
 // GEMV
-void mllm_gemv_q4_0_4x4_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
-void mllm_gemv_q4_0_4x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
-void mllm_gemv_q4_0_8x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
+void mllm_gemv_q4_0_4x4_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
+void mllm_gemv_q4_0_4x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
+void mllm_gemv_q4_0_8x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
+// void mllm_gemv_q4_0_4x4_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
+// void mllm_gemv_q4_0_4x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
+// void mllm_gemv_q4_0_8x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
 
 // GEMM
-void mllm_gemm_q4_0_4x4_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
-void mllm_gemm_q4_0_4x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
-void mllm_gemm_q4_0_8x8_q8_0(int n, float * __restrict s, size_t bs, const void * __restrict vx, const void * __restrict vy, int nr, int nc);
-
-void quantize_row_q4_0_4x4(const float * __restrict x, void * __restrict y, int k);
-void quantize_row_q4_0_4x4(const float * __restrict x, void * __restrict y, int k, int raw);
+void mllm_gemm_q4_0_4x4_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
+void mllm_gemm_q4_0_4x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
+void mllm_gemm_q4_0_8x8_q8_0(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc);
+// void mllm_gemm_q4_0_4x4_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
+// void mllm_gemm_q4_0_4x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
+// void mllm_gemm_q4_0_8x8_q8_0_bias(int n, float *__restrict s, size_t bs, const void *__restrict vx, const void *__restrict vy, int nr, int nc, const void *__restrict bias);
+
+void quantize_row_q4_0_4x4(const float *__restrict x, void *__restrict y, int k);
+void quantize_row_q4_0_4x4(const float *__restrict x, void *__restrict y, int k, int raw);
 
 #endif // MLLM_GEMM_HPP
diff --git a/src/models/qwen/tokenization_qwen.hpp b/src/models/qwen/tokenization_qwen.hpp
@@ -12,6 +12,7 @@
 
 #include "tokenizers/BPE/Bpe.hpp"
 #include "tokenizers/Tokenizer.hpp"
+#include "tokenizers/Unicode.hpp"
 #include <algorithm>
 #include <unordered_map>
 
@@ -45,10 +46,15 @@ static std::vector<int> __ord(std::string v) {
 }
 
 static const std::string PAT_STR = R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?:$|[^\S])|\s+)";
+static const std::string SPLIT_PAT_STR = R"(<\|im_start\|>|<\|im_end\|>|<\|endoftext\|>)";
+static const std::vector<std::string> FIXED_PAT_STRS = {
+    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+};
 
 class QWenTokenizer final {
 public:
-    explicit QWenTokenizer(const std::string &vocab_file, const std::string &merge_file) {
+    explicit QWenTokenizer(const std::string &vocab_file, const std::string &merge_file, bool split_special_tokens = false) :
+        split_special_tokens_(split_special_tokens) {
         Module::initBackend(MLLM_CPU);
         tokenizer = new BPETokenizer(vocab_file);
 
@@ -114,28 +120,95 @@ class QWenTokenizer final {
         return elems;
     }
 
+    std::vector<std::string> _splitWithDelimiters(const std::string &str, const std::vector<std::string> &delimiters) {
+        std::string s = str;
+        std::vector<std::string> result;
+        size_t pos = 0;
+        auto isDelimiter = [&](size_t currentPos) {
+            for (const auto &delimiter : delimiters) {
+                if (currentPos + delimiter.length() <= s.length() && s.substr(currentPos, delimiter.length()) == delimiter) {
+                    return true;
+                }
+            }
+            return false;
+        };
+
+        while (pos < s.length()) {
+            if (isDelimiter(pos)) {
+                if (pos != 0) {
+                    result.push_back(s.substr(0, pos));
+                }
+                size_t delimiterLength = delimiters.front().length();
+                for (const auto &delimiter : delimiters) {
+                    if (s.substr(pos, delimiter.length()) == delimiter) {
+                        delimiterLength = delimiter.length();
+                        result.push_back(delimiter);
+                        break;
+                    }
+                }
+                pos += delimiterLength;
+                s = s.substr(pos);
+                pos = 0;
+            } else {
+                ++pos;
+            }
+        }
+
+        if (!s.empty()) {
+            result.push_back(s);
+        }
+
+        return result;
+    }
+
     Tensor tokenize(std::string &text, int str_i = 0) {
         std::vector<token_id_t> ret;
 
-        auto splited = stringSplit(text, ' ');
-        if (text[0] == ' ') splited[0] = " " + splited[0];
-        for (auto piece : splited) {
-            // look up table
-            std::string token;
-            for (auto b : UTF8(piece)) token += byte_encoder_[b];
+        if (split_special_tokens_) {
+            const auto word_collection = unicode_regex_split(text, FIXED_PAT_STRS);
+            for (auto &piece : word_collection) {
+                // look up table
+                // std::string token;
+                // for (auto b : UTF8(piece)) token += byte_encoder_[b];
 
-            // using bpe
-            std::vector<token_id_t> tmp;
-            tokenizer->tokenize(token, tmp, /*bos*/ false, /*byte fallback*/ true, "");
+                // using bpe
+                std::vector<token_id_t> tmp;
+                tokenizer->tokenize(piece, tmp, false, true, "");
+                ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+            }
+        } else {
+            auto parts = _splitWithDelimiters(text, special_tokens);
+            // for (auto p : parts) {
+            //     std::cout << "\"" << p << "\"" << std::endl;
+            // }
+            for (auto &p : parts) {
+                if (std::find(special_tokens.begin(), special_tokens.end(), p) != special_tokens.end()) {
+                    std::string token;
+                    for (auto b : UTF8(p)) token += byte_encoder_[b];
 
-            ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+                    std::vector<token_id_t> tmp;
+                    tokenizer->tokenize(token, tmp, false, special_tokens, true);
+                    ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+                } else {
+                    const auto word_collection = unicode_regex_split(p, FIXED_PAT_STRS);
+                    for (auto &piece : word_collection) {
+                        // look up table
+                        // std::string token;
+                        // for (auto b : UTF8(piece)) token += byte_encoder_[b];
+
+                        // using bpe
+                        std::vector<token_id_t> tmp;
+                        tokenizer->tokenize(piece, tmp, false, true, "");
+                        assert(tmp.size() != 0);
+                        ret.insert(ret.end(), tmp.begin(), tmp.end() - 1);
+                    }
+                }
+            }
         }
-        // FIXME if we need bos or not?
-        ret.insert(ret.begin(), bos_id_);
+
         return Tokenizer::tokens2Input(ret);
     }
 
-    // FIXME std::string += std::string has performance issues when string is large.
     std::string _byte_decode_(const std::string &text) {
         std::string ret;
         auto _ = ORD(text);
@@ -168,11 +241,17 @@ class QWenTokenizer final {
     }
 
 public:
+    bool split_special_tokens_ = false;
     BPETokenizer *tokenizer;
     std::unordered_map<int, std::string> byte_encoder_;
     std::unordered_map<std::string, int> byte_decoder_;
     std::unordered_map<std::string, unsigned int> bpe_ranks_;
     token_id_t eos_id_ = 151645, bos_id_ = 151643;
+    std::vector<std::string> special_tokens = {
+        "<|endoftext|>",
+        "<|im_start|>",
+        "<|im_end|>",
+    };
 };
 
 #undef UTF8