feat: add simple WordPiece tokenizer. add modeling_bert(unfinished)

UbiquitousLearning · Oct 27, 2024 · b8e230e · b8e230e
1 parent 9bda904
commit b8e230e
Show file tree

Hide file tree

Showing 9 changed files with 527 additions and 0 deletions.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -24,6 +24,7 @@ macro(func_llm_add_executable target)
         ${DIR_SRC}
         ${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
         ${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
+        ${PROJECT_SOURCE_DIR}/src/tokenizers/WordPiece/WordPiece.cpp
         ${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
         ${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
         ${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
@@ -70,6 +71,7 @@ func_llm_add_executable(demo_minicpm)
 func_llm_add_executable(demo_smollm)
 func_llm_add_executable(demo_openelm)
 func_llm_add_executable(demo_dclm)
+func_llm_add_executable(demo_bert)
 # func_llm_add_executable(demo_phonellm)
 
 func_vlm_add_executable(demo_llava)

diff --git a/examples/demo_bert.cpp b/examples/demo_bert.cpp
@@ -0,0 +1,23 @@
+//
+// Created by xwk on 24-10-23.
+//
+#include "models/bert/configuration_bert.hpp"
+#include "models/bert/modeling_bert.hpp"
+#include "models/bert/tokenization_bert.hpp"
+
+string vocab_file = "vocab/all-MiniLM-L6-v2.mllm";
+string model_file = "models/gte.mllm";
+
+int main(int argc, char *argv[]){
+    BertTokenizer tokenizer(vocab_file, false);
+    string text = "Hello, my dog is cute.";
+    auto [token_ids, type_ids, position_ids] = tokenizer.process(text);
+    token_ids.printData<float>();
+
+    auto config = BertConfig();
+    auto model = BertModel(config);
+    model.load(model_file);
+
+    auto res = model({token_ids, type_ids, position_ids});
+    res[0].printData<float>();
+}
diff --git a/examples/demo_qwen.cpp b/examples/demo_qwen.cpp
@@ -46,6 +46,7 @@ int main(int argc, char **argv) {
     for (int i = 0; i < in_strs.size(); ++i) {
         auto input_str = tokenizer.apply_chat_template(in_strs[i]);
         auto input_tensor = tokenizer.tokenize(input_str);
+        input_tensor.printData<float>();
         std::cout << "[Q] " << in_strs[i] << std::endl;
         std::cout << "[A] " << std::flush;
 

diff --git a/src/models/bert/README.md b/src/models/bert/README.md
@@ -0,0 +1 @@
+# 千问通义
diff --git a/src/models/bert/configuration_bert.hpp b/src/models/bert/configuration_bert.hpp
@@ -0,0 +1,94 @@
+#ifndef CONFIG_BERT_HPP
+#define CONFIG_BERT_HPP
+#include "Types.hpp"
+#include "models/transformer/configuration_transformer.hpp"
+#include <cctype>
+#include <iterator>
+
+using namespace mllm;
+
+class BertNameConfig : public TransformerNameConfig {
+public:
+    /**
+     * @brief QWen2 following the hugging face naming method
+     *
+     * @param type RoPEType
+     */
+    void init() {
+        embedding_base_name = "embeddings.";
+
+        blk_name = "model.layers.";
+        _attn_base_name = "self.";
+        _ffn_base_name = "mlp.";
+        _q_proj_name = "query";
+        _k_proj_name = "key";
+        _v_proj_name = "value";
+        _o_proj_name = "o_proj";
+        _gate_proj_name = "gate_proj";
+        _up_proj_name = "up_proj";
+        _down_proj_name = "down_proj";
+        _attn_norm_name = "input_layernorm";
+        _ffn_norm_name = "post_attention_layernorm";
+        token_embd_name = "model.embed_tokens";
+        post_norm_name = "model.norm";
+        lm_head_name = "lm_head";
+    }
+    std::string embedding_base_name;
+
+    std::string blk_name;
+    std::string token_embd_name;
+    std::string post_norm_name;
+    std::string lm_head_name;
+    std::string _gate_proj_name;
+};
+
+struct BertConfig : public TransformerConfig {
+    explicit BertConfig(){
+        attention_dropout = 0.0;
+        bos_token_id = 151643;
+        eos_token_id = 151645;
+        std::string hidden_act = "gelu";
+        hidden_size = 384;
+        initializer_range = 0.02;
+        intermediate_size = 1536;
+        max_position_embeddings = 512;
+        max_window_layers = 21;
+        model_type = "bert";
+        num_attention_heads = 12;
+        num_hidden_layers = 12;
+        num_key_value_heads = 16;
+        rms_norm_eps = 1e-6;
+        rope_theta = 1000000.0;
+        sliding_window = 32768;
+        vocab_size = 30522;
+        tie_embedding_words = true;
+
+        names_config.init();
+    };
+
+    int type_vocab_size = 2;
+    float layer_norm_eps = 1e-12;
+
+    float attention_dropout = 0.0;
+    int bos_token_id = 151643;
+    int eos_token_id = 151643;
+    std::string hidden_act = "silu";
+    int hidden_size = 1024;
+    float initializer_range = 0.02;
+    int intermediate_size = 2816;
+    int max_position_embeddings = 32768;
+    int max_window_layers = 21;
+    std::string model_type = "bert";
+    int num_attention_heads = 12;
+    int num_hidden_layers = 12;
+    int num_key_value_heads = 12;
+    double rms_norm_eps = 1e-6;
+    float rope_theta = 1000000.0;
+    int sliding_window = 32768;
+    int vocab_size = 151936;
+    bool tie_embedding_words = false;
+
+    BertNameConfig names_config;
+};
+
+#endif //! CONFIG_BERT_HPP
diff --git a/src/models/bert/modeling_bert.hpp b/src/models/bert/modeling_bert.hpp
@@ -0,0 +1,126 @@
+#ifndef MODELING_BERT_HPP
+#define MODELING_BERT_HPP
+
+#include "Backend.hpp"
+#include "Layer.hpp"
+#include "Module.hpp"
+#include "Tensor.hpp"
+#include "configuration_bert.hpp"
+#include <cmath>
+using namespace mllm;
+
+class BertEmbeddings : public Module {
+public:
+    BertEmbeddings() = default;
+    BertEmbeddings(int vocal_size, int hidden_size, int type_size, int max_position_embeddings, float eps, BertNameConfig &config) {
+        word_embeddings = Embedding(vocal_size, hidden_size, config.embedding_base_name+"word_embeddings");
+        token_type_embeddings = Embedding(type_size, hidden_size, config.embedding_base_name+"token_type_embeddings");
+        position_embeddings = Embedding(max_position_embeddings, hidden_size, config.embedding_base_name+"position_embeddings");
+        layer_norm = LayerNorm(hidden_size, true, eps, config.embedding_base_name+"LayerNorm");
+    }
+
+    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
+        auto inputs_embeds = word_embeddings(inputs[0]);
+//        if (Tensor::tensor_status == TENSOR_STATIC_READY)
+//            inputs_embeds.printData<float>();
+        auto type_embeds = token_type_embeddings(inputs[1]);
+        auto position_embeds = position_embeddings(inputs[2]);
+        auto embeddings = inputs_embeds + type_embeds + position_embeds;
+        return {layer_norm(embeddings)};
+    }
+
+private:
+    Layer word_embeddings;
+    Layer token_type_embeddings;
+    Layer position_embeddings;
+    Layer layer_norm;
+};
+
+class BertSelfAttention : public Module {
+public:
+    BertSelfAttention() = default;
+    BertSelfAttention(BertConfig &config, const string &base_name) {
+        num_attention_heads = config.num_attention_heads;
+        attention_head_size = config.hidden_size / num_attention_heads;
+        all_head_size = num_attention_heads * attention_head_size;
+
+        query = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._q_proj_name);
+        key = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._k_proj_name);
+        value = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._v_proj_name);
+
+        softmax = Softmax(DIMENSION, base_name + "softmax");
+    }
+
+    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
+        if (Tensor::tensor_status == TENSOR_STATIC_READY) {
+            std::cout << "emb type: " << inputs[0].ctype() << std::endl;
+            inputs[0].printData<float>();
+        }
+
+        auto key_states = key(inputs[0]);
+        auto query_states = query(inputs[1]);
+        auto value_states = value(inputs[2]);
+
+//        auto key_len = key_states.sequence();
+
+        query_states = query_states.view(-1, num_attention_heads, -1, attention_head_size);
+        key_states = key_states.view(-1, num_attention_heads, -1, attention_head_size);
+        value_states = value_states.view(-1, num_attention_heads, -1, attention_head_size);
+
+        auto attn_weight =
+            Tensor::mm(query_states, key_states.transpose(Chl::SEQUENCE, Chl::DIMENSION)) / std::sqrt(attention_head_size);
+        auto attn_score = softmax(attn_weight);
+        auto attn_output = Tensor::mm(attn_score, value_states);
+        attn_output = attn_output.view(-1,1, -1, num_attention_heads * attention_head_size);
+        return {attn_output};
+    }
+
+private:
+    int num_attention_heads;
+    int attention_head_size;
+    int all_head_size;
+
+    Layer query;
+    Layer key;
+    Layer value;
+
+    Layer softmax;
+};
+
+class BertAttention : public Module {
+public:
+
+
+    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
+        return {};
+    }
+
+private:
+
+};
+
+class BertModel : public Module {
+public:
+    BertModel(BertConfig &config){
+        embeddings = BertEmbeddings(config.vocab_size, config.hidden_size, config.type_vocab_size, config.max_position_embeddings,
+                                    config.layer_norm_eps, config.names_config);
+        self_attention = BertSelfAttention(config, "encoder.layer.0.attention.self.");
+    }
+
+
+    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
+        auto emb = embeddings(inputs, args)[0];
+//        if (Tensor::tensor_status == TENSOR_STATIC_READY) {
+//            std::cout << "emb type: " << emb.ctype() << std::endl;
+//            emb.printData<float>();
+//        }
+        auto attn = self_attention({emb, emb, emb});
+        return {attn[0]};
+    }
+
+private:
+    BertEmbeddings embeddings;
+    BertSelfAttention self_attention;
+};
+
+#endif //! MODELING_BERT_HPP
diff --git a/src/models/bert/tokenization_bert.hpp b/src/models/bert/tokenization_bert.hpp
@@ -0,0 +1,39 @@
+#ifndef TOKENIZATION_BERT_HPP
+#define TOKENIZATION_BERT_HPP
+
+#include "tokenizers/BPE/Bpe.hpp"
+#include "tokenizers/Tokenizer.hpp"
+#include "tokenizers/Unicode.hpp"
+#include "tokenizers/WordPiece/WordPiece.hpp"
+#include <algorithm>
+#include <unordered_map>
+
+// unicode
+#include <codecvt>
+
+using namespace mllm;
+
+
+class BertTokenizer final : public WordPieceTokenizer {
+public:
+    explicit BertTokenizer(const std::string &vocab_file, bool bos = true) :
+        WordPieceTokenizer(vocab_file) {
+        Module::initBackend(MLLM_CPU);
+    }
+    std::tuple<Tensor, Tensor, Tensor> process(std::string &text){
+        auto tokens_id = vector<token_id_t>();
+        WordPieceTokenizer::tokenize(text, tokens_id, false);
+        auto tokens_type = vector<token_id_t>(tokens_id.size(), 0);
+        auto position_ids = vector<token_id_t>(tokens_id.size());
+        for (size_t i = 0; i < tokens_id.size(); i++) {
+            position_ids[i] = i;
+        }
+        return {
+            tokens2Input(tokens_id, "input_tokens"),
+            tokens2Input(tokens_type, "input_tokens_type"),
+            tokens2Input(position_ids, "input_position_ids")
+        };
+    }
+};
+
+#endif //! TOKENIZATION_BERT_HPP