Skip to content

Commit

Permalink
feat: add simple WordPiece tokenizer. add modeling_bert(unfinished)
Browse files Browse the repository at this point in the history
  • Loading branch information
XieWeikai committed Oct 27, 2024
1 parent 9bda904 commit b8e230e
Show file tree
Hide file tree
Showing 9 changed files with 527 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ macro(func_llm_add_executable target)
${DIR_SRC}
${PROJECT_SOURCE_DIR}/src/tokenizers/Tokenizer.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/BPE/Bpe.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/WordPiece/WordPiece.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/Unicode.cpp
${PROJECT_SOURCE_DIR}/src/tokenizers/UnicodeData.cpp
${PROJECT_SOURCE_DIR}/src/processor/PreProcess.cpp
Expand Down Expand Up @@ -70,6 +71,7 @@ func_llm_add_executable(demo_minicpm)
func_llm_add_executable(demo_smollm)
func_llm_add_executable(demo_openelm)
func_llm_add_executable(demo_dclm)
func_llm_add_executable(demo_bert)
# func_llm_add_executable(demo_phonellm)

func_vlm_add_executable(demo_llava)
Expand Down
23 changes: 23 additions & 0 deletions examples/demo_bert.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//
// Created by xwk on 24-10-23.
//
#include "models/bert/configuration_bert.hpp"
#include "models/bert/modeling_bert.hpp"
#include "models/bert/tokenization_bert.hpp"

string vocab_file = "vocab/all-MiniLM-L6-v2.mllm";
string model_file = "models/gte.mllm";

int main(int argc, char *argv[]){
BertTokenizer tokenizer(vocab_file, false);
string text = "Hello, my dog is cute.";
auto [token_ids, type_ids, position_ids] = tokenizer.process(text);
token_ids.printData<float>();

auto config = BertConfig();
auto model = BertModel(config);
model.load(model_file);

auto res = model({token_ids, type_ids, position_ids});
res[0].printData<float>();
}
1 change: 1 addition & 0 deletions examples/demo_qwen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ int main(int argc, char **argv) {
for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto input_tensor = tokenizer.tokenize(input_str);
input_tensor.printData<float>();
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

Expand Down
1 change: 1 addition & 0 deletions src/models/bert/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# 千问通义
94 changes: 94 additions & 0 deletions src/models/bert/configuration_bert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#ifndef CONFIG_BERT_HPP
#define CONFIG_BERT_HPP
#include "Types.hpp"
#include "models/transformer/configuration_transformer.hpp"
#include <cctype>
#include <iterator>

using namespace mllm;

class BertNameConfig : public TransformerNameConfig {
public:
/**
* @brief QWen2 following the hugging face naming method
*
* @param type RoPEType
*/
void init() {
embedding_base_name = "embeddings.";

blk_name = "model.layers.";
_attn_base_name = "self.";
_ffn_base_name = "mlp.";
_q_proj_name = "query";
_k_proj_name = "key";
_v_proj_name = "value";
_o_proj_name = "o_proj";
_gate_proj_name = "gate_proj";
_up_proj_name = "up_proj";
_down_proj_name = "down_proj";
_attn_norm_name = "input_layernorm";
_ffn_norm_name = "post_attention_layernorm";
token_embd_name = "model.embed_tokens";
post_norm_name = "model.norm";
lm_head_name = "lm_head";
}
std::string embedding_base_name;

std::string blk_name;
std::string token_embd_name;
std::string post_norm_name;
std::string lm_head_name;
std::string _gate_proj_name;
};

struct BertConfig : public TransformerConfig {
explicit BertConfig(){
attention_dropout = 0.0;
bos_token_id = 151643;
eos_token_id = 151645;
std::string hidden_act = "gelu";
hidden_size = 384;
initializer_range = 0.02;
intermediate_size = 1536;
max_position_embeddings = 512;
max_window_layers = 21;
model_type = "bert";
num_attention_heads = 12;
num_hidden_layers = 12;
num_key_value_heads = 16;
rms_norm_eps = 1e-6;
rope_theta = 1000000.0;
sliding_window = 32768;
vocab_size = 30522;
tie_embedding_words = true;

names_config.init();
};

int type_vocab_size = 2;
float layer_norm_eps = 1e-12;

float attention_dropout = 0.0;
int bos_token_id = 151643;
int eos_token_id = 151643;
std::string hidden_act = "silu";
int hidden_size = 1024;
float initializer_range = 0.02;
int intermediate_size = 2816;
int max_position_embeddings = 32768;
int max_window_layers = 21;
std::string model_type = "bert";
int num_attention_heads = 12;
int num_hidden_layers = 12;
int num_key_value_heads = 12;
double rms_norm_eps = 1e-6;
float rope_theta = 1000000.0;
int sliding_window = 32768;
int vocab_size = 151936;
bool tie_embedding_words = false;

BertNameConfig names_config;
};

#endif //! CONFIG_BERT_HPP
126 changes: 126 additions & 0 deletions src/models/bert/modeling_bert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#ifndef MODELING_BERT_HPP
#define MODELING_BERT_HPP

#include "Backend.hpp"
#include "Layer.hpp"
#include "Module.hpp"
#include "Tensor.hpp"
#include "configuration_bert.hpp"
#include <cmath>
using namespace mllm;

class BertEmbeddings : public Module {
public:
BertEmbeddings() = default;
BertEmbeddings(int vocal_size, int hidden_size, int type_size, int max_position_embeddings, float eps, BertNameConfig &config) {
word_embeddings = Embedding(vocal_size, hidden_size, config.embedding_base_name+"word_embeddings");
token_type_embeddings = Embedding(type_size, hidden_size, config.embedding_base_name+"token_type_embeddings");
position_embeddings = Embedding(max_position_embeddings, hidden_size, config.embedding_base_name+"position_embeddings");
layer_norm = LayerNorm(hidden_size, true, eps, config.embedding_base_name+"LayerNorm");
}

std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
auto inputs_embeds = word_embeddings(inputs[0]);
// if (Tensor::tensor_status == TENSOR_STATIC_READY)
// inputs_embeds.printData<float>();
auto type_embeds = token_type_embeddings(inputs[1]);
auto position_embeds = position_embeddings(inputs[2]);
auto embeddings = inputs_embeds + type_embeds + position_embeds;
return {layer_norm(embeddings)};
}

private:
Layer word_embeddings;
Layer token_type_embeddings;
Layer position_embeddings;
Layer layer_norm;
};

class BertSelfAttention : public Module {
public:
BertSelfAttention() = default;
BertSelfAttention(BertConfig &config, const string &base_name) {
num_attention_heads = config.num_attention_heads;
attention_head_size = config.hidden_size / num_attention_heads;
all_head_size = num_attention_heads * attention_head_size;

query = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._q_proj_name);
key = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._k_proj_name);
value = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._v_proj_name);

softmax = Softmax(DIMENSION, base_name + "softmax");
}

std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
if (Tensor::tensor_status == TENSOR_STATIC_READY) {
std::cout << "emb type: " << inputs[0].ctype() << std::endl;
inputs[0].printData<float>();
}

auto key_states = key(inputs[0]);
auto query_states = query(inputs[1]);
auto value_states = value(inputs[2]);

// auto key_len = key_states.sequence();

query_states = query_states.view(-1, num_attention_heads, -1, attention_head_size);
key_states = key_states.view(-1, num_attention_heads, -1, attention_head_size);
value_states = value_states.view(-1, num_attention_heads, -1, attention_head_size);

auto attn_weight =
Tensor::mm(query_states, key_states.transpose(Chl::SEQUENCE, Chl::DIMENSION)) / std::sqrt(attention_head_size);
auto attn_score = softmax(attn_weight);
auto attn_output = Tensor::mm(attn_score, value_states);
attn_output = attn_output.view(-1,1, -1, num_attention_heads * attention_head_size);
return {attn_output};
}

private:
int num_attention_heads;
int attention_head_size;
int all_head_size;

Layer query;
Layer key;
Layer value;

Layer softmax;
};

class BertAttention : public Module {
public:


std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
return {};
}

private:

};

class BertModel : public Module {
public:
BertModel(BertConfig &config){
embeddings = BertEmbeddings(config.vocab_size, config.hidden_size, config.type_vocab_size, config.max_position_embeddings,
config.layer_norm_eps, config.names_config);
self_attention = BertSelfAttention(config, "encoder.layer.0.attention.self.");
}


std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
auto emb = embeddings(inputs, args)[0];
// if (Tensor::tensor_status == TENSOR_STATIC_READY) {
// std::cout << "emb type: " << emb.ctype() << std::endl;
// emb.printData<float>();
// }
auto attn = self_attention({emb, emb, emb});
return {attn[0]};
}

private:
BertEmbeddings embeddings;
BertSelfAttention self_attention;
};

#endif //! MODELING_BERT_HPP
39 changes: 39 additions & 0 deletions src/models/bert/tokenization_bert.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#ifndef TOKENIZATION_BERT_HPP
#define TOKENIZATION_BERT_HPP

#include "tokenizers/BPE/Bpe.hpp"
#include "tokenizers/Tokenizer.hpp"
#include "tokenizers/Unicode.hpp"
#include "tokenizers/WordPiece/WordPiece.hpp"
#include <algorithm>
#include <unordered_map>

// unicode
#include <codecvt>

using namespace mllm;


class BertTokenizer final : public WordPieceTokenizer {
public:
explicit BertTokenizer(const std::string &vocab_file, bool bos = true) :
WordPieceTokenizer(vocab_file) {
Module::initBackend(MLLM_CPU);
}
std::tuple<Tensor, Tensor, Tensor> process(std::string &text){
auto tokens_id = vector<token_id_t>();
WordPieceTokenizer::tokenize(text, tokens_id, false);
auto tokens_type = vector<token_id_t>(tokens_id.size(), 0);
auto position_ids = vector<token_id_t>(tokens_id.size());
for (size_t i = 0; i < tokens_id.size(); i++) {
position_ids[i] = i;
}
return {
tokens2Input(tokens_id, "input_tokens"),
tokens2Input(tokens_type, "input_tokens_type"),
tokens2Input(position_ids, "input_position_ids")
};
}
};

#endif //! TOKENIZATION_BERT_HPP
Loading

0 comments on commit b8e230e

Please sign in to comment.