-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add simple WordPiece tokenizer. add modeling_bert(unfinished)
- Loading branch information
Showing
9 changed files
with
527 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
// | ||
// Created by xwk on 24-10-23. | ||
// | ||
#include "models/bert/configuration_bert.hpp" | ||
#include "models/bert/modeling_bert.hpp" | ||
#include "models/bert/tokenization_bert.hpp" | ||
|
||
string vocab_file = "vocab/all-MiniLM-L6-v2.mllm"; | ||
string model_file = "models/gte.mllm"; | ||
|
||
int main(int argc, char *argv[]){ | ||
BertTokenizer tokenizer(vocab_file, false); | ||
string text = "Hello, my dog is cute."; | ||
auto [token_ids, type_ids, position_ids] = tokenizer.process(text); | ||
token_ids.printData<float>(); | ||
|
||
auto config = BertConfig(); | ||
auto model = BertModel(config); | ||
model.load(model_file); | ||
|
||
auto res = model({token_ids, type_ids, position_ids}); | ||
res[0].printData<float>(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# 千问通义 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#ifndef CONFIG_BERT_HPP | ||
#define CONFIG_BERT_HPP | ||
#include "Types.hpp" | ||
#include "models/transformer/configuration_transformer.hpp" | ||
#include <cctype> | ||
#include <iterator> | ||
|
||
using namespace mllm; | ||
|
||
class BertNameConfig : public TransformerNameConfig { | ||
public: | ||
/** | ||
* @brief QWen2 following the hugging face naming method | ||
* | ||
* @param type RoPEType | ||
*/ | ||
void init() { | ||
embedding_base_name = "embeddings."; | ||
|
||
blk_name = "model.layers."; | ||
_attn_base_name = "self."; | ||
_ffn_base_name = "mlp."; | ||
_q_proj_name = "query"; | ||
_k_proj_name = "key"; | ||
_v_proj_name = "value"; | ||
_o_proj_name = "o_proj"; | ||
_gate_proj_name = "gate_proj"; | ||
_up_proj_name = "up_proj"; | ||
_down_proj_name = "down_proj"; | ||
_attn_norm_name = "input_layernorm"; | ||
_ffn_norm_name = "post_attention_layernorm"; | ||
token_embd_name = "model.embed_tokens"; | ||
post_norm_name = "model.norm"; | ||
lm_head_name = "lm_head"; | ||
} | ||
std::string embedding_base_name; | ||
|
||
std::string blk_name; | ||
std::string token_embd_name; | ||
std::string post_norm_name; | ||
std::string lm_head_name; | ||
std::string _gate_proj_name; | ||
}; | ||
|
||
struct BertConfig : public TransformerConfig { | ||
explicit BertConfig(){ | ||
attention_dropout = 0.0; | ||
bos_token_id = 151643; | ||
eos_token_id = 151645; | ||
std::string hidden_act = "gelu"; | ||
hidden_size = 384; | ||
initializer_range = 0.02; | ||
intermediate_size = 1536; | ||
max_position_embeddings = 512; | ||
max_window_layers = 21; | ||
model_type = "bert"; | ||
num_attention_heads = 12; | ||
num_hidden_layers = 12; | ||
num_key_value_heads = 16; | ||
rms_norm_eps = 1e-6; | ||
rope_theta = 1000000.0; | ||
sliding_window = 32768; | ||
vocab_size = 30522; | ||
tie_embedding_words = true; | ||
|
||
names_config.init(); | ||
}; | ||
|
||
int type_vocab_size = 2; | ||
float layer_norm_eps = 1e-12; | ||
|
||
float attention_dropout = 0.0; | ||
int bos_token_id = 151643; | ||
int eos_token_id = 151643; | ||
std::string hidden_act = "silu"; | ||
int hidden_size = 1024; | ||
float initializer_range = 0.02; | ||
int intermediate_size = 2816; | ||
int max_position_embeddings = 32768; | ||
int max_window_layers = 21; | ||
std::string model_type = "bert"; | ||
int num_attention_heads = 12; | ||
int num_hidden_layers = 12; | ||
int num_key_value_heads = 12; | ||
double rms_norm_eps = 1e-6; | ||
float rope_theta = 1000000.0; | ||
int sliding_window = 32768; | ||
int vocab_size = 151936; | ||
bool tie_embedding_words = false; | ||
|
||
BertNameConfig names_config; | ||
}; | ||
|
||
#endif //! CONFIG_BERT_HPP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#ifndef MODELING_BERT_HPP | ||
#define MODELING_BERT_HPP | ||
|
||
#include "Backend.hpp" | ||
#include "Layer.hpp" | ||
#include "Module.hpp" | ||
#include "Tensor.hpp" | ||
#include "configuration_bert.hpp" | ||
#include <cmath> | ||
using namespace mllm; | ||
|
||
class BertEmbeddings : public Module { | ||
public: | ||
BertEmbeddings() = default; | ||
BertEmbeddings(int vocal_size, int hidden_size, int type_size, int max_position_embeddings, float eps, BertNameConfig &config) { | ||
word_embeddings = Embedding(vocal_size, hidden_size, config.embedding_base_name+"word_embeddings"); | ||
token_type_embeddings = Embedding(type_size, hidden_size, config.embedding_base_name+"token_type_embeddings"); | ||
position_embeddings = Embedding(max_position_embeddings, hidden_size, config.embedding_base_name+"position_embeddings"); | ||
layer_norm = LayerNorm(hidden_size, true, eps, config.embedding_base_name+"LayerNorm"); | ||
} | ||
|
||
std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override { | ||
auto inputs_embeds = word_embeddings(inputs[0]); | ||
// if (Tensor::tensor_status == TENSOR_STATIC_READY) | ||
// inputs_embeds.printData<float>(); | ||
auto type_embeds = token_type_embeddings(inputs[1]); | ||
auto position_embeds = position_embeddings(inputs[2]); | ||
auto embeddings = inputs_embeds + type_embeds + position_embeds; | ||
return {layer_norm(embeddings)}; | ||
} | ||
|
||
private: | ||
Layer word_embeddings; | ||
Layer token_type_embeddings; | ||
Layer position_embeddings; | ||
Layer layer_norm; | ||
}; | ||
|
||
class BertSelfAttention : public Module { | ||
public: | ||
BertSelfAttention() = default; | ||
BertSelfAttention(BertConfig &config, const string &base_name) { | ||
num_attention_heads = config.num_attention_heads; | ||
attention_head_size = config.hidden_size / num_attention_heads; | ||
all_head_size = num_attention_heads * attention_head_size; | ||
|
||
query = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._q_proj_name); | ||
key = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._k_proj_name); | ||
value = Linear(config.hidden_size, all_head_size, true, base_name + config.names_config._v_proj_name); | ||
|
||
softmax = Softmax(DIMENSION, base_name + "softmax"); | ||
} | ||
|
||
std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override { | ||
if (Tensor::tensor_status == TENSOR_STATIC_READY) { | ||
std::cout << "emb type: " << inputs[0].ctype() << std::endl; | ||
inputs[0].printData<float>(); | ||
} | ||
|
||
auto key_states = key(inputs[0]); | ||
auto query_states = query(inputs[1]); | ||
auto value_states = value(inputs[2]); | ||
|
||
// auto key_len = key_states.sequence(); | ||
|
||
query_states = query_states.view(-1, num_attention_heads, -1, attention_head_size); | ||
key_states = key_states.view(-1, num_attention_heads, -1, attention_head_size); | ||
value_states = value_states.view(-1, num_attention_heads, -1, attention_head_size); | ||
|
||
auto attn_weight = | ||
Tensor::mm(query_states, key_states.transpose(Chl::SEQUENCE, Chl::DIMENSION)) / std::sqrt(attention_head_size); | ||
auto attn_score = softmax(attn_weight); | ||
auto attn_output = Tensor::mm(attn_score, value_states); | ||
attn_output = attn_output.view(-1,1, -1, num_attention_heads * attention_head_size); | ||
return {attn_output}; | ||
} | ||
|
||
private: | ||
int num_attention_heads; | ||
int attention_head_size; | ||
int all_head_size; | ||
|
||
Layer query; | ||
Layer key; | ||
Layer value; | ||
|
||
Layer softmax; | ||
}; | ||
|
||
class BertAttention : public Module { | ||
public: | ||
|
||
|
||
std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override { | ||
return {}; | ||
} | ||
|
||
private: | ||
|
||
}; | ||
|
||
class BertModel : public Module { | ||
public: | ||
BertModel(BertConfig &config){ | ||
embeddings = BertEmbeddings(config.vocab_size, config.hidden_size, config.type_vocab_size, config.max_position_embeddings, | ||
config.layer_norm_eps, config.names_config); | ||
self_attention = BertSelfAttention(config, "encoder.layer.0.attention.self."); | ||
} | ||
|
||
|
||
std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override { | ||
auto emb = embeddings(inputs, args)[0]; | ||
// if (Tensor::tensor_status == TENSOR_STATIC_READY) { | ||
// std::cout << "emb type: " << emb.ctype() << std::endl; | ||
// emb.printData<float>(); | ||
// } | ||
auto attn = self_attention({emb, emb, emb}); | ||
return {attn[0]}; | ||
} | ||
|
||
private: | ||
BertEmbeddings embeddings; | ||
BertSelfAttention self_attention; | ||
}; | ||
|
||
#endif //! MODELING_BERT_HPP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#ifndef TOKENIZATION_BERT_HPP | ||
#define TOKENIZATION_BERT_HPP | ||
|
||
#include "tokenizers/BPE/Bpe.hpp" | ||
#include "tokenizers/Tokenizer.hpp" | ||
#include "tokenizers/Unicode.hpp" | ||
#include "tokenizers/WordPiece/WordPiece.hpp" | ||
#include <algorithm> | ||
#include <unordered_map> | ||
|
||
// unicode | ||
#include <codecvt> | ||
|
||
using namespace mllm; | ||
|
||
|
||
class BertTokenizer final : public WordPieceTokenizer { | ||
public: | ||
explicit BertTokenizer(const std::string &vocab_file, bool bos = true) : | ||
WordPieceTokenizer(vocab_file) { | ||
Module::initBackend(MLLM_CPU); | ||
} | ||
std::tuple<Tensor, Tensor, Tensor> process(std::string &text){ | ||
auto tokens_id = vector<token_id_t>(); | ||
WordPieceTokenizer::tokenize(text, tokens_id, false); | ||
auto tokens_type = vector<token_id_t>(tokens_id.size(), 0); | ||
auto position_ids = vector<token_id_t>(tokens_id.size()); | ||
for (size_t i = 0; i < tokens_id.size(); i++) { | ||
position_ids[i] = i; | ||
} | ||
return { | ||
tokens2Input(tokens_id, "input_tokens"), | ||
tokens2Input(tokens_type, "input_tokens_type"), | ||
tokens2Input(position_ids, "input_position_ids") | ||
}; | ||
} | ||
}; | ||
|
||
#endif //! TOKENIZATION_BERT_HPP |
Oops, something went wrong.