Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
XieWeikai committed Oct 27, 2024
2 parents b8e230e + 64dd7b6 commit 4f7b03d
Show file tree
Hide file tree
Showing 73 changed files with 53,300 additions and 1,870 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ mllm.egg-info/

examples/demo_deepseek.cpp
src/models/deepseek/*
examples/demo_phonellm.cpp
src/models/phonellm/*
examples/demo_minicpm3.cpp
src/models/minicpm3/*
examples/demo.cpp
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ endif()

# backend options
option(QNN "Enable QNN" OFF)
option(QNN_OLD_FRONTEND "Enable Old QNN" OFF)
if(QNN)
add_definitions(-DUSE_QNN) # the USE_QNN should come before cpu subdirectory
endif()
if(QNN_OLD_FRONTEND)
add_definitions(-DOLD_QNN)
endif()

if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
Expand Down Expand Up @@ -116,8 +123,7 @@ include_directories(${PROJECT_SOURCE_DIR}/third_party/pybind11/include)

add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/cpu)

if(QNN)
add_definitions(-DUSE_QNN)
if(QNN) # QNN lib
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn)
endif()

Expand Down
5 changes: 3 additions & 2 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ endmacro()


## new demos

func_llm_add_executable(benchmark)
func_llm_add_executable(demo_llama)
func_llm_add_executable(demo_tinyllama)
Expand All @@ -72,7 +71,8 @@ func_llm_add_executable(demo_smollm)
func_llm_add_executable(demo_openelm)
func_llm_add_executable(demo_dclm)
func_llm_add_executable(demo_bert)
# func_llm_add_executable(demo_phonellm)
func_llm_add_executable(demo_phonelm)


func_vlm_add_executable(demo_llava)
func_vlm_add_executable(demo_fuyu)
Expand All @@ -84,6 +84,7 @@ func_vlm_add_executable(demo_imagebind_1mod)

# QNN demo
if(QNN)
func_llm_add_executable(demo_qnn)
func_llm_add_executable(main_qwen_npu)
endif()

Expand Down
60 changes: 60 additions & 0 deletions examples/demo_phonelm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include <iostream>
#include <vector>
#include "Types.hpp"
#include "cmdline.h"
#include "models/phonelm/modeling_phonelm.hpp"
#include "models/smollm/tokenization_smollm.hpp"

using namespace mllm;

int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/phonelm_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/phonelm_merges.txt");
cmdParser.add<string>("model", 'o', "specify mllm model path", false, "../models/phonelm-1.5b-instruct-fp32.mllm");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 40);
cmdParser.parse_check(argc, argv);

string merge_path = cmdParser.get<string>("merge");
string vocab_path = cmdParser.get<string>("vocab");
string model_path = cmdParser.get<string>("model");
int tokens_limit = cmdParser.get<int>("limits");
CPUBackend::cpu_threads = cmdParser.get<int>("thread");

auto tokenizer = SmolLMTokenizer(vocab_path, merge_path);

string system_prompt_start;
string system_prompt_end;

PhoneLMConfig config(tokens_limit, "1.5B");
auto model = PhoneLMForCausalLM(config);
model.load(model_path);

vector<string> in_strs = {
"Hello, who are you?",
"What can you do?",
"Please introduce Beijing University of Posts and Telecommunications.",
};

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto input_tensor = tokenizer.tokenize(input_str);
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;
LlmTextGeneratorOpts opt{
.max_new_tokens = 100,
.do_sample = false,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { return false; }
std::cout << output_string << std::flush;
return true;
});
model.clear_kvcache();
std::cout << "\n";
}
return 0;
}
33 changes: 28 additions & 5 deletions examples/demo_qnn.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "cmdline.h"
#include "models/qwen/configuration_qwen.hpp"
#include "models/qwen/modeling_qwen_npu.hpp"
#include "models/qwen/modeling_qwen.hpp"
#include "models/qwen/tokenization_qwen.hpp"

using namespace mllm;
Expand All @@ -9,7 +10,7 @@ int main(int argc, char **argv) {
cmdline::parser cmdParser;
cmdParser.add<string>("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm");
cmdParser.add<string>("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm");
cmdParser.add<string>("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-chat-int8.mllm");
cmdParser.add<string>("billion", 'b', "[0.5B | 1.8B]", false, "1.8B");
cmdParser.add<int>("limits", 'l', "max KV cache size", false, 400);
cmdParser.add<int>("thread", 't', "num of threads", false, 4);
Expand All @@ -24,25 +25,29 @@ int main(int argc, char **argv) {

auto tokenizer = QWenTokenizer(vocab_path, merge_path);
QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE);
auto model = QWenForCausalLM(config);
auto model = QWenForCausalLM_NPU(config);
model.load(model_path);
// auto decoding_model = QWenForCausalLM(config);
// decoding_model.load("../models/qwen-1.5-1.8b-chat-q4k.mllm");

vector<string> in_strs = {
" Give me a short introduction to large language model.",
};

for (int i = 0; i < in_strs.size(); ++i) {
auto input_str = tokenizer.apply_chat_template(in_strs[i]);
auto input_tensor = tokenizer.tokenize(input_str);
auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size);
std::cout << "[Q] " << in_strs[i] << std::endl;
std::cout << "[A] " << std::flush;

LlmTextGeneratorOpts opt{
.max_new_tokens = 100,
.do_sample = true,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = true,
.seq_before_padding = real_seq_length,
};
model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool {
auto out_string = tokenizer.detokenize({out_token});
Expand All @@ -51,6 +56,24 @@ int main(int argc, char **argv) {
std::cout << output_string << std::flush;
return true;
});
std::cout << "FINISH\n";

LlmTextGeneratorOpts decoding_opt{
.max_new_tokens = 100,
.do_sample = false,
.temperature = 0.3f,
.top_k = 50,
.top_p = 0.f,
.is_padding = false,
};
// decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool {
// auto out_string = tokenizer.detokenize({out_token});
// auto [isOk, print_string] = processOutput(out_string);
// if (isOk) {
// std::cout << print_string << std::flush;
// } else {
// return false;
// }
// return true;
// });
}
}
1 change: 1 addition & 0 deletions examples/demo_tinyllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ int main(int argc, char **argv) {
auto [out_string, out_token] = tokenizer.detokenize(result[0]);
auto [not_end, output_string] = tokenizer.postprocess(out_string);
if (!not_end) { break; }
std::cout << output_string << std::flush;
chatPostProcessing(out_token, input_tensor, {});
}
printf("\n");
Expand Down
13 changes: 4 additions & 9 deletions examples/main_qwen_npu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,17 +138,12 @@ int main(int argc, char **argv) {
std::cout << "[Q] " << in_str << std::endl;
std::cout << "[A] " << std::flush;

// cpuExe.run(&cpuNet, {input});
// auto result = cpuExe.result();
// auto token_idx = postProcessing(result[0], input);

// auto out_token = tokenizer.detokenize({token_idx});
// std::cout << out_token << std::flush;
// exit(0);

do {
// 1: Prefill stage using NPU chunk execute
npuExe.run(npu_ctx, &npuNet, {input});
if (chunk == 1)
npuExe.run(npu_ctx, &npuNet, {input});
else
npuExe.runExp(npu_ctx, &npuNet, {input});
auto result = npuExe.result();

// inter model for prefill-decode
Expand Down
56 changes: 11 additions & 45 deletions examples/main_qwen_npu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ namespace modeling {
NetTensor *Qwen_FFN_NPU(Context *c, NetTensor *i, int hidden_dim, int ffn_hidden_dim, string name) {
auto *x = _LinearINT8({i}, hidden_dim, ffn_hidden_dim, false, name + ".gate_proj");
auto *y = _LinearINT8({i}, hidden_dim, ffn_hidden_dim, false, name + ".up_proj");
x = _Dequantize({x}, true, (string)name + ".gate_proj.dequantize", true);
y = _Dequantize({y}, true, (string)name + ".up_proj.dequantize", true);
x = _SiLU({x}, name + ".silu");
x = *x * y;
x = _Quantize({x}, true, (string)name + ".down_proj.quantize");
x = _SuperSiLU({x,y}, name + ".supersilu");
// x = _Dequantize({x}, true, (string)name + ".gate_proj.dequantize", false);
// y = _Dequantize({y}, true, (string)name + ".up_proj.dequantize", false);
// x = _SiLU({x}, name + ".silu");
// x = *x * y;
// x = _Quantize({x}, true, (string)name + ".down_proj.quantize");
x = _LinearINT8({x}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj");
x = _Dequantize({x}, true, (string)name + ".down_proj.dequantize");
return x;
Expand All @@ -29,9 +30,9 @@ std::vector<NetTensor *> Qwen_CPUNPUAttention(Context *c, NetTensor *x, NetTenso
k = k->view(1, head_size, seq / chunk, hidden_size);
v = v->view(1, head_size, seq / chunk, hidden_size);

q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize");
k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize");
v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize");
q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize", true);
k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize", false);
v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize", false);

v = _Transpose({v}, {0, 2, 3, 1}, (string)name + ".v_proj.transpose");

Expand Down Expand Up @@ -153,41 +154,6 @@ NetTensor *Qwen_FFN_CPU_q4k(Context *c, NetTensor *i, int hidden_dim, int ffn_hi
return x;
}

void qwen_cpu(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200, int seq = 256, int chunk = 2) {
auto *i = _Input(c);
i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.embed_tokens");

for (int layer = 0; layer < 24; ++layer) {
auto res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".input_layernorm");

i = *Qwen_CPUAttention(c, res, hidden_dim, hidden_dim / mutil_head_size, mutil_head_size, cache_max, (string) "model.layers." + std::to_string(layer) + ".self_attn", seq, chunk) + i;

res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm");

if (layer != 6 && layer != 1 && layer != 2) {
i = *Qwen_FFN_CPU(c, res, hidden_dim, ffn_hidden_dim, (string) "model.layers." + std::to_string(layer) + ".mlp") + i;
} else {
auto name = (string) "model.layers." + std::to_string(layer) + ".mlp";

auto *x = _LinearINT8({res}, hidden_dim, ffn_hidden_dim, false, name + ".gate_proj");
x = _SiLU({x}, name + ".silu");
auto *y = _LinearINT8({res}, hidden_dim, ffn_hidden_dim, false, name + ".up_proj");
x = *x * y; // x = _Mul( {x, y}, name+".dot");

auto *i1 = x;
x = _LinearINT8({x}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj");

auto *i2 = x;

i = *x + i;

i = _LinearINT8Shadow({i1, i2, i}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj.shadow");
}
}
i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.norm");
i = _Linear({i}, hidden_dim, vocab_size, false, "lm_head");
}

void qwen_cpu_q4k(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200, int seq = 256, int chunk = 2) {
auto *i = _Input(c);
i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.embed_tokens");
Expand Down Expand Up @@ -242,9 +208,9 @@ void qwen_npu(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn

res = i;

i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm");
i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm", false);

i = _Quantize({i}, true, (string) "model.layers." + std::to_string(layer) + ".mlp.up_proj.quantize");
// i = _Quantize({i}, true, (string) "model.layers." + std::to_string(layer) + ".mlp.up_proj.quantize");

i = i->view(1, static_cast<int>(seq / chunk / 32), static_cast<int>(32), hidden_dim);

Expand Down
3 changes: 3 additions & 0 deletions include/OpDefined.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ enum OpType {
MERGEOUTPUT,
SPLITINPUT,
IROPE,
SUPERSILU,
OP_NUM
};

Expand Down Expand Up @@ -107,6 +108,7 @@ static const vector<string> OpNames = {
"Range",
"Where",
"Replace",
"Predictor",
"SparseLinear",
"SparseIdLinear",
"ElasticLinear",
Expand All @@ -117,6 +119,7 @@ static const vector<string> OpNames = {
"MergeOutput",
"SplitInput",
"IRoPE",
"SuperSiLU",
"OP_NUM"};

enum TensorFuncType {
Expand Down
4 changes: 2 additions & 2 deletions include/Types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ inline std::map<std::vector<int>, ChlType> Chls2Type = {
{{0, 3, 4, 1, 2}, BWCTH}};

enum TensorType {
INPUT_TENSOR = 0,
INPUT_TENSOR = 0, // used for input of the model
NORMAL_TENSOR,
OUTPUT_TENSOR,
GRAPH_OUTPUT, // used for output of a graph
};

enum Chl {
Expand Down
3 changes: 2 additions & 1 deletion scripts/build_qnn_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ cmake .. \
-DQNN=ON \
-DDEBUG=OFF \
-DTEST=OFF \
-DQUANT=OFF
-DQUANT=OFF \
-DQNN_OLD_FRONTEND=ON

make -j4
4 changes: 4 additions & 0 deletions src/Backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <memory>
#include <unordered_map>
#include <mutex>
#include "Layer.hpp"

namespace mllm {
extern void registerCPUBackendCreator();
Expand All @@ -29,6 +30,9 @@ static std::unordered_map<BackendType, std::shared_ptr<BackendCreator>> &GetBack
}

const std::shared_ptr<BackendCreator> GetBackendCreator(BackendType type) {
if (type == MLLM_QNN) {
Layer::use_layername_2_tensorname = false;
}
registerBackend();

auto &gExtraCreator = GetBackendCreatorMap();
Expand Down
5 changes: 5 additions & 0 deletions src/Backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ class Op;
class Tensor;
class Backend;

// KVCache map for QNN-CPU KVCache sharing
#ifdef USE_QNN
static std::unordered_map<string, Op *> kv_cache_map;
#endif

class TensorFunction {
public:
virtual void setup(vector<Tensor *> outputs, vector<Tensor *> inputs, vector<float> args) = 0;
Expand Down
Loading

0 comments on commit 4f7b03d

Please sign in to comment.