diff --git a/CMakeLists.txt b/CMakeLists.txt index f78c2426..82f46ea8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,13 @@ endif() # backend options option(QNN "Enable QNN" OFF) +option(QNN_OLD_FRONTEND "Enable Old QNN" OFF) +if(QNN) + add_definitions(-DUSE_QNN) # the USE_QNN should come before cpu subdirectory +endif() +if(QNN_OLD_FRONTEND) + add_definitions(-DOLD_QNN) +endif() if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") cmake_policy(SET CMP0135 NEW) @@ -116,8 +123,7 @@ include_directories(${PROJECT_SOURCE_DIR}/third_party/pybind11/include) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/cpu) -if(QNN) - add_definitions(-DUSE_QNN) +if(QNN) # QNN lib add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/backends/qnn) endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 882ba56b..a3e946d7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -53,7 +53,6 @@ endmacro() ## new demos - func_llm_add_executable(benchmark) func_llm_add_executable(demo_llama) func_llm_add_executable(demo_tinyllama) @@ -82,6 +81,7 @@ func_vlm_add_executable(demo_imagebind_1mod) # QNN demo if(QNN) + func_llm_add_executable(demo_qnn) func_llm_add_executable(main_qwen_npu) endif() diff --git a/examples/demo_qnn.cpp b/examples/demo_qnn.cpp index c42a202f..e8c9b98f 100644 --- a/examples/demo_qnn.cpp +++ b/examples/demo_qnn.cpp @@ -1,6 +1,7 @@ #include "cmdline.h" #include "models/qwen/configuration_qwen.hpp" #include "models/qwen/modeling_qwen_npu.hpp" +#include "models/qwen/modeling_qwen.hpp" #include "models/qwen/tokenization_qwen.hpp" using namespace mllm; @@ -9,7 +10,7 @@ int main(int argc, char **argv) { cmdline::parser cmdParser; cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm"); cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt"); - cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-chat-int8.mllm"); cmdParser.add("billion", 'b', "[0.5B | 1.8B]", false, "1.8B"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); @@ -24,8 +25,10 @@ int main(int argc, char **argv) { auto tokenizer = QWenTokenizer(vocab_path, merge_path); QWenConfig config(tokens_limit, model_billion, RoPEType::HFHUBROPE); - auto model = QWenForCausalLM(config); + auto model = QWenForCausalLM_NPU(config); model.load(model_path); + // auto decoding_model = QWenForCausalLM(config); + // decoding_model.load("../models/qwen-1.5-1.8b-chat-q4k.mllm"); vector in_strs = { " Give me a short introduction to large language model.", @@ -33,16 +36,18 @@ int main(int argc, char **argv) { for (int i = 0; i < in_strs.size(); ++i) { auto input_str = tokenizer.apply_chat_template(in_strs[i]); - auto input_tensor = tokenizer.tokenize(input_str); + auto [real_seq_length, input_tensor] = tokenizer.tokenizeWithPadding(input_str, 64, config.vocab_size); std::cout << "[Q] " << in_strs[i] << std::endl; std::cout << "[A] " << std::flush; LlmTextGeneratorOpts opt{ .max_new_tokens = 100, - .do_sample = true, + .do_sample = false, .temperature = 0.3f, .top_k = 50, .top_p = 0.f, + .is_padding = true, + .seq_before_padding = real_seq_length, }; model.generate(input_tensor, opt, [&](unsigned int out_token) -> bool { auto out_string = tokenizer.detokenize({out_token}); @@ -51,6 +56,24 @@ int main(int argc, char **argv) { std::cout << output_string << std::flush; return true; }); - std::cout << "FINISH\n"; + + LlmTextGeneratorOpts decoding_opt{ + .max_new_tokens = 100, + .do_sample = false, + .temperature = 0.3f, + .top_k = 50, + .top_p = 0.f, + .is_padding = false, + }; + // decoding_model.generate(input_tensor, decoding_opt, [&](unsigned int out_token) -> bool { + // auto out_string = tokenizer.detokenize({out_token}); + // auto [isOk, print_string] = processOutput(out_string); + // if (isOk) { + // std::cout << print_string << std::flush; + // } else { + // return false; + // } + // return true; + // }); } } \ No newline at end of file diff --git a/examples/main_qwen_npu.cpp b/examples/main_qwen_npu.cpp index e482665c..b523f806 100644 --- a/examples/main_qwen_npu.cpp +++ b/examples/main_qwen_npu.cpp @@ -138,17 +138,12 @@ int main(int argc, char **argv) { std::cout << "[Q] " << in_str << std::endl; std::cout << "[A] " << std::flush; - // cpuExe.run(&cpuNet, {input}); - // auto result = cpuExe.result(); - // auto token_idx = postProcessing(result[0], input); - - // auto out_token = tokenizer.detokenize({token_idx}); - // std::cout << out_token << std::flush; - // exit(0); - do { // 1: Prefill stage using NPU chunk execute - npuExe.run(npu_ctx, &npuNet, {input}); + if (chunk == 1) + npuExe.run(npu_ctx, &npuNet, {input}); + else + npuExe.runExp(npu_ctx, &npuNet, {input}); auto result = npuExe.result(); // inter model for prefill-decode diff --git a/examples/main_qwen_npu.hpp b/examples/main_qwen_npu.hpp index 1a8f42ac..01874b1d 100644 --- a/examples/main_qwen_npu.hpp +++ b/examples/main_qwen_npu.hpp @@ -10,11 +10,12 @@ namespace modeling { NetTensor *Qwen_FFN_NPU(Context *c, NetTensor *i, int hidden_dim, int ffn_hidden_dim, string name) { auto *x = _LinearINT8({i}, hidden_dim, ffn_hidden_dim, false, name + ".gate_proj"); auto *y = _LinearINT8({i}, hidden_dim, ffn_hidden_dim, false, name + ".up_proj"); - x = _Dequantize({x}, true, (string)name + ".gate_proj.dequantize", true); - y = _Dequantize({y}, true, (string)name + ".up_proj.dequantize", true); - x = _SiLU({x}, name + ".silu"); - x = *x * y; - x = _Quantize({x}, true, (string)name + ".down_proj.quantize"); + x = _SuperSiLU({x,y}, name + ".supersilu"); + // x = _Dequantize({x}, true, (string)name + ".gate_proj.dequantize", false); + // y = _Dequantize({y}, true, (string)name + ".up_proj.dequantize", false); + // x = _SiLU({x}, name + ".silu"); + // x = *x * y; + // x = _Quantize({x}, true, (string)name + ".down_proj.quantize"); x = _LinearINT8({x}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj"); x = _Dequantize({x}, true, (string)name + ".down_proj.dequantize"); return x; @@ -29,9 +30,9 @@ std::vector Qwen_CPUNPUAttention(Context *c, NetTensor *x, NetTenso k = k->view(1, head_size, seq / chunk, hidden_size); v = v->view(1, head_size, seq / chunk, hidden_size); - q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize"); - k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize"); - v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize"); + q = _Dequantize({q}, true, (string)name + ".q_proj.dequantize", true); + k = _Dequantize({k}, true, (string)name + ".k_proj.dequantize", false); + v = _Dequantize({v}, true, (string)name + ".v_proj.dequantize", false); v = _Transpose({v}, {0, 2, 3, 1}, (string)name + ".v_proj.transpose"); @@ -153,41 +154,6 @@ NetTensor *Qwen_FFN_CPU_q4k(Context *c, NetTensor *i, int hidden_dim, int ffn_hi return x; } -void qwen_cpu(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200, int seq = 256, int chunk = 2) { - auto *i = _Input(c); - i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.embed_tokens"); - - for (int layer = 0; layer < 24; ++layer) { - auto res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".input_layernorm"); - - i = *Qwen_CPUAttention(c, res, hidden_dim, hidden_dim / mutil_head_size, mutil_head_size, cache_max, (string) "model.layers." + std::to_string(layer) + ".self_attn", seq, chunk) + i; - - res = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm"); - - if (layer != 6 && layer != 1 && layer != 2) { - i = *Qwen_FFN_CPU(c, res, hidden_dim, ffn_hidden_dim, (string) "model.layers." + std::to_string(layer) + ".mlp") + i; - } else { - auto name = (string) "model.layers." + std::to_string(layer) + ".mlp"; - - auto *x = _LinearINT8({res}, hidden_dim, ffn_hidden_dim, false, name + ".gate_proj"); - x = _SiLU({x}, name + ".silu"); - auto *y = _LinearINT8({res}, hidden_dim, ffn_hidden_dim, false, name + ".up_proj"); - x = *x * y; // x = _Mul( {x, y}, name+".dot"); - - auto *i1 = x; - x = _LinearINT8({x}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj"); - - auto *i2 = x; - - i = *x + i; - - i = _LinearINT8Shadow({i1, i2, i}, ffn_hidden_dim, hidden_dim, false, name + ".down_proj.shadow"); - } - } - i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.norm"); - i = _Linear({i}, hidden_dim, vocab_size, false, "lm_head"); -} - void qwen_cpu_q4k(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn_hidden_dim = 11008, int mutil_head_size = 32, int cache_max = 200, int seq = 256, int chunk = 2) { auto *i = _Input(c); i = _Embedding({i}, vocab_size, hidden_dim, (string) "model.embed_tokens"); @@ -242,9 +208,9 @@ void qwen_npu(Context *c, int vocab_size = 32000, int hidden_dim = 4096, int ffn res = i; - i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm"); + i = _RMSNorm({i}, hidden_dim, 1e-6, (string) "model.layers." + std::to_string(layer) + ".post_attention_layernorm", false); - i = _Quantize({i}, true, (string) "model.layers." + std::to_string(layer) + ".mlp.up_proj.quantize"); + // i = _Quantize({i}, true, (string) "model.layers." + std::to_string(layer) + ".mlp.up_proj.quantize"); i = i->view(1, static_cast(seq / chunk / 32), static_cast(32), hidden_dim); diff --git a/include/OpDefined.hpp b/include/OpDefined.hpp index 1b6730ca..e38b8163 100644 --- a/include/OpDefined.hpp +++ b/include/OpDefined.hpp @@ -61,6 +61,7 @@ enum OpType { MERGEOUTPUT, SPLITINPUT, IROPE, + SUPERSILU, OP_NUM }; @@ -107,6 +108,7 @@ static const vector OpNames = { "Range", "Where", "Replace", + "Predictor", "SparseLinear", "SparseIdLinear", "ElasticLinear", @@ -117,6 +119,7 @@ static const vector OpNames = { "MergeOutput", "SplitInput", "IRoPE", + "SuperSiLU", "OP_NUM"}; enum TensorFuncType { diff --git a/include/Types.hpp b/include/Types.hpp index c0a4cebb..55b545c5 100644 --- a/include/Types.hpp +++ b/include/Types.hpp @@ -93,9 +93,9 @@ inline std::map, ChlType> Chls2Type = { {{0, 3, 4, 1, 2}, BWCTH}}; enum TensorType { - INPUT_TENSOR = 0, + INPUT_TENSOR = 0, // used for input of the model NORMAL_TENSOR, - OUTPUT_TENSOR, + GRAPH_OUTPUT, // used for output of a graph }; enum Chl { diff --git a/scripts/build_qnn_android.sh b/scripts/build_qnn_android.sh index e509cfc6..3af1b2ad 100755 --- a/scripts/build_qnn_android.sh +++ b/scripts/build_qnn_android.sh @@ -12,6 +12,7 @@ cmake .. \ -DQNN=ON \ -DDEBUG=OFF \ -DTEST=OFF \ --DQUANT=OFF +-DQUANT=OFF \ +-DQNN_OLD_FRONTEND=ON make -j4 diff --git a/src/Backend.cpp b/src/Backend.cpp index bcaeb503..eeb236ce 100644 --- a/src/Backend.cpp +++ b/src/Backend.cpp @@ -3,6 +3,7 @@ #include #include #include +#include "Layer.hpp" namespace mllm { extern void registerCPUBackendCreator(); @@ -29,6 +30,9 @@ static std::unordered_map> &GetBack } const std::shared_ptr GetBackendCreator(BackendType type) { + if (type == MLLM_QNN) { + Layer::use_layername_2_tensorname = false; + } registerBackend(); auto &gExtraCreator = GetBackendCreatorMap(); diff --git a/src/Backend.hpp b/src/Backend.hpp index 52eca0a8..bf94274e 100644 --- a/src/Backend.hpp +++ b/src/Backend.hpp @@ -13,6 +13,11 @@ class Op; class Tensor; class Backend; +// KVCache map for QNN-CPU KVCache sharing +#ifdef USE_QNN +static std::unordered_map kv_cache_map; +#endif + class TensorFunction { public: virtual void setup(vector outputs, vector inputs, vector args) = 0; diff --git a/src/Generate.hpp b/src/Generate.hpp index 41f58b1b..7c5b11b9 100644 --- a/src/Generate.hpp +++ b/src/Generate.hpp @@ -27,6 +27,8 @@ struct LlmTextGeneratorOpts { float temperature = 0.7; int top_k = 5; float top_p = 0.92; + bool is_padding = false; + int seq_before_padding = 0; }; template @@ -47,14 +49,24 @@ enum class LLmTextGeneratorType : int32_t { }; class _LlmTextGenerateMethod { + bool is_padding = false; + int seq_before_padding = 0; public: virtual ~_LlmTextGenerateMethod() = default; virtual unsigned int generate(Tensor &t) = 0; + inline void setPadding(bool is_padding, int seq_before_padding) { + this->is_padding = is_padding; + this->seq_before_padding = seq_before_padding; + } inline void _tensor_to_vec(Tensor &t, std::vector &scores) { assert(t.batch() == 1 && "Batch size of result is not 1. Which is not supported for now."); assert(t.head() == 1 && "The 3rd dim of result should be one. e.g.:[1, 1, seq, hidden]"); int _dims = t.dimension(); int _seq = t.sequence() - 1; + // padding prefill for QNN + if (is_padding) { + _seq = seq_before_padding - 1; + } for (int i = 0; i < _dims; ++i) { auto value = t.dataAt(0, 0, _seq, i); scores.push_back(value); @@ -144,6 +156,11 @@ class LlmTextGenerator { assert(false && "NIY"); break; } + + // padding prefill for QNN + if (opt.is_padding) { + m_method_class->setPadding(opt.is_padding, opt.seq_before_padding); + } } inline unsigned int generate(Tensor &t) { diff --git a/src/Graph.cpp b/src/Graph.cpp index 45d7d560..11b1ab42 100644 --- a/src/Graph.cpp +++ b/src/Graph.cpp @@ -18,9 +18,6 @@ std::string intToStringWithLeadingZero(int num) { namespace mllm { -#ifdef USE_QNN -static unordered_map kv_cache_map; -#endif Graph::Graph(const NetParameter ¶m, Backend *bn, unordered_map> &external_tensors, @@ -149,7 +146,7 @@ void Graph::setUpTensors() { // set graph out tensor TensorType auto &graph_out_tensors = ops_output_tensors_[op_names_[op_names_.size() - 1]]; for (auto &t : graph_out_tensors) { - t->setTtype(OUTPUT_TENSOR); + t->setTtype(GRAPH_OUTPUT); } this->backend_->onSetUpStart(graph_in_tensors, graph_out_tensors); diff --git a/src/Layer.hpp b/src/Layer.hpp index 93fd01ea..6cae49f2 100644 --- a/src/Layer.hpp +++ b/src/Layer.hpp @@ -12,6 +12,7 @@ #include #include +#include "OpDefined.hpp" #include "Tensor.hpp" #include "Op.hpp" #include "ParamLoader.hpp" @@ -54,6 +55,11 @@ class Layer { return ts[0].get(); } + Tensor &operator()(Tensor &input0, Tensor &input1, Tensor &input2) { + auto ts = run({input0, input1, input2}, 1); + return ts[0].get(); + } + private: std::string name_num_to_X(const std::string &input_string) { std::regex pattern(R"(\.\d{1,3}\.)"); // Matches any number between 1 and 100 between two dots @@ -116,7 +122,26 @@ class Layer { if (module->doLoad || !inited_loaded) { do_init = !inited_loaded; if (op_ == nullptr) { +#ifdef USE_QNN + if (param_["type"] == KVCACHE || param_["type"] == KVCACHENPU) { + if (kv_cache_map.find(name_) == kv_cache_map.end()) { + // for the prefill part, we need to create a new op + param_["type"] = KVCACHENPU; + op_ = backend_->opCreate(param_, name_); + kv_cache_map[name_] = op_; + } else { +#ifdef DEBUGPRINT + std::cout << name_ << " is shared used" << std::endl; +#endif + // for the decoding part, we need to get created op from global container + op_ = kv_cache_map[name_]; + } + } else { + op_ = backend_->opCreate(param_, name_); + } +#else op_ = backend_->opCreate(param_, name_); +#endif } if (module->doLoad) { op_->load(*module->loader); @@ -305,6 +330,21 @@ class ElasticLinear final : public Layer { } }; +class ShadowLinear final : public Layer { +public: + ShadowLinear() = default; + explicit ShadowLinear(int in_features, int out_features, bool bias, std::string name) { + param_["in_features"] = in_features; + param_["out_features"] = out_features; + param_["bias"] = (float)bias; + init(std::move(name), OpType::LINEARINT8SHADOW); + } + Tensor &operator()(Tensor &input0, Tensor &input1, Tensor &input2) { + auto ts = run({input0, input1, input2}, 1); + return ts[0].get(); + } +}; + class SiLU final : public Layer { public: SiLU() = default; @@ -502,6 +542,15 @@ class KVCache final : public Layer { param_["cache_max"] = cache_max; init(std::move(name), OpType::KVCACHE); } + explicit KVCache(int n_rep, int cache_max, std::string name, bool npuEnbaled) { + param_["n_rep"] = n_rep; + param_["cache_max"] = cache_max; + if (npuEnbaled) { + init(std::move(name), OpType::KVCACHENPU); + } else { + init(std::move(name), OpType::KVCACHE); + } + } Tensor &operator()(Tensor &input) { auto ts = run({input}, 1); return ts[0].get(); @@ -543,6 +592,14 @@ class RMSNorm final : public Layer { init(std::move(name), OpType::RMSNORM); } + // int8 output rmsnorm for qnn + explicit RMSNorm(int norm_size, float epsilon, std::string name, bool isFP32) { + param_["norm_size"] = norm_size; + param_["epsilon"] = epsilon; + param_["isFP32"] = (float)isFP32; + init(std::move(name), OpType::RMSNORM); + } + Tensor &operator()(Tensor &input) { auto ts = run({input}, 1); return ts[0].get(); @@ -667,6 +724,8 @@ class Position final : public Layer { } }; +// Only for QNN START + class Quantize final : public Layer { public: explicit Quantize(bool isNSHD, std::string name) { @@ -679,6 +738,111 @@ class Quantize final : public Layer { } }; +class Dequantize final : public Layer { +public: + explicit Dequantize(bool isNSHD, std::string name, bool isFP32 = true) { + param_["isNSHD"] = (float)isNSHD; + param_["isFP32"] = (float)isFP32; + init(std::move(name), OpType::DEQUANTIZE); + } + Tensor &operator()(Tensor &input) { + auto ts = run({input}, 1); + return ts[0].get(); + } +}; + +class Add final : public Layer { +public: + explicit Add(std::string name) { + init(std::move(name), OpType::ADD); + } + Tensor &operator()(Tensor &input0, Tensor &input1) { + auto ts = run({input0, input1}, 1); + return ts[0].get(); + } +}; + +class Mul final : public Layer { +public: + explicit Mul(std::string name) { + init(std::move(name), OpType::MUL); + } + Tensor &operator()(Tensor &input0, Tensor &input1) { + auto ts = run({input0, input1}, 1); + return ts[0].get(); + } +}; + +class View final : public Layer { +public: + explicit View(int batch, int head, int seq, int dim, std::string name) { + vector dims; + vector data_dims; + if (batch == -1 & seq == -1 & head != -1 & dim != -1) { // keep b&s change h&d + if (head != 1) { + dims = {batch, head, seq, -1}; + data_dims = {BATCH, DIMENSION, SEQUENCE, DIMENSION}; + } else { + dims = {batch, -1, seq, -1}; + data_dims = {BATCH, -1, SEQUENCE, HEAD + DIMENSION}; + } + } else if (batch == -1 & dim == -1 & head != -1 & seq != -1) { // keep b&d change h&s + if (head != 1) { + dims = {batch, head, -1, dim}; + data_dims = {BATCH, SEQUENCE, SEQUENCE, DIMENSION}; + } else { + dims = {batch, -1, -1, dim}; + data_dims = {BATCH, -1, HEAD + SEQUENCE, DIMENSION}; + } + } else if (head == -1 & dim == -1 & batch != -1 & seq != -1) { // keep h&d change b&s + if (seq != 1) { + dims = {-1, head, seq, dim}; + data_dims = {BATCH, HEAD, BATCH, DIMENSION}; + } else { + dims = {-1, head, -1, dim}; + data_dims = {BATCH + SEQUENCE, HEAD, -1, DIMENSION}; + } + } else if (batch != -1 & dim != -1 & head != -1 & seq != -1) { // change all dimension. + + dims = {batch, head, seq, dim}; + data_dims = {BATCH, HEAD, SEQUENCE, DIMENSION}; + + } else { + std::cout << "ERROR: " << name << " view [" << batch << ", " << head << ", " << seq << ", " << dim << "]" << std::endl; + } + param_["dim0"] = dims[0]; + param_["dim1"] = dims[1]; + param_["dim2"] = dims[2]; + param_["dim3"] = dims[3]; + param_["data_dim0"] = data_dims[0]; + param_["data_dim1"] = data_dims[1]; + param_["data_dim2"] = data_dims[2]; + param_["data_dim3"] = data_dims[3]; + init(std::move(name), OpType::VIEW); + } + Tensor &operator()(Tensor &input) { + auto ts = run({input}, 1); + return ts[0].get(); + } +}; + +class Transpose final : public Layer { +public: + explicit Transpose(std::vector perm, std::string name) { + param_["perm0"] = perm[0]; + param_["perm1"] = perm[1]; + param_["perm2"] = perm[2]; + param_["perm3"] = perm[3]; + init(std::move(name), OpType::TRANSPOSE); + } + Tensor &operator()(Tensor &input) { + auto ts = run({input}, 1); + return ts[0].get(); + } +}; + +// Only for QNN END + } // namespace mllm #endif // OPERATION_H \ No newline at end of file diff --git a/src/Module.cpp b/src/Module.cpp index 0c7a18bd..942e9941 100644 --- a/src/Module.cpp +++ b/src/Module.cpp @@ -10,12 +10,12 @@ namespace mllm { // AbstructLoader *Module::loader; // TensorStatus Tensor::tensor_status; // bool Module::doLoad = false; +// The llm_model_ptr is a pointer to the outmost module Module *Module::llm_model_ptr; int Module::listIdx; int Module::runlistIdx; - -bool Module::doToDevice = false; +// TensorStatus Tensor::tensor_status; BackendType Module::tmp_device = MLLM_CPU; std::unordered_map> Module::tensor_func_ops; diff --git a/src/Module.hpp b/src/Module.hpp index 3e982a47..e96b2254 100644 --- a/src/Module.hpp +++ b/src/Module.hpp @@ -33,7 +33,7 @@ class Module { BackendType device_ = BackendType::MLLM_CPU; public: - map> activation_tensors = {}; + map> activation_tensors; AbstructLoader *loader; bool doLoad = false; @@ -95,8 +95,9 @@ class Module { } void load(string path) { - ParamLoader param_loader(std::move(path)); - load(param_loader); + // create global loader and save to llm_model_ptr.loader as QNNBackend needs to load weights in runtime + loader = new ParamLoader(std::move(path)); + load(*loader); } void load(AbstructLoader ¶m_loader) { Tensor::tensor_status = TENSOR_STATIC_INIT; @@ -146,10 +147,22 @@ class Module { vector operator()(vector inputs, Args... args) { vector anyArgs = convertArgsToAnyVector(args...); // set static tmp_device to device_ to init layers' op + auto previoud_device = tmp_device; Module::tmp_device = device_; + // Module Loading if (llm_model_ptr && llm_model_ptr->doLoad) { - return Forward(inputs, anyArgs); + auto outputs = Forward(inputs, anyArgs); + // for inner module, set output tensors to GRAPH_OUTPUT + if (inputs[0].ttype() != TensorType::INPUT_TENSOR) { // XPUs' module should not be the outermost input tensor + for (auto &output : outputs) { + inputs[0].module()->activation_tensors[output.name()]->setTtype(GRAPH_OUTPUT); + } + } + // set Module::tmp_device to previous device + Module::tmp_device = previoud_device; + return outputs; } + // Module setUp & execute if (inputs[0].ttype() == TensorType::INPUT_TENSOR) { if (prefilling_token_size_ == 0) { // first time init prefilling_token_size_ = inputs[0].sequence(); @@ -192,7 +205,42 @@ class Module { } return output; - } else { + } else { // inner Modules + // offload according to the backends' info inited during loading + if (Tensor::tensor_status == TENSOR_STATIC_INIT && device_ != MLLM_CPU) { // backend specific module reshape & setup + auto inputs_vec = vector>(); + auto outputs_vec = vector>(); + for (auto &i : inputs) { + inputs_vec.push_back(inputs[0].module()->activation_tensors[i.name()]); + } + auto getUinqueName = [this]() -> string { + std::ostringstream oss; + oss << "Module@" << this; + return oss.str(); + }; + Backend::global_backends[device_]->onSetUpStart(inputs_vec, outputs_vec, getUinqueName()); + auto outputs = Forward(inputs, anyArgs); + for (auto &output : outputs) { + outputs_vec.push_back(inputs[0].module()->activation_tensors[output.name()]); + } + Backend::global_backends[device_]->onSetUpEnd(inputs_vec, outputs_vec, getUinqueName()); + return outputs; + } else if (Tensor::tensor_status == TENSOR_STATIC_READY && device_ != MLLM_CPU) { // backend specific module execute + auto inputs_vec = vector>(); + auto outputs_vec = vector>(); + for (auto &i : inputs) { + inputs_vec.push_back(inputs[0].module()->activation_tensors[i.name()]); + } + auto getUinqueName = [this]() -> string { + std::ostringstream oss; + oss << "Module@" << this; + return oss.str(); + }; + Backend::global_backends[device_]->onExecuteStart(inputs_vec, outputs_vec, getUinqueName()); + auto outputs = Forward(inputs, anyArgs); + Backend::global_backends[device_]->onExecuteEnd(); + return outputs; + } return Forward(inputs, anyArgs); } } diff --git a/src/Tensor.cpp b/src/Tensor.cpp index 77bbc565..c7feeea3 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -347,4 +347,33 @@ vector> Tensor::split(Tensor &input, std::vector< {module->activation_tensors[input.name()].get()}); } +Tensor &Tensor::to(BackendType backend_type) { + // TODO: check if the data is shared between devices + // if so, return the origin tensor + // if not, return the new tensor + // TODO: if need copy, should implement copyDataCrossBn and do copy when Tensor::TENSOR_STATIC_READY + + /** + * Currently, there are following cases: + * CPU -> QNN, QNN -> CPU + * if it is CPU -> QNN, the buffer should be realloced + * (NOTE: not handling data copy as the tensor.to() shoudld be called before the data is set and tensor.device() should be checked in frontend) + * if it is QNN -> CPU, the data is sharable between CPU and QNN, no need to copy or realloc + */ + if (device() == backend_type) { + return *this; + } + if (backend_type == MLLM_CPU && device() == MLLM_QNN) { + // data is sharable between CPU and QNN + return *this; + } + // realloc the tensor + if (backend_type == MLLM_QNN && device() == MLLM_CPU) { + this->free(); + } + module()->activation_tensors[name()]->setBackend(Backend::global_backends[backend_type]); + this->alloc(); + return *this; +}; + } // namespace mllm \ No newline at end of file diff --git a/src/Tensor.hpp b/src/Tensor.hpp index 75344e29..9edef3c5 100644 --- a/src/Tensor.hpp +++ b/src/Tensor.hpp @@ -1084,8 +1084,9 @@ class Tensor { Tensor &to(BackendType backend_type); static vector toDevice(vector inputs, BackendType backend_type) { - // TODO: implement - std::cout << "tensor should be transfered across backend" << std::endl; + for (auto &input : inputs) { + input.to(backend_type); + } return inputs; }; static vector toCPU(vector inputs) { diff --git a/src/backends/cpu/CPUEmbedding.cpp b/src/backends/cpu/CPUEmbedding.cpp index 66286dac..51fe1a40 100644 --- a/src/backends/cpu/CPUEmbedding.cpp +++ b/src/backends/cpu/CPUEmbedding.cpp @@ -11,7 +11,6 @@ CPUEmbedding::CPUEmbedding(Backend *bn, string opName, int hiddenSize, int voca weight_.setBackend(bn); } ErrorCode CPUEmbedding::reshape(vector> inputs, vector> outputs) { - assert(inputs.size() == 1); assert(outputs.size() == 1); auto input = inputs[0]; diff --git a/src/backends/cpu/CPUKVCacheNPU.cpp b/src/backends/cpu/CPUKVCacheNPU.cpp index 385b88c0..582a52e8 100644 --- a/src/backends/cpu/CPUKVCacheNPU.cpp +++ b/src/backends/cpu/CPUKVCacheNPU.cpp @@ -12,7 +12,7 @@ CPUKVCacheNPU::CPUKVCacheNPU(Backend *bn, string opName, int n_rep, int cache_ma cache_.setBackend(bn); // TODO: Chaning it to FP16 - cache_.setDtype(MLLM_TYPE_F32); + cache_.setDtype(MLLM_TYPE_F16); cache_limit_ = cache_max; } @@ -25,19 +25,20 @@ ErrorCode CPUKVCacheNPU::reshape(vector> inputs, vector(backend_); -#ifdef USE_QNN - if (cpuBackend->isStageSwitching()) { - cache_seq_len_ = cpuBackend->getSequenceLength(); - } -#endif - // the input is from QNN linear, the V is not transposed, so we need to transpose it here - if (name().find("v_cache") != std::string::npos) { + // when using the old frontend, the V will be transposed here; while in the module API, the V will be transposed in the QNNTranspose + if (name().find("v_cache") != std::string::npos && inputs[0]->ctype() != BHDS) { inputs[0]->transShape(SEQUENCE, DIMENSION); } } +#ifdef USE_QNN + // when the execution is switched from pref to dec, the sequence length should be set to the no padding length + auto cpuBackend = dynamic_cast(backend_); + if (cpuBackend->isStageSwitching()) { + cache_seq_len_ = cpuBackend->getSequenceLength(); + isDecoding = true; + } +#endif outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() + cache_seq_len_, inputs[0]->dimension()); @@ -56,6 +57,13 @@ ErrorCode CPUKVCacheNPU::load(AbstructLoader &loader) { } ErrorCode CPUKVCacheNPU::execute(vector> inputs, vector> outputs) { + // when decoding, the input will deepCopy from cache, no need to execute + if (isDecoding) { + int cache_seq_len_old = cache_seq_len_; + cache_seq_len_ += inputs[0]->sequence(); + return MLLM_NO_ERROR; + } + if (cache_.ctype() == BSHD && inputs[0]->ctype() == BSHD) { // 'K' #pragma omp parallel for collapse(3) num_threads(thread_count) for (int b = 0; b < cache_.batch(); ++b) { @@ -131,6 +139,20 @@ ErrorCode CPUKVCacheNPU::setUp(vector> inputs, vectorsetDtype(cache_.dtype()); + outputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ / cache_limit_, 0}); + if (inputs[0]->sequence() + cache_seq_len_ > cache_limit_) { + outputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ % cache_limit_ + 1, 0}); + } + if (inputs[0]->masterTensor() == nullptr) { + inputs[0]->free(); + } + inputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ % cache_limit_, 0}); + return MLLM_NO_ERROR; + } + // output setup outputs[0]->setDtype(cache_.dtype()); outputs[0]->deepCopyFrom(cache_, false, {0, 0, cache_seq_len_ / cache_limit_, 0}); diff --git a/src/backends/cpu/CPUKVCacheNPU.hpp b/src/backends/cpu/CPUKVCacheNPU.hpp index 54b96e23..014d32b5 100644 --- a/src/backends/cpu/CPUKVCacheNPU.hpp +++ b/src/backends/cpu/CPUKVCacheNPU.hpp @@ -26,11 +26,19 @@ class CPUKVCacheNPU final : public Op { Tensor cache_; + int getCacheSeqLen() override { + return cache_seq_len_; + } + void clearCache() override { + cache_seq_len_ = 0; + } + private: int thread_count = 4; int cache_seq_len_ = -999; int n_rep_ = 1; + bool isDecoding = false; int cache_limit_; }; diff --git a/src/backends/cpu/CPULinearINT8Shadow.cpp b/src/backends/cpu/CPULinearINT8Shadow.cpp index e61375f0..6eb25795 100755 --- a/src/backends/cpu/CPULinearINT8Shadow.cpp +++ b/src/backends/cpu/CPULinearINT8Shadow.cpp @@ -79,13 +79,6 @@ ErrorCode CPULinearINT8Shadow::load(AbstructLoader &loader) { memcpy(shadowWeight_.hostPtr(), weight_.hostPtr(), in_features_ * out_features_); - // shadowTransposeWeight_.setName(opName + ".shadow.transpose.weight"); - // shadowTransposeWeight_.reshape(1, 1, out_features_, in_features_); - // shadowTransposeWeight_.setDtype(MLLM_TYPE_I8); - // shadowTransposeWeight_.alloc(); - - // memcpy(shadowTransposeWeight_.hostPtr(), weight_.hostPtr(), in_features_*out_features_); - weight_.free(); return Op::load(loader); @@ -114,7 +107,7 @@ ErrorCode CPULinearINT8Shadow::execute(vector> inputs, vector output_scale = roundf(output_scale * 100000) / 100000; - memcpy(outputs[0]->hostPtr(), inputs[2]->hostPtr(), inputs[2]->batch() * inputs[2]->head() * inputs[2]->sequence() * inputs[2]->dimension() * sizeof(float)); + memcpy(outputs[0]->hostPtr(), inputs[2]->hostPtr(), inputs[2]->cntSize()); // input outliers if (!input_clip) { @@ -124,7 +117,6 @@ ErrorCode CPULinearINT8Shadow::execute(vector> inputs, vector #pragma omp parallel for num_threads(thread_count) for (int k = 0; k < inputs[0]->dimension(); k++) { if (roundf(inputs[0]->dataAt(i, h, j, k) / input_scale) > 127.0 || roundf(inputs[0]->dataAt(i, h, j, k) / input_scale) < -128.0) { -#pragma omp parallel for num_threads(thread_count) for (int w = 0; w < shadowWeight_.dimension(); w++) { // if (!(inputs[1]->dataAt(i, h, j, k) <= -128 || inputs[1]->dataAt(i, h, j, k) >= 127)) { @@ -148,19 +140,16 @@ ErrorCode CPULinearINT8Shadow::execute(vector> inputs, vector for (int i = 0; i < inputs[1]->batch(); i++) { for (int h = 0; h < inputs[1]->head(); h++) { for (int j = 0; j < inputs[1]->sequence(); j++) { +#pragma omp parallel for num_threads(thread_count) for (int k = 0; k < inputs[1]->dimension(); k++) { - if (inputs[1]->dataAt(i, h, j, k) <= (output_scale * -127.9) || inputs[1]->dataAt(i, h, j, k) >= (output_scale * -126.9)) { + if (inputs[1]->dataAt(i, h, j, k) <= -128 || inputs[1]->dataAt(i, h, j, k) >= 127) { float sum = 0.0f; for (int w = 0; w < shadowWeight_.sequence(); w++) { sum += roundf(inputs[0]->dataAt(i, h, j, w) / input_scale) * input_scale * (shadowWeight_.dataAt(0, 0, w, k) * weight_scale); } - // sum = sum - (inputs[1]->dataAt(i, h, j, k) * output_scale); - - // outputs[0]->setDataAt(i,h,j,k, roundf(sum/output_scale) * output_scale); - - outputs[0]->setDataAt(i, h, j, k, inputs[2]->dataAt(i, h, j, k) - inputs[1]->dataAt(i, h, j, k) + roundf(sum / output_scale) * output_scale); + outputs[0]->setDataAt(i, h, j, k, inputs[2]->dataAt(i, h, j, k) - (inputs[1]->dataAt(i, h, j, k) * output_scale) + roundf(sum / output_scale) * output_scale); } } } diff --git a/src/backends/cpu/CPUQuantize.cpp b/src/backends/cpu/CPUQuantize.cpp index 64ab81f0..05ab2b5d 100644 --- a/src/backends/cpu/CPUQuantize.cpp +++ b/src/backends/cpu/CPUQuantize.cpp @@ -3,20 +3,19 @@ // #include "CPUQuantize.hpp" -#include "compute/Matmul.hpp" +#include "backends/cpu/quantize/QuantizeQ8.hpp" #include namespace mllm { CPUQuantize::CPUQuantize(Backend *bn, string opName, int threadCount):thread_count(threadCount), Op(bn, std::move(opName)) { - + activation_dtype_ = MLLM_TYPE_I8; scale_.setBackend(bn); } ErrorCode CPUQuantize::reshape(vector> inputs, vector> outputs) { assert(inputs.size() == 1); assert(outputs.size() == 1); - activation_dtype_ = MLLM_TYPE_I8; outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), inputs[0]->dimension()); return Op::reshape(inputs, outputs); } diff --git a/src/backends/cpu/CPURoPE.cpp b/src/backends/cpu/CPURoPE.cpp index 5de956ae..9f304380 100644 --- a/src/backends/cpu/CPURoPE.cpp +++ b/src/backends/cpu/CPURoPE.cpp @@ -148,43 +148,68 @@ void CPURoPE::rope_hf(shared_ptr input, shared_ptr output){ int partial_dimension = (input->dimension()) * partial_rotary_factor_; int half = (int)(partial_dimension / 2); assert(partial_dimension%2==0); - if(output->ctype() == BSHD){ - if (out_dtype == MLLM_TYPE_F32){ + if(output->ctype() == BSHD){ + if (input->dtype() == MLLM_TYPE_F16) { + #pragma omp parallel for collapse(4) num_threads(thread_count) for (int n = 0; n < input->batch(); ++n) { for (int h = 0; h < input->head(); ++h) { for (int s = 0; s < input->sequence(); ++s) { // sequance for (int d = 0; d < partial_dimension/2; ++d) { - auto v = input->ptrAt(n, h, s, d); - auto o = output->ptrAt(n, h, s, d); - float in_value = v[0]; - float in_value_2 = v[half]; + auto v = input->ptrAt(n, h, s, d); + auto o = output->ptrAt(n, h, s, d); + float in_value = static_cast(v[0]); + float in_value_2 = static_cast(v[half]); float sin_value = sin_[s + h_cnt_][d]; float cos_value = cos_[s + h_cnt_][d]; auto value = in_value * cos_value - in_value_2 * sin_value; auto value2 = in_value * sin_value + in_value_2 * cos_value; - o[0] = value; - o[half] = value2; + o[0] = MLLM_FP32_TO_FP16(value); + o[half] = MLLM_FP32_TO_FP16(value2); } } } } - }else if(out_dtype == MLLM_TYPE_F16){ + + } else { + + if (out_dtype == MLLM_TYPE_F32){ #pragma omp parallel for collapse(4) num_threads(thread_count) - for (int n = 0; n < input->batch(); ++n) { - for (int h = 0; h < input->head(); ++h) { - for (int s = 0; s < input->sequence(); ++s) { // sequance - for (int d = 0; d < partial_dimension/2; ++d) { - auto v = input->ptrAt(n, h, s, d); - auto o = output->ptrAt(n, h, s, d); - float in_value = v[0]; - float in_value_2 = v[half]; - float sin_value = sin_[s + h_cnt_][d]; - float cos_value = cos_[s + h_cnt_][d]; - auto value = in_value * cos_value - in_value_2 * sin_value; - auto value2 = in_value * sin_value + in_value_2 * cos_value; - o[0] = MLLM_FP32_TO_FP16(value); - o[half] = MLLM_FP32_TO_FP16(value2); + for (int n = 0; n < input->batch(); ++n) { + for (int h = 0; h < input->head(); ++h) { + for (int s = 0; s < input->sequence(); ++s) { // sequance + for (int d = 0; d < partial_dimension/2; ++d) { + auto v = input->ptrAt(n, h, s, d); + auto o = output->ptrAt(n, h, s, d); + float in_value = v[0]; + float in_value_2 = v[half]; + float sin_value = sin_[s + h_cnt_][d]; + float cos_value = cos_[s + h_cnt_][d]; + auto value = in_value * cos_value - in_value_2 * sin_value; + auto value2 = in_value * sin_value + in_value_2 * cos_value; + o[0] = value; + o[half] = value2; + } + } + } + } + }else if(out_dtype == MLLM_TYPE_F16){ +#pragma omp parallel for collapse(4) num_threads(thread_count) + for (int n = 0; n < input->batch(); ++n) { + for (int h = 0; h < input->head(); ++h) { + for (int s = 0; s < input->sequence(); ++s) { // sequance + for (int d = 0; d < partial_dimension/2; ++d) { + auto v = input->ptrAt(n, h, s, d); + auto o = output->ptrAt(n, h, s, d); + float in_value = v[0]; + float in_value_2 = v[half]; + float sin_value = sin_[s + h_cnt_][d]; + float cos_value = cos_[s + h_cnt_][d]; + auto value = in_value * cos_value - in_value_2 * sin_value; + auto value2 = in_value * sin_value + in_value_2 * cos_value; + o[0] = MLLM_FP32_TO_FP16(value); + o[half] = MLLM_FP32_TO_FP16(value2); + } } } } @@ -197,19 +222,39 @@ void CPURoPE::rope_hf(shared_ptr input, shared_ptr output){ for (int h = 0; h < input->head(); ++h) { for (int s = 0; s < input->sequence(); ++s) { // sequance for (int d = 0; d < partial_dimension/2; ++d) { - float in_value = input->dataAt(n, h, s, d); - float in_value_2 = input->dataAt(n, h, s, d + partial_dimension / 2); - float sin_value = sin_[s + h_cnt_][d]; - float cos_value = cos_[s + h_cnt_][d]; - auto value = in_value * cos_value - in_value_2 * sin_value; - auto value2 = in_value * sin_value + in_value_2 * cos_value; - if (out_dtype == MLLM_TYPE_F32) { - output->setDataAt(n, h, s, d, value); - output->setDataAt(n, h, s, d+ partial_dimension / 2, value2); - } else if (out_dtype == MLLM_TYPE_F16) { - output->setDataAt(n, h, s, d, MLLM_FP32_TO_FP16(value)); - output->setDataAt(n, h, s, d+ partial_dimension / 2, MLLM_FP32_TO_FP16(value2)); + + if (input->dtype()== MLLM_TYPE_F16) { + + float in_value = static_cast(input->dataAt(n, h, s, d)); + float in_value_2 = static_cast(input->dataAt(n, h, s, d + partial_dimension / 2)); + float sin_value = sin_[s + h_cnt_][d]; + float cos_value = cos_[s + h_cnt_][d]; + auto value = in_value * cos_value - in_value_2 * sin_value; + auto value2 = in_value * sin_value + in_value_2 * cos_value; + if (out_dtype == MLLM_TYPE_F32) { + output->setDataAt(n, h, s, d, value); + output->setDataAt(n, h, s, d+ partial_dimension / 2, value2); + } else if (out_dtype == MLLM_TYPE_F16) { + output->setDataAt(n, h, s, d, MLLM_FP32_TO_FP16(value)); + output->setDataAt(n, h, s, d+ partial_dimension / 2, MLLM_FP32_TO_FP16(value2)); + } + + } else { + float in_value = input->dataAt(n, h, s, d); + float in_value_2 = input->dataAt(n, h, s, d + partial_dimension / 2); + float sin_value = sin_[s + h_cnt_][d]; + float cos_value = cos_[s + h_cnt_][d]; + auto value = in_value * cos_value - in_value_2 * sin_value; + auto value2 = in_value * sin_value + in_value_2 * cos_value; + if (out_dtype == MLLM_TYPE_F32) { + output->setDataAt(n, h, s, d, value); + output->setDataAt(n, h, s, d+ partial_dimension / 2, value2); + } else if (out_dtype == MLLM_TYPE_F16) { + output->setDataAt(n, h, s, d, MLLM_FP32_TO_FP16(value)); + output->setDataAt(n, h, s, d+ partial_dimension / 2, MLLM_FP32_TO_FP16(value2)); + } } + } } } diff --git a/src/backends/cpu/CPUSplitInput.cpp b/src/backends/cpu/CPUSplitInput.cpp index 8d613d03..16959d17 100644 --- a/src/backends/cpu/CPUSplitInput.cpp +++ b/src/backends/cpu/CPUSplitInput.cpp @@ -15,13 +15,13 @@ ErrorCode CPUSplitInput::reshape(vector> inputs, vectorreshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4 / 4, inputs[0]->dimension()); - outputs[1]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4 / 4, inputs[0]->dimension()); - outputs[2]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4 / 4, inputs[0]->dimension()); + outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4, inputs[0]->dimension()); + outputs[1]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4, inputs[0]->dimension()); + outputs[2]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4, inputs[0]->dimension()); if (outputs.size() == 4) // do not * 4 since type is FP32 - outputs[3]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4 / 4, inputs[0]->dimension()); + outputs[3]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() / 4, inputs[0]->dimension()); } else { outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), 1, inputs[0]->dimension()); @@ -37,8 +37,16 @@ ErrorCode CPUSplitInput::setUp(vector> inputs, vectordtype(); // return Op::setUp(inputs, outputs); + // for ( int i = 0; isetDtype(MLLM_TYPE_F32); + // outputs[i]->alloc(); + // } + for ( int i = 0; isetDtype(MLLM_TYPE_F32); + if(i == 0 || i==3) + outputs[i]->setDtype(MLLM_TYPE_F32); + else + outputs[i]->setDtype(MLLM_TYPE_F16); outputs[i]->alloc(); } return MLLM_NO_ERROR; diff --git a/src/backends/cpu/quantize/QuantizeQ8.cpp b/src/backends/cpu/quantize/QuantizeQ8.cpp index 3b65b626..ab2f89f5 100644 --- a/src/backends/cpu/quantize/QuantizeQ8.cpp +++ b/src/backends/cpu/quantize/QuantizeQ8.cpp @@ -382,6 +382,50 @@ void quantize_row_i8(const float *__restrict x, void *__restrict vy, int k, floa quantize_row_i8_reference(x, y, k, scale); #endif } +#if defined(__ARM_NEON) + +void dequantize_row_i8(const void *__restrict vx, float *__restrict y, int k, float scale) { + const int8_t *__restrict x = (int8_t *)vx; + + // Load scale into a NEON register + float32x4_t scale_vec = vdupq_n_f32(scale); + + int i; + for (i = 0; i <= k - 16; i += 16) { + // Load 16 int8_t values + int8x16_t x_vec = vld1q_s8(&x[i]); + + // De-interleave into lower and upper halves + int16x8_t x_low = vmovl_s8(vget_low_s8(x_vec)); + int16x8_t x_high = vmovl_s8(vget_high_s8(x_vec)); + + // Convert to float + float32x4_t x_f32_low1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(x_low))); + float32x4_t x_f32_low2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(x_low))); + float32x4_t x_f32_high1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(x_high))); + float32x4_t x_f32_high2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(x_high))); + + // Multiply by scale + x_f32_low1 = vmulq_f32(x_f32_low1, scale_vec); + x_f32_low2 = vmulq_f32(x_f32_low2, scale_vec); + x_f32_high1 = vmulq_f32(x_f32_high1, scale_vec); + x_f32_high2 = vmulq_f32(x_f32_high2, scale_vec); + + // Store the result + vst1q_f32(&y[i], x_f32_low1); + vst1q_f32(&y[i + 4], x_f32_low2); + vst1q_f32(&y[i + 8], x_f32_high1); + vst1q_f32(&y[i + 12], x_f32_high2); + } + + // Handle remaining elements + for (; i < k; i++) { + y[i] = x[i] * scale; + } +} + +#else + void dequantize_row_i8(const void *__restrict vx, float *__restrict y, int k, float scale) { const int8_t *__restrict x = (int8_t *)vx; @@ -389,3 +433,46 @@ void dequantize_row_i8(const void *__restrict vx, float *__restrict y, int k, fl y[i] = x[i] * scale; } } + +#endif + +// #if defined(__ARM_NEON) + +// void quantize_round_dequantize_row_i8(const float *__restrict vx, float *__restrict y, int k, float scale) { +// const float32x4_t v_scale = vdupq_n_f32(scale); // Load the scale value into a NEON register +// const float32x4_t v_inv_scale = vdupq_n_f32(1.0f / scale); // Calculate the inverse scale + +// int i = 0; +// for (; i <= k - 4; i += 4) { +// // Load four floats from the input array +// float32x4_t v_x = vld1q_f32(&vx[i]); + +// // Scale and round +// float32x4_t v_scaled = vmulq_f32(v_x, v_inv_scale); +// int32x4_t v_quantized = vcvtq_s32_f32(v_scaled); + +// // Dequantize +// float32x4_t v_dequantized = vcvtq_f32_s32(v_quantized); +// float32x4_t v_y = vmulq_f32(v_dequantized, v_scale); + +// // Store the result back to the output array +// vst1q_f32(&y[i], v_y); +// } + +// // Handle any remaining elements that don't fill a full NEON register +// for (; i < k; i++) { +// y[i] = roundf(vx[i] / scale) * scale; +// } +// } + +// #else + +void quantize_round_dequantize_row_i8(const float *__restrict vx, float *__restrict y, int k, float scale) { + const float *__restrict x = (float *)vx; + + for (int i = 0; i < k; i++) { + y[i] = roundf(x[i] / scale)*scale; + } +} + +// #endif \ No newline at end of file diff --git a/src/backends/cpu/quantize/QuantizeQ8.hpp b/src/backends/cpu/quantize/QuantizeQ8.hpp index acb113c5..7e727fa4 100644 --- a/src/backends/cpu/quantize/QuantizeQ8.hpp +++ b/src/backends/cpu/quantize/QuantizeQ8.hpp @@ -38,5 +38,6 @@ void dequantize_row_q8_K(const block_q8_K *__restrict x, float *__restrict y, in // for per-tensor int8 quantize void quantize_row_i8(const float *__restrict x, void *__restrict y, int k, float scale = 1.f); void dequantize_row_i8(const void *__restrict vx, float *__restrict y, int k, float scale = 1.f); +void quantize_round_dequantize_row_i8(const float *__restrict vx, float *__restrict y, int k, float scale = 1.f); #endif // MLLM_QUANTIZEQ8_HPP diff --git a/src/backends/qnn/CMakeLists.txt b/src/backends/qnn/CMakeLists.txt index de96f668..18547ac2 100644 --- a/src/backends/qnn/CMakeLists.txt +++ b/src/backends/qnn/CMakeLists.txt @@ -38,6 +38,17 @@ file(GLOB MLLM_QNN_SRC ) +if (MLLM_OPENMP) +find_package(OpenMP REQUIRED) +if(OpenMP_FOUND) + message(STATUS "found openmp") + set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} ${OPENMP_C_FLAGS}) + set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} ${OPENMP_CXX_FLAGS}) +else() + message(FATAL_ERROR "openmp not found!") +endif() +endif() + # import android ndk cmake toolchain if (ARM) include(${ANDROID_NDK}/build/cmake/android.toolchain.cmake) @@ -51,3 +62,16 @@ add_library( OBJECT ${MLLM_QNN_SRC} ) + +if(OpenMP_FOUND) + message(STATUS "found openmp") + if(ARM AND NOT APK) + message(STATUS "[ARM] found openmp") + target_compile_options(MLLM_QNN PRIVATE -fopenmp) + target_link_libraries(MLLM_QNN PUBLIC -fopenmp -static-openmp) + else() + target_link_libraries(MLLM_QNN + PUBLIC + OpenMP::OpenMP_CXX ) + endif() +endif() \ No newline at end of file diff --git a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/config/LLaMAOpPackageHtp.xml b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/config/LLaMAOpPackageHtp.xml index b3498280..3ec42bdf 100755 --- a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/config/LLaMAOpPackageHtp.xml +++ b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/config/LLaMAOpPackageHtp.xml @@ -11,6 +11,92 @@ Confidential and Proprietary - Qualcomm Technologies, Inc. > + + LLaMASuperSiLU + + + fused SiLU function + + + + + + + in[0] + + input activation + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + in[1] + + input activation + + true + BACKEND_SPECIFIC + + 4D + NHWC + [N, C, H , W] + + + + + out[0] + + output activation + + true + BACKEND_SPECIFIC + + 4D + [N, C, H , W] + + + + + a_scale + true + QNN_DATATYPE_FLOAT_32 + + SCALAR + + N-1 + + + + b_scale + true + QNN_DATATYPE_FLOAT_32 + + SCALAR + + N-1 + + + + o_scale + true + QNN_DATATYPE_FLOAT_32 + + SCALAR + + N-1 + + + + HTP + + SiLU @@ -1106,6 +1192,7 @@ Confidential and Proprietary - Qualcomm Technologies, Inc. + LLaMASuperSiLU SiLU Attention RMSNorm @@ -1136,6 +1223,27 @@ Confidential and Proprietary - Qualcomm Technologies, Inc. + + + LLaMASuperSiLU + + + in[0] + QNN_DATATYPE_SFIXED_POINT_8 + + + in[1] + QNN_DATATYPE_SFIXED_POINT_8 + + + + + + out[0] + QNN_DATATYPE_SFIXED_POINT_8 + + + LLaMAReLU @@ -1261,6 +1369,7 @@ Confidential and Proprietary - Qualcomm Technologies, Inc. out[0] QNN_DATATYPE_FLOAT_16 QNN_DATATYPE_FLOAT_32 + QNN_DATATYPE_SFIXED_POINT_8 diff --git a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/LLaMAPackageInterface.cpp b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/LLaMAPackageInterface.cpp index 7d4d1c4c..36a91ddd 100755 --- a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/LLaMAPackageInterface.cpp +++ b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/LLaMAPackageInterface.cpp @@ -19,30 +19,31 @@ BEGIN_PKG_OPS_OPTS_LIST() * registered to the HTP Core. * Append the latest OpName at the bottom */ -DECLARE_PKG_OPS_OPTS_LIST(PKG_SplitInput) -DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAAdd) -DECLARE_PKG_OPS_OPTS_LIST(PKG_Attention) -DECLARE_PKG_OPS_OPTS_LIST(PKG_QLayerNorm) -DECLARE_PKG_OPS_OPTS_LIST(PKG_CausalMask) -DECLARE_PKG_OPS_OPTS_LIST(PKG_RMSNorm) DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMALinear) -DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMADequantize) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMASuperSiLU) +DECLARE_PKG_OPS_OPTS_LIST(PKG_KVCache) +DECLARE_PKG_OPS_OPTS_LIST(PKG_QLayerNorm) DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAQuantize) -DECLARE_PKG_OPS_OPTS_LIST(PKG_RoPE) +DECLARE_PKG_OPS_OPTS_LIST(PKG_WNop) DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAMul) -DECLARE_PKG_OPS_OPTS_LIST(PKG_KVCache) +DECLARE_PKG_OPS_OPTS_LIST(PKG_Attention) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMADequantize) +DECLARE_PKG_OPS_OPTS_LIST(PKG_CausalMask) DECLARE_PKG_OPS_OPTS_LIST(PKG_HeadMatmul) +DECLARE_PKG_OPS_OPTS_LIST(PKG_SplitInput) DECLARE_PKG_OPS_OPTS_LIST(PKG_SiLU) -DECLARE_PKG_OPS_OPTS_LIST(PKG_WNop) DECLARE_PKG_OPS_OPTS_LIST(PKG_MergeOutput) DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAReLU) +DECLARE_PKG_OPS_OPTS_LIST(PKG_RMSNorm) +DECLARE_PKG_OPS_OPTS_LIST(PKG_RoPE) +DECLARE_PKG_OPS_OPTS_LIST(PKG_LLaMAAdd) END_PKG_OPS_OPTS_LIST() // op package info static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag -static std::array sg_opNames{{"SplitInput", "LLaMAAdd", "Attention", "QLayerNorm", "CausalMask", "RMSNorm", "LLaMALinear", "LLaMADequantize", "LLaMAQuantize", "RoPE", "LLaMAMul", "KVCache", "HeadMatmul", "SiLU", "WNop", "MergeOutput", "LLaMAReLU"}}; +static std::array sg_opNames{{"LLaMALinear", "LLaMASuperSiLU", "KVCache", "QLayerNorm", "LLaMAQuantize", "WNop", "LLaMAMul", "Attention", "LLaMADequantize", "CausalMask", "HeadMatmul", "SplitInput", "SiLU", "MergeOutput", "LLaMAReLU", "RMSNorm", "RoPE", "LLaMAAdd"}}; static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT; static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; @@ -226,18 +227,18 @@ Qnn_ErrorHandle_t LLaMAPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ * Check if op config type matches any registered ops * If a match is found, check number of inputs, outputs and params */ - if (std::string(opConfig.v1.typeName) == "SplitInput"){ - if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 2){ + if (std::string(opConfig.v1.typeName) == "LLaMALinear"){ + if (opConfig.v1.numOfParams != 4 || opConfig.v1.numOfInputs != 3 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "LLaMAAdd"){ - if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "LLaMASuperSiLU"){ + if (opConfig.v1.numOfParams != 3 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "Attention"){ - if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 5 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "KVCache"){ + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } @@ -246,18 +247,23 @@ Qnn_ErrorHandle_t LLaMAPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "CausalMask"){ - if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "LLaMAQuantize"){ + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "RMSNorm"){ + else if (std::string(opConfig.v1.typeName) == "WNop"){ + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 2){ + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } + else if (std::string(opConfig.v1.typeName) == "LLaMAMul"){ if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "LLaMALinear"){ - if (opConfig.v1.numOfParams != 4 || opConfig.v1.numOfInputs != 3 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "Attention"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 5 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } @@ -266,48 +272,48 @@ Qnn_ErrorHandle_t LLaMAPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "LLaMAQuantize"){ - if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "CausalMask"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "RoPE"){ - if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 3 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "HeadMatmul"){ + if (opConfig.v1.numOfParams != 2 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "LLaMAMul"){ - if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "SplitInput"){ + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 2){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "KVCache"){ - if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "SiLU"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "HeadMatmul"){ - if (opConfig.v1.numOfParams != 2 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "MergeOutput"){ + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "SiLU"){ + else if (std::string(opConfig.v1.typeName) == "LLaMAReLU"){ if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "WNop"){ - if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 2){ + else if (std::string(opConfig.v1.typeName) == "RMSNorm"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "MergeOutput"){ - if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 4 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "RoPE"){ + if (opConfig.v1.numOfParams != 1 || opConfig.v1.numOfInputs != 3 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } - else if (std::string(opConfig.v1.typeName) == "LLaMAReLU"){ - if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ + else if (std::string(opConfig.v1.typeName) == "LLaMAAdd"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 2 || opConfig.v1.numOfOutputs != 1){ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; } } diff --git a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAQuantize.cpp b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAQuantize.cpp index 0d48dc9a..23b357b5 100755 --- a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAQuantize.cpp +++ b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMAQuantize.cpp @@ -648,6 +648,8 @@ int32_t qhmath_hvx_quantize_ahf_int8( #define FLOAT_MANTISA_MASK 0x007fffff #define FLOAT_SIGN 31 #define FLOAT_NEG_1 0xBF800000 +#define ROUND_2_SCALE 22 +#define ROUND_SCALSE ((1 << ROUND_2_SCALE) * 1.0f) int32_t qhmath_hvx_quantize_af( float *restrict input, @@ -671,7 +673,7 @@ int32_t qhmath_hvx_quantize_af( HVX_Vector sline4p, sline4c, sline4; HVX_Vector sout1, sout2, sout3, sout4; - HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec; + HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec, round_scale_vec; int32_t block, l2fetch_block; // int32_t leftover = size & 31; int32_t vectors_in_rounddown = size / 32; @@ -687,6 +689,7 @@ int32_t qhmath_hvx_quantize_af( high_level_vec = Q6_V_vsplat_R(float_to_bits(high_level)); scale_vec = Q6_V_vsplat_R(float_to_bits(scale)); es_vec = Q6_V_vsplat_R(float_to_bits(es)); + round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE)); HVX_Vector zero_v_sf = Q6_V_vzero(); es_vec = Q6_Vqf32_vadd_VsfVsf(es_vec, zero_v_sf); @@ -694,13 +697,13 @@ int32_t qhmath_hvx_quantize_af( HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080); - HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK); - HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS); - HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK); - HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1); - HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1); - HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1); - HVX_Vector zero = Q6_V_vzero(); + // HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK); + // HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS); + // HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK); + // HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1); + // HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1); + // HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1); + // HVX_Vector zero = Q6_V_vzero(); for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) { @@ -722,42 +725,45 @@ int32_t qhmath_hvx_quantize_af( sout1 = Q6_Vsf_equals_Vqf32(sout1); sout1 = Q6_Vsf_vmin_VsfVsf(sout1, high_level_vec); sout1 = Q6_Vsf_vmax_VsfVsf(sout1, low_level_vec); + sout1 = Q6_Vqf32_vmpy_VsfVsf(sout1, round_scale_vec); + sout1 = Q6_Vsf_equals_Vqf32(sout1); - { - HVX_Vector exp = Q6_Vw_vasr_VwR(sout1, FLOAT_MANTISA); - exp = Q6_V_vand_VV(exp, expmask); - exp = Q6_Vw_vsub_VwVw(exp, expbias); + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout1, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); - HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); - HVX_Vector manzero = Q6_V_vand_VV(sout1, man); + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout1, man); - HVX_Vector sign = Q6_Vw_vasr_VwR(sout1, FLOAT_SIGN); - HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout1, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); - HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); - HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); - HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); - HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout1, man); - man = Q6_V_vnot_V(man); - HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); - exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); - HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout1, 1); - HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout1, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout1, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); - // exp >= 0 - HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); - tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); - // exp < 0 (-1, 1) - HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); - tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); - tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); - sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); - } + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); + // } sout1 = Q6_Vw_equals_Vsf(sout1); + sout1 = Q6_Vw_vasr_VwR(sout1, ROUND_2_SCALE); // sout1 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout1, Q6_V_vzero()), 0); sline2c = *iptr++; @@ -768,42 +774,45 @@ int32_t qhmath_hvx_quantize_af( sout2 = Q6_Vsf_equals_Vqf32(sout2); sout2 = Q6_Vsf_vmin_VsfVsf(sout2, high_level_vec); sout2 = Q6_Vsf_vmax_VsfVsf(sout2, low_level_vec); + sout2 = Q6_Vqf32_vmpy_VsfVsf(sout2, round_scale_vec); + sout2 = Q6_Vsf_equals_Vqf32(sout2); - { - HVX_Vector exp = Q6_Vw_vasr_VwR(sout2, FLOAT_MANTISA); - exp = Q6_V_vand_VV(exp, expmask); - exp = Q6_Vw_vsub_VwVw(exp, expbias); + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout2, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); - HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); - HVX_Vector manzero = Q6_V_vand_VV(sout2, man); + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout2, man); - HVX_Vector sign = Q6_Vw_vasr_VwR(sout2, FLOAT_SIGN); - HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout2, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); - HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); - HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); - HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); - HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout2, man); - man = Q6_V_vnot_V(man); - HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); - exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); - HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout2, 1); - HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout2, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout2, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); - // exp >= 0 - HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); - tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); - // exp < 0 (-1, 1) - HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); - tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); - tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); - sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); - } + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); + // } sout2 = Q6_Vw_equals_Vsf(sout2); + sout2 = Q6_Vw_vasr_VwR(sout2, ROUND_2_SCALE); // sout2 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout2, Q6_V_vzero()), 0); sline3c = *iptr++; @@ -814,43 +823,46 @@ int32_t qhmath_hvx_quantize_af( sout3 = Q6_Vsf_equals_Vqf32(sout3); sout3 = Q6_Vsf_vmin_VsfVsf(sout3, high_level_vec); sout3 = Q6_Vsf_vmax_VsfVsf(sout3, low_level_vec); + sout3 = Q6_Vqf32_vmpy_VsfVsf(sout3, round_scale_vec); + sout3 = Q6_Vsf_equals_Vqf32(sout3); - { - HVX_Vector exp = Q6_Vw_vasr_VwR(sout3, FLOAT_MANTISA); - exp = Q6_V_vand_VV(exp, expmask); - exp = Q6_Vw_vsub_VwVw(exp, expbias); + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout3, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); - HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); - HVX_Vector manzero = Q6_V_vand_VV(sout3, man); + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout3, man); - HVX_Vector sign = Q6_Vw_vasr_VwR(sout3, FLOAT_SIGN); - HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout3, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); - HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); - HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); - HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); - HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout3, man); - man = Q6_V_vnot_V(man); - HVX_Vector exppos_signpos = Q6_V_vand_VV(sout3, man); - exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); - HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout3, 1); - HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout3, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout3, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout3, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); - // exp >= 0 - HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); - tsout1 = Q6_V_vmux_QVV(maneqzero, sout3, tsout1); + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout3, tsout1); - // exp < 0 (-1, 1) - HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout3, negone); - tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout3, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); - tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); - sout3 = Q6_V_vmux_QVV(expgte23, sout3, tsout1); - } + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout3 = Q6_V_vmux_QVV(expgte23, sout3, tsout1); + // } sout3 = Q6_Vw_equals_Vsf(sout3); + sout3 = Q6_Vw_vasr_VwR(sout3, ROUND_2_SCALE); // sout3 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout3, Q6_V_vzero()), 0); sline4c = *iptr++; @@ -861,42 +873,46 @@ int32_t qhmath_hvx_quantize_af( sout4 = Q6_Vsf_equals_Vqf32(sout4); sout4 = Q6_Vsf_vmin_VsfVsf(sout4, high_level_vec); sout4 = Q6_Vsf_vmax_VsfVsf(sout4, low_level_vec); + sout4 = Q6_Vqf32_vmpy_VsfVsf(sout4, round_scale_vec); + sout4 = Q6_Vsf_equals_Vqf32(sout4); + - { - HVX_Vector exp = Q6_Vw_vasr_VwR(sout4, FLOAT_MANTISA); - exp = Q6_V_vand_VV(exp, expmask); - exp = Q6_Vw_vsub_VwVw(exp, expbias); + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout4, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); - HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); - HVX_Vector manzero = Q6_V_vand_VV(sout4, man); + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout4, man); - HVX_Vector sign = Q6_Vw_vasr_VwR(sout4, FLOAT_SIGN); - HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout4, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); - HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); - HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); - HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); - HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout4, man); - man = Q6_V_vnot_V(man); - HVX_Vector exppos_signpos = Q6_V_vand_VV(sout4, man); - exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); - HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout4, 1); - HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout4, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout4, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout4, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); - // exp >= 0 - HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); - tsout1 = Q6_V_vmux_QVV(maneqzero, sout4, tsout1); + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout4, tsout1); - // exp < 0 (-1, 1) - HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout4, negone); - tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout4, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); - tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); - sout4 = Q6_V_vmux_QVV(expgte23, sout4, tsout1); - } + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout4 = Q6_V_vmux_QVV(expgte23, sout4, tsout1); + // } sout4 = Q6_Vw_equals_Vsf(sout4); + sout4 = Q6_Vw_vasr_VwR(sout4, ROUND_2_SCALE); // sout4 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout4, Q6_V_vzero()), 0); diff --git a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp new file mode 100755 index 00000000..0a849ca1 --- /dev/null +++ b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/LLaMASuperSiLU.cpp @@ -0,0 +1,1171 @@ +//============================================================================== +// Auto Generated Code for LLaMAPackage +//============================================================================== + +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "QnnOpPackage.h" +#include "HTP/core/simple_reg.h" + + +BEGIN_PKG_OP_DEFINITION(PKG_LLaMASuperSiLU); + + +// op execute function declarations +template +GraphStatus llamasupersiluImpl(TensorType& out_0, + const TensorType& in_0, + const TensorType& in_1, + const PlainFloatTensor& a_scale, + const PlainFloatTensor& b_scale, + const PlainFloatTensor& o_scale); + +// forward declaration of sample cost function +static float llamasupersiluCostFunc(const Op *op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default flag (Flags::RESOURCE_HVX) + * syntax: DEF_PACKAGE_OP(F,OP) + * e.g. DEF_PACKAGE_OP((llamasupersiluImpl), "LLaMASuperSiLU") + */ +DEF_PACKAGE_OP((llamasupersiluImpl), "LLaMASuperSiLU") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, FAST, FREE) + * and provided flags + * syntax: DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) + * can use zero or more flags, FLAG options are IS_CONST, INHIBIT_CONST_PROP, + * RESOURCE_HVX, RESOURCE_HMX(not supported in external op packages) + * e.g. DEF_PACKAGE_OP_AND_COST_AND_FLAGS((llamasupersiluImpl), "LLaMASuperSiLU", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((llamasupersiluImpl), + * "LLaMASuperSiLU", llamasupersiluCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op execution functions + * if an op does not have a parameter order definition, parameter order passed into Qnn_addNode + * will be passed into op execution functions + * if an op has a parameter order definition, any parameter passed into Qnn_addNode with unlisted + * name will be abandoned + * if two or more op packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at Qnn_addNode + * DEFAULT is used when MANDATORY is false + * if provided as Qnn_Param_t*, + * DEFAULT will be used for graph construction when this parameter is not provided at + * Qnn_addNode + * if provided as nullptr, + * graph construction will skip this parameter when this parameter is not provided at + * Qnn_addNode + */ +DEF_PACKAGE_PARAM_ORDER("LLaMASuperSiLU", + "a_scale", + true, + nullptr, + "b_scale", + true, + nullptr, + "o_scale", + true, + nullptr) + + +/* execute functions for ops */ + +#ifndef REFERENCE_OP + +#include "qhmath_hvx.h" +#include "hvx_internal.h" +#include +#include + +#define BLOCK_SIZE (8*1024/VLEN) /* vector chunks */ +#define L2FETCH_AHEAD (BLOCK_SIZE) + +#define FP16_MANTISA 10 +#define FP16_EXPONENT_MASK 0x1f +#define FP16_EXPONENT_BIAS 0xf +#define FP16_MANTISA_MASK 0x000003ff +#define FP16_SIGN 15 +#define FP16_NEG_1 0xbc00 +#define ROUND_2_SCALE 22 +#define ROUND_SCALSE ((1 << ROUND_2_SCALE) * 1.0f) + +static inline int32_t float_to_fp16s(float input) +{ + union { + int32_t i; + __fp16 f[2]; + } fp32 = {.f = {(__fp16)input, (__fp16)input}}; + return fp32.i; +} + +static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) +{ + union { float f; uint32_t i; } fp32 = { .f = x }; + return fp32.i; +} + + +static const float fp16_c0_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +0.13239719960243818,0.2216255210749415,0.3447664743728659,0.48137452032585476,0.5716299228719798,0.5547323231605259,0.5046287748870234,0.4999985574626892, +0.5000036514755082,0.49475652448004626,0.4441393352532763,0.428500379952032,0.5173297285470642,0.6541461039833616,0.7783931007462818,0.8678015179911097, +}; +static const float fp16_c1_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +0.05928005756790343,0.11063222460270064,0.1932879057003057,0.30302440212086995,0.3922924462181049,0.36546332659415875,0.2644148210990377,0.24989020912329707, +0.2498532691910313,0.2661055781198988,0.36728015359480604,0.39215270010450015,0.3041825601732039,0.1940762094668647,0.11061794856987572,0.059174800917353595, +}; +static const float fp16_c2_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +0.010145494303219278,0.02123968384425681,0.04207468332514667,0.07519946712591977,0.10840620196267145,0.09270738184406795,0.015322371881818012,-0.0009948273994921822, +0.0011544907060402412,-0.017040517565094934,-0.09379878876657094,-0.10835043868732394,-0.07558705272699548,-0.04228875316413285,-0.021235740718738055,-0.010124599879590107, +}; +static const float fp16_c3_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +0.0007841223015974933,0.001850453397354219,0.004187899308371771,0.008640952434084206,0.01414741414964877,0.010117749275618,-0.01654848996354919,-0.02395108399453624, +-0.024199111971064446,-0.015783556879607072,0.010407672131558174,0.014137608186323335,0.008698510795258909,0.004213708431213342,0.0018499827774393985,0.0007822799742289481, +}; +static const float fp16_c4_coeffs[32] __attribute__((aligned(VLEN))) = +{ + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, +2.3031641204975905e-05,6.150442488966733e-05,0.00015997783736818624,0.00038491646239693526,0.0007283649599237781,0.00034439150914392054,-0.003142246198646662,-0.004120389580321761, +0.004246050162553198,0.0030162727520777893,-0.00037312974308425725,-0.0007277242855014247,-0.00038811687679772674,-0.0001611434776868886,-6.14837984586862e-05,-2.297076123375133e-05, +}; + +int32_t hvx_supersilu_ahf( + uint8_t *restrict input, + uint8_t *restrict input2, + uint8_t *restrict output, + float a_scale, + float b_scale, + float o_scale, + uint32_t size) +{ + if ((input == NULL) || (output == NULL) || (size == 0)) + { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)input2; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + + int32_t block, l2fetch_block; + int32_t leftover = size & 128; + int32_t vectors_in_rounddown = size / 128; + // int32_t leftover_size = leftover * sizeof(__fp16); + + sline1p = *iptr++; + sline2p = *iptr2++; + + + // dequantize + uint32_t convert = 0x00800080; + HVX_Vector convert_vector = Q6_V_vsplat_R(convert); + + + HVX_Vector a_scale_vec = Q6_V_vsplat_R(float_to_fp16s(a_scale)); + HVX_Vector b_scale_vec = Q6_V_vsplat_R(float_to_fp16s(b_scale)); + HVX_Vector zero_v_sf = Q6_V_vzero(); + + + //silu + HVX_Vector input_min_v_hf; + HVX_Vector input_shifted_v_hf; + HVX_Vector input_scaled_v; + HVX_VectorPair input_vp_qf32; + // HVX_Vector input_v_qf16; + HVX_Vector mask_idx1_v, mask_idx2_v; + HVX_Vector const16_0_v_hf; + HVX_Vector zero_v_hf, one_v_hf; + HVX_Vector tmp_v; + HVX_Vector idx1_v, idx2_v; + HVX_Vector scale_v; + HVX_DV output_dv; + HVX_DV c0_coeff_dv; + HVX_VectorPair c0_coeff_vp; + HVX_Vector c0_coeff_v; + HVX_DV c1_coeff_dv; + HVX_VectorPair c1_coeff_vp; + HVX_Vector c1_coeff_v; + HVX_DV c2_coeff_dv; + HVX_VectorPair c2_coeff_vp; + HVX_Vector c2_coeff_v; + HVX_DV c3_coeff_dv; + HVX_VectorPair c3_coeff_vp; + HVX_Vector c3_coeff_v; + HVX_DV c4_coeff_dv; + HVX_VectorPair c4_coeff_vp; + HVX_Vector c4_coeff_v; + + scale_v = Q6_Vh_vsplat_R(0x3bfe); + + /* Vector of ones used as mpy neutral element in conversions from hf vector to qf32 vector pair */ + one_v_hf = Q6_Vh_vsplat_R(0x3c00); + + /* + * Vector of zeroes used as neutral element in hf to qf16 conversions. + * NOTE: Some of conversions (i.e conversion of scale factor and coefficients) + * can be avoided in real-time, but this is not done in order to don't + * sacrify code readibility in expense of insignificant performance improvement. + */ + zero_v_hf = Q6_V_vzero(); + + /* Mask for extracting only 4 bits of mantissa */ + mask_idx1_v = Q6_Vh_vsplat_R(0x000F); + + mask_idx2_v = Q6_V_vsplat_R(0x00001010); + + /* 16.0 in IEEE 16-bit floating-point representation */ + const16_0_v_hf = Q6_Vh_vsplat_R(0x4c00); + + /* + * Prepare vector of input_min values, that is used later in shifting input range. + * input_min is low boundary of specified input range. + */ + input_min_v_hf = Q6_Vh_vsplat_R(0xc800); + + /* Convert scale factor from hf to q16. Use the same vector for both formats */ + scale_v = Q6_Vqf16_vadd_VhfVhf(scale_v, zero_v_hf); + + /* Load coefficients */ + c0_coeff_v = *((HVX_Vector *)(fp16_c0_coeffs)); + c1_coeff_v = *((HVX_Vector *)(fp16_c1_coeffs)); + c2_coeff_v = *((HVX_Vector *)(fp16_c2_coeffs)); + c3_coeff_v = *((HVX_Vector *)(fp16_c3_coeffs)); + c4_coeff_v = *((HVX_Vector *)(fp16_c4_coeffs)); + + /* Convert coefficients from hf to qf32 format. Use the same vector for both representations */ + c0_coeff_v = Q6_Vqf32_vadd_VsfVsf(c0_coeff_v, zero_v_hf); + c1_coeff_v = Q6_Vqf32_vadd_VsfVsf(c1_coeff_v, zero_v_hf); + c2_coeff_v = Q6_Vqf32_vadd_VsfVsf(c2_coeff_v, zero_v_hf); + c3_coeff_v = Q6_Vqf32_vadd_VsfVsf(c3_coeff_v, zero_v_hf); + c4_coeff_v = Q6_Vqf32_vadd_VsfVsf(c4_coeff_v, zero_v_hf); + + /* Split 32-bit coefficients to lower and upper part in order to obtain them later with VLUT16. */ + c0_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c0_coeff_v); + c1_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c1_coeff_v); + c2_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c2_coeff_v); + c3_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c3_coeff_v); + c4_coeff_dv.VV = Q6_Wuw_vzxt_Vuh(c4_coeff_v); + + + // quantize + HVX_Vector low_level_vec, high_level_vec, o_scale_vec, es_vec, round_scale_vec; + HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080); + HVX_Vector vmb = Q6_V_vsplat_R(0x40004000); + + + float post_scale_flt = a_scale * b_scale * o_scale; + int scexp = flt_getexp( post_scale_flt); + int rsh = min_i32( -scexp,7); // e.g. 0.11 -> 0.88, rsh = 3 + float rsh_fac = flt_power2(rsh); + + int adj_bias = roundf_i32(128 * rsh_fac); + adj_bias = Q6_R_combine_RlRl( adj_bias, adj_bias); + + HVX_Vector vadj = Q6_V_vsplat_R(adj_bias); + + float es = 0.5; + low_level_vec = Q6_V_vsplat_R(float_to_fp16s(-128.0f)); + high_level_vec = Q6_V_vsplat_R(float_to_fp16s(127.0f)); + o_scale_vec = Q6_V_vsplat_R(float_to_fp16s(post_scale_flt * rsh_fac * (1<<15))); + // one_vec = Q6_V_vsplat_R(float_to_fp16s(1.0f)); + // o_scale_vec = Q6_Vqf16_vadd_VhfVhf(o_scale_vec, zero_v_hf); + es_vec = Q6_V_vsplat_R(float_to_fp16s(es)); + round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE)); + + es_vec = Q6_Vqf16_vadd_VhfVhf(es_vec, zero_v_sf); + round_scale_vec = Q6_Vqf32_vadd_VsfVsf(round_scale_vec, zero_v_sf); + + HVX_Vector expmask = Q6_Vh_vsplat_R(FP16_EXPONENT_MASK); + HVX_Vector expbias = Q6_Vh_vsplat_R(FP16_EXPONENT_BIAS); + HVX_Vector manmask = Q6_Vh_vsplat_R(FP16_MANTISA_MASK); + HVX_Vector exp23 = Q6_Vh_vsplat_R(23 - 1); + HVX_Vector exp0 = Q6_Vh_vsplat_R(0 - 1); + HVX_Vector negone = Q6_Vh_vsplat_R(FP16_NEG_1); + HVX_Vector zero = Q6_V_vzero(); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) + { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) + { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) + { + sline1c = *iptr++; + sline2c = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + + HVX_Vector sline1_high; + HVX_Vector sline1_low; + // HVX_Vector sline2_high; + // HVX_Vector sline2_low; + + { + // dequantize sline1 qf16 + HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline1, zero_v_sf); + + temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + sline1_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), a_scale_vec); + sline1_low = Q6_Vhf_equals_Vqf16(sline1_low); + sline1_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), a_scale_vec); + sline1_high = Q6_Vhf_equals_Vqf16(sline1_high); + } + + + // { + // // dequantize sline2 qf16 + // HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline2, zero_v_sf); + + // temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + // HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + // HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + // sline2_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), b_scale_vec); + // sline2_low = Q6_Vhf_equals_Vqf16(sline2_low); + // sline2_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), b_scale_vec); + // sline2_high = Q6_Vhf_equals_Vqf16(sline2_high); + // } + + { + // silu sline1_low + tmp_v = Q6_Vh_vdeal_Vh(sline1_low); + + /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + * Use the same input_scaled_v vector for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_low, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + // output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + // output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_low = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + + { + // silu sline1_high + tmp_v = Q6_Vh_vdeal_Vh(sline1_high); + + /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + * Use the same input_scaled_v vector for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_high, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + // output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + // output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_high = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + + HVX_Vector sline_high; + HVX_Vector sline_low; + + // { + // // mul + // sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, sline2_high); + // sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, sline2_low); + + // sline_high = Q6_Vhf_equals_Vqf16(sline_high); + // sline_low = Q6_Vhf_equals_Vqf16(sline_low); + // } + + HVX_VectorPair mul_output; + { + // uint8 mul + // (a-128)*(b-128) = a*b - 128 (a+b) + 128*128 + HVX_VectorPair prod1 = Q6_Wuh_vmpyacc_WuhVubVub(Q6_W_vcombine_VV(vmb,vmb), sline1, sline2); + HVX_VectorPair prod2 = Q6_Wh_vmpa_WubRub( Q6_W_vcombine_VV(sline2, sline1), 0x80808080); + mul_output = Q6_Wh_vsub_WhWh(prod1, prod2); + + mul_output = Q6_W_vshuff_VVR(Q6_V_hi_W(mul_output), Q6_V_lo_W(mul_output), -2); + + // sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, Q6_Vhf_equals_Vh(Q6_V_lo_W(mul_output))); + // sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, Q6_Vhf_equals_Vh(Q6_V_hi_W(mul_output))); + + } + + { + // scaling quantize + sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, o_scale_vec); + sline_low = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_low)); + sline_low = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vmpy_VhVh_s1_rnd_sat(Q6_V_lo_W(mul_output), sline_low), vadj); + + sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, o_scale_vec); + sline_high = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_high)); + sline_high = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vmpy_VhVh_s1_rnd_sat(sline_high, Q6_V_hi_W(mul_output)), vadj); + + HVX_Vector sout = Q6_Vub_vasr_VhVhR_rnd_sat( sline_high, sline_low, rsh); + sout = Q6_Vb_vdeal_Vb(sout); + *optr++ = sout; + } + + // { + // // quantize + // HVX_Vector sout1 = Q6_Vqf16_vmpy_Vqf16Vhf(sline_low, o_scale_vec); + // sout1 = Q6_Vqf16_vadd_Vqf16Vqf16(sout1, es_vec); + // sout1 = Q6_Vhf_equals_Vqf16(sout1); + // sout1 = Q6_Vhf_vmin_VhfVhf(sout1, high_level_vec); + // sout1 = Q6_Vhf_vmax_VhfVhf(sout1, low_level_vec); + // HVX_VectorPair sout1_pair = Q6_Wqf32_vmpy_VhfVhf(sout1, one_vec); + // HVX_Vector sout1_low = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(sout1_pair), round_scale_vec)); + // HVX_Vector sout1_high = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(sout1_pair), round_scale_vec)); + + // sout1_pair = Q6_W_vshuff_VVR(sout1_high, sout1_low, -4); + // sout1_low = Q6_V_lo_W(sout1_pair); + // sout1_high = Q6_V_hi_W(sout1_pair); + + + // // { + // // HVX_Vector exp = Q6_Vh_vasr_VhR(sout1, FP16_MANTISA); + // // exp = Q6_V_vand_VV(exp, expmask); + // // exp = Q6_Vh_vsub_VhVh(exp, expbias); + + // // HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + // // HVX_Vector manzero = Q6_V_vand_VV(sout1, man); + + // // HVX_Vector sign = Q6_Vh_vasr_VhR(sout1, FP16_SIGN); + // // HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + // // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + // // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + // // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + // // HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout1, man); + // // man = Q6_V_vnot_V(man); + // // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); + // // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // // HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout1, 1); + // // HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // // // exp >= 0 + // // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // // tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); + + // // // exp < 0 (-1, 1) + // // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); + // // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // // sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); + // // } + + // sout1_low = Q6_Vw_equals_Vsf(sout1_low); + // sout1_low = Q6_Vw_vasr_VwR(sout1_low, ROUND_2_SCALE); + // sout1_high = Q6_Vw_equals_Vsf(sout1_high); + // sout1_high = Q6_Vw_vasr_VwR(sout1_high, ROUND_2_SCALE); + + + // HVX_Vector sout2 = Q6_Vqf16_vmpy_Vqf16Vhf(sline_high, o_scale_vec); + // sout2 = Q6_Vqf16_vadd_Vqf16Vqf16(sout2, es_vec); + // sout2 = Q6_Vhf_equals_Vqf16(sout2); + // sout2 = Q6_Vhf_vmin_VhfVhf(sout2, high_level_vec); + // sout2 = Q6_Vhf_vmax_VhfVhf(sout2, low_level_vec); + // HVX_VectorPair sout2_pair = Q6_Wqf32_vmpy_VhfVhf(sout2, one_vec); + // HVX_Vector sout2_low = Q6_Vsf_equals_Vqf32( Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(sout2_pair), round_scale_vec)); + // HVX_Vector sout2_high = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(sout2_pair), round_scale_vec)); + + // sout2_pair = Q6_W_vshuff_VVR(sout2_high, sout2_low, -4); + // sout2_low = Q6_V_lo_W(sout2_pair); + // sout2_high = Q6_V_hi_W(sout2_pair); + + // // { + // // HVX_Vector exp = Q6_Vh_vasr_VhR(sout2, FP16_MANTISA); + // // exp = Q6_V_vand_VV(exp, expmask); + // // exp = Q6_Vh_vsub_VhVh(exp, expbias); + + // // HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + // // HVX_Vector manzero = Q6_V_vand_VV(sout2, man); + + // // HVX_Vector sign = Q6_Vh_vasr_VhR(sout2, FP16_SIGN); + // // HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + // // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + // // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + // // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + // // HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout2, man); + // // man = Q6_V_vnot_V(man); + // // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); + // // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // // HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout2, 1); + // // HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // // // exp >= 0 + // // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // // tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); + + // // // exp < 0 (-1, 1) + // // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); + // // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // // sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); + // // } + + // sout2_low = Q6_Vw_equals_Vsf(sout2_low); + // sout2_low = Q6_Vw_vasr_VwR(sout2_low, ROUND_2_SCALE); + // sout2_high = Q6_Vw_equals_Vsf(sout2_high); + // sout2_high = Q6_Vw_vasr_VwR(sout2_high, ROUND_2_SCALE); + + // HVX_Vector reql_h = Q6_Vh_vpack_VwVw_sat(sout1_high, sout1_low); + // HVX_Vector reqh_h = Q6_Vh_vpack_VwVw_sat(sout2_high, sout2_low); + // HVX_Vector req_b = Q6_Vb_vpack_VhVh_sat(reqh_h, reql_h); + + // *optr++ = Q6_Vb_vadd_VbVb(req_b, uintconvert); + // } + + + + + + sline1p = sline1c; + sline2p = sline2c; + } + } + + if (vectors_in_rounddown > 0) { + + o_scale_vec = Q6_V_vsplat_R(float_to_fp16s(o_scale)); + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t) input); + + sline2c = is_aligned(iptr2, VLEN) && leftover == 0 ? sline2p : *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t) input2); + + + HVX_Vector sline1_high; + HVX_Vector sline1_low; + HVX_Vector sline2_high; + HVX_Vector sline2_low; + + { + // dequantize sline1 qf16 + HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline1, zero_v_sf); + + temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + sline1_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), a_scale_vec); + sline1_low = Q6_Vhf_equals_Vqf16(sline1_low); + sline1_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), a_scale_vec); + sline1_high = Q6_Vhf_equals_Vqf16(sline1_high); + } + + + { + // dequantize sline2 qf16 + HVX_VectorPair temp = Q6_Wh_vadd_VubVub(sline2, zero_v_sf); + + temp = Q6_W_vshuff_VVR(Q6_V_hi_W(temp), Q6_V_lo_W(temp), -2); + HVX_Vector sout1 = Q6_Vh_vsub_VhVh(Q6_V_lo_W(temp), convert_vector); + HVX_Vector sout2 = Q6_Vh_vsub_VhVh(Q6_V_hi_W(temp), convert_vector); + + sline2_low = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout1), b_scale_vec); + sline2_low = Q6_Vhf_equals_Vqf16(sline2_low); + sline2_high = Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(sout2), b_scale_vec); + sline2_high = Q6_Vhf_equals_Vqf16(sline2_high); + } + + { + // silu sline1_low + tmp_v = Q6_Vh_vdeal_Vh(sline1_low); + + /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + * Use the same input_scaled_v vector for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_low, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_low = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + + { + // silu sline1_high + tmp_v = Q6_Vh_vdeal_Vh(sline1_high); + + /* Shift input range from [input_min, input_max] to [0, input_max - input_min] */ + input_shifted_v_hf = Q6_Vqf16_vsub_VhfVhf(tmp_v, input_min_v_hf); + + /* + * Scale shifted input range from [0, input_max - input_min] to [0,16.0) + * in order to get corresponding coefficient indexes + */ + input_scaled_v = Q6_Vqf16_vmpy_Vqf16Vqf16(input_shifted_v_hf, scale_v); + + /* + * VLUT 16 requires integer indexes. Shift scaled input range from [0,16.0) + * to [16.0,32.0) in order to convert float indexes to integer values. + * Float values, represented in IEEE 754, in range [16.0,32.0] have the + * same exponent, which means 4 MSB of mantissa carry information about + * integer index. + * Use the same input_scaled_v vector for hf and qf16 representation + */ + input_scaled_v = Q6_Vqf16_vadd_Vqf16Vhf(input_scaled_v, const16_0_v_hf); + + /* Convert back from qf16 to hf in order to extract integer index */ + tmp_v = Q6_Vhf_equals_Vqf16(input_scaled_v); + + /* Only 4 MSB bits of mantissa represent segment index */ + idx1_v = Q6_Vuh_vlsr_VuhR(tmp_v, 6); + + /* Ensure only 4 MSB bits of mantissa are used as indexes */ + idx1_v = Q6_V_vand_VV(idx1_v, mask_idx1_v); + + idx1_v = Q6_Vb_vshuff_Vb(idx1_v); + idx1_v = Q6_V_vor_VV(idx1_v, mask_idx2_v); + idx2_v = Q6_Vw_vasl_VwR(idx1_v, 16); + + /* Obtain the polynomial coefficients from lookup table */ + c0_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c0_coeff_dv.VV), 1); + c0_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c0_coeff_vp, idx2_v, Q6_V_hi_W(c0_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c1_coeff_dv.VV), 1); + c1_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c1_coeff_vp, idx2_v, Q6_V_hi_W(c1_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c2_coeff_dv.VV), 1); + c2_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c2_coeff_vp, idx2_v, Q6_V_hi_W(c2_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c3_coeff_dv.VV), 1); + c3_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c3_coeff_vp, idx2_v, Q6_V_hi_W(c3_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16_VbVhR(idx1_v, Q6_V_lo_W(c4_coeff_dv.VV), 1); + c4_coeff_vp = Q6_Wh_vlut16or_WhVbVhR(c4_coeff_vp, idx2_v, Q6_V_hi_W(c4_coeff_dv.VV), 1); + + /* Convert input from hf vector to qf32 vector pair for Horner's method*/ + input_vp_qf32 = Q6_Wqf32_vmpy_VhfVhf(sline1_high, one_v_hf); + + /* Perform evaluation of polynomial using Horner's method */ + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(c4_coeff_vp), Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c3_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c2_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c1_coeff_vp)); + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(input_vp_qf32)); + output_dv.V.lo = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.lo, Q6_V_lo_W(c0_coeff_vp)); + + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(c4_coeff_vp), Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c3_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c2_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c1_coeff_vp)); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(input_vp_qf32)); + output_dv.V.hi = Q6_Vqf32_vadd_Vqf32Vqf32(output_dv.V.hi, Q6_V_hi_W(c0_coeff_vp)); + + // x * sigmod + output_dv.V.lo = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_lo_W(input_vp_qf32), output_dv.V.lo); + output_dv.V.hi = Q6_Vqf32_vmpy_Vqf32Vqf32(Q6_V_hi_W(input_vp_qf32), output_dv.V.hi); + + sline1_high = Q6_Vhf_equals_Wqf32(output_dv.VV); + } + + + HVX_Vector sline_high; + HVX_Vector sline_low; + + { + // mul + sline_high = Q6_Vqf16_vmpy_VhfVhf(sline1_high, sline2_high); + sline_low = Q6_Vqf16_vmpy_VhfVhf(sline1_low, sline2_low); + + sline_high = Q6_Vhf_equals_Vqf16(sline_high); + sline_low = Q6_Vhf_equals_Vqf16(sline_low); + } + + + { + // quantize + HVX_Vector sout1 = Q6_Vqf16_vmpy_VhfVhf(sline_low, o_scale_vec); + sout1 = Q6_Vqf16_vadd_Vqf16Vqf16(sout1, es_vec); + sout1 = Q6_Vhf_equals_Vqf16(sout1); + sout1 = Q6_Vhf_vmin_VhfVhf(sout1, high_level_vec); + sout1 = Q6_Vhf_vmax_VhfVhf(sout1, low_level_vec); + + { + HVX_Vector exp = Q6_Vh_vasr_VhR(sout1, FP16_MANTISA); + exp = Q6_V_vand_VV(exp, expmask); + exp = Q6_Vh_vsub_VhVh(exp, expbias); + + HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + HVX_Vector manzero = Q6_V_vand_VV(sout1, man); + + HVX_Vector sign = Q6_Vh_vasr_VhR(sout1, FP16_SIGN); + HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout1, man); + man = Q6_V_vnot_V(man); + HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); + exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout1, 1); + HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // exp >= 0 + HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); + + // exp < 0 (-1, 1) + HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); + tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); + } + + sout1 = Q6_Vh_equals_Vhf(sout1); + + + HVX_Vector sout2 = Q6_Vqf16_vmpy_VhfVhf(sline_high, o_scale_vec); + sout2 = Q6_Vqf16_vadd_Vqf16Vqf16(sout2, es_vec); + sout2 = Q6_Vhf_equals_Vqf16(sout2); + sout2 = Q6_Vhf_vmin_VhfVhf(sout2, high_level_vec); + sout2 = Q6_Vhf_vmax_VhfVhf(sout2, low_level_vec); + + { + HVX_Vector exp = Q6_Vh_vasr_VhR(sout2, FP16_MANTISA); + exp = Q6_V_vand_VV(exp, expmask); + exp = Q6_Vh_vsub_VhVh(exp, expbias); + + HVX_Vector man = Q6_Vh_vasr_VhVh(manmask, exp); + HVX_Vector manzero = Q6_V_vand_VV(sout2, man); + + HVX_Vector sign = Q6_Vh_vasr_VhR(sout2, FP16_SIGN); + HVX_Vector issignpos = Q6_Q_vcmp_eq_VhVh(sign, zero); + + HVX_Vector expgte23 = Q6_Q_vcmp_gt_VhVh(exp, exp23); + HVX_Vector expgte0 = Q6_Q_vcmp_gt_VhVh(exp, exp0); + HVX_Vector maneqzero = Q6_Q_vcmp_eq_VhVh(manzero, zero); + + HVX_Vector exppos_signneg = Q6_Vh_vadd_VhVh(sout2, man); + man = Q6_V_vnot_V(man); + HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); + exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + HVX_Vector shift1 = Q6_Vh_vasl_VhR(sout2, 1); + HVX_Vector iszero = Q6_Q_vcmp_eq_VhVh(shift1, zero); + + // exp >= 0 + HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); + + // exp < 0 (-1, 1) + HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); + tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); + } + + sout2 = Q6_Vh_equals_Vhf(sout2); + + HVX_Vector reql_h = Q6_Vb_vpack_VhVh_sat(sout2, sout1); + *optr++ = Q6_Vb_vadd_VbVb(reql_h, uintconvert); + + } + + } + + // // Handle leftover elements. + // if (leftover_size > 0) { + // sline1c = (is_in_one_chunk(iptr, leftover_size, VLEN) + // ? sline1p + // : *iptr++); + // sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + + // sline2c = (is_in_one_chunk(iptr2, leftover_size, VLEN) + // ? sline2p + // : *iptr2++); + // sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input2); + + // vstu_variable(optr, leftover_size, Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(sline1, sline2))); + // } + + return 0; +} + + +template +GraphStatus llamasupersiluImpl(TensorType& out_0, + const TensorType& in_0, + const TensorType& in_1, + const PlainFloatTensor& a_scale, + const PlainFloatTensor& b_scale, + const PlainFloatTensor& o_scale) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + out_0.set_dims(in_0); + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + size_t size = b_in*h_in*w_in*d_in; + + + float a_scale_ = a_scale(0,0,0,0); + float b_scale_ = b_scale(0,0,0,0); + float o_scale_ = o_scale(0,0,0,0); + + auto in_ptr = (uint8_t*)in_0.raw_data_const(); + auto in_ptr2 = (uint8_t*)in_1.raw_data_const(); + + auto out_ptr = (uint8_t*)out_0.raw_data(); + + + DType dtype = in_0.get_dtype(); + + if (dtype == DType::QUInt8 && out_0.get_dtype() == DType::QUInt8) { + hvx_supersilu_ahf(in_ptr, in_ptr2, out_ptr, a_scale_, b_scale_, 1.0f/o_scale_, size); + } + + return GraphStatus::Success; +} + +#else + +template +GraphStatus llamasupersiluImpl(TensorType& out_0, + const TensorType& in_0, + const TensorType& in_1, + const PlainFloatTensor& a_scale, + const PlainFloatTensor& b_scale, + const PlainFloatTensor& o_scale) + +{ + /* + * add code here + * */ + /* + * To have good performance and stability, it is required to avoid heap memory + * allocation in this function. The heap memory allocation includes but not + * limited to calling malloc, operator new, constructing STL container objects + * like std::vector with default allocator, and adding items like calling + * std::vector::push_back to STL container objects with default allocator. + * + * Please check in SDK documentation for more information. + */ + + out_0.set_dims(in_0); + + float a_scale_ = a_scale(0,0,0,0); + float b_scale_ = b_scale(0,0,0,0); + float o_scale_ = o_scale(0,0,0,0); + + auto in_ptr = (uint8_t*)in_0.raw_data_const(); + auto in_ptr2 = (uint8_t*)in_1.raw_data_const(); + + auto out_ptr = (uint8_t*)out_0.raw_data(); + + + auto [b_in, h_in, w_in, d_in] = in_0.dims(); + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // mul + for (Idx d = 0; d < d_in; d++) { + + + int32_t a_inval = static_cast(*in_ptr++); + float a_inval_fp16 = (a_inval-128) * a_scale_; + + + int32_t b_inval = static_cast(*in_ptr2++); + float b_inval_fp16 = (b_inval-128) * b_scale_; + + + a_inval_fp16 = a_inval_fp16 * (1 / (1 + expf(-a_inval_fp16))); + + float inval = a_inval_fp16 * b_inval_fp16; + + long v = lroundf(inval / o_scale_); + + if (v > 127) + v = 127; + + if (v < -128) + v = -128; + + v += 128; + + *out_ptr++ = static_cast(v); + + } + } + } + } + + + return GraphStatus::Success; +} + +#endif + +__attribute__((unused)) static float llamasupersiluCostFunc(const Op *op) +{ + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + + + + + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_LLaMASuperSiLU); \ No newline at end of file diff --git a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/RMSNorm.cpp b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/RMSNorm.cpp index ddf701ff..bd079a2c 100755 --- a/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/RMSNorm.cpp +++ b/src/backends/qnn/LLaMAOpPackageHtp/LLaMAPackage/src/ops/RMSNorm.cpp @@ -222,6 +222,654 @@ int32_t hvx_rmsnorm_af( return 0; } +static HVX_INLINE_ALWAYS uint32_t float_to_bits(float x) +{ + union { float f; uint32_t i; } fp32 = { .f = x }; + return fp32.i; +} + +static inline int32_t float_to_fp16s(float input) +{ + union { + int32_t i; + __fp16 f[2]; + } fp32 = {.f = {(__fp16)input, (__fp16)input}}; + return fp32.i; +} + + +#define FLOAT_MANTISA 23 +#define FLOAT_EXPONENT_MASK 0xff +#define FLOAT_EXPONENT_BIAS 0x7f +#define FLOAT_MANTISA_MASK 0x007fffff +#define FLOAT_SIGN 31 +#define FLOAT_NEG_1 0xBF800000 +#define ROUND_2_SCALE 22 +#define ROUND_SCALSE ((1 << ROUND_2_SCALE) * 1.0f) + +int32_t hvx_rmsnorm_auint8( + float *restrict input, + float *restrict weights, + uint8_t *restrict output, + uint32_t size, + float scale) +{ + if ((input == NULL) || (output == NULL) || (size == 0)) + { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)weights; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + HVX_Vector sline3p, sline3c, sline3; + HVX_Vector sline4p, sline4c, sline4; + HVX_Vector slinewp, slinewc, slinew; + + HVX_Vector sout1, sout2, sout3, sout4; + HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec, round_scale_vec; + + float low_level = -128.0f; + float high_level = 127.0f; + + float es = 0.5f; + low_level_vec = Q6_V_vsplat_R(float_to_bits(low_level)); + high_level_vec = Q6_V_vsplat_R(float_to_bits(high_level)); + scale_vec = Q6_V_vsplat_R(float_to_bits(scale)); + es_vec = Q6_V_vsplat_R(float_to_bits(es)); + round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE)); + + HVX_Vector zero_v_sf = Q6_V_vzero(); + scale_vec = Q6_Vqf32_vadd_VsfVsf(scale_vec, zero_v_sf); + es_vec = Q6_Vqf32_vadd_VsfVsf(es_vec, zero_v_sf); + + HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080); + + + // HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK); + // HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS); + // HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK); + // HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1); + // HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1); + // HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1); + HVX_Vector zero = Q6_V_vzero(); + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + // int32_t leftover_size = leftover * sizeof(float); + + sline1p = *iptr++; + + + // ^2 sum + HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero()); + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) + { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) + { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) + { + sline1c = *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + + + sline1p = sline1c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t) input); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + + } + + float epsilon_ = 1e-6; + union { + float f; + uint32_t ui; + } sum_value; + sum_value.f = 0.0f; + + for (int32_t i = 64; i >= 4; i >>= 1) + { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i)); + } + + sum = Q6_Vsf_equals_Vqf32(sum); + sum_value.f = 1.0f / sqrtf(*((float*)&sum + 31) / size + epsilon_); + + // x * 1/rsqrt(sum) + iptr = (HVX_Vector *)input; + + sline1p = *iptr++; + sline2p = *iptr++; + sline3p = *iptr++; + sline4p = *iptr++; + + slinewp = *iptr2++; + + + HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui); + HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero()); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) + { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) + { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; j+=4) + { + + { + sline1c = *iptr++; + slinewc = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, slinew); + sline1 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + sout1 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline1,scale_vec); + sout1 = Q6_Vqf32_vadd_Vqf32Vqf32(sout1, es_vec); + sout1 = Q6_Vsf_equals_Vqf32(sout1); + sout1 = Q6_Vsf_vmin_VsfVsf(sout1, high_level_vec); + sout1 = Q6_Vsf_vmax_VsfVsf(sout1, low_level_vec); + sout1 = Q6_Vqf32_vmpy_VsfVsf(sout1, round_scale_vec); + sout1 = Q6_Vsf_equals_Vqf32(sout1); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout1, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout1, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout1, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout1, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout1, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout1, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout1, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout1, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout1 = Q6_V_vmux_QVV(expgte23, sout1, tsout1); + // } + + sout1 = Q6_Vw_equals_Vsf(sout1); + sout1 = Q6_Vw_vasr_VwR(sout1, ROUND_2_SCALE); + // sout1 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout1, Q6_V_vzero()), 0); + + { + sline2c = *iptr++; + slinewc = *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline2, slinew); + sline2 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + + sout2 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline2,scale_vec); + sout2 = Q6_Vqf32_vadd_Vqf32Vqf32(sout2, es_vec); + sout2 = Q6_Vsf_equals_Vqf32(sout2); + sout2 = Q6_Vsf_vmin_VsfVsf(sout2, high_level_vec); + sout2 = Q6_Vsf_vmax_VsfVsf(sout2, low_level_vec); + sout2 = Q6_Vqf32_vmpy_VsfVsf(sout2, round_scale_vec); + sout2 = Q6_Vsf_equals_Vqf32(sout2); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout2, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout2, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout2, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout2, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout2, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout2, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout2, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout2, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout2 = Q6_V_vmux_QVV(expgte23, sout2, tsout1); + // } + + sout2 = Q6_Vw_equals_Vsf(sout2); + sout2 = Q6_Vw_vasr_VwR(sout2, ROUND_2_SCALE); + // sout2 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout2, Q6_V_vzero()), 0); + + { + sline3c = *iptr++; + slinewc = *iptr2++; + sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t) input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline3, slinew); + sline3 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + + sout3 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline3,scale_vec); + sout3 = Q6_Vqf32_vadd_Vqf32Vqf32(sout3, es_vec); + sout3 = Q6_Vsf_equals_Vqf32(sout3); + sout3 = Q6_Vsf_vmin_VsfVsf(sout3, high_level_vec); + sout3 = Q6_Vsf_vmax_VsfVsf(sout3, low_level_vec); + sout3 = Q6_Vqf32_vmpy_VsfVsf(sout3, round_scale_vec); + sout3 = Q6_Vsf_equals_Vqf32(sout3); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout3, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout3, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout3, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout3, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout3, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout3, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout3, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout3, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout3 = Q6_V_vmux_QVV(expgte23, sout3, tsout1); + // } + + + sout3 = Q6_Vw_equals_Vsf(sout3); + sout3 = Q6_Vw_vasr_VwR(sout3, ROUND_2_SCALE); + // sout3 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout3, Q6_V_vzero()), 0); + + { + sline4c = *iptr++; + slinewc = *iptr2++; + sline4 = Q6_V_valign_VVR(sline4c, sline4p, (size_t) input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline4, slinew); + sline4 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + + sout4 = Q6_Vqf32_vmpy_Vqf32Vqf32(sline4,scale_vec); + sout4 = Q6_Vqf32_vadd_Vqf32Vqf32(sout4, es_vec); + sout4 = Q6_Vsf_equals_Vqf32(sout4); + sout4 = Q6_Vsf_vmin_VsfVsf(sout4, high_level_vec); + sout4 = Q6_Vsf_vmax_VsfVsf(sout4, low_level_vec); + sout4 = Q6_Vqf32_vmpy_VsfVsf(sout4, round_scale_vec); + sout4 = Q6_Vsf_equals_Vqf32(sout4); + + // { + // HVX_Vector exp = Q6_Vw_vasr_VwR(sout4, FLOAT_MANTISA); + // exp = Q6_V_vand_VV(exp, expmask); + // exp = Q6_Vw_vsub_VwVw(exp, expbias); + + // HVX_Vector man = Q6_Vw_vasr_VwVw(manmask, exp); + // HVX_Vector manzero = Q6_V_vand_VV(sout4, man); + + // HVX_Vector sign = Q6_Vw_vasr_VwR(sout4, FLOAT_SIGN); + // HVX_Vector issignpos = Q6_Q_vcmp_eq_VwVw(sign, zero); + + // HVX_Vector expgte23 = Q6_Q_vcmp_gt_VwVw(exp, exp23); + // HVX_Vector expgte0 = Q6_Q_vcmp_gt_VwVw(exp, exp0); + // HVX_Vector maneqzero = Q6_Q_vcmp_eq_VwVw(manzero, zero); + + // HVX_Vector exppos_signneg = Q6_Vw_vadd_VwVw(sout4, man); + // man = Q6_V_vnot_V(man); + // HVX_Vector exppos_signpos = Q6_V_vand_VV(sout4, man); + // exppos_signneg = Q6_V_vand_VV(exppos_signneg, man); + // HVX_Vector shift1 = Q6_Vw_vasl_VwR(sout4, 1); + // HVX_Vector iszero = Q6_Q_vcmp_eq_VwVw(shift1, zero); + + // // exp >= 0 + // HVX_Vector tsout1 = Q6_V_vmux_QVV(issignpos, exppos_signpos, exppos_signneg); + // tsout1 = Q6_V_vmux_QVV(maneqzero, sout4, tsout1); + + // // exp < 0 (-1, 1) + // HVX_Vector tsout2 = Q6_V_vmux_QVV(iszero, sout4, negone); + // tsout2 = Q6_V_vmux_QVV(issignpos, zero, tsout2); + + // tsout1 = Q6_V_vmux_QVV(expgte0, tsout1, tsout2); + // sout4 = Q6_V_vmux_QVV(expgte23, sout4, tsout1); + // } + + sout4 = Q6_Vw_equals_Vsf(sout4); + sout4 = Q6_Vw_vasr_VwR(sout4, ROUND_2_SCALE); + // sout4 = qhmath_hvx_vw_convert_vqf32_rmode(Q6_Vqf32_vadd_VsfVsf(sout4, Q6_V_vzero()), 0); + + + HVX_Vector reql_h = Q6_Vh_vpack_VwVw_sat(sout2, sout1); + HVX_Vector reqh_h = Q6_Vh_vpack_VwVw_sat(sout4, sout3); + HVX_Vector req_b = Q6_Vb_vpack_VhVh_sat(reqh_h, reql_h); + + *optr++ = Q6_Vb_vadd_VbVb(req_b, uintconvert); + + sline1p = sline1c; + sline2p = sline2c; + sline3p = sline3c; + sline4p = sline4c; + + + slinewp = slinewc; + + } + } + + return 0; +} + +int32_t hvx_rmsnorm_auint8_opt( + float *restrict input, + float *restrict weights, + uint8_t *restrict output, + uint32_t size, + float scale) +{ + if ((input == NULL) || (output == NULL) || (size == 0)) + { + return -1; + } + + HVX_Vector *iptr = (HVX_Vector *)input; + HVX_Vector *iptr2 = (HVX_Vector *)weights; + HVX_UVector *optr = (HVX_UVector *)output; + HVX_Vector sline1p, sline1c, sline1; + HVX_Vector sline2p, sline2c, sline2; + HVX_Vector sline3p, sline3c, sline3; + HVX_Vector sline4p, sline4c, sline4; + HVX_Vector slinewp, slinewc, slinew; + + // HVX_Vector sout1, sout2, sout3, sout4; + // HVX_Vector low_level_vec, high_level_vec, scale_vec, es_vec, round_scale_vec; + + // float low_level = -128.0f; + // float high_level = 127.0f; + + // float es = 0.5f; + // low_level_vec = Q6_V_vsplat_R(float_to_bits(low_level)); + // high_level_vec = Q6_V_vsplat_R(float_to_bits(high_level)); + // scale_vec = Q6_V_vsplat_R(float_to_bits(scale)); + // es_vec = Q6_V_vsplat_R(float_to_bits(es)); + // round_scale_vec = Q6_V_vsplat_R(float_to_bits(ROUND_SCALSE)); + + // HVX_Vector zero_v_sf = Q6_V_vzero(); + // scale_vec = Q6_Vqf32_vadd_VsfVsf(scale_vec, zero_v_sf); + // es_vec = Q6_Vqf32_vadd_VsfVsf(es_vec, zero_v_sf); + + // HVX_Vector uintconvert = Q6_V_vsplat_R(0x80808080); + + + // HVX_Vector expmask = Q6_V_vsplat_R(FLOAT_EXPONENT_MASK); + // HVX_Vector expbias = Q6_V_vsplat_R(FLOAT_EXPONENT_BIAS); + // HVX_Vector manmask = Q6_V_vsplat_R(FLOAT_MANTISA_MASK); + // HVX_Vector exp23 = Q6_V_vsplat_R(23 - 1); + // HVX_Vector exp0 = Q6_V_vsplat_R(0 - 1); + // HVX_Vector negone = Q6_V_vsplat_R(FLOAT_NEG_1); + HVX_Vector zero = Q6_V_vzero(); + + int32_t block, l2fetch_block; + int32_t leftover = size & 31; + int32_t vectors_in_rounddown = size / 32; + // int32_t leftover_size = leftover * sizeof(float); + + sline1p = *iptr++; + + + // ^2 sum + HVX_Vector sum = Q6_Vqf32_vadd_VsfVsf(Q6_V_vzero(), Q6_V_vzero()); + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) + { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) + { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; ++j) + { + sline1c = *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + + + sline1p = sline1c; + } + } + + if (vectors_in_rounddown > 0) { + + sline1c = is_aligned(iptr, VLEN) && leftover == 0 ? sline1p : *iptr++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t) input); + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_Vqf32_vmpy_VsfVsf(sline1, sline1)); + + } + + float epsilon_ = 1e-6; + union { + float f; + uint32_t ui; + } sum_value; + sum_value.f = 0.0f; + + for (int32_t i = 64; i >= 4; i >>= 1) + { + sum = Q6_Vqf32_vadd_Vqf32Vqf32(sum, Q6_V_vlalign_VVR(sum, zero, i)); + } + + sum = Q6_Vsf_equals_Vqf32(sum); + sum_value.f = 1.0f / sqrtf(*((float*)&sum + 31) / size + epsilon_); + + // x * 1/rsqrt(sum) + iptr = (HVX_Vector *)input; + + sline1p = *iptr++; + sline2p = *iptr++; + sline3p = *iptr++; + sline4p = *iptr++; + + slinewp = *iptr2++; + + + HVX_Vector irsqrt_vsf = Q6_V_vsplat_R(sum_value.ui); + HVX_Vector irsqrt_vqf32 = Q6_Vqf32_vadd_VsfVsf(irsqrt_vsf, Q6_V_vzero()); + + + float post_scale_flt = scale / 64.0f; + int scexp = flt_getexp( post_scale_flt); + int rsh = min_i32( -scexp,7); // e.g. 0.11 -> 0.88, rsh = 3 + float rsh_fac = flt_power2(rsh); + + int adj_bias = roundf_i32(128 * rsh_fac); + adj_bias = Q6_R_combine_RlRl( adj_bias, adj_bias); + + + HVX_Vector zero_v_sf = Q6_V_vzero(); + float es = 0.5f; + HVX_Vector es_vec = Q6_V_vsplat_R(float_to_fp16s(es)); + es_vec = Q6_Vqf16_vadd_VhfVhf(es_vec, zero_v_sf); + + HVX_Vector vadj = Q6_V_vsplat_R(adj_bias); + HVX_Vector o_scale_vec = Q6_V_vsplat_R(float_to_fp16s(post_scale_flt * rsh_fac * (1<<15))); + + for (int32_t i = vectors_in_rounddown - 1; i > 0; i -= BLOCK_SIZE) + { + block = Q6_R_min_RR(i, BLOCK_SIZE); + l2fetch_block = Q6_R_min_RR(i - L2FETCH_AHEAD, BLOCK_SIZE); + + if (l2fetch_block > 0) + { + l2fetch(iptr + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + l2fetch(iptr2 + L2FETCH_AHEAD, VLEN, VLEN, l2fetch_block, 0); + } + + for (int32_t j = 0; j < block; j+=4) + { + + { + sline1c = *iptr++; + slinewc = *iptr2++; + sline1 = Q6_V_valign_VVR(sline1c, sline1p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline1, slinew); + sline1 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + { + sline2c = *iptr++; + slinewc = *iptr2++; + sline2 = Q6_V_valign_VVR(sline2c, sline2p, (size_t)input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline2, slinew); + sline2 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + + HVX_Vector sline_low = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(sline2, sline1)); + sline_low = Q6_Vqf16_vadd_Vqf16Vqf16(sline_low, es_vec); + sline_low = Q6_Vqf16_vmpy_VhfVhf(sline_low, o_scale_vec); + sline_low = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_low)); + sline_low = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vmpy_VhRh_s1_rnd_sat(sline_low, 0x00400040), vadj); + + sline_low = Q6_Vh_vdeal_Vh(sline_low); + + { + sline3c = *iptr++; + slinewc = *iptr2++; + sline3 = Q6_V_valign_VVR(sline3c, sline3p, (size_t) input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline3, slinew); + sline3 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + { + sline4c = *iptr++; + slinewc = *iptr2++; + sline4 = Q6_V_valign_VVR(sline4c, sline4p, (size_t) input); + slinew = Q6_V_valign_VVR(slinewc, slinewp, (size_t)weights); + + HVX_Vector middle_value_qf32 = Q6_Vqf32_vmpy_VsfVsf(sline4, slinew); + sline4 = Q6_Vqf32_vmpy_Vqf32Vqf32(middle_value_qf32, irsqrt_vqf32); + + slinewp = slinewc; + } + + + HVX_Vector sline_high = Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(sline4, sline3)); + sline_high = Q6_Vqf16_vadd_Vqf16Vqf16(sline_high, es_vec); + sline_high = Q6_Vqf16_vmpy_VhfVhf(sline_high, o_scale_vec); + sline_high = Q6_Vh_equals_Vhf(Q6_Vhf_equals_Vqf16(sline_high)); + sline_high = Q6_Vh_vadd_VhVh_sat(Q6_Vh_vmpy_VhRh_s1_rnd_sat(sline_high, 0x00400040), vadj); + + sline_high = Q6_Vh_vdeal_Vh(sline_high); + + HVX_Vector sout = Q6_Vub_vasr_VhVhR_rnd_sat( sline_high, sline_low, rsh); + sout = Q6_Vb_vdeal_Vb(sout); + *optr++ = sout; + + sline1p = sline1c; + sline2p = sline2c; + sline3p = sline3c; + sline4p = sline4c; + + + slinewp = slinewc; + + } + } + + return 0; +} + template GraphStatus rmsnormImpl(TensorType& out_0, const TensorType& in_0, @@ -233,21 +881,49 @@ GraphStatus rmsnormImpl(TensorType& out_0, // NHWC auto in_ptr = (float*)in_0.raw_data_const(); - auto out_ptr = (float*)out_0.raw_data(); auto weights_ptr = (float*)weights.raw_data_const(); auto [b_in, h_in, w_in, d_in] = in_0.dims(); - for (Idx b = 0; b < b_in; b++) { - for (Idx h = 0; h < h_in; h++) { - for (Idx w = 0; w < w_in; w++) { - // RMS - hvx_rmsnorm_af(in_ptr, weights_ptr, out_ptr, d_in); - - in_ptr += d_in; - out_ptr += d_in; + + + DType dtype = out_0.get_dtype(); + + if (dtype == DType::Float32) { + + auto out_ptr = (float*)out_0.raw_data(); + + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // RMS + hvx_rmsnorm_af(in_ptr, weights_ptr, out_ptr, d_in); + + in_ptr += d_in; + out_ptr += d_in; + } } } + + } else if (dtype == DType::QUInt8) { + + auto out_ptr = (uint8_t*)out_0.raw_data(); + float scale_ = out_0.get_interface_scale(); + + scale_ = 1.0f/scale_; + + for (Idx b = 0; b < b_in; b++) { + for (Idx h = 0; h < h_in; h++) { + for (Idx w = 0; w < w_in; w++) { + // RMS + hvx_rmsnorm_auint8(in_ptr, weights_ptr, out_ptr, d_in, scale_); + + in_ptr += d_in; + out_ptr += d_in; + } + } + } + } return GraphStatus::Success; diff --git a/src/backends/qnn/Model/QnnModel.cpp b/src/backends/qnn/Model/QnnModel.cpp index 405fa738..32adae18 100644 --- a/src/backends/qnn/Model/QnnModel.cpp +++ b/src/backends/qnn/Model/QnnModel.cpp @@ -570,6 +570,47 @@ ModelError_t getGraphInfoFromModels(QnnModel *models, return err; } +ModelError_t getSingleGraphInfoFromModel(QnnModel &model, GraphInfoPtr_t* graphInfoPtr) { + ModelError_t err = MODEL_NO_ERROR; + + *graphInfoPtr = (GraphInfo_t *)malloc(sizeof(GraphInfo_t)); + auto graphInfo = *graphInfoPtr; + if (graphInfo == nullptr) { + PRINT_ERROR("getGraphInfoFromModels() graphsInfo malloc returned nullptr.\n"); + return MODEL_GRAPH_ERROR; + } + + graphInfo->graph = model.getQnnGraph(); + graphInfo->graphName = + strnDup(model.getQnnGraphName().c_str(), model.getQnnGraphName().size()); + if (graphInfo->graphName == nullptr) { + PRINT_ERROR("getGraphInfoFromModels() failed to construct graphName. Received nullptr.\n"); + return MODEL_GRAPH_ERROR; + } + + // allocate and add graph input/output TensorsWrapper. Note: no need to make deep copies of + // the tensor's pointer members as they are already allocated on heap in the addTensor + // function call. + std::vector graphInputTensors = model.getGraphInputTensors(); + size_t numInputTensors = graphInputTensors.size(); + size_t inputTensorsSize = numInputTensors * sizeof(Qnn_Tensor_t); + graphInfo->inputTensors = (Qnn_Tensor_t *)malloc(inputTensorsSize); + memscpy(graphInfo->inputTensors, inputTensorsSize, graphInputTensors.data(), inputTensorsSize); + graphInfo->numInputTensors = (uint32_t)numInputTensors; + // allocate and add graph outputTensors + std::vector graphOutputTensors = model.getGraphOutputTensors(); + size_t numOutputTensors = graphOutputTensors.size(); + size_t outputTensorsSize = numOutputTensors * sizeof(Qnn_Tensor_t); + graphInfo->outputTensors = (Qnn_Tensor_t *)malloc(outputTensorsSize); + memscpy( + graphInfo->outputTensors, outputTensorsSize, graphOutputTensors.data(), outputTensorsSize); + graphInfo->numOutputTensors = (uint32_t)numOutputTensors; + + // graph composition is complete by this stage, free if any cached tensors remaining + VALIDATE(model.freeCachedTensors(), err); + return err; +} + ModelError_t freeGraphsInfo(GraphInfoPtr_t **graphsInfo, uint32_t numGraphs) { if (graphsInfo == nullptr || *graphsInfo == nullptr) { PRINT_ERROR("freeGraphsInfo() invalid graphsInfo."); diff --git a/src/backends/qnn/Model/QnnModel.hpp b/src/backends/qnn/Model/QnnModel.hpp index 78ec9e14..1ffe0a61 100644 --- a/src/backends/qnn/Model/QnnModel.hpp +++ b/src/backends/qnn/Model/QnnModel.hpp @@ -256,6 +256,7 @@ class QnnModel { ModelError_t getGraphInfoFromModels(QnnModel* models, uint32_t numModels, GraphInfoPtr_t** graphsInfo); +ModelError_t getSingleGraphInfoFromModel(QnnModel &model, GraphInfoPtr_t* graphInfoPtr); /** * @brief A helper function to free memory malloced for communicating the Graph for a model(s) diff --git a/src/backends/qnn/QNNBackend.cpp b/src/backends/qnn/QNNBackend.cpp index 069b82c4..0a5eeb72 100755 --- a/src/backends/qnn/QNNBackend.cpp +++ b/src/backends/qnn/QNNBackend.cpp @@ -4,6 +4,7 @@ #include #include +#include "Module.hpp" #include "OpDefined.hpp" #include "QNNBackend.hpp" #include "QnnModel.hpp" @@ -21,7 +22,6 @@ #include "op/QNNAdd.hpp" #include "op/QNNCausalMask.hpp" #include "op/QNNGELU.hpp" -#include "op/QNNLinear.hpp" #include "op/QNNLinearINT8.hpp" #include "op/QNNMatmul.hpp" #include "op/QNNMul.hpp" @@ -39,6 +39,7 @@ #include "op/QNNSplitInput.hpp" #include "op/QNNTranspose.hpp" #include "op/QNNLinearINT8Shadow.hpp" +#include "op/QNNSuperSiLU.hpp" #include "memory/MemInspect.hpp" @@ -79,10 +80,12 @@ void QNNBackend::registerOps() { addCreator(SPLITINPUT, (QNNBackend::Creator *)(new QNNSplitInputCreator())); addCreator(TRANSPOSE, (QNNBackend::Creator *)(new QNNTransposeCreator())); addCreator(LINEARINT8SHADOW, (QNNBackend::Creator *)(new QNNLinearINT8ShadowCreator())); + addCreator(SUPERSILU, (QNNBackend::Creator *)(new QNNSuperSiLUCreator())); } QNNBackend::QNNBackend(shared_ptr mm) : Backend(mm) { + type_ = BackendType::MLLM_QNN; // used in Tensor.device() if (!log::initializeLogging()) { std::cerr << "ERROR: Unable to initialize logging!\n"; return; @@ -116,7 +119,7 @@ QNNBackend::QNNBackend(shared_ptr mm) : &m_qnnFunctionPointers, &m_backendLibraryHandle, false, - &m_modelHandle); + nullptr); if (dynamicloadutil::StatusCode::SUCCESS != statusCode) { if (dynamicloadutil::StatusCode::FAIL_LOAD_BACKEND == statusCode) { exitWithMessage( @@ -246,36 +249,77 @@ void QNNBackend::onSetUpStart(vector> &inputs, vectorsequence() % 5 != 0) { - auto data_type = QNN_DATATYPE_SFIXED_POINT_8; - uint32_t dimensionsInput[4] = { - static_cast(inputs[0]->batch()), - static_cast(inputs[0]->sequence()), - static_cast(inputs[0]->head()), - static_cast(inputs[0]->dimension()), - }; - - qnnModels_[qnnModelIndex_].addTensor(inputs[0]->name().c_str(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = inputs[0]->name().c_str(), - .type = QNN_TENSOR_TYPE_APP_WRITE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = data_type, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsInput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}); + for (auto &input : inputs) { + if (input->sequence() % 5 != 0) { + Qnn_DataType_t data_type; + auto quantizeDefined = QNN_DEFINITION_UNDEFINED; + auto quantizeType = QNN_QUANTIZATION_ENCODING_UNDEFINED; + float scale = 0.0f; + auto loader = Module::llm_model_ptr->loader; + Tensor scaleTensor(this); + scaleTensor.reshape(1, 1, 1, 1); + scaleTensor.setDtype(MLLM_TYPE_F32); + scaleTensor.alloc(); + + switch (input->dtype()) { + case MLLM_TYPE_F32: + data_type = QNN_DATATYPE_FLOAT_32; + break; + case MLLM_TYPE_I8: { + data_type = QNN_DATATYPE_SFIXED_POINT_8; + quantizeDefined = QNN_DEFINITION_DEFINED; + quantizeType = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; + + std::string prefix = "out-", suffix = ".quantize", scaleName; + if (input->name().find(prefix) != std::string::npos) { + scaleName = input->name().substr(prefix.length()); + } + if (scaleName.find(suffix) != std::string::npos) { + scaleName = scaleName.substr(0, scaleName.length() - suffix.length()); + } + scaleName += ".input_scale"; + + scaleTensor.setName(scaleName); + loader->load(&scaleTensor); + scale = roundf(scaleTensor.hostPtr()[0] / 127.0 * 100000) / 100000; + scaleTensor.free(); + break; + } + default: + std::cerr << "[ERROR] QNNBackend not support dtype: " << input->dtype() << std::endl; + data_type = QNN_DATATYPE_FLOAT_32; + } + + uint32_t dimensionsInput[4] = { + static_cast(input->batch()), + static_cast(input->sequence()), + static_cast(input->head()), + static_cast(input->dimension()), + }; + + qnnModels_[qnnModelIndex_].addTensor(input->name().c_str(), + (Qnn_Tensor_t){ + .version = QNN_TENSOR_VERSION_1, + .v1 = { + .id = 0, + .name = input->name().c_str(), + .type = QNN_TENSOR_TYPE_APP_WRITE, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = data_type, + .quantizeParams = {quantizeDefined, + quantizeType, + {.scaleOffsetEncoding = {.scale = scale, .offset = 0}}}, + .rank = 4, + .dimensions = dimensionsInput, + .memType = QNN_TENSORMEMTYPE_RAW, + .clientBuf = {.data = nullptr, + .dataSize = 0}}}); + } } // create a new inputBuffer and outputBuffer for the graph inputBufferMap.insert(std::make_pair(graphName, std::vector(inputs.size()))); - outputBufferMap.insert(std::make_pair(graphName, std::vector(0))); + outputBufferMap.insert(std::make_pair(graphName, std::vector())); currentInputBuffers = &inputBufferMap[graphName]; currentOutputBuffers = &outputBufferMap[graphName]; @@ -299,48 +343,51 @@ void QNNBackend::onSetUpEnd(vector> &inputs, vectorsize(); - if (iotensor::StatusCode::SUCCESS != m_ioTensor.populateInputTensors(graphIdx, *currentInputBuffers, inputs_, graphInfo, m_inputDataType)) { - returnStatus = StatusCode::FAILURE; - } + // Todo only one graph now + size_t totalCount = currentInputBuffers->size(); + if (iotensor::StatusCode::SUCCESS != m_ioTensor.populateInputTensors(qnnModelIndex_, *currentInputBuffers, qnnInputs, *graphInfo, m_inputDataType)) { + returnStatus = StatusCode::FAILURE; + } - auto qnnMM = std::static_pointer_cast(mem_manager_); + auto qnnMM = std::static_pointer_cast(mem_manager_); - // register input and output tensor to qnn shared buffers - // TODO: currently must insure the inputs and outputs of mllm graph are the same as the qnn graph - // op created io tensors (kvcache, wnop...) should be solved + // register input and output tensor to qnn shared buffers + // TODO: currently must insure the inputs and outputs of mllm graph are the same as the qnn graph + // op created io tensors (kvcache, wnop...) should be solved #ifdef DEBUGPRINT - std::cout << "input tensors num:" << (*m_graphsInfo)[graphIdx].numInputTensors << std::endl; - std::cout << "output tensors num:" << (*m_graphsInfo)[graphIdx].numOutputTensors << std::endl; + std::cout << "input tensors num:" << graphInfo->numInputTensors << std::endl; + std::cout << "output tensors num:" << graphInfo->numOutputTensors << std::endl; #endif - for (int i = 0; i < (*m_graphsInfo)[graphIdx].numInputTensors; i++) { - qnnMM->registerQnnTensor((*currentInputBuffers)[i], inputs_[i]); - QNN_DEBUG("inputBuffers: %p ", (*currentInputBuffers)[i]); - } - for (int i = 0; i < (*m_graphsInfo)[graphIdx].numOutputTensors; i++) { - qnnMM->registerQnnTensor((*currentOutputBuffers)[i], outputs_[i]); - QNN_DEBUG("outputBuffers: %p ", (*currentOutputBuffers)[i]); - } + for (int i = 0; i < graphInfo->numInputTensors; i++) { + qnnMM->registerQnnTensor((*currentInputBuffers)[i], qnnInputs[i]); +#ifdef DEBUGPRINT + std::cout << "registered input tensor: " << inputs[i]->hostPtr() << " backend staged ptr: " << (void *)(*currentInputBuffers)[i] << std::endl; + std::cout << "qnn input tensor name: " << qnnInputs[i].v1.name << std::endl; + std::cout << "qnn input tensor scale: " << qnnInputs[i].v1.quantizeParams.scaleOffsetEncoding.scale << std::endl; +#endif + } + for (int i = 0; i < graphInfo->numOutputTensors; i++) { + qnnMM->registerQnnTensor((*currentOutputBuffers)[i], qnnOutputs[i]); +#ifdef DEBUGPRINT + std::cout << "registered output tensor: " << outputs[i]->hostPtr() << " backend staged ptr: " << (void *)(*currentOutputBuffers)[i] << std::endl; + std::cout << "qnn output tensor name: " << qnnOutputs[i].v1.name << std::endl; + std::cout << "qnn output tensor scale: " << qnnOutputs[i].v1.quantizeParams.scaleOffsetEncoding.scale << std::endl; +#endif } - inputsMap_[qnnModelIndex_] = inputs_; - outputsMap_[qnnModelIndex_] = outputs_; + inputsMap_[qnnModelIndex_] = qnnInputs; + outputsMap_[qnnModelIndex_] = qnnOutputs; } void QNNBackend::onExecuteStart(vector> &inputs, vector> &outputs, string graphName) { @@ -348,49 +395,38 @@ void QNNBackend::onExecuteStart(vector> &inputs, vectorgraph, + inputs_, + graphInfo->numInputTensors, + outputs_, + graphInfo->numOutputTensors, + m_profileBackendHandle, + nullptr); #ifdef DEBUGPRINT - uint64_t t_end = mllm_time_us(); - std::cout << "QNN execution time " << (t_end - t_start) / 1000.0F << " ms" << std::endl; + uint64_t t_end = mllm_time_us(); + std::cout << "QNN execution time " << (t_end - t_start) / 1000.0F << " ms" << std::endl; #endif - if (QNN_GRAPH_NO_ERROR != executeStatus) { - returnStatus = StatusCode::FAILURE; - } + if (QNN_GRAPH_NO_ERROR != executeStatus) { + std::cerr << "Error in executing graph: " << graphName << std::endl; + } - if (ProfilingLevel::OFF != m_profilingLevel) { - extractBackendProfilingInfo(m_profileBackendHandle); - } + if (ProfilingLevel::OFF != m_profilingLevel) { + extractBackendProfilingInfo(m_profileBackendHandle); } } void QNNBackend::onExecuteEnd() { -#ifdef QNN_ARM - executeGraphsShared(); -#else - executeGraphs(inputBufferMap, outputBufferMap); -#endif } void QNNBackend::freeGraphDataStructure(string graphName) { @@ -418,7 +454,7 @@ void QNNBackend::afterAllGraphsExecute() { inputBufferMap.clear(); outputBufferMap.clear(); - m_graphsInfoMap_.clear(); + graphInfoMap_.clear(); inputsMap_.clear(); outputsMap_.clear(); } @@ -459,22 +495,21 @@ qnn_wrapper_api::ModelError_t QNNBackend::graphAddNode(string name, } qnn_wrapper_api::ModelError_t QNNBackend::graphFinilize() { - // Add all models to array to get graphsInfo - qnn_wrapper_api::QnnModel *models[] = {&qnnModels_[qnnModelIndex_]}; - m_graphsCount = 1; // Populate the constructed graphs in provided output variables qnn_wrapper_api::ModelError_t err = qnn_wrapper_api::MODEL_NO_ERROR; - qnn_wrapper_api::GraphInfo_t **m_graphsInfo = nullptr; + qnn_wrapper_api::GraphInfo_t *graphInfo = nullptr; + + VALIDATE(getSingleGraphInfoFromModel(qnnModels_[qnnModelIndex_], &graphInfo), err); - VALIDATE(getGraphInfoFromModels(*models, m_graphsCount, &m_graphsInfo), err); // Graph finalize - if (QNN_GRAPH_NO_ERROR != m_qnnFunctionPointers.qnnInterface.graphFinalize((*m_graphsInfo)[0].graph, m_profileBackendHandle, nullptr)) { + if (QNN_GRAPH_NO_ERROR != m_qnnFunctionPointers.qnnInterface.graphFinalize(graphInfo->graph, m_profileBackendHandle, nullptr)) { return qnn_wrapper_api::ModelError_t::MODEL_GRAPH_ERROR; } if (ProfilingLevel::OFF != m_profilingLevel) { extractBackendProfilingInfo(m_profileBackendHandle); } - m_graphsInfoMap_[qnnModelIndex_] = m_graphsInfo; + + graphInfoMap_[qnnModelIndex_] = graphInfo; return qnn_wrapper_api::ModelError_t::MODEL_NO_ERROR; } @@ -683,107 +718,4 @@ StatusCode QNNBackend::freeDevice() { return StatusCode::SUCCESS; } -// executeGraphs() that load input/output buffers from CPU context -// inputBufferMap and outputBufferMap: graph_name -> graph input/output CPU buffers. -StatusCode QNNBackend::executeGraphs(std::map> inputBufferMap, std::map> outputBufferMap) { - qnn_wrapper_api::GraphInfo_t **m_graphsInfo = m_graphsInfoMap_[qnnModelIndex_]; - - auto returnStatus = StatusCode::SUCCESS; - for (size_t graphIdx = 0; graphIdx < m_graphsCount; graphIdx++) { - QNN_DEBUG("Starting execution for graphIdx: %d", graphIdx); - if (graphIdx >= inputBufferMap.size()) { - QNN_ERROR("No Inputs available for: %d", graphIdx); - returnStatus = StatusCode::FAILURE; - break; - } - - Qnn_Tensor_t *inputs_ = inputsMap_[qnnModelIndex_]; - Qnn_Tensor_t *outputs_ = outputsMap_[qnnModelIndex_]; - - auto graphInfo = (*m_graphsInfo)[graphIdx]; - if (!inputBufferMap.empty()) { - size_t startIdx = 0; - - if (StatusCode::SUCCESS == returnStatus) { - QNN_DEBUG("Successfully populated input tensors for graphIdx: %d", graphIdx); - Qnn_ErrorHandle_t executeStatus = QNN_GRAPH_NO_ERROR; -#ifdef DEBUGPRINT - uint64_t t_start = mllm_time_us(); -#endif - - executeStatus = - m_qnnFunctionPointers.qnnInterface.graphExecute(graphInfo.graph, - inputs_, - graphInfo.numInputTensors, - outputs_, - graphInfo.numOutputTensors, - m_profileBackendHandle, - nullptr); -#ifdef DEBUGPRINT - uint64_t t_end = mllm_time_us(); - std::cout << "QNN execution time " << (t_end - t_start) / 1000.0F << " ms" << std::endl; -#endif - - if (QNN_GRAPH_NO_ERROR != executeStatus) { - returnStatus = StatusCode::FAILURE; - } - } - if (StatusCode::SUCCESS != returnStatus) { - QNN_ERROR("Execution of Graph: %d failed!", graphIdx); - break; - } - if (ProfilingLevel::OFF != m_profilingLevel) { - extractBackendProfilingInfo(m_profileBackendHandle); - } - } - - m_ioTensor.tearDownInputAndOutputTensors( - inputs_, outputs_, graphInfo.numInputTensors, graphInfo.numOutputTensors); - inputs_ = nullptr; - outputs_ = nullptr; - if (StatusCode::SUCCESS != returnStatus) { - break; - } - } - - qnn_wrapper_api::freeGraphsInfo(&m_graphsInfo, m_graphsCount); - m_graphsInfo = nullptr; - return returnStatus; -} - -StatusCode QNNBackend::executeGraphsShared() { - qnn_wrapper_api::GraphInfo_t **m_graphsInfo = m_graphsInfoMap_[qnnModelIndex_]; - - auto returnStatus = StatusCode::SUCCESS; - - for (size_t graphIdx = 0; graphIdx < 1; graphIdx++) { - auto graphInfo = (*m_graphsInfo)[graphIdx]; - - Qnn_Tensor_t *inputs_ = inputsMap_[qnnModelIndex_]; - Qnn_Tensor_t *outputs_ = outputsMap_[qnnModelIndex_]; - - Qnn_ErrorHandle_t executeStatus = QNN_GRAPH_NO_ERROR; -#ifdef DEBUGPRINT - uint64_t t_start = mllm_time_us(); -#endif - executeStatus = - m_qnnFunctionPointers.qnnInterface.graphExecute(graphInfo.graph, - inputs_, - graphInfo.numInputTensors, - outputs_, - graphInfo.numOutputTensors, - m_profileBackendHandle, - nullptr); -#ifdef DEBUGPRINT - uint64_t t_end = mllm_time_us(); - std::cout << "QNN execution time " << (t_end - t_start) / 1000.0F << " ms" << std::endl; -#endif - - if (QNN_GRAPH_NO_ERROR != executeStatus) { - returnStatus = StatusCode::FAILURE; - } - } - return returnStatus; -} - } // namespace mllm \ No newline at end of file diff --git a/src/backends/qnn/QNNBackend.hpp b/src/backends/qnn/QNNBackend.hpp index 293b641e..27ac0676 100644 --- a/src/backends/qnn/QNNBackend.hpp +++ b/src/backends/qnn/QNNBackend.hpp @@ -50,9 +50,6 @@ class QNNBackend : public Backend { if (m_backendLibraryHandle) { pal::dynamicloading::dlClose(m_backendLibraryHandle); } - if (m_modelHandle) { - pal::dynamicloading::dlClose(m_modelHandle); - } QNN_INFO("Free handle"); } @@ -129,9 +126,6 @@ class QNNBackend : public Backend { StatusCode createContext(); - StatusCode executeGraphs(std::map> inputBufferMap, std::map> outputBufferMap); - StatusCode executeGraphsShared(); - StatusCode registerOpPackages(); StatusCode freeContext(); @@ -185,15 +179,12 @@ class QNNBackend : public Backend { iotensor::InputDataType m_inputDataType; sample_app::ProfilingLevel m_profilingLevel; - std::map m_graphsInfoMap_; - // for mllm single graph execute - qnn_wrapper_api::GraphInfo_t graphInfo; + std::map graphInfoMap_; const QnnGraph_Config_t **graphConfigs = nullptr; - uint32_t m_graphsCount; // these two pointers is .so library handle void *m_backendLibraryHandle = nullptr; - void *m_modelHandle = nullptr; // m_modelHandle is always nullptr cause we build graph in runtime + iotensor::IOTensor m_ioTensor; bool m_isBackendInitialized; bool m_isContextCreated; diff --git a/src/backends/qnn/QNNExecutor.cpp b/src/backends/qnn/QNNExecutor.cpp index 0e9cd37b..32656934 100644 --- a/src/backends/qnn/QNNExecutor.cpp +++ b/src/backends/qnn/QNNExecutor.cpp @@ -408,7 +408,7 @@ void QNNPipelineExecutor::runExp(Context *ctx, Net *net, vectorsequence() + chunk_size - 1) / chunk_size; // create a new tensor for each chunk vector>> chunked_tensors_list(chunk_num, vector>(input_tensors.size())); @@ -519,7 +519,7 @@ void QNNPipelineExecutor::runExp(Context *ctx, Net *net, vectorsubGraph()[name]; if (chunk_id != 0) { @@ -536,11 +536,11 @@ void QNNPipelineExecutor::runExp(Context *ctx, Net *net, vectorsubGraph()[name]; auto *qnn_graph = dynamic_cast(g.get()); - qnn_graph->forward(name); + // qnn_graph->forward(name); // only get the result at the last graph if (i == net->subGraph().size() - 1) { @@ -590,10 +590,11 @@ void QNNPipelineExecutor::runExp(Context *ctx, Net *net, vectorsubGraph().size(); ++i) { -#pragma omp parallel for +#pragma omp parallel for num_threads(chunk_num) for (int chunk_id = 0; chunk_id < chunk_num; ++chunk_id) { executeFunc(chunk_id, i); } +#pragma omp barrier } // the last graph of chunk 1 { diff --git a/src/backends/qnn/QNNGraph.cpp b/src/backends/qnn/QNNGraph.cpp index eb5eee90..4586c14a 100644 --- a/src/backends/qnn/QNNGraph.cpp +++ b/src/backends/qnn/QNNGraph.cpp @@ -22,7 +22,7 @@ void QNNGraph::setUpTensors(std::string name) { // set graph out tensor TensorType auto &graph_out_tensors = ops_output_tensors_[op_names_[op_names_.size() - 1]]; for (auto &t : graph_out_tensors) { - t->setTtype(OUTPUT_TENSOR); + t->setTtype(GRAPH_OUTPUT); t->alloc(); } for (auto &t : graph_in_tensors) { t->alloc(); } @@ -79,13 +79,14 @@ const vector> &QNNGraph::forward(std::string graphName) { } this->backend_->onExecuteStart(ops_input_tensors_[op_names_[0]], ops_output_tensors_[op_names_[op_names_.size() - 1]], graphName); - + if (ops_[op_names_[op_names_.size() - 1]]->type() == MERGEOUTPUT) { auto inputs = ops_input_tensors_[op_names_[op_names_.size() - 1]]; auto outputs = ops_output_tensors_[op_names_[op_names_.size() - 1]]; - memcpy(outputs[0]->hostPtr() + (inputs[0]->cntSize()*0), inputs[0]->hostPtr(), inputs[0]->cntSize()); - memcpy(outputs[0]->hostPtr() + (inputs[0]->cntSize()*1), inputs[1]->hostPtr(), inputs[1]->cntSize()); - memcpy(outputs[0]->hostPtr() + (inputs[0]->cntSize()*2), inputs[2]->hostPtr(), inputs[2]->cntSize()); +#pragma omp parallel for collapse(1) num_threads(4) + for(int t=0; t<3; t++) { + memcpy(outputs[0]->hostPtr() + (inputs[0]->cntSize()*t), inputs[t]->hostPtr(), inputs[t]->cntSize()); + } } if (ops_[op_names_[op_names_.size() - 1]]->type() == LINEARINT8SHADOW) { diff --git a/src/backends/qnn/QNNMemoryManager.cpp b/src/backends/qnn/QNNMemoryManager.cpp index 7d6a62e6..204cbe05 100644 --- a/src/backends/qnn/QNNMemoryManager.cpp +++ b/src/backends/qnn/QNNMemoryManager.cpp @@ -112,7 +112,7 @@ void QNNMemoryManager::alloc(void **ptr, size_t size, size_t alignment) { void QNNMemoryManager::registerQnnTensor(void *ptr, Qnn_Tensor_t &qnnTensor) { auto it = qnnMemPtrMap_.find(ptr); if (it == qnnMemPtrMap_.end()) { - std::cerr << "getMemHandle failed" << std::endl; + std::cerr << "getMemHandle failed " << ptr << std::endl; return; } diff --git a/src/backends/qnn/op/QNNCommonOp.cpp b/src/backends/qnn/op/QNNCommonOp.cpp index 1082a05f..2f0e9885 100644 --- a/src/backends/qnn/op/QNNCommonOp.cpp +++ b/src/backends/qnn/op/QNNCommonOp.cpp @@ -32,9 +32,6 @@ ErrorCode QNNCommonOp::graphAddNode(string name, string nodeType, vectordtype() == MLLM_TYPE_I8) { -#ifdef DEBUGPRINT - std::cout << name << "is QNN INT8 op " << std::endl; -#endif data_type = QNN_DATATYPE_SFIXED_POINT_8; } @@ -63,8 +60,8 @@ ErrorCode QNNCommonOp::graphAddNode(string name, string nodeType, vectorgraphAddNode(name, nodeType, inputTensorNames, outputTensors, params, packageName)) { @@ -83,7 +80,11 @@ ErrorCode QNNCommonOp::graphAddNode(string name, string nodeType, vector } Qnn_TensorType_t QNNCommonOp::getOutputTensorType(shared_ptr tensor) const { - if (tensor->ttype() == OUTPUT_TENSOR) { + if (tensor->ttype() == GRAPH_OUTPUT) { + // in Module API, the outputs of a graph is not allocated before setUp, alloc here + if(tensor->allocted() == 0) { + tensor->alloc(); + } qnnBackend_->pushOutputBuffers(tensor->hostPtr()); return QNN_TENSOR_TYPE_APP_READ; } else { diff --git a/src/backends/qnn/op/QNNDequantize.cpp b/src/backends/qnn/op/QNNDequantize.cpp index 3b09e96a..13eb49dd 100644 --- a/src/backends/qnn/op/QNNDequantize.cpp +++ b/src/backends/qnn/op/QNNDequantize.cpp @@ -20,6 +20,7 @@ ErrorCode QNNDequantize::reshape(vector> inputs, vector> inputs, vector> outputs) { + #ifdef OLD_QNN if (getOutputTensorType(outputs[0]) == QNN_TENSOR_TYPE_APP_READ) { outputs[0]->setBackend(qnnBackend_); outputs[0]->setDtype(MLLM_TYPE_F32); @@ -27,6 +28,7 @@ ErrorCode QNNDequantize::setUp(vector> inputs, vectorpushOutputBuffers(outputs[0]->hostPtr()); } + #endif auto outName = outputs[0]->name(); uint32_t dimensionsOutput[4]; @@ -91,6 +93,7 @@ ErrorCode QNNDequantize::setUp(vector> inputs, vectorname()}, outputTensor, paramsDeQuantize, "LLaMAPackage"); } else { + outputs[0]->setDtype(MLLM_TYPE_F16); uint32_t paramsDeQuantizeDimension[1] = {1}; auto paramsDeQuantizeName = name() + "dequantize_params"; vector paramsDeQuantize = { diff --git a/src/backends/qnn/op/QNNLinear.cpp b/src/backends/qnn/op/QNNLinear.cpp deleted file mode 100755 index b6629ffb..00000000 --- a/src/backends/qnn/op/QNNLinear.cpp +++ /dev/null @@ -1,329 +0,0 @@ - -#include "QNNLinear.hpp" -#include "QnnTypes.h" -#include "Types.hpp" -#include "QNNCommonOp.hpp" -#include -#include - -namespace mllm { -QNNLinear::QNNLinear(Backend *bn, string opName, int in_features, int out_features, bool bias) : - QNNCommonOp(bn, opName), in_features_(in_features), out_features_(out_features), support_bias_(bias) { - weight_.setBackend(bn); - bias_.setBackend(bn); -} - -ErrorCode QNNLinear::reshape(vector> inputs, vector> outputs) { - assert(inputs.size() == 1); - assert(outputs.size() == 1); - // N | C | H | W - // ----------------------------------------------- - // 1 |out_channel | in_channel | 1 - // |out_features| in_features | - // ----------------------------------------------- - // batch |in_channel | seq_len | 1 - // |in_features | inputs[0]->sequence() | - // ----------------------------------------------- - // batch |out_channel | seq_len | 1 - // |out_features| inputs[0]->sequence() | - assert(inputs[0]->head() == 1); - assert(in_features_ == inputs[0]->dimension()); - outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_features_); - return Op::reshape(inputs, outputs); -} - -ErrorCode QNNLinear::setUp(vector> inputs, vector> outputs) { - // add matmul param to qnn - vector paramsMatmul = { - {.paramType = QNN_PARAMTYPE_SCALAR, - .name = "transpose_in0", - .scalarParam = (Qnn_Scalar_t){QNN_DATATYPE_BOOL_8, {.bool8Value = 0}}}, - {.paramType = QNN_PARAMTYPE_SCALAR, - .name = "transpose_in1", - .scalarParam = (Qnn_Scalar_t){QNN_DATATYPE_BOOL_8, {.bool8Value = 1}}}}; - // add quantized input tensor to qnn - auto inputQuantizeName = name() + inputs[0]->name() + ".quantize"; - uint32_t dimensionsInput[4] = {static_cast(inputs[0]->batch()), - static_cast(inputs[0]->sequence()), - static_cast(inputs[0]->head()), - static_cast(inputs[0]->dimension())}; - - uint32_t dimensions_InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_dilation[] = {2}; - uint32_t InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_dilation[] = {1, 1}; - uint32_t dimensions_InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_pad_amount[] = {2, 2}; - uint32_t InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_pad_amount[] = {0, 0, 0, 0}; - uint32_t dimensions_InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_stride[] = {2}; - uint32_t InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_stride[] = {1, 1}; - - vector params_InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D = { - {.paramType = QNN_PARAMTYPE_TENSOR, - .name = "stride", - .tensorParam = - (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = {.id = 0, - .name = "InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_stride", - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UINT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = 1, - .dimensions = dimensions_InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_stride, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = - {.data = (uint8_t *)InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_stride, - .dataSize = 8}}}}, - {.paramType = QNN_PARAMTYPE_TENSOR, - .name = "pad_amount", - .tensorParam = - (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = {.id = 0, - .name = "InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_pad_amount", - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UINT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, - .offset = 0}}}, - .rank = 2, - .dimensions = - dimensions_InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_pad_amount, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = - {.data = (uint8_t *) - InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D_pad_amount, - .dataSize = 16}}}}, - - }; - - // TODO: split into another function - // if weight is float32, use float matmul - if (weight_.dtype() == MLLM_TYPE_F32) { - std::cout << " test fp linear " << name() << std::endl; - - uint32_t dimensionsWeight[4] = {1, 1, static_cast(weight_.sequence()), static_cast(weight_.dimension())}; - qnnBackend_->modelAddTensor(weight_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = weight_.name().c_str(), - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsWeight, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = weight_.hostPtr(), - .dataSize = (uint32_t)weight_.cntSize()}}}); - // final output - uint32_t dimensionsOutput[4] = {static_cast(outputs[0]->batch()), - static_cast(outputs[0]->sequence()), - static_cast(outputs[0]->head()), - static_cast(outputs[0]->dimension())}; - auto outString = outputs[0]->name(); - vector - matmulOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outString.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - - return graphAddNode(name() + ".matmul", "Conv2d", {inputs[0]->name(), weight_.name()}, matmulOut, params_InceptionV3_InceptionV3_Conv2d_1a_3x3_Conv2D); - } // TODO: split into another function - - vector quantizedInput = { - (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = inputQuantizeName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UFIXED_POINT_8, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsInput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}; - graphAddNode(name() + ".quantize", "Quantize", {inputs[0]->name()}, quantizedInput); - // add weight tensor to qnn - uint32_t dimensionsWeight[4] = {1, 1, static_cast(weight_.sequence()), static_cast(weight_.dimension())}; - qnnBackend_->modelAddTensor(weight_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = weight_.name().c_str(), - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UFIXED_POINT_8, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsWeight, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = weight_.hostPtr(), - .dataSize = (uint32_t)weight_.cntSize()}}}); - // free weight host memory - // weight_.free(); - - // dimensions of matmul output and bias - uint32_t dimensionsOutput[4] = {static_cast(outputs[0]->batch()), - static_cast(outputs[0]->sequence()), - static_cast(outputs[0]->head()), - static_cast(outputs[0]->dimension())}; - - auto outName = outputs[0]->name(); - auto outQuantizedName = name() + outputs[0]->name() + ".quantized"; - auto outDeqnName = name() + outputs[0]->name() + ".dequantized"; - vector matmulOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outQuantizedName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UFIXED_POINT_8, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - graphAddNode(name() + ".matmul", "MatMul", {inputQuantizeName, weight_.name()}, matmulOut, paramsMatmul); - - // if don't support bias, just dequantize and write to tensor with name of outputs[0] - if (!support_bias_) { - // output of dequantized result of matmul - vector deqnOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outName.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - return graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName}, deqnOut); - } - - // dequantize to tensor with name of outputs[0] + ".dequantize" - // output of dequantized result of matmul - vector deqnOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outDeqnName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName}, deqnOut); - // add bias tensor to qnn - uint32_t dimensionsBias[4] = {1, 1, 1, (uint32_t)out_features_}; - qnnBackend_->modelAddTensor(bias_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = bias_.name().c_str(), - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsBias, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = bias_.hostPtr(), - .dataSize = (uint32_t)bias_.cntSize()}}}); - // free bias host memory - bias_.free(); - - // final output - vector biasOutput = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outName.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - return graphAddNode(name() + ".add", "ElementWiseAdd", {outDeqnName, bias_.name()}, biasOutput); -} - -ErrorCode QNNLinear::load(AbstructLoader &loader) { - weight_.setName(name() + ".weight"); - weight_.reshape(1, 1, in_features_, out_features_); - if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) { - weight_.setDtype(loader.getDataType(weight_.name())); - weight_.alloc(); - loader.load(&weight_); - } else { - weight_.setDtype(MLLM_TYPE_F32); - weight_.alloc(); - } - if (support_bias_) { - bias_.setName(name() + ".bias"); - bias_.reshape(1, 1, 1, out_features_); - if (loader.getDataType(bias_.name()) != MLLM_TYPE_COUNT) { - bias_.setDtype(loader.getDataType(bias_.name())); - bias_.alloc(); - loader.load(&bias_); - } else { - bias_.setDtype(MLLM_TYPE_F32); - bias_.alloc(); - } - } - return Op::load(loader); -} - -ErrorCode QNNLinear::free(vector> inputs, vector> outputs) { - // weight_.free(); - // if (support_bias_) { - // bias_.free(); - // } - return Op::free(inputs, outputs); -} -} // namespace mllm diff --git a/src/backends/qnn/op/QNNLinear.hpp b/src/backends/qnn/op/QNNLinear.hpp deleted file mode 100644 index b4e627c4..00000000 --- a/src/backends/qnn/op/QNNLinear.hpp +++ /dev/null @@ -1,36 +0,0 @@ - -#ifndef MLLM_QNNLINEAR_H -#define MLLM_QNNLINEAR_H - -#include "QNNCommonOp.hpp" -namespace mllm { -class QNNLinear : public QNNCommonOp { -public: - QNNLinear(Backend *bn, string opName, int in_features, int out_features, bool bias); - virtual ~QNNLinear() = default; - virtual ErrorCode reshape(vector> inputs, vector> outputs) override; - virtual ErrorCode setUp(vector> inputs, vector> outputs) override; - virtual ErrorCode load(AbstructLoader &loader) override; - virtual ErrorCode free(vector> inputs, vector> outputs) override; - -private: - int in_features_; - int out_features_; - bool support_bias_; - Tensor weight_; - Tensor bias_; -}; - -class QNNLinearCreator : public QNNBackend::Creator { -public: - virtual Op *create(OpParam op_param, Backend *bn, string name) const { - int in_features = op_param["in_features"]; - int out_features = op_param["out_features"]; - int bias = op_param["bias"]; - return new QNNLinear(bn, name, in_features, out_features, (bool)bias); - } -}; - -} // namespace mllm - -#endif diff --git a/src/backends/qnn/op/QNNLinear3D.cpp b/src/backends/qnn/op/QNNLinear3D.cpp deleted file mode 100755 index 336891b1..00000000 --- a/src/backends/qnn/op/QNNLinear3D.cpp +++ /dev/null @@ -1,276 +0,0 @@ - -#include "QNNLinear3D.hpp" -#include "QnnTypes.h" -#include "Types.hpp" -#include "QNNCommonOp.hpp" -#include -#include - -namespace mllm { -QNNLinear3D::QNNLinear3D(Backend *bn, string opName, int in_features, int out_features, bool bias) : - QNNCommonOp(bn, opName), in_features_(in_features), out_features_(out_features), support_bias_(bias) { - weight_.setBackend(bn); - bias_.setBackend(bn); -} - -ErrorCode QNNLinear3D::reshape(vector> inputs, vector> outputs) { - assert(inputs.size() == 1); - assert(outputs.size() == 1); - // N | C | H | W - // ----------------------------------------------- - // 1 |out_channel | in_channel | 1 - // |out_features| in_features | - // ----------------------------------------------- - // batch |in_channel | seq_len | 1 - // |in_features | inputs[0]->sequence() | - // ----------------------------------------------- - // batch |out_channel | seq_len | 1 - // |out_features| inputs[0]->sequence() | - assert(inputs[0]->head() == 1); - assert(in_features_ == inputs[0]->dimension()); - outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_features_); - return Op::reshape(inputs, outputs); -} - -ErrorCode QNNLinear3D::setUp(vector> inputs, vector> outputs) { - // add matmul param to qnn - vector paramsMatmul = { - {.paramType = QNN_PARAMTYPE_SCALAR, - .name = "transpose_in0", - .scalarParam = (Qnn_Scalar_t){QNN_DATATYPE_BOOL_8, {.bool8Value = 0}}}, - {.paramType = QNN_PARAMTYPE_SCALAR, - .name = "transpose_in1", - .scalarParam = (Qnn_Scalar_t){QNN_DATATYPE_BOOL_8, {.bool8Value = 1}}}}; - // add quantized input tensor to qnn - auto inputQuantizeName = name() + inputs[0]->name() + ".quantize"; - uint32_t dimensionsInput[4] = {static_cast(inputs[0]->batch()), - static_cast(inputs[0]->head()), - static_cast(inputs[0]->sequence()), - static_cast(inputs[0]->dimension())}; - - // TODO: split into another function - // if weight is float32, use float matmul - if (weight_.dtype() == MLLM_TYPE_F32) { - std::cout << " test fp linear " << name() << std::endl; - - uint32_t dimensionsWeight[4] = {1, 32, static_cast(weight_.sequence()), static_cast(weight_.dimension())}; - qnnBackend_->modelAddTensor(weight_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = weight_.name().c_str(), - .type = QNN_TENSOR_TYPE_APP_WRITE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsWeight, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}); - - qnnBackend_->pushInputBuffers(weight_.hostPtr()); - - // final output - uint32_t dimensionsOutput[4] = {static_cast(outputs[0]->batch()), - static_cast(outputs[0]->head()), - static_cast(outputs[0]->sequence()), - static_cast(outputs[0]->dimension())}; - auto outString = outputs[0]->name(); - vector - matmulOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outString.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - return graphAddNode(name() + ".matmul", "MatMul", {inputs[0]->name(), weight_.name()}, matmulOut, paramsMatmul); - } // TODO: split into another function - - vector quantizedInput = { - (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = inputQuantizeName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UFIXED_POINT_8, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsInput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}; - graphAddNode(name() + ".quantize", "Quantize", {inputs[0]->name()}, quantizedInput); - // add weight tensor to qnn - uint32_t dimensionsWeight[4] = {1, 1, static_cast(weight_.sequence()), static_cast(weight_.dimension())}; - qnnBackend_->modelAddTensor(weight_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = weight_.name().c_str(), - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UFIXED_POINT_8, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsWeight, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = weight_.hostPtr(), - .dataSize = (uint32_t)weight_.cntSize()}}}); - - // dimensions of matmul output and bias - uint32_t dimensionsOutput[4] = {static_cast(outputs[0]->batch()), - static_cast(outputs[0]->head()), - static_cast(outputs[0]->sequence()), - static_cast(outputs[0]->dimension())}; - - auto outName = outputs[0]->name(); - auto outQuantizedName = name() + outputs[0]->name() + ".quantized"; - auto outDeqnName = name() + outputs[0]->name() + ".dequantized"; - vector matmulOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outQuantizedName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UFIXED_POINT_8, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - graphAddNode(name() + ".matmul", "MatMul", {inputQuantizeName, weight_.name()}, matmulOut, paramsMatmul); - - // if don't support bias, just dequantize and write to tensor with name of outputs[0] - if (!support_bias_) { - // output of dequantized result of matmul - vector deqnOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outName.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - return graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName}, deqnOut); - } - - // dequantize to tensor with name of outputs[0] + ".dequantize" - // output of dequantized result of matmul - vector deqnOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outDeqnName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName}, deqnOut); - // add bias tensor to qnn - uint32_t dimensionsBias[4] = {1, 1, 1, (uint32_t)out_features_}; - qnnBackend_->modelAddTensor(bias_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = bias_.name().c_str(), - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsBias, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = bias_.hostPtr(), - .dataSize = (uint32_t)bias_.cntSize()}}}); - // free bias host memory - bias_.free(); - - // final output - vector biasOutput = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outName.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}}; - return graphAddNode(name() + ".add", "ElementWiseAdd", {outDeqnName, bias_.name()}, biasOutput); -} - -ErrorCode QNNLinear3D::load(AbstructLoader &loader) { - weight_.setName(name() + ".weight"); - weight_.reshape(1, 1, out_features_, in_features_); - if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) { - weight_.setDtype(loader.getDataType(weight_.name())); - weight_.alloc(); - loader.load(&weight_); - } else { - weight_.setDtype(MLLM_TYPE_F32); - weight_.alloc(); - } - if (support_bias_) { - bias_.setName(name() + ".bias"); - bias_.reshape(1, 1, 1, out_features_); - if (loader.getDataType(bias_.name()) != MLLM_TYPE_COUNT) { - bias_.setDtype(loader.getDataType(bias_.name())); - bias_.alloc(); - loader.load(&bias_); - } else { - bias_.setDtype(MLLM_TYPE_F32); - bias_.alloc(); - } - } - return Op::load(loader); -} - -ErrorCode QNNLinear3D::free(vector> inputs, vector> outputs) { - // weight_.free(); - // if (support_bias_) { - // bias_.free(); - // } - return Op::free(inputs, outputs); -} -} // namespace mllm diff --git a/src/backends/qnn/op/QNNLinear3D.hpp b/src/backends/qnn/op/QNNLinear3D.hpp deleted file mode 100644 index 7738bf80..00000000 --- a/src/backends/qnn/op/QNNLinear3D.hpp +++ /dev/null @@ -1,36 +0,0 @@ - -#ifndef MLLM_QNNLINEAR3D_H -#define MLLM_QNNLINEAR3D_H - -#include "QNNCommonOp.hpp" -namespace mllm { -class QNNLinear3D : public QNNCommonOp { -public: - QNNLinear3D(Backend *bn, string opName, int in_features, int out_features, bool bias); - virtual ~QNNLinear3D() = default; - virtual ErrorCode reshape(vector> inputs, vector> outputs) override; - virtual ErrorCode setUp(vector> inputs, vector> outputs) override; - virtual ErrorCode load(AbstructLoader &loader) override; - virtual ErrorCode free(vector> inputs, vector> outputs) override; - -private: - int in_features_; - int out_features_; - bool support_bias_; - Tensor weight_; - Tensor bias_; -}; - -class QNNLinear3DCreator : public QNNBackend::Creator { -public: - virtual Op *create(OpParam op_param, Backend *bn, string name) const { - int in_features = op_param["in_features"]; - int out_features = op_param["out_features"]; - int bias = op_param["bias"]; - return new QNNLinear3D(bn, name, in_features, out_features, (bool)bias); - } -}; - -} // namespace mllm - -#endif diff --git a/src/backends/qnn/op/QNNLinearFP.cpp b/src/backends/qnn/op/QNNLinearFP.cpp deleted file mode 100755 index 3de20372..00000000 --- a/src/backends/qnn/op/QNNLinearFP.cpp +++ /dev/null @@ -1,205 +0,0 @@ - -#include "QNNLinearFP.hpp" -#include "QnnTypes.h" -#include "Types.hpp" -#include "QNNCommonOp.hpp" -#include -#include - -namespace mllm { -QNNLinearFP::QNNLinearFP(Backend *bn, string opName, int in_features, int out_features, bool bias) : - QNNCommonOp(bn, opName), in_features_(in_features), out_features_(out_features), support_bias_(bias) { - weight_.setBackend(bn); - bias_.setBackend(bn); -} - -ErrorCode QNNLinearFP::reshape(vector> inputs, vector> outputs) { - assert(inputs.size() == 1); - assert(outputs.size() == 1); - // N | C | H | W - // ----------------------------------------------- - // 1 |out_channel | in_channel | 1 - // |out_features| in_features | - // ----------------------------------------------- - // batch |in_channel | seq_len | 1 - // |in_features | inputs[0]->sequence() | - // ----------------------------------------------- - // batch |out_channel | seq_len | 1 - // |out_features| inputs[0]->sequence() | - assert(inputs[0]->head() == 1); - assert(in_features_ == inputs[0]->dimension()); - outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), out_features_); - return Op::reshape(inputs, outputs); -} - -ErrorCode QNNLinearFP::setUp(vector> inputs, vector> outputs) { - // add matmul param to qnn - vector paramsMatmul = { - {.paramType = QNN_PARAMTYPE_SCALAR, - .name = "transpose_in0", - {.scalarParam = (Qnn_Scalar_t){QNN_DATATYPE_BOOL_8, {.bool8Value = 0}}}}, - {.paramType = QNN_PARAMTYPE_SCALAR, - .name = "transpose_in1", - {.scalarParam = (Qnn_Scalar_t){QNN_DATATYPE_BOOL_8, {.bool8Value = 1}}}}}; - - // add weight tensor to qnn - uint32_t dimensionsWeight[4]; - for (int i = 0; i < 4; i++) { - dimensionsWeight[i] = weight_.shape()[i]; - } - auto weightName = weight_.name(); - auto weightDeQuantName = weightName + ".dequantized"; - qnnBackend_->modelAddTensor(weight_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = weightName.c_str(), - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_UFIXED_POINT_8, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsWeight, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = weight_.hostPtr(), - .dataSize = (uint32_t)weight_.cntSize()}}}}}); - // free weight - weight_.free(); - // output of dequantized result of weight - vector weightQuantOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = weightDeQuantName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsWeight, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, - .dataSize = 0}}}}}}; - // dequantize weight to float and matmul - graphAddNode(name() + ".dequantize", "Dequantize", {weightName}, weightQuantOut); - // dimensions of matmul output and bias - uint32_t dimensionsOutput[4]; - for (int i = 0; i < 4; i++) { - dimensionsOutput[i] = outputs[0]->shape()[i]; - } - auto outName = outputs[0]->name(); - - // if don't support bias, just execute matmul - if (!support_bias_) { - vector matmulOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outName.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, - .dataSize = 0}}}}}}; - return graphAddNode(name() + ".matmul", "MatMul", {inputs[0]->name(), weightDeQuantName}, matmulOut, paramsMatmul); - } - - string matmulOutName = name() + ".matmul"; - vector matmulOut = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = matmulOutName.c_str(), - .type = QNN_TENSOR_TYPE_NATIVE, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, - .dataSize = 0}}}}}}; - graphAddNode(name() + ".matmul", "MatMul", {inputs[0]->name(), weightDeQuantName}, matmulOut, paramsMatmul); - // add bias tensor to qnn - uint32_t dimensionsBias[4] = {1, 1, 1, (uint32_t)out_features_}; - qnnBackend_->modelAddTensor(bias_.name(), (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = bias_.name().c_str(), - .type = QNN_TENSOR_TYPE_STATIC, - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsBias, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = bias_.hostPtr(), - .dataSize = (uint32_t)bias_.cntSize()}}}}}); - // free bias - bias_.free(); - - // final output - vector biasOutput = {{QNN_TENSOR_VERSION_1, - {.v1 = { - .id = 0, - .name = outName.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimensionsOutput, - .memType = QNN_TENSORMEMTYPE_RAW, - {.clientBuf = {.data = nullptr, - .dataSize = 0}}}}}}; - return graphAddNode(name() + ".add", "ElementWiseAdd", {matmulOutName, bias_.name()}, biasOutput); -} - -ErrorCode QNNLinearFP::load(AbstructLoader &loader) { - weight_.setName(name() + ".weight"); - weight_.reshape(1, 1, out_features_, in_features_); - if (loader.getDataType(weight_.name()) != MLLM_TYPE_COUNT) { - weight_.setDtype(loader.getDataType(weight_.name())); - weight_.alloc(); - loader.load(&weight_); - } else { - weight_.setDtype(MLLM_TYPE_F32); - weight_.alloc(); - } - if (support_bias_) { - bias_.setName(name() + ".bias"); - bias_.reshape(1, 1, 1, out_features_); - if (loader.getDataType(bias_.name()) != MLLM_TYPE_COUNT) { - bias_.setDtype(loader.getDataType(bias_.name())); - bias_.alloc(); - loader.load(&bias_); - } else { - bias_.setDtype(MLLM_TYPE_F32); - bias_.alloc(); - } - } - return Op::load(loader); -} - -ErrorCode QNNLinearFP::free(vector> inputs, vector> outputs) { - // weight_.free(); - // if (support_bias_) { - // bias_.free(); - // } - return Op::free(inputs, outputs); -} -} // namespace mllm diff --git a/src/backends/qnn/op/QNNLinearFP.hpp b/src/backends/qnn/op/QNNLinearFP.hpp deleted file mode 100644 index 7b96dc07..00000000 --- a/src/backends/qnn/op/QNNLinearFP.hpp +++ /dev/null @@ -1,37 +0,0 @@ - -#ifndef MLLM_QNNLINEARFP_H -#define MLLM_QNNLINEARFP_H - -#include "QNNCommonOp.hpp" - -namespace mllm { -class QNNLinearFP : public QNNCommonOp { -public: - QNNLinearFP(Backend *bn, string opName, int in_features, int out_features, bool bias); - virtual ~QNNLinearFP() = default; - virtual ErrorCode reshape(vector> inputs, vector> outputs) override; - virtual ErrorCode setUp(vector> inputs, vector> outputs) override; - virtual ErrorCode load(AbstructLoader &loader) override; - virtual ErrorCode free(vector> inputs, vector> outputs) override; - -private: - int in_features_; - int out_features_; - bool support_bias_; - Tensor weight_; - Tensor bias_; -}; - -class QNNLinearFPCreator : public QNNBackend::Creator { -public: - virtual Op *create(OpParam op_param, Backend *bn, string name) const { - int in_features = op_param["in_features"]; - int out_features = op_param["out_features"]; - int bias = op_param["bias"]; - return new QNNLinearFP(bn, name, in_features, out_features, (bool)bias); - } -}; - -} // namespace mllm - -#endif diff --git a/src/backends/qnn/op/QNNLinearINT8.cpp b/src/backends/qnn/op/QNNLinearINT8.cpp index 25f3efbc..4fc57547 100755 --- a/src/backends/qnn/op/QNNLinearINT8.cpp +++ b/src/backends/qnn/op/QNNLinearINT8.cpp @@ -38,6 +38,7 @@ ErrorCode QNNLinearINT8::reshape(vector> inputs, vector> inputs, vector> outputs) { + outputs[0]->setDtype(MLLM_TYPE_I8); // add matmul param to qnn vector paramsMatmul = { {.paramType = QNN_PARAMTYPE_SCALAR, diff --git a/src/backends/qnn/op/QNNLinearINT8Shadow.cpp b/src/backends/qnn/op/QNNLinearINT8Shadow.cpp index d158211c..7444eac3 100755 --- a/src/backends/qnn/op/QNNLinearINT8Shadow.cpp +++ b/src/backends/qnn/op/QNNLinearINT8Shadow.cpp @@ -5,6 +5,7 @@ #include "QNNCommonOp.hpp" #include #include +#include "backends/cpu/compute/Matmul.hpp" namespace mllm { QNNLinearINT8Shadow::QNNLinearINT8Shadow(Backend *bn, string opName, int in_features, int out_features, bool bias) : @@ -19,6 +20,9 @@ QNNLinearINT8Shadow::QNNLinearINT8Shadow(Backend *bn, string opName, int in_feat inputClip_.setBackend(bn); outputClip_.setBackend(bn); + + weight_f32_buffer_.setBackend(bn); + input_f32_buffer_.setBackend(bn); } ErrorCode QNNLinearINT8Shadow::reshape(vector> inputs, vector> outputs) { @@ -100,9 +104,32 @@ ErrorCode QNNLinearINT8Shadow::load(AbstructLoader &loader) { shadowWeight_.reshape(1, 1, in_features_, out_features_); shadowWeight_.setDtype(MLLM_TYPE_I8); shadowWeight_.alloc(); + + shadowTransposeWeight_.setName(opName + ".shadow.transpose_weight"); + shadowTransposeWeight_.reshape(1, 1, out_features_, in_features_); + shadowTransposeWeight_.setDtype(MLLM_TYPE_I8); + shadowTransposeWeight_.alloc(); memcpy(shadowWeight_.hostPtr(), weight_.hostPtr(), in_features_ * out_features_); + for (int i=0; i(0,0, i,j, shadowWeight_.dataAt(0,0,j,i)); + + } + } + + weight_f32_buffer_.setName(opName + ".shadow.weight_f32_buffer"); + weight_f32_buffer_.reshape(1, 1, 1, in_features_); + weight_f32_buffer_.setDtype(MLLM_TYPE_F32); + weight_f32_buffer_.alloc(); + + input_f32_buffer_.setName(opName + ".shadow.input_f32_buffer"); + input_f32_buffer_.reshape(1, 1, 1, in_features_); + input_f32_buffer_.setDtype(MLLM_TYPE_F32); + input_f32_buffer_.alloc(); + weight_.free(); return Op::load(loader); @@ -131,7 +158,7 @@ ErrorCode QNNLinearINT8Shadow::execute(vector> inputs, vector output_scale = roundf(output_scale * 100000) / 100000; - memcpy(outputs[0]->hostPtr(), inputs[2]->hostPtr(), inputs[2]->batch() * inputs[2]->head() * inputs[2]->sequence() * inputs[2]->dimension() * sizeof(float)); + memcpy(outputs[0]->hostPtr(), inputs[2]->hostPtr(), inputs[2]->cntSize()); // input outliers if (!input_clip) { @@ -139,18 +166,64 @@ ErrorCode QNNLinearINT8Shadow::execute(vector> inputs, vector for (int h = 0; h < inputs[0]->head(); h++) { for (int j = 0; j < inputs[0]->sequence(); j++) { for (int k = 0; k < inputs[0]->dimension(); k++) { - if (roundf(inputs[0]->dataAt(i, h, j, k) / input_scale) > 127.0 || roundf(inputs[0]->dataAt(i, h, j, k) / input_scale) < -128.0) { + float round_value = roundf(inputs[0]->dataAt(i, h, j, k) / input_scale); + if (round_value > (127.0 * 1.5) || round_value < (-128.0 * 1.5)) { + +#if defined(__ARM_NEON) + float origin_value = round_value * input_scale * weight_scale; + float clip_value = std::fmax(std::fmin(round_value, 127), -128) * input_scale * weight_scale; + + int w_max = shadowWeight_.dimension(); + int vector_size = 4; + +#pragma omp parallel for num_threads(4) + for (int w = 0; w <= w_max - vector_size; w += vector_size) { + // Load shadow weights into a NEON vector + int8x8_t weight_vec_int8 = vld1_s8(shadowWeight_.ptrAt(0, 0, k, w)); + int16x8_t weight_vec_int16 = vmovl_s8(weight_vec_int8); + + // Convert to float + float32x4_t weight_vec = vcvtq_f32_s32(vmovl_s16(vget_low_s16(weight_vec_int16))); + + // Compute origin and clip vectors with NEON + float32x4_t origin_vec = vmulq_n_f32(weight_vec, origin_value); + float32x4_t clip_vec = vmulq_n_f32(weight_vec, clip_value); + + // Load previous output values + float32x4_t output_vec = vld1q_f32(outputs[0]->ptrAt(i, h, j, w)); + + // Calculate and store the result + float32x4_t result_vec = vsubq_f32(origin_vec, clip_vec); + result_vec = vaddq_f32(result_vec, output_vec); + + vst1q_f32(outputs[0]->ptrAt(i, h, j, w), result_vec); + } + + // Handle remaining elements, if any + for (int w = (w_max / vector_size) * vector_size; w < w_max; ++w) { + float origin = origin_value * shadowWeight_.dataAt(0, 0, k, w); + float clip = clip_value * shadowWeight_.dataAt(0, 0, k, w); + + outputs[0]->setDataAt(i, h, j, w, origin - clip + outputs[0]->dataAt(i, h, j, w)); + } + +#else + float origin_value = round_value * input_scale * weight_scale; + float clip_value = std::fmax(std::fmin(round_value, 127), -128) * input_scale * weight_scale; + +#pragma omp parallel for collapse(1) num_threads(4) for (int w = 0; w < shadowWeight_.dimension(); w++) { // if (!(inputs[1]->dataAt(i, h, j, k) <= -128 || inputs[1]->dataAt(i, h, j, k) >= 127)) { - float origin = roundf(inputs[0]->dataAt(i, h, j, k) / input_scale) * input_scale * (shadowWeight_.dataAt(0, 0, k, w) * weight_scale); + float origin = origin_value * shadowWeight_.dataAt(0, 0, k, w); - float clip = std::fmax(std::fmin(roundf(inputs[0]->dataAt(i, h, j, k) / input_scale), 127), -128) * input_scale * (shadowWeight_.dataAt(0, 0, k, w) * weight_scale); + float clip = clip_value * shadowWeight_.dataAt(0, 0, k, w); outputs[0]->setDataAt(i, h, j, w, origin - clip + outputs[0]->dataAt(i, h, j, w)); // } } +#endif } } } @@ -163,14 +236,20 @@ ErrorCode QNNLinearINT8Shadow::execute(vector> inputs, vector for (int i = 0; i < inputs[1]->batch(); i++) { for (int h = 0; h < inputs[1]->head(); h++) { for (int j = 0; j < inputs[1]->sequence(); j++) { +#pragma omp parallel for collapse(1) num_threads(4) for (int k = 0; k < inputs[1]->dimension(); k++) { if (inputs[1]->dataAt(i, h, j, k) <= -128 || inputs[1]->dataAt(i, h, j, k) >= 127) { float sum = 0.0f; - for (int w = 0; w < shadowWeight_.sequence(); w++) { - sum += roundf(inputs[0]->dataAt(i, h, j, w) / input_scale) * input_scale * (shadowWeight_.dataAt(0, 0, w, k) * weight_scale); - } +#if defined(__ARM_NEON) + shadow_vec_dot_fp32_arm(&sum, inputs[0]->ptrAt(i, h, j, 0), shadowTransposeWeight_.ptrAt(0, 0, k, 0), shadowTransposeWeight_.dimension(), input_scale, weight_scale); +#else + + for (int w = 0; w < shadowTransposeWeight_.dimension(); w++) { + sum += roundf(inputs[0]->dataAt(i, h, j, w) / input_scale) * input_scale * (shadowTransposeWeight_.dataAt(0, 0, k, w) * weight_scale); + } +#endif outputs[0]->setDataAt(i, h, j, k, inputs[2]->dataAt(i, h, j, k) - (inputs[1]->dataAt(i, h, j, k) * output_scale) + roundf(sum / output_scale) * output_scale); } } @@ -182,4 +261,11 @@ ErrorCode QNNLinearINT8Shadow::execute(vector> inputs, vector return MLLM_NO_ERROR; } +void QNNLinearINT8Shadow::shadow_vec_dot_fp32_arm(float* s, float* x, int8_t* y, int n, float input_scale, float weight_scale) { + + quantize_round_dequantize_row_i8(x, input_f32_buffer_.hostPtr(), n , input_scale); + dequantize_row_i8(y, weight_f32_buffer_.hostPtr(), n, weight_scale); + vec_dot_fp32(n, s, input_f32_buffer_.hostPtr(), weight_f32_buffer_.hostPtr()); +} + } // namespace mllm diff --git a/src/backends/qnn/op/QNNLinearINT8Shadow.hpp b/src/backends/qnn/op/QNNLinearINT8Shadow.hpp index 9d9f4de0..7ee479de 100644 --- a/src/backends/qnn/op/QNNLinearINT8Shadow.hpp +++ b/src/backends/qnn/op/QNNLinearINT8Shadow.hpp @@ -14,6 +14,8 @@ class QNNLinearINT8Shadow : public QNNCommonOp { virtual ErrorCode load(AbstructLoader &loader) override; virtual ErrorCode free(vector> inputs, vector> outputs) override; + void shadow_vec_dot_fp32_arm(float* s, float* x, int8_t* y, int n, float input_scale, float weight_scale); + private: int in_features_; int out_features_; @@ -29,6 +31,11 @@ class QNNLinearINT8Shadow : public QNNCommonOp { Tensor inputClip_; Tensor outputClip_; + // i16 for accuracy + Tensor weight_f32_buffer_; + Tensor input_f32_buffer_; + + }; class QNNLinearINT8ShadowCreator : public QNNBackend::Creator { diff --git a/src/backends/qnn/op/QNNMatmul.cpp b/src/backends/qnn/op/QNNMatmul.cpp index c6a4f71e..afd42e5a 100644 --- a/src/backends/qnn/op/QNNMatmul.cpp +++ b/src/backends/qnn/op/QNNMatmul.cpp @@ -57,7 +57,7 @@ ErrorCode QNNMatmul::reshape(vector> inputs, vectortransShape(SEQUENCE, DIMENSION); outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->dimension(), inputs[1]->dimension()); } - // outputs[0]->setDtype(activationDtype()); + return Op::reshape(inputs, outputs); } diff --git a/src/backends/qnn/op/QNNMergeOutput.cpp b/src/backends/qnn/op/QNNMergeOutput.cpp index de7ddac1..a9f1d379 100755 --- a/src/backends/qnn/op/QNNMergeOutput.cpp +++ b/src/backends/qnn/op/QNNMergeOutput.cpp @@ -20,7 +20,7 @@ ErrorCode QNNMergeOutput::reshape(vector> inputs, vectorreshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence() + inputs[1]->sequence() + inputs[2]->sequence(), inputs[0]->dimension()); else - outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), (inputs[0]->sequence() * 3 + inputs[3]->sequence()) * 4, inputs[0]->dimension()); + outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), (inputs[0]->sequence() * 3 + inputs[3]->sequence()), inputs[0]->dimension()); return Op::reshape(inputs, outputs); } diff --git a/src/backends/qnn/op/QNNMul.cpp b/src/backends/qnn/op/QNNMul.cpp index 9bc37818..fd38796c 100644 --- a/src/backends/qnn/op/QNNMul.cpp +++ b/src/backends/qnn/op/QNNMul.cpp @@ -20,6 +20,7 @@ ErrorCode QNNMul::reshape(vector> inputs, vector> inputs, vector> outputs) { + outputs[0]->setDtype(MLLM_TYPE_F32); auto outName = outputs[0]->name(); uint32_t dimensionsOutput[4]; @@ -29,13 +30,20 @@ ErrorCode QNNMul::setUp(vector> inputs, vector(outputs[0]->head()); dimensionsOutput[3] = static_cast(outputs[0]->dimension()); + auto type = QNN_DATATYPE_FLOAT_32; + + if (inputs[0]->dtype() == MLLM_TYPE_F16) { + type = QNN_DATATYPE_FLOAT_16; + outputs[0]->setDtype(MLLM_TYPE_F16); + } + vector outputTensor = {{QNN_TENSOR_VERSION_1, {.v1 = { .id = 0, .name = outName.c_str(), .type = getOutputTensorType(outputs[0]), .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, + .dataType = type, .quantizeParams = {QNN_DEFINITION_UNDEFINED, QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, diff --git a/src/backends/qnn/op/QNNQuantize.cpp b/src/backends/qnn/op/QNNQuantize.cpp index 0d5dc0a1..cbf4937e 100644 --- a/src/backends/qnn/op/QNNQuantize.cpp +++ b/src/backends/qnn/op/QNNQuantize.cpp @@ -20,6 +20,7 @@ ErrorCode QNNQuantize::reshape(vector> inputs, vector> inputs, vector> outputs) { + outputs[0]->setDtype(MLLM_TYPE_I8); auto outName = outputs[0]->name(); uint32_t dimensionsOutput[4]; diff --git a/src/backends/qnn/op/QNNRMSNorm.cpp b/src/backends/qnn/op/QNNRMSNorm.cpp index be98bfa7..708cc2bb 100644 --- a/src/backends/qnn/op/QNNRMSNorm.cpp +++ b/src/backends/qnn/op/QNNRMSNorm.cpp @@ -5,9 +5,10 @@ #include namespace mllm { -QNNRMSNorm::QNNRMSNorm(Backend *bn, string opName, int normSize, float epsilon) : - QNNCommonOp(bn, opName), normSize_(normSize), epsilon_(epsilon) { +QNNRMSNorm::QNNRMSNorm(Backend *bn, string opName, int normSize, float epsilon, bool isFP32) : + QNNCommonOp(bn, opName), normSize_(normSize), epsilon_(epsilon), isFP32_(isFP32) { weight_.setBackend(bn); + scale_.setBackend(bn); } ErrorCode QNNRMSNorm::reshape(vector> inputs, vector> outputs) { @@ -17,6 +18,10 @@ ErrorCode QNNRMSNorm::reshape(vector> inputs, vector> inputs, vector> outputs) { + float quantScale = 0; + quantScale = scale_.hostPtr()[0] / 127.0; + quantScale = roundf(quantScale * 100000) / 100000; + uint32_t dimWeight[4] = {(uint32_t)normSize_}; qnnBackend_->modelAddTensor(weight_.name(), (Qnn_Tensor_t){ .version = QNN_TENSOR_VERSION_1, @@ -39,25 +44,51 @@ ErrorCode QNNRMSNorm::setUp(vector> inputs, vectorbatch(), (uint32_t)outputs[0]->sequence(), (uint32_t)outputs[0]->head(), (uint32_t)outputs[0]->dimension()}; auto outName = outputs[0]->name(); - vector - out = { - (Qnn_Tensor_t){ - .version = QNN_TENSOR_VERSION_1, - .v1 = { - .id = 0, - .name = outName.c_str(), - .type = getOutputTensorType(outputs[0]), - .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, - .quantizeParams = {QNN_DEFINITION_UNDEFINED, - QNN_QUANTIZATION_ENCODING_UNDEFINED, - {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, - .rank = 4, - .dimensions = dimOut, - .memType = QNN_TENSORMEMTYPE_RAW, - .clientBuf = {.data = nullptr, - .dataSize = 0}}}}; - return graphAddNode(name(), "RMSNorm", {inputs[0]->name(), weight_.name()}, out, {}, "LLaMAPackage"); + + if (isFP32_) { + outputs[0]->setDtype(MLLM_TYPE_F32); + vector + out = { + (Qnn_Tensor_t){ + .version = QNN_TENSOR_VERSION_1, + .v1 = { + .id = 0, + .name = outName.c_str(), + .type = getOutputTensorType(outputs[0]), + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = QNN_DATATYPE_FLOAT_32, + .quantizeParams = {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}}, + .rank = 4, + .dimensions = dimOut, + .memType = QNN_TENSORMEMTYPE_RAW, + .clientBuf = {.data = nullptr, + .dataSize = 0}}}}; + return graphAddNode(name(), "RMSNorm", {inputs[0]->name(), weight_.name()}, out, {}, "LLaMAPackage"); + + } else { + outputs[0]->setDtype(MLLM_TYPE_I8); + vector + out = { + (Qnn_Tensor_t){ + .version = QNN_TENSOR_VERSION_1, + .v1 = { + .id = 0, + .name = outName.c_str(), + .type = getOutputTensorType(outputs[0]), + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = QNN_DATATYPE_SFIXED_POINT_8, + .quantizeParams = {QNN_DEFINITION_DEFINED, + QNN_QUANTIZATION_ENCODING_SCALE_OFFSET, + {.scaleOffsetEncoding = {.scale = quantScale, .offset = 0}}}, + .rank = 4, + .dimensions = dimOut, + .memType = QNN_TENSORMEMTYPE_RAW, + .clientBuf = {.data = nullptr, + .dataSize = 0}}}}; + return graphAddNode(name(), "RMSNorm", {inputs[0]->name(), weight_.name()}, out, {}, "LLaMAPackage"); + } } ErrorCode QNNRMSNorm::load(AbstructLoader &loader) { @@ -72,6 +103,21 @@ ErrorCode QNNRMSNorm::load(AbstructLoader &loader) { weight_.setDtype(MLLM_TYPE_F32); weight_.alloc(); } + + string scaleName = name(); + + std::string wordToRemove = "post_attention_layernorm"; + int pos = scaleName.find(wordToRemove); + if (pos != -1) { + scaleName.erase(pos, wordToRemove.length()); + } + + scale_.setName(scaleName + "mlp.up_proj.input_scale"); + scale_.reshape(1, 1, 1, 1); + scale_.setDtype(MLLM_TYPE_F32); + scale_.alloc(); + loader.load(&scale_); + return Op::load(loader); } } // namespace mllm diff --git a/src/backends/qnn/op/QNNRMSNorm.hpp b/src/backends/qnn/op/QNNRMSNorm.hpp index 29217184..b739ff70 100644 --- a/src/backends/qnn/op/QNNRMSNorm.hpp +++ b/src/backends/qnn/op/QNNRMSNorm.hpp @@ -6,7 +6,7 @@ namespace mllm { class QNNRMSNorm : public QNNCommonOp { public: - QNNRMSNorm(Backend *bn, string opName, int normSize, float epsilon = 1e-6); + QNNRMSNorm(Backend *bn, string opName, int normSize, float epsilon = 1e-6, bool isFP32 = true); virtual ~QNNRMSNorm() = default; virtual ErrorCode reshape(vector> inputs, vector> outputs) override; virtual ErrorCode setUp(vector> inputs, vector> outputs) override; @@ -17,6 +17,9 @@ class QNNRMSNorm : public QNNCommonOp { int axis_ = 1; Tensor weight_; int normSize_; + bool isFP32_; + + Tensor scale_; }; class QNNRMSNormCreator : public QNNBackend::Creator { @@ -24,7 +27,8 @@ class QNNRMSNormCreator : public QNNBackend::Creator { virtual Op *create(OpParam op_param, Backend *bn, string name) const override { int normSize = (int)op_param["norm_size"]; float epsilon = (float)op_param["epsilon"]; - return new QNNRMSNorm(bn, name, normSize, epsilon); + bool isFP32 = (float)op_param["isFP32"]; + return new QNNRMSNorm(bn, name, normSize, epsilon, isFP32); } }; diff --git a/src/backends/qnn/op/QNNSiLU.cpp b/src/backends/qnn/op/QNNSiLU.cpp index b6874ee9..7af12189 100644 --- a/src/backends/qnn/op/QNNSiLU.cpp +++ b/src/backends/qnn/op/QNNSiLU.cpp @@ -23,13 +23,23 @@ ErrorCode QNNSiLU::setUp(vector> inputs, vector(outputs[0]->head()); dimensionsOutput[3] = static_cast(outputs[0]->dimension()); + + auto type = QNN_DATATYPE_FLOAT_32; + outputs[0]->setDtype(MLLM_TYPE_F32); + + if (inputs[0]->dtype() == MLLM_TYPE_F16) { + type = QNN_DATATYPE_FLOAT_16; + outputs[0]->setDtype(MLLM_TYPE_F16); + } + + vector outputTensor = {{QNN_TENSOR_VERSION_1, {.v1 = { .id = 0, .name = outName.c_str(), .type = getOutputTensorType(outputs[0]), .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, - .dataType = QNN_DATATYPE_FLOAT_32, + .dataType = type, .quantizeParams = {QNN_DEFINITION_UNDEFINED, QNN_QUANTIZATION_ENCODING_UNDEFINED, {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, diff --git a/src/backends/qnn/op/QNNSplitInput.cpp b/src/backends/qnn/op/QNNSplitInput.cpp index d66c8bcc..fdfcf91e 100755 --- a/src/backends/qnn/op/QNNSplitInput.cpp +++ b/src/backends/qnn/op/QNNSplitInput.cpp @@ -141,7 +141,7 @@ ErrorCode QNNSplitInput::setUp(vector> inputs, vector()[0] / 127.0; quantScale1 = roundf(quantScale1 * 100000) / 100000; - qnnBackend_->modelAddTensor(inputs[0]->name(), (Qnn_Tensor_t){ + qnnBackend_->modelAddTensor(name(), (Qnn_Tensor_t){ .version = QNN_TENSOR_VERSION_1, .v1 = { .id = 0, @@ -167,7 +167,7 @@ ErrorCode QNNSplitInput::setUp(vector> inputs, vector(outputs[1]->head()), static_cast(outputs[1]->dimension())}; - qnnBackend_->modelAddTensor(inputs[0]->name(), (Qnn_Tensor_t){ + qnnBackend_->modelAddTensor(name(), (Qnn_Tensor_t){ .version = QNN_TENSOR_VERSION_1, .v1 = { .id = 0, diff --git a/src/backends/qnn/op/QNNSuperSiLU.cpp b/src/backends/qnn/op/QNNSuperSiLU.cpp new file mode 100644 index 00000000..2e0dcc05 --- /dev/null +++ b/src/backends/qnn/op/QNNSuperSiLU.cpp @@ -0,0 +1,161 @@ + +#include "QNNSuperSiLU.hpp" +#include "Types.hpp" +#include "QNNCommonOp.hpp" + +namespace mllm { +QNNSuperSiLU::QNNSuperSiLU(Backend *bn, string opName) : + QNNCommonOp(bn, opName) { + + a_scale_.setBackend(bn); + b_scale_.setBackend(bn); + o_scale_.setBackend(bn); +} + +ErrorCode QNNSuperSiLU::reshape(vector> inputs, vector> outputs) { + outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), inputs[0]->dimension()); + return Op::reshape(inputs, outputs); +} + +ErrorCode QNNSuperSiLU::setUp(vector> inputs, vector> outputs) { + + auto outName = outputs[0]->name(); + + uint32_t dimensionsOutput[4]; + + + dimensionsOutput[0] = static_cast(outputs[0]->batch()); + dimensionsOutput[1] = static_cast(outputs[0]->sequence()); + dimensionsOutput[2] = static_cast(outputs[0]->head()); + dimensionsOutput[3] = static_cast(outputs[0]->dimension()); + + float aScale = 0; + aScale = a_scale_.hostPtr()[0] / 127.0; + aScale = roundf(aScale * 100000) / 100000; + + float bScale = 0; + bScale = b_scale_.hostPtr()[0] / 127.0; + bScale = roundf(bScale * 100000) / 100000; + + float oScale = 0; + oScale = o_scale_.hostPtr()[0] / 127.0; + oScale = roundf(oScale * 100000) / 100000; + + auto paramsSuperSiLuNameA = name() + ".supersilu_params.a_scale"; + auto paramsSuperSiLuNameB = name() + ".supersilu_params.b_scale"; + auto paramsSuperSiLuNameO = name() + ".supersilu_params.o_scale"; + + uint32_t paramsSuperSiLuDimension[1] = {1}; + + vector paramsSuperSiLu = { + {.paramType = QNN_PARAMTYPE_TENSOR, + .name = "a_scale", + {.tensorParam = + (Qnn_Tensor_t){.version = QNN_TENSOR_VERSION_1, + {.v1 = { + .id = 0, + .name = paramsSuperSiLuNameA.c_str(), + .type = QNN_TENSOR_TYPE_STATIC, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = QNN_DATATYPE_FLOAT_32, + .quantizeParams = {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = 1, + .dimensions = paramsSuperSiLuDimension, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = (uint8_t *)&aScale, + .dataSize = sizeof(float)}}}}}}}, + {.paramType = QNN_PARAMTYPE_TENSOR, + .name = "b_scale", + {.tensorParam = + (Qnn_Tensor_t){.version = QNN_TENSOR_VERSION_1, + {.v1 = { + .id = 0, + .name = paramsSuperSiLuNameB.c_str(), + .type = QNN_TENSOR_TYPE_STATIC, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = QNN_DATATYPE_FLOAT_32, + .quantizeParams = {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = 1, + .dimensions = paramsSuperSiLuDimension, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = (uint8_t *)&bScale, + .dataSize = sizeof(float)}}}}}}}, + {.paramType = QNN_PARAMTYPE_TENSOR, + .name = "o_scale", + {.tensorParam = + (Qnn_Tensor_t){.version = QNN_TENSOR_VERSION_1, + {.v1 = { + .id = 0, + .name = paramsSuperSiLuNameO.c_str(), + .type = QNN_TENSOR_TYPE_STATIC, + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = QNN_DATATYPE_FLOAT_32, + .quantizeParams = {QNN_DEFINITION_UNDEFINED, + QNN_QUANTIZATION_ENCODING_UNDEFINED, + {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, + .offset = 0}}}, + .rank = 1, + .dimensions = paramsSuperSiLuDimension, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = (uint8_t *)&oScale, + .dataSize = sizeof(float)}}}}}}}, + }; + + + vector outputTensor = {{QNN_TENSOR_VERSION_1, + {.v1 = { + .id = 0, + .name = outName.c_str(), + .type = getOutputTensorType(outputs[0]), + .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER, + .dataType = QNN_DATATYPE_SFIXED_POINT_8, + .quantizeParams = {QNN_DEFINITION_DEFINED, + QNN_QUANTIZATION_ENCODING_SCALE_OFFSET, + {.scaleOffsetEncoding = {.scale = oScale, + .offset = 0}}}, + .rank = 4, + .dimensions = dimensionsOutput, + .memType = QNN_TENSORMEMTYPE_RAW, + {.clientBuf = {.data = nullptr, + .dataSize = 0}}}}}}; + return graphAddNode(name(), "LLaMASuperSiLU", {inputs[0]->name(), inputs[1]->name()}, outputTensor, paramsSuperSiLu, "LLaMAPackage"); +} + +ErrorCode QNNSuperSiLU::load(AbstructLoader &loader) { + string opName = name(); + std::string wordToRemove = ".supersilu"; + + int pos = opName.find(wordToRemove); + if (pos != -1) { + opName.erase(pos, wordToRemove.length()); + } + + a_scale_.setName(opName + ".gate_proj.output_scale"); + a_scale_.reshape(1, 1, 1, 1); + a_scale_.setDtype(MLLM_TYPE_F32); + a_scale_.alloc(); + loader.load(&a_scale_); + + b_scale_.setName(opName + ".up_proj.output_scale"); + b_scale_.reshape(1, 1, 1, 1); + b_scale_.setDtype(MLLM_TYPE_F32); + b_scale_.alloc(); + loader.load(&b_scale_); + + o_scale_.setName(opName + ".down_proj.input_scale"); + o_scale_.reshape(1, 1, 1, 1); + o_scale_.setDtype(MLLM_TYPE_F32); + o_scale_.alloc(); + loader.load(&o_scale_); + + return Op::load(loader); +} + +} // namespace mllm + diff --git a/src/backends/qnn/op/QNNSuperSiLU.hpp b/src/backends/qnn/op/QNNSuperSiLU.hpp new file mode 100644 index 00000000..26b4165d --- /dev/null +++ b/src/backends/qnn/op/QNNSuperSiLU.hpp @@ -0,0 +1,30 @@ + +#ifndef MLLM_QNNSUPERSILU_H +#define MLLM_QNNSUPERSILU_H + +#include "QNNCommonOp.hpp" +namespace mllm { +class QNNSuperSiLU : public QNNCommonOp { +public: + QNNSuperSiLU(Backend *bn, string opName); + virtual ~QNNSuperSiLU() = default; + virtual ErrorCode reshape(vector> inputs, vector> outputs) override; + virtual ErrorCode setUp(vector> inputs, vector> outputs) override; + virtual ErrorCode load(AbstructLoader &loader) override; + + Tensor a_scale_; + Tensor b_scale_; + Tensor o_scale_; +}; + + +class QNNSuperSiLUCreator : public QNNBackend::Creator { + + virtual Op *create(OpParam op_param, Backend *bn, string name) const { + return new QNNSuperSiLU(bn, name); + } +}; + +} // namespace mllm + +#endif diff --git a/src/backends/qnn/op/QNNTranspose.cpp b/src/backends/qnn/op/QNNTranspose.cpp index a20c0e19..94b6c277 100644 --- a/src/backends/qnn/op/QNNTranspose.cpp +++ b/src/backends/qnn/op/QNNTranspose.cpp @@ -13,13 +13,16 @@ QNNTranspose::QNNTranspose(Backend *bn, int perm0, int perm1, int perm2, int per } ErrorCode QNNTranspose::reshape(vector> inputs, vector> outputs) { - if (perm[0] == 0 && perm[1] == 2 && perm[2] == 3 && perm[3] == 1) - outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->dimension(), inputs[0]->sequence()); + // if (perm[0] == 0 && perm[1] == 2 && perm[2] == 3 && perm[3] == 1) + // outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->dimension(), inputs[0]->sequence()); + outputs[0]->reshape(inputs[0]->batch(), inputs[0]->head(), inputs[0]->sequence(), inputs[0]->dimension()); + outputs[0]->transShape(SEQUENCE, DIMENSION); return Op::reshape(inputs, outputs); } ErrorCode QNNTranspose::setUp(vector> inputs, vector> outputs) { + #ifdef OLD_QNN if (getOutputTensorType(outputs[0]) == QNN_TENSOR_TYPE_APP_READ) { outputs[0]->setBackend(qnnBackend_); outputs[0]->setDtype(MLLM_TYPE_F32); @@ -27,6 +30,7 @@ ErrorCode QNNTranspose::setUp(vector> inputs, vectorpushOutputBuffers(outputs[0]->hostPtr()); } + #endif uint32_t transposeParamsDimension[4] = {4}; @@ -55,9 +59,15 @@ ErrorCode QNNTranspose::setUp(vector> inputs, vectorbatch(); dimVTranspose[1] = outputs[0]->head(); - dimVTranspose[2] = outputs[0]->sequence(); - dimVTranspose[3] = outputs[0]->dimension(); + dimVTranspose[2] = outputs[0]->dimension(); + dimVTranspose[3] = outputs[0]->sequence(); + auto type = QNN_DATATYPE_FLOAT_32; + + if (inputs[0]->dtype() == MLLM_TYPE_F16) { + type = QNN_DATATYPE_FLOAT_16; + } + auto outVTransposeName = outputs[0]->name(); vector outVTranspose = { (Qnn_Tensor_t){ @@ -67,7 +77,7 @@ ErrorCode QNNTranspose::setUp(vector> inputs, vector> inputs, vectorreshape(dim0, dim1, dim2, dim3); @@ -80,6 +80,8 @@ ErrorCode QNNView::reshape(vector> inputs, vector> inputs, vector> outputs) { + outputs[0]->setDtype(inputs[0]->dtype()); + #ifdef OLD_QNN if (getOutputTensorType(outputs[0]) == QNN_TENSOR_TYPE_APP_READ) { outputs[0]->setBackend(qnnBackend_); outputs[0]->setDtype(MLLM_TYPE_I8); @@ -87,6 +89,7 @@ ErrorCode QNNView::setUp(vector> inputs, vectorpushOutputBuffers(outputs[0]->hostPtr()); } + #endif if (outputs[0]->dtype() == MLLM_TYPE_I8) return graphAddNode(name(), "Reshape", inputs, outputs, {}, "qti.aisw", true, &scale_); @@ -190,6 +193,13 @@ ErrorCode QNNView::load(AbstructLoader &loader) { scale_type_name = ".input_scale"; } + wordToRemove = ".post_attention_layernorm"; + pos = scaleName.find(wordToRemove); + if (pos != -1) { + scaleName.erase(pos, wordToRemove.length()); + scale_type_name = ".mlp.up_proj.input_scale"; + } + scale_.setName(scaleName + scale_type_name); scale_.reshape(1, 1, 1, 1); scale_.setDtype(MLLM_TYPE_F32); diff --git a/src/express/Express.cpp b/src/express/Express.cpp index 5461a345..41905747 100644 --- a/src/express/Express.cpp +++ b/src/express/Express.cpp @@ -248,6 +248,23 @@ NetTensor *_SiLU(std::vector inputs, string name) { out_tensor->ctx = ctx; return out_tensor; } + +NetTensor *_SuperSiLU(std::vector inputs, string name) { + Context *ctx = inputs[0]->ctx; + NetTensor *out_tensor = new NetTensor(); + if (name.empty()) { + name = "Silu" + std::to_string(ctx->idx); + } + out_tensor->name = "outtensor-" + name + "-00"; + out_tensor->type = inputs[0]->type; + ctx->idx++; + _STORE_OUT_TENSOR + _NEW_OP(mllm::SUPERSILU) + _UPDATE_INPUT_TENSORS + out_tensor->in = net_op_; + out_tensor->ctx = ctx; + return out_tensor; +} NetTensor *_Quantize(std::vector inputs, bool isNSHD, string name) { Context *ctx = inputs[0]->ctx; NetTensor *out_tensor = new NetTensor(); @@ -333,19 +350,23 @@ NetTensor *_Matmul(std::vector inputs, bool transpose0, bool transp * \param norm_size The size of the normed dimension. * \param epsilon Default is 1e-6. */ -NetTensor *_RMSNorm(std::vector inputs, int norm_size, float epsilon, string name) { +NetTensor *_RMSNorm(std::vector inputs, int norm_size, float epsilon, string name, bool isFP32) { Context *ctx = inputs[0]->ctx; NetTensor *out_tensor = new NetTensor(); if (name.empty()) { name = "RMSNorm" + std::to_string(ctx->idx); } out_tensor->name = "outtensor-" + name + "-00"; - out_tensor->type = inputs[0]->type; + if (isFP32) + out_tensor->type = inputs[0]->type; + else + out_tensor->type = MLLM_TYPE_I8; ctx->idx++; _STORE_OUT_TENSOR _NEW_OP(mllm::RMSNORM) net_op_->param["norm_size"] = (float) norm_size; net_op_->param["epsilon"] = (float) epsilon; + net_op_->param["isFP32"] = (float) isFP32; _UPDATE_INPUT_TENSORS out_tensor->in = net_op_; out_tensor->ctx = ctx; diff --git a/src/express/Express.hpp b/src/express/Express.hpp index 7d062227..cf582470 100644 --- a/src/express/Express.hpp +++ b/src/express/Express.hpp @@ -16,11 +16,12 @@ NetTensor *_Range(Context *ctx, std::vector inputs, int start, int NetTensor *_Add(std::vector inputs, string name = ""); NetTensor *_Causalmask(std::vector inputs, string name = ""); NetTensor *_SiLU(std::vector inputs, string name = ""); +NetTensor *_SuperSiLU(std::vector inputs, string name = ""); NetTensor *_Quantize(std::vector inputs, bool isNSHD = true, string name = ""); NetTensor *_Dequantize(std::vector inputs, bool isNSHD = true, string name = "", bool isFP32 = true); NetTensor *_Softmax(std::vector inputs, int axis, int do_causal_mask, string name = ""); NetTensor *_Matmul(std::vector inputs, bool transpose0, bool transpose1, string name = ""); -NetTensor *_RMSNorm(std::vector inputs, int norm_size, float epsilon= 1e-6, string name = ""); +NetTensor *_RMSNorm(std::vector inputs, int norm_size, float epsilon= 1e-6, string name = "", bool isFP32 = true); NetTensor *_RoPE(std::vector inputs, int pose_type, string name = "", int rope_theta = 10000, int max_position_embeddings = 16384); NetTensor *_PositionalEmbedding(std::vector inputs, int max_num, int hidden_dim, string name = ""); NetTensor *_Scale(std::vector inputs, float scale, float bias, bool bias_after_scale, string name); diff --git a/src/models/qwen/modeling_qwen_npu.hpp b/src/models/qwen/modeling_qwen_npu.hpp index 221a1786..18d294f8 100644 --- a/src/models/qwen/modeling_qwen_npu.hpp +++ b/src/models/qwen/modeling_qwen_npu.hpp @@ -1,5 +1,5 @@ -#ifndef MODELING_QWEN_HPP -#define MODELING_QWEN_HPP +#ifndef MODELING_QWENNPU_HPP +#define MODELING_QWENNPU_HPP #include "Backend.hpp" #include "Layer.hpp" @@ -7,8 +7,7 @@ #include "Tensor.hpp" #include "Types.hpp" #include "configuration_qwen.hpp" -#include -#include + using namespace mllm; // NPU QKV part @@ -18,10 +17,22 @@ class QwenDecoderNPUPart1 final : public Module { int head_dim; int num_key_value_heads; int num_key_value_groups; + + // it is for speed up the QNN linear implemented by conv, TODO: should integrate into QNNLinear + Layer pre_attn_view; + Layer q_proj; Layer k_proj; Layer v_proj; - Layer o_proj; + + Layer q_view; + Layer k_view; + Layer v_view; + + Layer q_dequant; + Layer k_dequant; + Layer v_dequant; + Layer v_transpose; public: QwenDecoderNPUPart1() = default; @@ -32,18 +43,39 @@ class QwenDecoderNPUPart1 final : public Module { num_key_value_heads = config.num_key_value_heads; num_key_value_groups = num_heads / num_key_value_heads; - q_proj = Linear(hidden_size, num_heads * head_dim, true, base_name + names._attn_base_name + names._q_proj_name); - k_proj = Linear(hidden_size, num_key_value_heads * head_dim, true, base_name + names._attn_base_name + names._k_proj_name); - v_proj = Linear(hidden_size, num_key_value_heads * head_dim, true, base_name + names._attn_base_name + names._v_proj_name); - o_proj = Linear(num_heads * head_dim, hidden_size, false, base_name + names._attn_base_name + names._o_proj_name); + pre_attn_view = View(-1, 1, -1, num_heads * head_dim, base_name + "ires_split-00_view_"); + + q_proj = Linear(hidden_size, num_heads * head_dim, true, base_name + names._q_proj_name); + k_proj = Linear(hidden_size, num_key_value_heads * head_dim, true, base_name + names._k_proj_name); + v_proj = Linear(hidden_size, num_key_value_heads * head_dim, true, base_name + names._v_proj_name); + + q_view = View(-1, num_heads, -1, head_dim, base_name + names._q_proj_name + "-00_view_"); + k_view = View(-1, num_heads, -1, head_dim, base_name + names._k_proj_name + "-00_view_"); + v_view = View(-1, num_heads, -1, head_dim, base_name + names._v_proj_name + "-00_view_"); + + q_dequant = Dequantize(true, base_name + names._q_proj_name + ".dequantize"); + k_dequant = Dequantize(true, base_name + names._k_proj_name + ".dequantize", false); + v_dequant = Dequantize(true, base_name + names._v_proj_name + ".dequantize", false); + + v_transpose = Transpose({0, 2, 3, 1}, base_name + names._v_proj_name + ".transpose"); } vector Forward(vector inputs, vector args) override { - auto query_states = q_proj(inputs[0]); - auto key_states = k_proj(inputs[0]); - auto value_states = v_proj(inputs[0]); + auto x = pre_attn_view(inputs[0]); + + auto query_states = q_proj(x); + auto key_states = k_proj(x); + auto value_states = v_proj(x); + + query_states = q_view(query_states); + key_states = k_view(key_states); + value_states = v_view(value_states); + + query_states = q_dequant(query_states); + key_states = k_dequant(key_states); + value_states = v_dequant(value_states); - value_states = value_states.transpose(SEQUENCE, DIMENSION); + value_states = v_transpose(value_states); return {query_states, key_states, value_states}; } }; @@ -55,6 +87,9 @@ class QwenQKVmm final : public Module { Layer k_rope; Layer k_cache; Layer v_cache; + Layer qk_mm; + Layer qkv_mm; + Layer o_quantize; int hidden_size; int num_heads; @@ -71,10 +106,15 @@ class QwenQKVmm final : public Module { q_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "q_rope"); k_rope = RoPE(config.RoPE_type, config.rope_theta, config.max_position_embeddings, base_name + "k_rope"); - k_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + names._attn_base_name + "k_cache"); - v_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + names._attn_base_name + "v_cache"); + k_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "k_cache", true); + v_cache = KVCache(config.num_attention_heads / config.num_key_value_heads, config.cache_limit, base_name + "v_cache", true); + + qk_mm = Matmul(false, true, base_name + "qk"); + qkv_mm = Matmul(false, false, base_name + "qkv"); softmax = Softmax(DIMENSION, true, base_name + "softmax"); + + o_quantize = Quantize(true, base_name + names._o_proj_name + ".quantize"); } vector Forward(vector inputs, vector args) override { @@ -82,24 +122,20 @@ class QwenQKVmm final : public Module { auto k = inputs[1]; auto v = inputs[2]; - q = q.view(-1, num_heads, -1, head_dim); - k = k.view(-1, num_heads, -1, head_dim); - v = v.view(-1, num_heads, -1, head_dim); - q = q_rope(q); k = k_rope(k); - if (k_cache.ready() && v_cache.ready()) { - k = k_cache(k); - v = v_cache(v); - } + k = k_cache(k); + v = v_cache(v); - k = k.transpose(SEQUENCE, DIMENSION); - auto qk = Tensor::mm(q, k); - qk = qk / std::sqrt(hidden_size); + // auto qk = qk_mm(q, k); + auto qk = Tensor::mm(q, k.transpose(Chl::SEQUENCE, Chl::DIMENSION)); qk = softmax(qk); + // auto o = qkv_mm(qk, v); auto o = Tensor::mm(qk, v); + o = o_quantize(o); + return {o}; } }; @@ -113,12 +149,30 @@ class QwenDecoderNPUPart2 final : public Module { int num_key_value_groups; int intermediate_size; + // NPU part2 of attention + Layer pre_oproj_view; Layer out_proj; + Layer post_oproj_view; + Layer post_oproj_dequantize; + + // NPU mlp + Layer pre_mlp_quantize; + Layer pre_mlp_view; Layer gate_proj; Layer up_proj; - Layer down_proj; + Layer post_up_proj_dequantize; + Layer post_gate_proj_dequantize; Layer silu; - Layer post_attention_layernorm; + Layer post_attn_layernorm; + + Layer down_proj; + Layer pre_down_proj_quantize; + Layer post_down_proj_dequantize; + Layer post_mlp_view; + + Layer post_atten_res_add; + Layer post_mlp_res_add; + Layer mlp_mul; public: QwenDecoderNPUPart2() = default; @@ -130,31 +184,181 @@ class QwenDecoderNPUPart2 final : public Module { num_key_value_heads = config.num_key_value_heads; num_key_value_groups = num_heads / num_key_value_heads; - out_proj = Linear(hidden_size, hidden_size, false, base_name + names._o_proj_name); - gate_proj = Linear(hidden_size, intermediate_size, false, base_name + names._gate_proj_name); - silu = SiLU(base_name + "act"); - up_proj = Linear(hidden_size, intermediate_size, false, base_name + names._up_proj_name); - down_proj = Linear(intermediate_size, hidden_size, false, base_name + names._down_proj_name); - post_attention_layernorm = - RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._ffn_norm_name); + // for QNN linear speed up + pre_oproj_view = View(1, 2, 32, head_dim * num_heads, base_name + names._attn_base_name + "or_split-00_view_"); + out_proj = Linear(hidden_size, hidden_size, false, base_name + names._attn_base_name + names._o_proj_name); + post_oproj_dequantize = Dequantize(true, base_name + names._attn_base_name + names._o_proj_name + ".dequantize"); + post_oproj_view = View(1, 1, 64, hidden_size, base_name + names._attn_base_name + names._o_proj_name + ".dequantize-00_view_"); + post_atten_res_add = Add(base_name + names._attn_base_name + "post_atten_add"); + + post_attn_layernorm = + RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._ffn_norm_name, false); + + auto mlp_base_name = base_name + names._ffn_base_name; + pre_mlp_quantize = Quantize(true, mlp_base_name + names._up_proj_name + ".quantize"); + pre_mlp_view = View(1, 2, 32, hidden_size, mlp_base_name + names._up_proj_name + ".quantize-00_view_"); + gate_proj = Linear(hidden_size, intermediate_size, false, mlp_base_name + names._gate_proj_name); + silu = SiLU(mlp_base_name + "act"); + up_proj = Linear(hidden_size, intermediate_size, false, mlp_base_name + names._up_proj_name); + post_up_proj_dequantize = Dequantize(true, mlp_base_name + names._up_proj_name + ".dequantize", false); + post_gate_proj_dequantize = Dequantize(true, mlp_base_name + names._gate_proj_name + ".dequantize", false); + + down_proj = Linear(intermediate_size, hidden_size, false, mlp_base_name + names._down_proj_name); + pre_down_proj_quantize = Quantize(true, mlp_base_name + names._down_proj_name + ".quantize"); + post_down_proj_dequantize = Dequantize(true, mlp_base_name + names._down_proj_name + ".dequantize"); + post_mlp_view = View(1, 1, 64, hidden_size, mlp_base_name + names._down_proj_name + ".dequantize-00_view_"); + + mlp_mul = Mul(mlp_base_name + "mul"); + post_mlp_res_add = Add(mlp_base_name + "res_add"); } std::vector Forward(std::vector inputs, std::vector args) override { auto atten_output = inputs[0]; auto res = inputs[1]; - atten_output = atten_output.view(-1, 1, -1, head_dim * num_heads); + atten_output = pre_oproj_view(atten_output); atten_output = out_proj(atten_output); + atten_output = post_oproj_dequantize(atten_output); + atten_output = post_oproj_view(atten_output); - auto tmp = atten_output + res; - auto x = post_attention_layernorm(tmp); - x = gate_proj(x); - x = silu(x); - auto y = up_proj(tmp); - x = x * y; - x = down_proj(x); - x = x + tmp; - return {x}; + auto tmp = post_atten_res_add(atten_output, res); + + auto x = post_attn_layernorm(tmp); + + // x = pre_mlp_quantize(x); + // reshape to 32,2 + x = pre_mlp_view(x); + + auto gate_out = gate_proj(x); + auto up_out = up_proj(x); + + gate_out = post_gate_proj_dequantize(gate_out); + gate_out = silu(gate_out); + + up_out = post_up_proj_dequantize(up_out); + gate_out = mlp_mul(gate_out, up_out); + + gate_out = pre_down_proj_quantize(gate_out); + gate_out = down_proj(gate_out); + gate_out = post_down_proj_dequantize(gate_out); + + // reshape to 64,1 + gate_out = post_mlp_view(gate_out); + + gate_out = post_mlp_res_add(gate_out, tmp); + return {gate_out}; + } +}; + +class QwenDecoderNPUPart2WithShadow final : public Module { + int hidden_size; + int num_heads; + int head_dim; + int num_key_value_heads; + int num_key_value_groups; + int intermediate_size; + + // NPU part2 of attention + Layer pre_oproj_view; + Layer out_proj; + Layer post_oproj_view; + Layer post_oproj_dequantize; + + // NPU mlp + Layer pre_mlp_quantize; + Layer pre_mlp_view; + Layer gate_proj; + Layer up_proj; + Layer post_up_proj_dequantize; + Layer post_gate_proj_dequantize; + Layer silu; + Layer post_attn_layernorm; + + Layer down_proj; + Layer pre_down_proj_quantize; + Layer post_down_proj_dequantize; + Layer post_mlp_view; + + Layer post_atten_res_add; + Layer post_mlp_res_add; + Layer mlp_mul; + +public: + QwenDecoderNPUPart2WithShadow() = default; + QwenDecoderNPUPart2WithShadow(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { + hidden_size = config.hidden_size; + num_heads = config.num_attention_heads; + head_dim = config.hidden_size / num_heads; + intermediate_size = config.intermediate_size; + num_key_value_heads = config.num_key_value_heads; + num_key_value_groups = num_heads / num_key_value_heads; + + // for QNN linear speed up + pre_oproj_view = View(1, 2, 32, head_dim * num_heads, base_name + names._attn_base_name + "or_split-00_view_"); + out_proj = Linear(hidden_size, hidden_size, false, base_name + names._attn_base_name + names._o_proj_name); + post_oproj_dequantize = Dequantize(true, base_name + names._attn_base_name + names._o_proj_name + ".dequantize"); + post_oproj_view = View(1, 1, 64, hidden_size, base_name + names._attn_base_name + names._o_proj_name + ".dequantize-00_view_"); + post_atten_res_add = Add(base_name + names._attn_base_name + "post_atten_add"); + + post_attn_layernorm = + RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._ffn_norm_name, false); + + auto mlp_base_name = base_name + names._ffn_base_name; + pre_mlp_quantize = Quantize(true, mlp_base_name + names._up_proj_name + ".quantize"); + pre_mlp_view = View(1, 2, 32, hidden_size, mlp_base_name + names._up_proj_name + ".quantize-00_view_"); + gate_proj = Linear(hidden_size, intermediate_size, false, mlp_base_name + names._gate_proj_name); + silu = SiLU(mlp_base_name + "act"); + up_proj = Linear(hidden_size, intermediate_size, false, mlp_base_name + names._up_proj_name); + post_up_proj_dequantize = Dequantize(true, mlp_base_name + names._up_proj_name + ".dequantize", false); + post_gate_proj_dequantize = Dequantize(true, mlp_base_name + names._gate_proj_name + ".dequantize", false); + + down_proj = Linear(intermediate_size, hidden_size, false, mlp_base_name + names._down_proj_name); + pre_down_proj_quantize = Quantize(true, mlp_base_name + names._down_proj_name + ".quantize"); + post_down_proj_dequantize = Dequantize(true, mlp_base_name + names._down_proj_name + ".dequantize"); + post_mlp_view = View(1, 1, 64, hidden_size, mlp_base_name + names._down_proj_name + ".dequantize-00_view_"); + + mlp_mul = Mul(mlp_base_name + "mul"); + post_mlp_res_add = Add(mlp_base_name + "res_add"); + } + + std::vector Forward(std::vector inputs, std::vector args) override { + auto atten_output = inputs[0]; + auto res = inputs[1]; + + atten_output = pre_oproj_view(atten_output); + atten_output = out_proj(atten_output); + atten_output = post_oproj_dequantize(atten_output); + atten_output = post_oproj_view(atten_output); + + auto tmp = post_atten_res_add(atten_output, res); + + auto x = post_attn_layernorm(tmp); + + // x = pre_mlp_quantize(x); + // reshape to 32,2 + x = pre_mlp_view(x); + + auto gate_out = gate_proj(x); + auto up_out = up_proj(x); + + gate_out = post_gate_proj_dequantize(gate_out); + gate_out = silu(gate_out); + + up_out = post_up_proj_dequantize(up_out); + gate_out = mlp_mul(gate_out, up_out); + + auto shadow_input_1 = gate_out; + + gate_out = pre_down_proj_quantize(gate_out); + gate_out = down_proj(gate_out); + auto shadow_input_2 = gate_out; + gate_out = post_down_proj_dequantize(gate_out); + + // reshape to 64,1 + gate_out = post_mlp_view(gate_out); + + gate_out = post_mlp_res_add(gate_out, tmp); + return {shadow_input_1, shadow_input_2, gate_out}; } }; @@ -166,6 +370,7 @@ class QwenNPU_CPUDecoder final : public Module { int num_key_value_groups; Layer input_layernorm; + Layer pre_attn_quantize; QwenDecoderNPUPart1 part1; QwenQKVmm qkv_mm; QwenDecoderNPUPart2 part2; @@ -180,11 +385,12 @@ class QwenNPU_CPUDecoder final : public Module { num_key_value_groups = num_heads / num_key_value_heads; input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._attn_norm_name); + pre_attn_quantize = Quantize(true, base_name + names._attn_base_name + names._q_proj_name + ".quantize"); - part1 = QwenDecoderNPUPart1(config, names, base_name); + part1 = QwenDecoderNPUPart1(config, names, base_name + names._attn_base_name); part1.to(MLLM_QNN); - qkv_mm = QwenQKVmm(config, names, base_name); + qkv_mm = QwenQKVmm(config, names, base_name + names._attn_base_name); qkv_mm.to(MLLM_CPU); part2 = QwenDecoderNPUPart2(config, names, base_name); @@ -193,7 +399,8 @@ class QwenNPU_CPUDecoder final : public Module { vector Forward(vector inputs, vector args) override { auto x = input_layernorm(inputs[0]); - // TODO: quantize x to int8 + x = pre_attn_quantize(x); + if (x.device() != MLLM_QNN) { x = Tensor::toQNN({x})[0]; } @@ -201,45 +408,139 @@ class QwenNPU_CPUDecoder final : public Module { auto q_k_v = part1({x}); // q,k,v auto o_x = qkv_mm(q_k_v)[0]; - o_x = Tensor::toQNN({o_x})[0]; + if (o_x.device() != MLLM_QNN) { + o_x = Tensor::toQNN({o_x})[0]; + } + if (inputs[0].device() != MLLM_QNN) { + inputs[0] = Tensor::toQNN({inputs[0]})[0]; + } x = part2({o_x, inputs[0]})[0]; return {x}; } }; +class QwenNPU_CPUDecoderWithShadow final : public Module { + int hidden_size; + int num_heads; + int head_dim; + int num_key_value_heads; + int num_key_value_groups; + + Layer input_layernorm; + Layer pre_attn_quantize; + Layer shadow_linear; + QwenDecoderNPUPart1 part1; + QwenQKVmm qkv_mm; + QwenDecoderNPUPart2WithShadow part2; + +public: + QwenNPU_CPUDecoderWithShadow() = default; + QwenNPU_CPUDecoderWithShadow(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { + hidden_size = config.hidden_size; + num_heads = config.num_attention_heads; + head_dim = config.hidden_size / num_heads; + num_key_value_heads = config.num_key_value_heads; + num_key_value_groups = num_heads / num_key_value_heads; + + input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._attn_norm_name); + pre_attn_quantize = Quantize(true, base_name + names._attn_base_name + names._q_proj_name + ".quantize"); + + part1 = QwenDecoderNPUPart1(config, names, base_name + names._attn_base_name); + part1.to(MLLM_QNN); + + qkv_mm = QwenQKVmm(config, names, base_name + names._attn_base_name); + qkv_mm.to(MLLM_CPU); + + part2 = QwenDecoderNPUPart2WithShadow(config, names, base_name); + part2.to(MLLM_QNN); + + shadow_linear = ShadowLinear(config.intermediate_size, hidden_size, false, base_name + names._ffn_base_name + names._down_proj_name + ".shadow"); + } + + vector Forward(vector inputs, vector args) override { + auto x = input_layernorm(inputs[0]); + x = pre_attn_quantize(x); + + if (x.device() != MLLM_QNN) { + x = Tensor::toQNN({x})[0]; + } + + auto q_k_v = part1({x}); // q,k,v + auto o_x = qkv_mm(q_k_v)[0]; + + if (o_x.device() != MLLM_QNN) { + o_x = Tensor::toQNN({o_x})[0]; + } + if (inputs[0].device() != MLLM_QNN) { + inputs[0] = Tensor::toQNN({inputs[0]})[0]; + } + auto decoder_out = part2({o_x, inputs[0]}); + if (decoder_out[0].device() != MLLM_CPU) { + decoder_out = Tensor::toCPU(decoder_out); + } + auto shadow_input_1 = decoder_out[0]; + auto shadow_input_2 = decoder_out[1]; + x = decoder_out[2]; + x = shadow_linear(shadow_input_1, shadow_input_2, x); + + return {x}; + } +}; + // Copied from GemmaModel with Gemma->Qwen and set RmsNorm(without add_unit_offset) -class QWenModel final : public Module { +class QWenModel_NPU final : public Module { + template + static vector> ListWithShadow(int n, Args &&...args) { + static_assert(std::is_base_of::value, "T1 must be a subclass of Module"); + static_assert(std::is_base_of::value, "SHADOW must be a subclass of Module"); + listIdx = 0; + vector> modules; + std::set shadowLayers = {1, 2, 6}; + // for index in shadowLayers, create shadow decoder, for others, create normal decoder + for (int i = 0; i < n; i++) { + auto new_args = change_last(args...); // 创建新的参数包,最后一个参数被修改为原来的值+ std::to_string(listIdx)+ "." + if (shadowLayers.find(listIdx) != shadowLayers.end()) { + modules.push_back(std::make_unique(std::apply([&](auto &&...args) { return SHADOW(std::forward(args)...); }, new_args))); + } else { + modules.push_back(std::make_unique(std::apply([&](auto &&...args) { return T1(std::forward(args)...); }, new_args))); + } + listIdx++; + } + listIdx = 0; + return modules; + } + public: - QWenModel() = default; - QWenModel(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { - // TODO: only one block, change it to config.num_hidden_layers - blocks = List(1, config, names, base_name); + QWenModel_NPU() = default; + QWenModel_NPU(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) { + // blocks = List(1, config, names, base_name); + blocks = ListWithShadow(24, config, names, base_name); norm = RMSNorm(config.hidden_size, config.rms_norm_eps, names.post_norm_name); } std::vector Forward(std::vector inputs, std::vector args) override { auto x = inputs[0]; for (auto &block : blocks) { - x = block({x})[0]; + x = (*block)({x})[0]; } x = norm(x); return {x}; } private: - std::vector blocks; + std::vector> blocks; Layer norm; }; -class QWenForCausalLM final : public Module { +class QWenForCausalLM_NPU final : public Module { public: - QWenForCausalLM(QWenConfig &config) { + QWenForCausalLM_NPU(QWenConfig &config) { auto names = config.names_config; hidden_size = config.hidden_size; tie_embedding_words = config.tie_embedding_words; embedding = Embedding(config.vocab_size, config.hidden_size, names.token_embd_name); - model = QWenModel(config, names, names.blk_name); + model = QWenModel_NPU(config, names, names.blk_name); // Qwen-0.5 use tied embedding // Others use nn.Linear() @@ -295,7 +596,7 @@ class QWenForCausalLM final : public Module { auto out_token = text_generator_->generate(_out[0]); if (!call_back(out_token)) break; chatPostProcessing(out_token, input_ids, {}); - std::cout << "========AFTER PREFILL=========" << std::endl; + std::cout << "\n========AFTER PREFILL=========" << std::endl; return; } } @@ -306,7 +607,7 @@ class QWenForCausalLM final : public Module { Layer embedding; Parameter lm_head; Layer lm_head_layer; - QWenModel model; + QWenModel_NPU model; }; -#endif //! MODELING_QWEN_HPP \ No newline at end of file +#endif //! MODELING_QWENNPU_HPP \ No newline at end of file