Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop qnn zh #52

Merged
merged 12 commits into from
Jan 11, 2024
15 changes: 14 additions & 1 deletion demo/qnn/MockLoader.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef MLLM_MockLoader_H
#define MLLM_MockLoader_H
#include "ParamLoader.hpp"
#include <cstdint>

namespace mllm {
class MockLoader : public ParamLoader {
Expand All @@ -12,6 +13,18 @@ class MockLoader : public ParamLoader {
#ifdef DEBUG
std::cout << "MockLoader load" << std::endl;
#endif
switch (tensor->dtype()) {
case DataType::MLLM_TYPE_F32: {
tensor->fullData<float>(2.f);
break;
}
case DataType::MLLM_TYPE_I8: {
tensor->fullData<int8_t>(2);
break;
}
default:
break;
}
return true;
}
bool load(std::shared_ptr<mllm::Tensor> tensor) override {
Expand All @@ -21,7 +34,7 @@ class MockLoader : public ParamLoader {
return true;
}
DataType getDataType(string name) override {
if (name.find("wq.weight") != string::npos) {
if (name.find("q8.weight") != string::npos) {
std::cout << name << "int8" << std::endl;
return DataType::MLLM_TYPE_I8;
}
Expand Down
18 changes: 15 additions & 3 deletions demo/qnn/qnn_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,23 @@

using namespace mllm;

// when set name of linear, use q8 as postfix to let mock loader load int8 data
void BuildModel(Context *ctx) {
auto *i = _Input(ctx);
auto *q = _Linear(ctx, {i}, 4, 2, true, "layers." + std::to_string(0) + ".attention.wq");
auto *r = _RMSNorm(ctx, {q}, "layers." + std::to_string(0) + ".attention_norm");
r = _SiLU(ctx, {r}, "layers." + std::to_string(0) + ".ffn.activation");
i = _RoPE(ctx, {i});
i = _RMSNorm(ctx, {i});
auto *q = _Linear(ctx, {i}, 4, 4, false, "attention.q.q8");
auto *k = _Linear(ctx, {i}, 4, 4, false, "attention.k.q8");
auto *v = _Linear(ctx, {i}, 4, 4, false, "attention.v.q8");
// q = _View(ctx, {q}, {-1, 2, -1, -1}, {0, 3, 2, 3});
// k = _View(ctx, {q}, {-1, 2, -1, -1}, {0, 3, 2, 3});
// v = _View(ctx, {q}, {-1, 2, -1, -1}, {0, 3, 2, 3});
auto *qk = _Matmul(ctx, {q, k}, false, true, "attention.qk");
qk = _Scale(ctx, {qk}, 0.5f, 0.0F, false, "attention.scale");
qk = _Causalmask(ctx, {qk}, "mask");
qk = _Softmax(ctx, {qk}, 3, "softmax");
auto *o = _Matmul(ctx, {qk, v}, false, false, "qkv");
o = _View(ctx, {o}, {-1, -1, -1, -1}, {0, -1, 2, 1 + 3}, "qkv_view");
}

template <typename Dtype>
Expand Down
9 changes: 7 additions & 2 deletions src/backends/QNN/QNNBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ QNNBackend::QNNBackend(shared_ptr<MemoryManager> mm) : Backend(mm) {
#endif

// TODO: make these configuable
m_debug = true; // when set true, NATIVE tensor will be regared as APP_READ tensor
m_debug = false; // when set true, NATIVE tensor will be regared as APP_READ tensor
m_outputDataType = iotensor::OutputDataType::FLOAT_ONLY;
m_inputDataType = iotensor::InputDataType::FLOAT;
m_profilingLevel = ProfilingLevel::OFF;
Expand Down Expand Up @@ -772,7 +772,12 @@ StatusCode QNNBackend::executeGraphs(std::map< std::string, std::vector<uint8_t*
QNN_DEBUG("Successfully executed graphIdx: %d ", graphIdx);
for (int oi=0; oi < graphInfo.numOutputTensors; oi ++) {
auto output = outputs[oi];

// DEBUGLOG
std::cout << "----------------" << std::endl;
std::cout << "output name:" << output.v1.name << std::endl;
std::cout << "output id:" << output.v1.clientBuf.dataSize << std::endl;
std::cout << "output type:" << output.v1.type << std::endl;
std::cout << "output type:" << output.v1.dataType << std::endl;
// m_ioTensor.writeOutputTensor(&output, outputBufferMap["graph"][oi]);
memcpy(outputBufferMap["graph"][oi], output.v1.clientBuf.data, output.v1.clientBuf.dataSize);
}
Expand Down
2 changes: 1 addition & 1 deletion src/backends/QNN/op/QNNAdd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,6 @@ ErrorCode QNNAdd::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Ten
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr,
.dataSize = 0}}}}}};
return graphAddNode(name(), "ElementWiseAdd", {inputName0.c_str(), inputName1.c_str()}, out);
return graphAddNode(name(), "ElementWiseAdd", {inputName0, inputName1}, out);
}
} // namespace mllm
16 changes: 14 additions & 2 deletions src/backends/QNN/op/QNNCommonOp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include "QnnTypes.h"
#include "QnnWrapperUtils.hpp"
#include "Types.hpp"
#include <memory>
#include <string>

namespace mllm {

Expand Down Expand Up @@ -33,6 +35,15 @@ ErrorCode QNNCommonOp::load(AbstructLoader &loader) {
}

ErrorCode QNNCommonOp::graphAddNode(string name, string nodeType, vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs, vector<Qnn_Param_t> params, string packageName) {
// DEBUGLOG
std::cout << "=name:" << name << std::endl;
std::cout << "=nodeType:" << nodeType << std::endl;
for (auto &inputTensorName : inputs) {
std::cout << "=input:" << inputTensorName->name() << std::endl;
}
for (auto &output : outputs) {
std::cout << "=output:" << output->name() << std::endl;
}
vector<string> inputTensorNames;
for (auto &input : inputs) {
inputTensorNames.push_back(input->name());
Expand All @@ -44,11 +55,11 @@ ErrorCode QNNCommonOp::graphAddNode(string name, string nodeType, vector<shared_
for (int i = 0; i < output->shape().size(); i++) {
dimensions[i] = output->shape()[i];
}
auto outString = output->name();
inputTensorNames_.push_back(new string(output->name()));
outputTensors.push_back({QNN_TENSOR_VERSION_1,
{.v1 = {
.id = 0,
.name = outString.c_str(),
.name = inputTensorNames_.back()->c_str(),
.type = getOutputTensorType(output),
.dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
.dataType = QNN_DATATYPE_FLOAT_32,
Expand All @@ -69,6 +80,7 @@ ErrorCode QNNCommonOp::graphAddNode(string name, string nodeType, vector<shared_
}

ErrorCode QNNCommonOp::graphAddNode(string name, string nodeType, vector<string> inputTensorNames, vector<Qnn_Tensor_t> outputs, vector<Qnn_Param_t> params, string packageName) {
// DEBUGLOG
std::cout << "=name:" << name << std::endl;
std::cout << "=nodeType:" << nodeType << std::endl;
for(auto &inputTensorName : inputTensorNames) {
Expand Down
1 change: 1 addition & 0 deletions src/backends/QNN/op/QNNCommonOp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class QNNCommonOp : public Op {
virtual ErrorCode load(AbstructLoader &loader) override;

protected:
vector<string *> inputTensorNames_;
QNNBackend *qnnBackend_;
ErrorCode graphAddNode(string name, string nodeType, vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs, vector<Qnn_Param_t> params = {}, string packageName = "qti.aisw");
ErrorCode graphAddNode(string name, string nodeType, vector<string> inputs, vector<Qnn_Tensor_t> outputs, vector<Qnn_Param_t> params = {}, string packageName = "qti.aisw");
Expand Down
14 changes: 7 additions & 7 deletions src/backends/QNN/op/QNNLinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ ErrorCode QNNLinear::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr,
.dataSize = 0}}}}}};
graphAddNode(name() + ".quantize", "Quantize", {inputs[0]->name().c_str()}, quantizedInput);
graphAddNode(name() + ".quantize", "Quantize", {inputs[0]->name()}, quantizedInput);
// add weight tensor to qnn
uint32_t dimensionsWeight[4];
for (int i = 0; i < 4; i++) {
dimensionsWeight[i] = weight_.shape()[i];
}
qnnBackend_->modelAddTensor(weight_.name().c_str(), (Qnn_Tensor_t){
qnnBackend_->modelAddTensor(weight_.name(), (Qnn_Tensor_t){
.version = QNN_TENSOR_VERSION_1,
{.v1 = {
.id = 0,
Expand Down Expand Up @@ -107,7 +107,7 @@ ErrorCode QNNLinear::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr,
.dataSize = 0}}}}}};
graphAddNode(name() + ".matmul", "MatMul", {inputQuantizeName.c_str(), weight_.name().c_str()}, matmulOut, paramsMatmul);
graphAddNode(name() + ".matmul", "MatMul", {inputQuantizeName, weight_.name()}, matmulOut, paramsMatmul);

// if don't support bias, just dequantize and write to tensor with name of outputs[0]
if (!support_bias_) {
Expand All @@ -127,7 +127,7 @@ ErrorCode QNNLinear::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr,
.dataSize = 0}}}}}};
return graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName.c_str()}, deqnOut);
return graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName}, deqnOut);
}

// dequantize to tensor with name of outputs[0] + ".dequantize"
Expand All @@ -147,10 +147,10 @@ ErrorCode QNNLinear::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr,
.dataSize = 0}}}}}};
graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName.c_str()}, deqnOut);
graphAddNode(name() + ".dequantize", "Dequantize", {outQuantizedName}, deqnOut);
// add bias tensor to qnn
uint32_t dimensionsBias[4] = {1, 1, 1, (uint32_t)out_features_};
qnnBackend_->modelAddTensor(bias_.name().c_str(), (Qnn_Tensor_t){
qnnBackend_->modelAddTensor(bias_.name(), (Qnn_Tensor_t){
.version = QNN_TENSOR_VERSION_1,
{.v1 = {
.id = 0,
Expand Down Expand Up @@ -182,7 +182,7 @@ ErrorCode QNNLinear::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
.memType = QNN_TENSORMEMTYPE_RAW,
{.clientBuf = {.data = nullptr,
.dataSize = 0}}}}}};
return graphAddNode(name() + ".add", "ElementWiseAdd", {outDeqnName.c_str(), bias_.name().c_str()}, biasOutput);
return graphAddNode(name() + ".add", "ElementWiseAdd", {outDeqnName, bias_.name()}, biasOutput);
}

ErrorCode QNNLinear::load(AbstructLoader &loader) {
Expand Down
6 changes: 3 additions & 3 deletions src/backends/QNN/op/QNNMatmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
#include "QNNCommonOp.hpp"

namespace mllm {
QNNMatmul::QNNMatmul(Backend *bn, string opName) :
QNNCommonOp(bn, opName) {
QNNMatmul::QNNMatmul(Backend *bn, string opName, bool transpose0, bool transpose1) :
QNNCommonOp(bn, opName), transpose0_(transpose0), transpose1_(transpose1) {
}

ErrorCode QNNMatmul::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
Expand Down Expand Up @@ -63,6 +63,6 @@ ErrorCode QNNMatmul::setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<
{.paramType = QNN_PARAMTYPE_SCALAR,
.name = "transpose_in1",
{.scalarParam = (Qnn_Scalar_t){QNN_DATATYPE_BOOL_8, {.bool8Value = transpose1_}}}}};
return graphAddNode(name(), "Reshape", inputs, outputs, paramsMatmul);
return graphAddNode(name(), "MatMul", inputs, outputs, paramsMatmul);
}
} // namespace mllm
6 changes: 4 additions & 2 deletions src/backends/QNN/op/QNNMatmul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
namespace mllm {
class QNNMatmul : public QNNCommonOp {
public:
QNNMatmul(Backend *bn, string opName);
QNNMatmul(Backend *bn, string opName, bool transpose0, bool transpose1);
virtual ~QNNMatmul() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode setUp(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
Expand All @@ -19,7 +19,9 @@ class QNNMatmul : public QNNCommonOp {
class QNNMatmulCreator : public QNNBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name) const {
return new QNNMatmul(bn, name);
bool transpose0 = (bool)op_param["transpose0"];
bool transpose1 = (bool)op_param["transpose1"];
return new QNNMatmul(bn, name, transpose0, transpose1);
}
};

Expand Down
Loading
Loading