diff --git a/examples/demo_llama.cpp b/examples/demo_llama.cpp index f24a08e7..b68603c1 100644 --- a/examples/demo_llama.cpp +++ b/examples/demo_llama.cpp @@ -51,6 +51,7 @@ int main(int argc, char **argv) { chatPostProcessing(out_token, input_tensor, {}); } printf("\n"); + model.profiling(); } return 0; diff --git a/src/backends/cpu/compute/GEMM_AArch64.cpp b/src/backends/cpu/compute/GEMM_AArch64.cpp index 8280e561..c51696d7 100644 --- a/src/backends/cpu/compute/GEMM_AArch64.cpp +++ b/src/backends/cpu/compute/GEMM_AArch64.cpp @@ -2201,4 +2201,11 @@ void quantize_row_q4_0_4x4(const float * __restrict x, void * __restrict y, int assert(k%QK4_0 == 0); std::cout<<"Quantize 4x4:"< q6_layers = {"w2", "wv", "dense_h_to_4h", "v_proj", "down_proj"}; vector q4x4_2_q4_layers = {"w2", "wv", "dense_h_to_4h", "v_proj", "down_proj"}; +vector q4x4_2_q4_layers_ = {"wv", "v_proj"}; int tmp_hidden_dim = -1; @@ -128,7 +129,7 @@ void QuantWriter::quantParams(DataType dataType) { break; case MLLM_TYPE_Q4_0_4_4: std::cout << "Quantize param " << name << " to " << DataTypeName(MLLM_TYPE_Q4_0) << "\t"; - block_t = alloc_quant_block(size, dataType); + block_t = alloc_quant_block(size, MLLM_TYPE_Q4_0); quant_ptr = block_t.first; quantize_row_q4_0(param, quant_ptr, size); size = block_t.second; @@ -268,6 +269,68 @@ void QuantWriter::quantParams(DataType dataType) { } writeIndex(); } + +void QuantWriter::quantParams_q4_(DataType dataType) { + for (const auto &name : param_names_) { + // int force_quant_type = -1; + auto size = param_loader_->offsets_[name].second / sizeof(float); + if(find_names(name, {"norm"})) { + tmp_hidden_dim = size; + } + } + quant_type_ = dataType; + for (const auto &name : param_names_) { + // int force_quant_type = -1; + auto *param = getParam(name); + if (param == nullptr) { + __exit(-1); + } + auto size = param_loader_->offsets_[name].second / sizeof(float); + if(find_names(name, {"norm"})) { + tmp_hidden_dim = size; + } + void *quant_ptr = nullptr; + std::pair block_t; + if(find_names(name, fp32_layers)) { + std::cout << "Quantize param " << name << " to " << DataTypeName(MLLM_TYPE_F32) << "\t"; + const auto s = param_loader_->offsets_[name].second / sizeof(float); + const auto tsize = alloc_quant_block(s, MLLM_TYPE_F32).second; + writeParam(name, MLLM_TYPE_F32, param, tsize); + std::cout << " size:" << tsize << std::endl; + }else if (find_names(name, q4x4_2_q4_layers_)) { + // std::cout<<"q4x4_2_q4_layers"< data_; diff --git a/src/quantizer/main.cpp b/src/quantizer/main.cpp index 0b8fd4f6..3889c3d7 100644 --- a/src/quantizer/main.cpp +++ b/src/quantizer/main.cpp @@ -35,7 +35,7 @@ int main(int argc, char **argv) { } else if (quant_type == "Q8_K") { quant_writer.quantParams(MLLM_TYPE_Q8_K); } else if (quant_type == "Q4_0_4_4") { - quant_writer.quantParams(MLLM_TYPE_Q4_0_4_4); + quant_writer.quantParams_q4_(MLLM_TYPE_Q4_0_4_4); } else { std::cout << "Quant type " << quant_type << " is not supported\n"; return -1;