diff --git a/examples/demo_qwen.cpp b/examples/demo_qwen.cpp index ba958662..422540ec 100644 --- a/examples/demo_qwen.cpp +++ b/examples/demo_qwen.cpp @@ -19,7 +19,7 @@ int main(int argc, char **argv) { cmdline::parser cmdParser; cmdParser.add("vocab", 'v', "specify mllm tokenizer model path", false, "../vocab/qwen_vocab.mllm"); cmdParser.add("merge", 'e', "specify mllm merge file path", false, "../vocab/qwen_merges.txt"); - cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-0.5b-q4_k.mllm"); + cmdParser.add("model", 'm', "specify mllm model path", false, "../models/qwen-1.5-1.8b-q8_0.mllm"); cmdParser.add("limits", 'l', "max KV cache size", false, 400); cmdParser.add("thread", 't', "num of threads", false, 4); cmdParser.parse_check(argc, argv); @@ -31,7 +31,7 @@ int main(int argc, char **argv) { CPUBackend::cpu_threads = cmdParser.get("thread"); auto tokenizer = QWenTokenizer(vocab_path, merge_path); - QWenConfig config(tokens_limit, "0.5B", RoPEType::HFHUBROPE); + QWenConfig config(tokens_limit, "1.8B", RoPEType::HFHUBROPE); auto model = QWenForCausalLM(config); model.load(model_path); diff --git a/src/backends/cpu/CPULinear.cpp b/src/backends/cpu/CPULinear.cpp index 4a6680d8..319d1eed 100644 --- a/src/backends/cpu/CPULinear.cpp +++ b/src/backends/cpu/CPULinear.cpp @@ -1,5 +1,6 @@ #include "CPULinear.hpp" +#include namespace mllm { diff --git a/src/quantizer/QuantWriter.cpp b/src/quantizer/QuantWriter.cpp index 752e9c62..2b01d09d 100644 --- a/src/quantizer/QuantWriter.cpp +++ b/src/quantizer/QuantWriter.cpp @@ -69,7 +69,7 @@ void QuantWriter::quantParams(DataType dataType) { } void *quant_ptr = nullptr; std::pair block_t; - if (find_names(name, q6_layers)) { + if (find_names(name, q6_layers) && (dataType== MLLM_TYPE_Q6_K ||dataType == MLLM_TYPE_Q4_K)) { if(tmp_hidden_dim>0 && (size/tmp_hidden_dim)%256!=0){ std::cout << "Quantize param " << name << " to " << DataTypeName(MLLM_TYPE_F32) << "\t"; const auto s = param_loader_->offsets_[name].second / sizeof(float);