diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh
deleted file mode 100644
index 5adaf86579b..00000000000
--- a/.azure-pipelines/scripts/ut/run_itrex.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-set -xe
-source /neural-compressor/.azure-pipelines/scripts/change_color.sh
-python -c "import neural_compressor as nc;print(nc.version.__version__)"
-echo "run itrex ut..."
-
-# install inc 3x deps
-pip install -r /neural-compressor/requirements_pt.txt
-export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
-
-# prepare itrex
-git clone https://github.com/intel/intel-extension-for-transformers.git /intel-extension-for-transformers
-cd /intel-extension-for-transformers && git rev-parse --short HEAD
-bash /intel-extension-for-transformers/.github/workflows/script/prepare_env.sh
-bash /intel-extension-for-transformers/.github/workflows/script/install_binary.sh
-
-# prepare test env
-sed -i '/neural-compressor.git/d' /intel-extension-for-transformers/tests/requirements.txt
-pip install -r /intel-extension-for-transformers/tests/requirements.txt
-# workaround
-pip install onnx==1.16.0
-pip install onnxruntime==1.18.0
-echo "pip list itrex ut deps..."
-pip list
-LOG_DIR=/neural-compressor/log_dir
-mkdir -p ${LOG_DIR}
-ut_log_name=${LOG_DIR}/ut_itrex.log
-
-# run unit test
-cd /intel-extension-for-transformers/tests/CI
-find . -name "test*.py" | grep -v "test_tf" | sed 's,\.\/,python ,g' | sed 's/$/ --verbose/' > run.sh
-
-# run UT
-$BOLD_YELLOW && echo "cat run.sh..." && $RESET
-cat run.sh | tee ${ut_log_name}
-$BOLD_YELLOW && echo "------UT start-------" && $RESET
-bash -x run.sh 2>&1 | tee -a ${ut_log_name}
-$BOLD_YELLOW && echo "------ UT end -------" && $RESET
-
-if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
- echo "Find errors in UT test, please check the output..."
- exit 1
-fi
-echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/ut-itrex.yml b/.azure-pipelines/ut-itrex.yml
deleted file mode 100644
index 2f038270234..00000000000
--- a/.azure-pipelines/ut-itrex.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-trigger: none
-
-pr:
- autoCancel: true
- drafts: false
- branches:
- include:
- - master
- paths:
- include:
- - neural_compressor
- - setup.py
- - requirements.txt
- - .azure-pipelines/scripts/ut/run_itrex.sh
- - .azure-pipelines/ut-itrex.yml
-
-pool: MODEL_PERF_TEST
-
-variables:
- UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
-
-stages:
- - stage:
- displayName: Unit Test of ITREX
- jobs:
- - job:
- steps:
- - template: template/ut-template.yml
- parameters:
- dockerConfigName: 'commonDockerConfig'
- utScriptFileName: 'run_itrex'
- uploadPath: $(UPLOAD_PATH)
- utArtifact: 'ut_itrex'
- utTestMode: "no-coverage"
- utContainerName: "utTest-itrex"
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
index 9f566749da0..33a7a2b06a4 100644
--- a/.github/checkgroup.yml
+++ b/.github/checkgroup.yml
@@ -78,19 +78,6 @@ subprojects:
- "UT-Basic (Unit Test other basic case Test other basic case)"
- "UT-Basic (Unit Test other cases baseline Test other cases baseline)"
- - id: "Unit Tests ITREX workflow"
- paths:
- - "neural_compressor/**"
- - "setup.py"
- - "requirements.txt"
- - ".azure-pipelines/scripts/ut/run_itrex.sh"
- - ".azure-pipelines/ut-itrex.yml"
- - "!neural_compressor/common/**"
- - "!neural_compressor/torch/**"
- - "!neural_compressor/tensorflow/**"
- checks:
- - "UT-ITREX"
-
- id: "Unit Tests 3x-TensorFlow workflow"
paths:
- "neural_compressor/common/**"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b821d93eb1..2875b945c57 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -76,7 +76,7 @@ repos:
)$
- repo: https://github.com/PyCQA/docformatter
- rev: v1.7.5
+ rev: 06907d0
hooks:
- id: docformatter
args: [
diff --git a/README.md b/README.md
index e2bef73e2d7..983b80227ed 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testi
* Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst)
## What's New
+* [2024/10] [Transformers-like API](./docs/source/3x/transformers_like_api.md) for INT4 inference on Intel CPU and GPU.
* [2024/07] From 3.0 release, framework extension API is recommended to be used for quantization.
* [2024/07] Performance optimizations and usability improvements on [client-side](./docs/source/3x/client_quant.md).
@@ -164,6 +165,16 @@ Intel Neural Compressor will convert the model format from auto-gptq to hpu form
Smooth Quantization |
+
+
+ Transformers-like APIs |
+
+
+
+
+ Overview |
+
+
Other Modules |
diff --git a/docs/source/3x/transformers_like_api.md b/docs/source/3x/transformers_like_api.md
index 9aafeed5278..55e8d964072 100644
--- a/docs/source/3x/transformers_like_api.md
+++ b/docs/source/3x/transformers_like_api.md
@@ -208,6 +208,8 @@ python run_generation_gpu_woq.py --woq --benchmark --model save_dir
>Note:
> * Saving quantized model should be executed before the optimize_transformers function is called.
> * The optimize_transformers function is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs). It provides optimizations for both model-wise and content-generation-wise. The detail of `optimize_transformers`, please refer to [the link](https://github.com/intel/intel-extension-for-pytorch/blob/xpu-main/docs/tutorials/llm/llm_optimize_transformers.md).
+>* The quantization process is performed on the CPU accelerator by default. Users can override this setting by specifying the environment variable `INC_TARGET_DEVICE`. Usage on bash: ```export INC_TARGET_DEVICE=xpu```.
+>* For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the OMP_NUM_THREADS explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using taskset.
## Examples
diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index bb18153f389..a1f33413864 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -1,196 +1,225 @@
{
- "pytorch": {
- "llava_woq_autoround_int4":{
- "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
- "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
- "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
- "input_model": "liuhaotian/llava-v1.5-7b",
- "main_script": "main.py",
- "batch_size": 1
- },
- "qwenvl_woq_autoround_int4":{
- "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
- "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
- "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
- "input_model": "Qwen/Qwen-VL",
- "main_script": "main.py",
- "batch_size": 8
- },
- "Phi3Vision_woq_autoround_int4":{
- "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
- "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
- "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
- "input_model": "microsoft/Phi-3-vision-128k-instruct",
- "main_script": "main.py",
- "batch_size": 1
- },
- "opt_125m_woq_gptq_int4":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 1
- },
- "opt_125m_woq_gptq_int4_dq_bnb":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 1
- },
- "opt_125m_woq_gptq_int4_dq_ggml":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "llama2_7b_gptq_int4":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "llama2_7b_gptq_int4_dq_bnb":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "llama2_7b_gptq_int4_dq_ggml":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "gpt_j_woq_rtn_int4":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "gpt_j_woq_rtn_int4_dq_bnb":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "gpt_j_woq_rtn_int4_dq_ggml":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "gpt_j_woq_gptq_int4":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "gpt_j_woq_gptq_int4_dq_bnb":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "gpt_j_woq_gptq_int4_dq_ggml":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "gpt_j_ipex":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 1
- },
- "gpt_j_ipex_sq":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 1
- },
- "llama2_7b_ipex":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 1
- },
- "llama2_7b_ipex_sq":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 1
- },
- "opt_125m_ipex":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "opt_125m_ipex_sq":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 8
- },
- "dlrm_ipex": {
- "model_src_dir": "recommendation/dlrm/static_quant/ipex",
- "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
- "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
- "main_script": "dlrm_s_pytorch.py",
- "batch_size": 16384
- },
- "resnet18_pt2e_static":{
- "model_src_dir": "cv/static_quant",
- "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
- "input_model": "",
- "main_script": "main.py",
- "batch_size": 1
- },
- "resnet18_fp8_static":{
- "model_src_dir": "cv/fp8_quant",
- "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
- "input_model": "",
- "main_script": "main.py",
- "batch_size": 1
- },
- "opt_125m_pt2e_static":{
- "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
- "dataset_location": "",
- "input_model": "",
- "main_script": "run_clm_no_trainer.py",
- "batch_size": 1
- },
- "sdxl_ipex_sq":{
- "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
- "dataset_location": "",
- "input_model": "",
- "main_script": "main.py",
- "batch_size": 1
- },
- "resnet18_mixed_precision": {
- "model_src_dir": "cv/mixed_precision",
- "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
- "input_model": "resnet18",
- "main_script": "main.py",
- "batch_size": 20
+ "pytorch": {
+ "llava_woq_autoround_int4":{
+ "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
+ "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+ "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+ "input_model": "liuhaotian/llava-v1.5-7b",
+ "main_script": "main.py",
+ "batch_size": 1
+ },
+ "qwenvl_woq_autoround_int4":{
+ "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
+ "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+ "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+ "input_model": "Qwen/Qwen-VL",
+ "main_script": "main.py",
+ "batch_size": 8
+ },
+ "Phi3Vision_woq_autoround_int4":{
+ "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
+ "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+ "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+ "input_model": "microsoft/Phi-3-vision-128k-instruct",
+ "main_script": "main.py",
+ "batch_size": 1
+ },
+ "opt_125m_woq_gptq_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "opt_125m_woq_gptq_int4_dq_bnb":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "opt_125m_woq_gptq_int4_dq_ggml":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "llama2_7b_gptq_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "llama2_7b_gptq_int4_dq_bnb":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "llama2_7b_gptq_int4_dq_ggml":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "gpt_j_woq_rtn_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "gpt_j_woq_rtn_int4_dq_bnb":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "gpt_j_woq_rtn_int4_dq_ggml":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "gpt_j_woq_gptq_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "gpt_j_woq_gptq_int4_dq_bnb":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "gpt_j_woq_gptq_int4_dq_ggml":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "gpt_j_woq_awq_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "opt_125m_woq_awq_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "opt_125m_woq_autoround_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "opt_125m_woq_autotune_int4":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "gpt_j_ipex":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "gpt_j_ipex_sq":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "llama2_7b_ipex":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "llama2_7b_ipex_sq":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "opt_125m_ipex":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "opt_125m_ipex_sq":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 8
+ },
+ "dlrm_ipex": {
+ "model_src_dir": "recommendation/dlrm/static_quant/ipex",
+ "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
+ "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
+ "main_script": "dlrm_s_pytorch.py",
+ "batch_size": 16384
+ },
+ "resnet18_pt2e_static":{
+ "model_src_dir": "cv/static_quant",
+ "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+ "input_model": "",
+ "main_script": "main.py",
+ "batch_size": 1
+ },
+ "resnet18_fp8_static":{
+ "model_src_dir": "cv/fp8_quant",
+ "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+ "input_model": "",
+ "main_script": "main.py",
+ "batch_size": 1
+ },
+ "opt_125m_pt2e_static":{
+ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "run_clm_no_trainer.py",
+ "batch_size": 1
+ },
+ "sdxl_ipex_sq":{
+ "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
+ "dataset_location": "",
+ "input_model": "",
+ "main_script": "main.py",
+ "batch_size": 1
+ },
+ "resnet18_mixed_precision": {
+ "model_src_dir": "cv/mixed_precision",
+ "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+ "input_model": "resnet18",
+ "main_script": "main.py",
+ "batch_size": 20
+ }
}
}
}
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
index bc70f987095..736d79c4d72 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
@@ -2,6 +2,5 @@ transformers
torch
sentencepiece
neural-compressor
-intel-extension-for-transformers >= 1.4.1
lm-eval==0.4.2
peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
index 40bf217c72e..6ad8e495db2 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
@@ -62,7 +62,7 @@ def get_user_model():
user_model = convert(model=user_model)
user_model.eval()
-from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
user_model=user_model,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
index d4155dfbf75..d9f59d178e7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
@@ -8,7 +8,6 @@ pytest
wandb
einops
neural-compressor
-intel-extension-for-transformers
-lm_eval==0.4.2
+lm_eval==0.4.3
peft
optimum-intel
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
index 694c0505ea4..a082421f15b 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -185,7 +185,7 @@ def eval_func(model):
config = AutoConfig.from_pretrained(args.model)
setattr(model, "config", config)
- from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+ from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
user_model=model,
@@ -232,7 +232,7 @@ def eval_func(model):
if args.accuracy:
user_model.eval()
- from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+ from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
index f0b56e558d3..5174182f312 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
@@ -8,6 +8,5 @@ pytest
wandb
einops
neural-compressor
-intel-extension-for-transformers
-lm_eval==0.4.2
+lm_eval==0.4.3
peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
index b56c01f20f5..eb97f930d29 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
@@ -212,7 +212,7 @@ def run_fn(model):
if args.accuracy:
user_model.eval()
- from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+ from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
user_model=user_model,
@@ -232,7 +232,7 @@ def run_fn(model):
if args.performance:
user_model.eval()
- from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+ from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
import time
samples = args.iters * args.batch_size
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
index b6d9b6c55de..63959e924cb 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
@@ -2,6 +2,5 @@ transformers
torch
sentencepiece
neural-compressor
-intel-extension-for-transformers >= 1.4.1
-lm-eval==0.4.2
+lm-eval==0.4.3
peft
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 395bc6f9b57..a2aa6c1302a 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -116,7 +116,7 @@ def get_example_inputs(tokenizer):
if args.accuracy:
- from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+ from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
user_model=user_model,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md
index 1abe2633ea3..f0760cc2fe1 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md
@@ -103,6 +103,8 @@ python run_generate_cpu_woq.py \
> 1. default search algorithm is beam search with num_beams = 1.
> 2. [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.10%2Bxpu/docs/tutorials/llm/llm_optimize_transformers.md) Support for the optimized inference of model types "gptj," "mistral," "qwen," and "llama" to achieve high performance and accuracy. Ensure accurate inference for other model types as well.
> 3. We provide compression technologies `WeightOnlyQuant` with `Rtn/GPTQ/AutoRound` algorithms and `load_in_4bit` and `load_in_8bit` work on intel GPU device.
+> 4. The quantization process is performed on the CPU accelerator by default. Users can override this setting by specifying the environment variable `INC_TARGET_DEVICE`. Usage on bash: ```export INC_TARGET_DEVICE=xpu```.
+> 5. For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the OMP_NUM_THREADS explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using taskset.
## Prerequisite​
### Dependencies
@@ -111,7 +113,7 @@ Intel-extension-for-pytorch dependencies are in oneapi package, before install i
### Create Environment​
Pytorch and Intel-extension-for-pytorch version for intel GPU > 2.1 are required, python version requests equal or higher than 3.9 due to [text evaluation library](https://github.com/EleutherAI/lm-evaluation-harness/tree/master) limitation, the dependent packages are listed in requirements_GPU.txt, we recommend create environment as the following steps. For Intel-exension-for-pytorch, we should install from source code now, and Intel-extension-for-pytorch will add weight-only quantization in the next version.
->**Note**: please install transformers==4.40.2.
+>**Note**: please install transformers==4.38.1.
```bash
pip install -r requirements_GPU.txt
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md
index 2c3b14459c8..6a5e75b5023 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md
@@ -30,12 +30,6 @@ The scripts [run_generation_sq.py](./run_generation_sq.py) and [run_generation_c
```bash
# Installation
-git clone https://github.com/intel/intel-extension-for-transformers.git
-
-# install ITREX
-cd intel-extension-for-transformers
-pip install -r requirements.txt
-pip install -v .
# install requirements
cd examples/huggingface/pytorch/text-generation/quantization
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py
index 62ef4ca2f49..8329d74b9a4 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py
@@ -293,7 +293,6 @@
_commit_hash=args._commit_hash,
)
elif args.load_in_4bit or args.load_in_8bit:
- # CPU device usage is provided by intel-extension-for-transformers.
user_model = AutoModelForCausalLM.from_pretrained(
args.model,
load_in_4bit=args.load_in_4bit,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
index 9245d53eb50..7b63a015600 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
@@ -6,9 +6,9 @@
from transformers import AutoConfig, AutoTokenizer
from transformers.generation import GenerationConfig
import intel_extension_for_pytorch as ipex
-# from intel_extension_for_transformers.transformers.llm.utils.generation import _beam_search, _greedy_search
from neural_compressor.transformers import AutoModelForCausalLM, AutoRoundConfig, RtnConfig, GPTQConfig
from neural_compressor.transformers.quantization.utils import convert_dtype_str2torch
+from neural_compressor.transformers.generation import _greedy_search, _beam_search
from transformers.utils import check_min_version
import contextlib
@@ -189,7 +189,6 @@
torch_dtype=torch.float16,
)
elif args.load_in_4bit or args.load_in_8bit:
- # CPU device usage is provided by intel-extension-for-transformers.
user_model = AutoModelForCausalLM.from_pretrained(args.model,
device_map=args.device,
load_in_4bit=args.load_in_4bit,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
index 889d7b42682..0519b490ff7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
@@ -35,9 +35,8 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
- --accuracy \
- --tasks "lambada_openai" \
- --double_quant_type "BNB_NF4"
+ --double_quant_type "BNB_NF4" \
+ --output_dir saved_results
# "--woq_algo RTN" is used to enable RTN algorithms
python run_clm_no_trainer.py \
@@ -48,9 +47,38 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
+ --double_quant_type "BNB_NF4"
+ --output_dir saved_results
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+ --model EleutherAI/gpt-j-6B \
+ --dataset NeelNanda/pile-10k \
+ --quantize \
+ --woq_algo AWQ \
+ --woq_bits 4 \
+ --woq_scheme asym \
+ --woq_group_size 128 \
+ --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+ --model EleutherAI/gpt-j-6B \
+ --dataset NeelNanda/pile-10k \
+ --quantize \
+ --woq_algo AutoRound \
+ --woq_bits 4 \
+ --woq_scheme asym \
+ --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+ --model EleutherAI/gpt-j-6B \
+ --dataset NeelNanda/pile-10k \
+ --int8 \
--accuracy \
--tasks "lambada_openai" \
- --double_quant_type "BNB_NF4"
+ --output_dir saved_results
```
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
@@ -72,8 +100,6 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
- --accuracy \
- --tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
# "--woq_algo RTN" is used to enable RTN algorithms
@@ -85,13 +111,40 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
+ --double_quant_type "BNB_NF4"
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+ --model facebook/opt-125m \
+ --dataset NeelNanda/pile-10k \
+ --quantize \
+ --woq_algo AWQ \
+ --woq_bits 4 \
+ --woq_scheme asym \
+ --woq_group_size 128 \
+ --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+ --model facebook/opt-125m \
+ --dataset NeelNanda/pile-10k \
+ --quantize \
+ --woq_algo AutoRound \
+ --woq_bits 4 \
+ --woq_scheme asym \
+ --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+ --model facebook/opt-125m \
+ --dataset NeelNanda/pile-10k \
+ --int8 \
--accuracy \
--tasks "lambada_openai" \
- --double_quant_type "BNB_NF4"
+ --output_dir saved_results
```
### LLAMA2-7b/13b/70b
->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
#### Quantization
```bash
@@ -107,8 +160,6 @@ python run_clm_no_trainer.py \
--woq_group_size 128 \
--gptq_max_seq_length 2048 \
--gptq_use_max_length \
- --accuracy \
- --tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
# "--woq_algo RTN" is used to enable RTN algorithms
@@ -120,8 +171,6 @@ python run_clm_no_trainer.py \
--woq_bits 4 \
--woq_scheme asym \
--woq_group_size 128 \
- --accuracy \
- --tasks "lambada_openai" \
--double_quant_type "BNB_NF4"
```
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt
index 63c4d6e10b1..4745e2dfbd7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt
@@ -8,7 +8,6 @@ pytest
wandb
einops
neural-compressor
-intel-extension-for-transformers
lm_eval==0.4.3
peft
auto_round
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 9e1d766128e..6c84e27ce88 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -70,58 +70,59 @@ function run_benchmark {
fi
echo $extra_cmd
- if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+ if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
model_name_or_path="facebook/opt-125m"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="facebook/opt-125m"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
- extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="facebook/opt-125m"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
- extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
- extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
model_name_or_path="meta-llama/Llama-2-7b-hf"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
- extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
- extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
- model_name_or_path="EleutherAI/gpt-j-6b"\
- extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
- extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+ model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
- model_name_or_path="EleutherAI/gpt-j-6b"\
- extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
- extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+ model_name_or_path="EleutherAI/gpt-j-6b"
elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
- extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
model_name_or_path="EleutherAI/gpt-j-6b"
- extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
- extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+ elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
+ model_name_or_path="EleutherAI/gpt-j-6b"
+ elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
+ model_name_or_path="facebook/opt-125m"
+ elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
+ model_name_or_path="facebook/opt-125m"
+ extra_cmd=$extra_cmd" --woq_algo AutoRound"
+ elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
+ model_name_or_path="facebook/opt-125m"
fi
- python -u run_clm_no_trainer.py \
- --model ${model_name_or_path} \
- --output_dir ${tuned_checkpoint} \
- --task ${task} \
- --batch_size ${batch_size} \
- ${extra_cmd} ${mode_cmd}
+ if [[ ${mode} == "accuracy" ]]; then
+ python -u run_clm_no_trainer.py \
+ --model ${model_name_or_path} \
+ --output_dir ${tuned_checkpoint} \
+ --task ${task} \
+ --batch_size ${batch_size} \
+ ${extra_cmd} ${mode_cmd}
+ elif [[ ${mode} == "performance" ]]; then
+ incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
+ --model ${model_name_or_path} \
+ --batch_size ${batch_size} \
+ --output_dir ${tuned_checkpoint} \
+ ${extra_cmd} ${mode_cmd}
+ else
+ echo "Error: No such mode: ${mode}"
+ exit 1
+ fi
+
}
main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 02329bd9e15..51be2900ba7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -53,7 +53,7 @@
type=str, help="tasks for accuracy validation")
parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
# ============WeightOnly configs===============
-parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'],
+parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound', 'AutoTune'],
help="Weight-only parameter.")
parser.add_argument("--woq_bits", type=int, default=8)
parser.add_argument("--woq_dtype", type=str, default="int")
@@ -62,6 +62,7 @@
parser.add_argument("--woq_scheme", default="sym")
parser.add_argument("--woq_use_mse_search", action="store_true")
parser.add_argument("--woq_use_full_range", action="store_true")
+parser.add_argument("--quant_lm_head", action="store_true", help="whether to quant the lm_head layer in transformers")
# =============GPTQ configs====================
parser.add_argument("--gptq_actorder", action="store_true",
help="Whether to apply the activation order GPTQ heuristic.")
@@ -78,6 +79,35 @@
help='Calibration dataset sequence max length, '
'this should align with your model config, '
'and your dataset builder args: args.pad_max_length')
+# =============AWQ configs====================
+parser.add_argument("--use_auto_scale", action="store_true",
+ help="Enables best scales search based on activation distribution.")
+parser.add_argument("--use_auto_clip", action="store_true",
+ help="Enables clip range searchc.")
+parser.add_argument("--folding", action="store_true",
+ help="Allow insert mul before linear when the scale cannot be absorbed by last layer for TEQ/AWQ.")
+parser.add_argument('--absorb_layer_dict', type=dict, default={},
+ help="The layer dict that scale can be absorbed for TEQ/AWQ.")
+# ============AUTOROUND configs==============
+parser.add_argument(
+ "--lr",
+ type=float,
+ default=None,
+ help="learning rate, if None, it will be set to 1.0/iters automatically",
+)
+parser.add_argument(
+ "--minmax_lr",
+ type=float,
+ default=None,
+ help="minmax learning rate, if None,it will beset to be the same with lr",
+)
+parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.")
+parser.add_argument("--autoround_nsamples", default=128, type=int, help="num samples for autoround calibration.")
+parser.add_argument(
+ "--disable_quanted_input",
+ action="store_true",
+ help="whether to use the output of quantized block to tune the next block",
+)
# =============DoubleQuant configs====================
parser.add_argument("--double_quant_type",
@@ -196,6 +226,8 @@ def get_user_model():
)
tokenizer = AutoTokenizer.from_pretrained(args.model)
user_model = user_model.float()
+ if args.woq_algo == 'AutoRound':
+ user_model.to(torch.float32)
# Set model's seq_len when GPTQ calibration is enabled.
if args.woq_algo == 'GPTQ':
@@ -210,6 +242,31 @@ def get_user_model():
user_model.eval()
return user_model, tokenizer
+def eval_fn(user_model=None):
+ user_model.eval()
+ from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
+ import time
+
+ samples = args.iters * args.batch_size
+ eval_args = LMEvalParser(
+ model="hf",
+ user_model=user_model,
+ tokenizer=tokenizer,
+ batch_size=args.batch_size,
+ tasks=args.tasks,
+ limit=samples,
+ device="hpu" if is_hpex_available() else "cpu",
+ )
+ start = time.time()
+ results = evaluate(eval_args)
+ end = time.time()
+ for task_name in args.tasks.split(","):
+ if task_name == "wikitext":
+ acc = results["results"][task_name]["word_perplexity,none"]
+ else:
+ acc = results["results"][task_name]["acc,none"]
+ print("Accuracy: %.5f" % acc)
+ return acc
if args.quantize:
# dataset
@@ -224,9 +281,25 @@ def get_user_model():
shuffle=False,
collate_fn=calib_evaluator.collate_batch,
)
+ def calib_func(prepared_model):
+ for i, calib_input in enumerate(calib_dataloader):
+ if i > args.calib_iters:
+ break
+ prepared_model(calib_input[0])
# 3.x api
- from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
+ from neural_compressor.torch.quantization import (
+ RTNConfig,
+ GPTQConfig,
+ AWQConfig,
+ AutoRoundConfig,
+ TEQConfig,
+ TuningConfig,
+ autotune,
+ get_woq_tuning_config,
+ prepare,
+ convert
+ )
from neural_compressor.torch.utils import get_double_quant_config_dict
weight_sym = True if args.woq_scheme == "sym" else False
if args.double_quant_type is not None:
@@ -239,6 +312,7 @@ def get_user_model():
# TODO: add group_dim into double quant config?
"use_full_range": args.woq_use_full_range,
"use_mse_search": args.woq_use_mse_search,
+ "quant_lm_head": args.quant_lm_head,
}
)
quant_config = RTNConfig.from_dict(double_quant_config_dict)
@@ -256,8 +330,8 @@ def get_user_model():
double_quant_dtype=args.double_quant_dtype,
double_quant_use_sym=args.double_quant_use_sym,
double_quant_group_size=args.double_quant_group_size,
+ quant_lm_head=args.quant_lm_head,
)
- quant_config.set_local("lm_head", RTNConfig(dtype="fp32"))
user_model = prepare(model=user_model, quant_config=quant_config)
user_model = convert(model=user_model)
elif args.woq_algo == "GPTQ":
@@ -288,6 +362,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
"act_order": args.gptq_actorder,
"block_size": args.gptq_block_size,
"static_groups": args.gptq_static_groups,
+ "quant_lm_head": args.quant_lm_head,
}
)
quant_config = GPTQConfig.from_dict(double_quant_config_dict)
@@ -307,11 +382,109 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
double_quant_dtype=args.double_quant_dtype,
double_quant_use_sym=args.double_quant_use_sym,
double_quant_group_size=args.double_quant_group_size,
+ quant_lm_head=args.quant_lm_head,
)
- quant_config.set_local("lm_head", GPTQConfig(dtype="fp32"))
user_model = prepare(model=user_model, quant_config=quant_config)
run_fn_for_gptq(user_model, dataloader_for_calibration)
user_model = convert(user_model)
+ elif args.woq_algo == "AWQ":
+ quant_config = AWQConfig(
+ dtype=args.woq_dtype,
+ bits=args.woq_bits,
+ use_sym=weight_sym,
+ group_size=args.woq_group_size,
+ group_dim=args.woq_group_dim,
+ use_auto_scale=args.use_auto_scale,
+ use_auto_clip=args.use_auto_clip,
+ folding=args.folding,
+ absorb_layer_dict=args.absorb_layer_dict,
+ quant_lm_head=args.quant_lm_head,
+ )
+ example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+ run_fn = calib_func
+ user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+ run_fn(user_model)
+ user_model = convert(user_model)
+ elif args.woq_algo == "TEQ":
+ quant_config = TEQConfig(
+ dtype=args.woq_dtype,
+ bits=args.woq_bits,
+ use_sym=weight_sym,
+ group_size=args.woq_group_size,
+ group_dim=args.woq_group_dim,
+ folding=args.folding,
+ quant_lm_head=args.quant_lm_head,
+ )
+ example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+ run_fn = calib_func
+ user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+ run_fn(user_model)
+ user_model = convert(user_model)
+ elif args.woq_algo == "AutoRound":
+ quant_config = AutoRoundConfig(
+ dtype=args.woq_dtype,
+ bits=args.woq_bits,
+ use_sym=weight_sym,
+ group_size=args.woq_group_size,
+ enable_quanted_input=not args.disable_quanted_input,
+ lr=args.lr,
+ minmax_lr=args.minmax_lr,
+ seqlen=args.pad_max_length,
+ nsamples=args.autoround_nsamples,
+ iters=args.autoround_iters,
+ )
+ quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
+ from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader
+ dataloader = get_dataloader(tokenizer=tokenizer,
+ seqlen=args.pad_max_length,
+ dataset_name=datasets,
+ seed=args.seed,
+ bs=args.batch_size,
+ nsamples=args.autoround_nsamples)
+ @torch.no_grad()
+ def run_fn_for_autoround(model, dataloader):
+ for data in dataloader:
+ if isinstance(data, tuple) or isinstance(data, list):
+ model(*data)
+ elif isinstance(data, dict):
+ model(**data)
+ else:
+ model(data)
+ run_fn = run_fn_for_autoround
+ run_args = (dataloader,)
+ user_model = prepare(model=user_model, quant_config=quant_config)
+ run_fn(user_model, *run_args)
+ user_model = convert(user_model)
+ elif args.woq_algo == "AutoTune":
+ from utils import DataloaderPreprocessor
+ dataloaderPreprocessor = DataloaderPreprocessor(
+ dataloader_original=calib_dataloader,
+ use_max_length=args.gptq_use_max_length,
+ max_seq_length=args.gptq_max_seq_length,
+ )
+ dataloader = dataloaderPreprocessor.get_prepared_dataloader()
+ custom_tune_config = TuningConfig(config_set=get_woq_tuning_config())
+ from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device
+ from tqdm import tqdm
+ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
+ for batch in tqdm(dataloader_for_calibration):
+ batch = move_input_to_device(batch, device=None)
+ if isinstance(batch, tuple) or isinstance(batch, list):
+ model(batch[0])
+ elif isinstance(batch, dict):
+ model(**batch)
+ else:
+ model(batch)
+ return
+ example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+ user_model = autotune(
+ model=user_model,
+ tune_config=custom_tune_config,
+ eval_fn=eval_fn,
+ run_fn=run_fn_for_gptq,
+ run_args=(dataloader, True), # run_args should be a tuple,
+ example_inputs=example_inputs,
+ )
user_model.save(args.output_dir)
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index a860712b697..ed4ee705726 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -85,6 +85,19 @@ function run_tuning {
model_name_or_path="EleutherAI/gpt-j-6b"
extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+ elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
+ model_name_or_path="EleutherAI/gpt-j-6b"
+ extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128"
+ extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+ elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
+ model_name_or_path="facebook/opt-125m"
+ extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128"
+ elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
+ model_name_or_path="facebook/opt-125m"
+ extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128 --woq_scheme asym --autoround_iters 200 --autoround_nsamples 500"
+ elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
+ model_name_or_path="facebook/opt-125m"
+ extra_cmd=$extra_cmd" --woq_algo AutoTune --woq_bits 4"
fi
python -u run_clm_no_trainer.py \
diff --git a/neural_compressor/transformers/generation/__init__.py b/neural_compressor/transformers/generation/__init__.py
new file mode 100644
index 00000000000..4030000c22c
--- /dev/null
+++ b/neural_compressor/transformers/generation/__init__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .beam_search import _beam_search
+from .greedy_search import _greedy_search
diff --git a/neural_compressor/transformers/generation/beam_search.py b/neural_compressor/transformers/generation/beam_search.py
new file mode 100644
index 00000000000..d4372810078
--- /dev/null
+++ b/neural_compressor/transformers/generation/beam_search.py
@@ -0,0 +1,490 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import time
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers.generation.beam_search import BeamScorer
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.utils import ModelOutput
+
+
+class BeamSearchEncoderDecoderOutput(ModelOutput):
+ sequences: torch.LongTensor = None
+ sequences_scores: Optional[torch.FloatTensor] = None
+ scores: Optional[Tuple[torch.FloatTensor]] = None
+ beam_indices: Optional[torch.LongTensor] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+class BeamSearchDecoderOnlyOutput(ModelOutput):
+ sequences: torch.LongTensor = None
+ sequences_scores: Optional[torch.FloatTensor] = None
+ scores: Optional[Tuple[torch.FloatTensor]] = None
+ beam_indices: Optional[torch.LongTensor] = None
+ attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
+
+
+def _beam_search(
+ self,
+ input_ids: torch.LongTensor,
+ beam_scorer: BeamScorer,
+ logits_processor: Optional[LogitsProcessorList] = None,
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
+ max_length: Optional[int] = None,
+ pad_token_id: Optional[int] = None,
+ eos_token_id: Optional[Union[int, List[int]]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_scores: Optional[bool] = None,
+ return_dict_in_generate: Optional[bool] = None,
+ synced_gpus: bool = False,
+ **model_kwargs,
+) -> Union[BeamSearchOutput, torch.LongTensor]:
+ r"""
+ Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+ can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+ In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
+ instead. For an overview of generation strategies and code examples, check the [following
+ guide](../generation_strategies).
+
+ Parameters:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ The sequence used as a prompt for the generation.
+ beam_scorer (`BeamScorer`):
+ An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+ sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+ used to modify the prediction scores of the language modeling head applied at each generation step.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+ used to tell if the generation loop should stop.
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+ tokens. The maximum length of the sequence to be generated.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`Union[int, List[int]]`, *optional*):
+ The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+ output_attentions (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more details.
+ output_hidden_states (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more details.
+ output_scores (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
+ Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+ model_kwargs:
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+ an encoder-decoder model the kwargs should include `encoder_outputs`.
+ Return:
+ [`BeamSearchDecoderOnlyOutput`], [`BeamSearchEncoderDecoderOutput`] or
+ `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+ `return_dict_in_generate=True` or a [`BeamSearchEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
+ Examples:
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForSeq2SeqLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... BeamSearchScorer,
+ ... )
+ >>> import torch
+ >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+ >>> encoder_input_str = "translate English to German: How old are you?"
+ >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+ >>> # lets run beam search using 3 beams
+ >>> num_beams = 3
+ >>> # define decoder start token ids
+ >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+ >>> input_ids = input_ids * model.config.decoder_start_token_id
+ >>> # add encoder_outputs to model keyword arguments
+ >>> model_kwargs = {
+ ... "encoder_outputs": model.get_encoder()(
+ ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+ ... )
+ ... }
+ >>> # instantiate beam scorer
+ >>> beam_scorer = BeamSearchScorer(
+ ... batch_size=1,
+ ... num_beams=num_beams,
+ ... device=model.device,
+ ... )
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList(
+ ... [
+ ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+ ... ]
+ ... )
+ >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+ >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+ ['Wie alt bist du?']
+ ```"""
+ # init values
+ token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or (
+ self.token_latency if hasattr(self, "token_latency") else False
+ )
+
+ latency_list = []
+ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+ if max_length is not None:
+ warnings.warn(
+ "`max_length` is deprecated in this function, use"
+ " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+ UserWarning,
+ )
+ stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+ if len(stopping_criteria) == 0:
+ warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
+ pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+ eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+ if isinstance(eos_token_id, int):
+ eos_token_id = [eos_token_id]
+ output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+ output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+ )
+ return_dict_in_generate = (
+ return_dict_in_generate
+ if return_dict_in_generate is not None
+ else self.generation_config.return_dict_in_generate
+ )
+
+ batch_size = len(beam_scorer._beam_hyps)
+ num_beams = beam_scorer.num_beams
+
+ batch_beam_size, cur_len = input_ids.shape
+
+ if num_beams * batch_size != batch_beam_size:
+ raise ValueError(
+ f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+ )
+
+ # init attention / hidden states / scores tuples
+ scores = () if (return_dict_in_generate and output_scores) else None
+ beam_indices = tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+ cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+ if return_dict_in_generate and self.config.is_encoder_decoder:
+ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+ encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+ # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+ # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+ beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+ beam_scores[:, 1:] = -1e9
+ beam_scores = beam_scores.view((batch_size * num_beams,))
+ this_peer_finished = False # used by synced_gpus only
+ decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder
+ while True:
+ tic = time.time()
+ if synced_gpus:
+ # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+ # The following logic allows an early break if all peers finished generating their sequence
+ this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+ # send 0.0 if we finished, 1.0 otherwise
+ dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+ # did all peers finish? the reduced sum will be 0.0 then
+ if this_peer_finished_flag.item() == 0.0:
+ break
+
+ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+ if (
+ re.search("GPTJ", self.config.architectures[0])
+ or re.search("llama", self.config.architectures[0], re.IGNORECASE)
+ or re.search("gptneox", self.config.architectures[0], re.IGNORECASE)
+ or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
+ or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
+ or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+ ):
+ first_token = False
+ input_bs = input_ids.size()[0]
+ has_position_id = True
+ if model_inputs["past_key_values"] is None:
+ first_token = True
+ if first_token and hasattr(self, "trace_graph"):
+ if re.search("GPTJ", self.config.architectures[0]):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.n_layer)
+ ]
+ )
+ elif re.search("llama", self.config.architectures[0], re.IGNORECASE):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ elif re.search("OPT", self.config.architectures[0], re.IGNORECASE):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ has_position_id = False
+ elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search(
+ "rw", self.config.architectures[0], re.IGNORECASE
+ ):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ has_position_id = False
+
+ if hasattr(self, "trace_graph"):
+ if first_token:
+ new_attention_mask = model_inputs["attention_mask"][:batch_size].clone()
+ new_input_ids = model_inputs["input_ids"][:batch_size].clone()
+ if has_position_id:
+ new_position_ids = model_inputs["position_ids"][:batch_size].clone()
+ for i in range(batch_size):
+ new_attention_mask[i] = model_inputs["attention_mask"][i * num_beams]
+ new_input_ids[i] = model_inputs["input_ids"][i * num_beams]
+ if has_position_id:
+ new_position_ids[i] = model_inputs["position_ids"][i * num_beams]
+ model_inputs["attention_mask"] = new_attention_mask
+ model_inputs["input_ids"] = new_input_ids
+ if has_position_id:
+ model_inputs["position_ids"] = new_position_ids
+ model_inputs.pop("use_cache", None)
+ model_inputs.pop("token_type_ids", None)
+ if first_token and hasattr(self, "trace_graph_first"):
+ outputs = self.trace_graph_first(**model_inputs)
+ else:
+ outputs = self.trace_graph(**model_inputs)
+
+ if first_token and len(model_inputs["past_key_values"][1]) == 4:
+ outputs = list(outputs)
+ outputs[0] = outputs[0].repeat_interleave(num_beams, dim=0)
+ outputs = tuple(outputs)
+ if synced_gpus and this_peer_finished:
+ cur_len = cur_len + 1
+ continue # don't waste resources running the code we don't need
+ next_token_logits = outputs[0][:, -1, :]
+ else:
+ outputs = self(
+ **model_inputs,
+ return_dict=True,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ )
+ if synced_gpus and this_peer_finished:
+ cur_len = cur_len + 1
+ continue # don't waste resources running the code we don't need
+ next_token_logits = outputs.logits[:, -1, :]
+ else:
+ outputs = self(
+ **model_inputs,
+ return_dict=True,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ )
+ if synced_gpus and this_peer_finished:
+ cur_len = cur_len + 1
+ continue # don't waste resources running the code we don't need
+ next_token_logits = outputs.logits[:, -1, :]
+ next_token_scores = nn.functional.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size)
+ next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+ next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores_processed)
+ # Store scores, attentions and hidden_states when required
+ if return_dict_in_generate:
+ if output_scores:
+ scores += (next_token_scores_processed,)
+ if output_attentions:
+ decoder_attentions += (
+ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+ )
+ if self.config.is_encoder_decoder:
+ cross_attentions += (outputs.cross_attentions,)
+
+ if output_hidden_states:
+ decoder_hidden_states += (
+ (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,)
+ )
+
+ # reshape for beam search
+ vocab_size = next_token_scores.shape[-1]
+ next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+ # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+ n_eos_tokens = len(eos_token_id) if eos_token_id else 0
+ next_token_scores, next_tokens = torch.topk(
+ next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
+ )
+
+ next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+ next_tokens = next_tokens % vocab_size
+
+ # stateless
+ beam_outputs = beam_scorer.process(
+ input_ids,
+ next_token_scores,
+ next_tokens,
+ next_indices,
+ pad_token_id=pad_token_id,
+ eos_token_id=eos_token_id,
+ beam_indices=beam_indices,
+ decoder_prompt_len=decoder_prompt_len,
+ )
+ beam_scores = beam_outputs["next_beam_scores"]
+ beam_next_tokens = beam_outputs["next_beam_tokens"]
+ beam_idx = beam_outputs["next_beam_indices"]
+ input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+ model_kwargs = self._update_model_kwargs_for_generation(
+ outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+ )
+ if model_kwargs["past_key_values"] is not None:
+ model_kwargs["past_key_values"] = self._temporary_reorder_cache(model_kwargs["past_key_values"], beam_idx)
+
+ if return_dict_in_generate and output_scores:
+ # pylint: disable=unsubscriptable-object
+ beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+ # increase cur_len
+ cur_len = cur_len + 1
+ if token_latency:
+ if input_ids.is_xpu:
+ torch.xpu.synchronize()
+ latency_list.append(time.time() - tic)
+
+ if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+ if not synced_gpus:
+ break
+ else:
+ this_peer_finished = True
+
+ sequence_outputs = beam_scorer.finalize(
+ input_ids,
+ beam_scores,
+ next_tokens,
+ next_indices,
+ pad_token_id=pad_token_id,
+ eos_token_id=eos_token_id,
+ max_length=stopping_criteria.max_length,
+ beam_indices=beam_indices,
+ decoder_prompt_len=decoder_prompt_len,
+ )
+ if return_dict_in_generate:
+ if not output_scores:
+ sequence_outputs["sequence_scores"] = None
+
+ if self.config.is_encoder_decoder:
+ output_result = BeamSearchEncoderDecoderOutput(
+ sequences=sequence_outputs["sequences"],
+ sequences_scores=sequence_outputs["sequence_scores"],
+ scores=scores,
+ beam_indices=sequence_outputs["beam_indices"],
+ encoder_attentions=encoder_attentions,
+ encoder_hidden_states=encoder_hidden_states,
+ decoder_attentions=decoder_attentions,
+ cross_attentions=cross_attentions,
+ decoder_hidden_states=decoder_hidden_states,
+ past_key_values=model_kwargs.get("past_key_values"),
+ )
+ else:
+ output_result = BeamSearchDecoderOnlyOutput(
+ sequences=sequence_outputs["sequences"],
+ sequences_scores=sequence_outputs["sequence_scores"],
+ scores=scores,
+ beam_indices=sequence_outputs["beam_indices"],
+ attentions=decoder_attentions,
+ hidden_states=decoder_hidden_states,
+ past_key_values=model_kwargs.get("past_key_values"),
+ )
+ else:
+ output_result = sequence_outputs["sequences"]
+ # result
+ if token_latency:
+ return (output_result, latency_list)
+ else:
+ return output_result
diff --git a/neural_compressor/transformers/generation/greedy_search.py b/neural_compressor/transformers/generation/greedy_search.py
new file mode 100644
index 00000000000..f35211005ff
--- /dev/null
+++ b/neural_compressor/transformers/generation/greedy_search.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import time
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.generation.streamers import BaseStreamer
+from transformers.utils import ModelOutput
+
+
+class GreedySearchDecoderOnlyOutput(ModelOutput):
+ sequences: torch.LongTensor = None
+ scores: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+class GreedySearchEncoderDecoderOutput(ModelOutput):
+ sequences: torch.LongTensor = None
+ scores: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+ past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]
+
+
+def _greedy_search(
+ self,
+ input_ids: torch.LongTensor,
+ logits_processor: Optional[LogitsProcessorList] = None,
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
+ max_length: Optional[int] = None,
+ pad_token_id: Optional[int] = None,
+ eos_token_id: Optional[Union[int, List[int]]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_scores: Optional[bool] = None,
+ return_dict_in_generate: Optional[bool] = None,
+ synced_gpus: bool = False,
+ streamer: Optional["BaseStreamer"] = None,
+ **model_kwargs,
+) -> Union[GreedySearchOutput, torch.LongTensor]:
+ r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
+ used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+
+ In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
+ instead. For an overview of generation strategies and code examples, check the [following
+ guide](../generation_strategies).
+
+ Parameters:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ The sequence used as a prompt for the generation.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+ used to modify the prediction scores of the language modeling head applied at each generation step.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+ used to tell if the generation loop should stop.
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+ tokens. The maximum length of the sequence to be generated.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`Union[int, List[int]]`, *optional*):
+ The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+ output_attentions (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more details.
+ output_hidden_states (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more details.
+ output_scores (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
+ Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+ streamer (`BaseStreamer`, *optional*):
+ Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+ through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+ model_kwargs:
+ Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+ If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+ Return:
+ [`GreedySearchDecoderOnlyOutput`], [`GreedySearchEncoderDecoderOutput`] or
+ `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+ `return_dict_in_generate=True` or a [`GreedySearchEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
+ Examples:
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForCausalLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... StoppingCriteriaList,
+ ... MaxLengthCriteria,
+ ... )
+ >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+ >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+ >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+ >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+ >>> input_prompt = "It might be possible to"
+ >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList(
+ ... [
+ ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
+ ... ]
+ ... )
+ >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+ >>> outputs = model.greedy_search(
+ ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
+ ... )
+ >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+ ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
+ ```
+ """
+ token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or (
+ self.token_latency if hasattr(self, "token_latency") else False
+ )
+
+ latency_list = []
+ # init values
+ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+ if max_length is not None:
+ warnings.warn(
+ "`max_length` is deprecated in this function, use"
+ " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+ UserWarning,
+ )
+ stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+ pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+ eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+ if isinstance(eos_token_id, int):
+ eos_token_id = [eos_token_id]
+ eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+ output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+ output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+ )
+ return_dict_in_generate = (
+ return_dict_in_generate
+ if return_dict_in_generate is not None
+ else self.generation_config.return_dict_in_generate
+ )
+
+ # init attention / hidden states / scores tuples
+ scores = () if (return_dict_in_generate and output_scores) else None
+ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+ cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+ if return_dict_in_generate and self.config.is_encoder_decoder:
+ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+ encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+ # keep track of which sequences are already finished
+ unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+ this_peer_finished = False # used by synced_gpus only
+ while True:
+ tic = time.time()
+ if synced_gpus:
+ # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+ # The following logic allows an early break if all peers finished generating their sequence
+ this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+ # send 0.0 if we finished, 1.0 otherwise
+ dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+ # did all peers finish? the reduced sum will be 0.0 then
+ if this_peer_finished_flag.item() == 0.0:
+ break
+
+ # prepare model inputs
+ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+ if (
+ re.search("GPTJ", self.config.architectures[0])
+ or re.search("llama", self.config.architectures[0], re.IGNORECASE)
+ or re.search("gptneox", self.config.architectures[0], re.IGNORECASE)
+ or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
+ or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
+ or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+ ):
+ first_token = False
+ input_bs = input_ids.size()[0]
+ if model_inputs["past_key_values"] is None:
+ first_token = True
+ if first_token and hasattr(self, "trace_graph"):
+ if re.search("GPTJ", self.config.architectures[0]):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.n_layer)
+ ]
+ )
+ elif re.search("llama", self.config.architectures[0], re.IGNORECASE):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ elif re.search("OPT", self.config.architectures[0], re.IGNORECASE):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search(
+ "rw", self.config.architectures[0], re.IGNORECASE
+ ):
+ beam_idx_tmp = torch.zeros(
+ (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+ ).contiguous()
+ model_inputs["past_key_values"] = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+ beam_idx_tmp,
+ )
+ for i in range(self.config.num_hidden_layers)
+ ]
+ )
+ if hasattr(self, "trace_graph"):
+ model_inputs.pop("use_cache", None)
+ model_inputs.pop("token_type_ids", None)
+ outputs = self.trace_graph(**model_inputs)
+ if synced_gpus and this_peer_finished:
+ continue # don't waste resources running the code we don't need
+ next_token_logits = outputs[0][:, -1, :]
+ else:
+ outputs = self(
+ **model_inputs,
+ return_dict=True,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ )
+ if synced_gpus and this_peer_finished:
+ continue # don't waste resources running the code we don't need
+ next_token_logits = outputs.logits[:, -1, :]
+ else:
+ outputs = self(
+ **model_inputs,
+ return_dict=True,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ )
+ if synced_gpus and this_peer_finished:
+ continue # don't waste resources running the code we don't need
+ next_token_logits = outputs.logits[:, -1, :]
+
+ # pre-process distribution
+ next_tokens_scores = logits_processor(input_ids, next_token_logits)
+
+ # Store scores, attentions and hidden_states when required
+ if return_dict_in_generate:
+ if output_scores:
+ scores += (next_tokens_scores,)
+ if output_attentions:
+ decoder_attentions += (
+ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+ )
+ if self.config.is_encoder_decoder:
+ cross_attentions += (outputs.cross_attentions,)
+
+ if output_hidden_states:
+ decoder_hidden_states += (
+ (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,)
+ )
+
+ # argmax
+ next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+
+ # finished sentences should have their next token be a padding token
+ if eos_token_id is not None:
+ if pad_token_id is None:
+ raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+ # update generated ids, model inputs, and length for next step
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+ if streamer is not None:
+ streamer.put(next_tokens.cpu())
+ model_kwargs = self._update_model_kwargs_for_generation(
+ outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+ )
+
+ # if eos_token was found in one sentence, set sentence to finished
+ if eos_token_id_tensor is not None:
+ unfinished_sequences = unfinished_sequences.mul(
+ next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+ )
+ # stop when each sentence is finished
+ if unfinished_sequences.max() == 0:
+ this_peer_finished = True
+ # stop if we exceed the maximum length
+ if token_latency:
+ if input_ids.is_xpu:
+ torch.xpu.synchronize()
+ latency_list.append(time.time() - tic)
+ if stopping_criteria(input_ids, scores):
+ this_peer_finished = True
+ if this_peer_finished and not synced_gpus:
+ break
+ if streamer is not None:
+ streamer.end()
+
+ if return_dict_in_generate:
+ if self.config.is_encoder_decoder:
+ output_result = GreedySearchEncoderDecoderOutput(
+ sequences=input_ids,
+ scores=scores,
+ encoder_attentions=encoder_attentions,
+ encoder_hidden_states=encoder_hidden_states,
+ decoder_attentions=decoder_attentions,
+ cross_attentions=cross_attentions,
+ decoder_hidden_states=decoder_hidden_states,
+ past_key_values=model_kwargs.get("past_key_values"),
+ )
+ else:
+ output_result = GreedySearchDecoderOnlyOutput(
+ sequences=input_ids,
+ scores=scores,
+ attentions=decoder_attentions,
+ hidden_states=decoder_hidden_states,
+ past_key_values=model_kwargs.get("past_key_values"),
+ )
+ else:
+ output_result = input_ids
+
+ if token_latency:
+ return (output_result, latency_list)
+ else:
+ return output_result
diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py
index e81c3295bfa..877e3be89be 100644
--- a/neural_compressor/transformers/quantization/utils.py
+++ b/neural_compressor/transformers/quantization/utils.py
@@ -351,10 +351,12 @@ def convert_to_quantized_model(model, config, device="cpu"):
import intel_extension_for_pytorch
assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!"
- os.environ["INC_TARGET_DEVICE"] = "cpu"
- logger.info(
- "Set the environment variable INC_TARGET_DEVICE='cpu' to ensure the quantization process occurs on the CPU."
- )
+ if "INC_TARGET_DEVICE" not in os.environ:
+ os.environ["INC_TARGET_DEVICE"] = "cpu"
+ logger.info(
+ "Set the environment variable INC_TARGET_DEVICE='cpu'"
+ " to ensure the quantization process occurs on the CPU."
+ )
orig_dtype = torch.float32
for param in model.parameters():