diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh
deleted file mode 100644
index 5adaf86579b..00000000000
--- a/.azure-pipelines/scripts/ut/run_itrex.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-set -xe
-source /neural-compressor/.azure-pipelines/scripts/change_color.sh
-python -c "import neural_compressor as nc;print(nc.version.__version__)"
-echo "run itrex ut..."
-
-# install inc 3x deps
-pip install -r /neural-compressor/requirements_pt.txt
-export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
-
-# prepare itrex
-git clone https://github.com/intel/intel-extension-for-transformers.git /intel-extension-for-transformers
-cd /intel-extension-for-transformers && git rev-parse --short HEAD
-bash /intel-extension-for-transformers/.github/workflows/script/prepare_env.sh
-bash /intel-extension-for-transformers/.github/workflows/script/install_binary.sh
-
-# prepare test env
-sed -i '/neural-compressor.git/d' /intel-extension-for-transformers/tests/requirements.txt
-pip install -r /intel-extension-for-transformers/tests/requirements.txt
-# workaround
-pip install onnx==1.16.0
-pip install onnxruntime==1.18.0
-echo "pip list itrex ut deps..."
-pip list
-LOG_DIR=/neural-compressor/log_dir
-mkdir -p ${LOG_DIR}
-ut_log_name=${LOG_DIR}/ut_itrex.log
-
-# run unit test
-cd /intel-extension-for-transformers/tests/CI
-find . -name "test*.py" | grep -v "test_tf" | sed 's,\.\/,python ,g' | sed 's/$/ --verbose/' > run.sh
-
-# run UT
-$BOLD_YELLOW && echo "cat run.sh..." && $RESET
-cat run.sh | tee ${ut_log_name}
-$BOLD_YELLOW && echo "------UT start-------" && $RESET
-bash -x run.sh 2>&1 | tee -a ${ut_log_name}
-$BOLD_YELLOW && echo "------ UT end -------" && $RESET
-
-if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then
-    echo "Find errors in UT test, please check the output..."
-    exit 1
-fi
-echo "UT finished successfully! "
\ No newline at end of file
diff --git a/.azure-pipelines/ut-itrex.yml b/.azure-pipelines/ut-itrex.yml
deleted file mode 100644
index 2f038270234..00000000000
--- a/.azure-pipelines/ut-itrex.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-trigger: none
-
-pr:
-  autoCancel: true
-  drafts: false
-  branches:
-    include:
-      - master
-  paths:
-    include:
-      - neural_compressor
-      - setup.py
-      - requirements.txt
-      - .azure-pipelines/scripts/ut/run_itrex.sh
-      - .azure-pipelines/ut-itrex.yml
-
-pool: MODEL_PERF_TEST
-
-variables:
-  UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir
-
-stages:
-  - stage:
-    displayName: Unit Test of ITREX
-    jobs:
-      - job:
-        steps:
-          - template: template/ut-template.yml
-            parameters:
-              dockerConfigName: 'commonDockerConfig'
-              utScriptFileName: 'run_itrex'
-              uploadPath: $(UPLOAD_PATH)
-              utArtifact: 'ut_itrex'
-              utTestMode: "no-coverage"
-              utContainerName: "utTest-itrex"
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
index 9f566749da0..33a7a2b06a4 100644
--- a/.github/checkgroup.yml
+++ b/.github/checkgroup.yml
@@ -78,19 +78,6 @@ subprojects:
       - "UT-Basic (Unit Test other basic case Test other basic case)"
       - "UT-Basic (Unit Test other cases baseline Test other cases baseline)"
 
-  - id: "Unit Tests ITREX workflow"
-    paths:
-      - "neural_compressor/**"
-      - "setup.py"
-      - "requirements.txt"
-      - ".azure-pipelines/scripts/ut/run_itrex.sh"
-      - ".azure-pipelines/ut-itrex.yml"
-      - "!neural_compressor/common/**"
-      - "!neural_compressor/torch/**"
-      - "!neural_compressor/tensorflow/**"
-    checks:
-      - "UT-ITREX"
-
   - id: "Unit Tests 3x-TensorFlow workflow"
     paths:
       - "neural_compressor/common/**"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b821d93eb1..2875b945c57 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -76,7 +76,7 @@ repos:
           )$
 
   - repo: https://github.com/PyCQA/docformatter
-    rev: v1.7.5
+    rev: 06907d0
     hooks:
       - id: docformatter
         args: [
diff --git a/README.md b/README.md
index e2bef73e2d7..983b80227ed 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testi
 * Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst)
 
 ## What's New
+* [2024/10] [Transformers-like API](./docs/source/3x/transformers_like_api.md) for INT4 inference on Intel CPU and GPU.
 * [2024/07] From 3.0 release, framework extension API is recommended to be used for quantization.
 * [2024/07] Performance optimizations and usability improvements on [client-side](./docs/source/3x/client_quant.md).
 
@@ -164,6 +165,16 @@ Intel Neural Compressor will convert the model format from auto-gptq to hpu form
           <td colspan="2" align="center"><a href="./docs/source/3x/TF_SQ.md">Smooth Quantization</a></td>
       </tr>
   </tbody>
+  <thead>
+      <tr>
+        <th colspan="8">Transformers-like APIs</th>
+      </tr>
+  </thead>
+  <tbody>
+      <tr>
+          <td colspan="8" align="center"><a href="./docs/source/3x/transformers_like_api.md">Overview</a></td>
+      </tr>
+  </tbody>
   <thead>
       <tr>
         <th colspan="8">Other Modules</th>
diff --git a/docs/source/3x/transformers_like_api.md b/docs/source/3x/transformers_like_api.md
index 9aafeed5278..55e8d964072 100644
--- a/docs/source/3x/transformers_like_api.md
+++ b/docs/source/3x/transformers_like_api.md
@@ -208,6 +208,8 @@ python run_generation_gpu_woq.py --woq --benchmark --model save_dir
 >Note:
 > * Saving quantized model should be executed before the optimize_transformers function is called.
 > * The optimize_transformers function is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs). It provides optimizations for both model-wise and content-generation-wise. The detail of `optimize_transformers`, please refer to [the link](https://github.com/intel/intel-extension-for-pytorch/blob/xpu-main/docs/tutorials/llm/llm_optimize_transformers.md).
+>* The quantization process is performed on the CPU accelerator by default. Users can override this setting by specifying the environment variable `INC_TARGET_DEVICE`. Usage on bash: ```export INC_TARGET_DEVICE=xpu```.
+>* For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the OMP_NUM_THREADS explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using taskset.
 
 ## Examples
 
diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index bb18153f389..a1f33413864 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -1,196 +1,225 @@
 {
-  "pytorch": {
-    "llava_woq_autoround_int4":{
-      "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
-      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
-      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
-      "input_model": "liuhaotian/llava-v1.5-7b",
-      "main_script": "main.py",
-      "batch_size": 1
-    },
-    "qwenvl_woq_autoround_int4":{
-      "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
-      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
-      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
-      "input_model": "Qwen/Qwen-VL",
-      "main_script": "main.py",
-      "batch_size": 8
-    },
-    "Phi3Vision_woq_autoround_int4":{
-      "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
-      "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
-      "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
-      "input_model": "microsoft/Phi-3-vision-128k-instruct",
-      "main_script": "main.py",
-      "batch_size": 1
-    },
-    "opt_125m_woq_gptq_int4":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
-    "opt_125m_woq_gptq_int4_dq_bnb":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
-    "opt_125m_woq_gptq_int4_dq_ggml":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "llama2_7b_gptq_int4":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "llama2_7b_gptq_int4_dq_bnb":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "llama2_7b_gptq_int4_dq_ggml":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "gpt_j_woq_rtn_int4":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "gpt_j_woq_rtn_int4_dq_bnb":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "gpt_j_woq_rtn_int4_dq_ggml":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "gpt_j_woq_gptq_int4":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "gpt_j_woq_gptq_int4_dq_bnb":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "gpt_j_woq_gptq_int4_dq_ggml":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "gpt_j_ipex":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
-    "gpt_j_ipex_sq":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
-    "llama2_7b_ipex":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
-    "llama2_7b_ipex_sq":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
-    "opt_125m_ipex":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "opt_125m_ipex_sq":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 8
-    },
-    "dlrm_ipex": {
-      "model_src_dir": "recommendation/dlrm/static_quant/ipex",
-      "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
-      "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
-      "main_script": "dlrm_s_pytorch.py",
-      "batch_size": 16384
-    },
-    "resnet18_pt2e_static":{
-      "model_src_dir": "cv/static_quant",
-      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-      "input_model": "",
-      "main_script": "main.py",
-      "batch_size": 1
-    },
-    "resnet18_fp8_static":{
-      "model_src_dir": "cv/fp8_quant",
-      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-      "input_model": "",
-      "main_script": "main.py",
-      "batch_size": 1
-    },
-    "opt_125m_pt2e_static":{
-      "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "run_clm_no_trainer.py",
-      "batch_size": 1
-    },
-    "sdxl_ipex_sq":{
-      "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
-      "dataset_location": "",
-      "input_model": "",
-      "main_script": "main.py",
-      "batch_size": 1
-    },
-    "resnet18_mixed_precision": {
-      "model_src_dir": "cv/mixed_precision",
-      "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
-      "input_model": "resnet18",
-      "main_script": "main.py",
-      "batch_size": 20
+    "pytorch": {
+      "llava_woq_autoround_int4":{
+        "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava",
+        "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+        "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+        "input_model": "liuhaotian/llava-v1.5-7b",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "qwenvl_woq_autoround_int4":{
+        "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL",
+        "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+        "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+        "input_model": "Qwen/Qwen-VL",
+        "main_script": "main.py",
+        "batch_size": 8
+      },
+      "Phi3Vision_woq_autoround_int4":{
+        "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision",
+        "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full",
+        "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json",
+        "input_model": "microsoft/Phi-3-vision-128k-instruct",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_gptq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_gptq_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_gptq_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "llama2_7b_gptq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "llama2_7b_gptq_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "llama2_7b_gptq_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_rtn_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_rtn_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_rtn_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_gptq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_gptq_int4_dq_bnb":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_gptq_int4_dq_ggml":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "gpt_j_woq_awq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_awq_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_autoround_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_woq_autotune_int4":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "gpt_j_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "gpt_j_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "llama2_7b_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "opt_125m_ipex":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "opt_125m_ipex_sq":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 8
+      },
+      "dlrm_ipex": {
+        "model_src_dir": "recommendation/dlrm/static_quant/ipex",
+        "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
+        "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
+        "main_script": "dlrm_s_pytorch.py",
+        "batch_size": 16384
+      },
+      "resnet18_pt2e_static":{
+        "model_src_dir": "cv/static_quant",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "resnet18_fp8_static":{
+        "model_src_dir": "cv/fp8_quant",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "opt_125m_pt2e_static":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      },
+      "sdxl_ipex_sq":{
+        "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "resnet18_mixed_precision": {
+        "model_src_dir": "cv/mixed_precision",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+        "input_model": "resnet18",
+        "main_script": "main.py",
+        "batch_size": 20
+      }
     }
   }
 }
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
index bc70f987095..736d79c4d72 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt
@@ -2,6 +2,5 @@ transformers
 torch
 sentencepiece
 neural-compressor
-intel-extension-for-transformers >= 1.4.1
 lm-eval==0.4.2
 peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
index 40bf217c72e..6ad8e495db2 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py
@@ -62,7 +62,7 @@ def get_user_model():
 user_model = convert(model=user_model)
 user_model.eval()
 
-from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
 eval_args = LMEvalParser(
     model="hf",
     user_model=user_model,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
index d4155dfbf75..d9f59d178e7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt
@@ -8,7 +8,6 @@ pytest
 wandb
 einops
 neural-compressor
-intel-extension-for-transformers
-lm_eval==0.4.2
+lm_eval==0.4.3
 peft
 optimum-intel
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
index 694c0505ea4..a082421f15b 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py
@@ -185,7 +185,7 @@ def eval_func(model):
         config = AutoConfig.from_pretrained(args.model)
         setattr(model, "config", config)
     
-        from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+        from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
         eval_args = LMEvalParser(
             model="hf",
             user_model=model,
@@ -232,7 +232,7 @@ def eval_func(model):
 
 if args.accuracy:
     user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
 
     eval_args = LMEvalParser(
         model="hf",
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
index f0b56e558d3..5174182f312 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt
@@ -8,6 +8,5 @@ pytest
 wandb
 einops
 neural-compressor
-intel-extension-for-transformers
-lm_eval==0.4.2
+lm_eval==0.4.3
 peft
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
index b56c01f20f5..eb97f930d29 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py
@@ -212,7 +212,7 @@ def run_fn(model):
 
 if args.accuracy:
     user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
     eval_args = LMEvalParser(
         model="hf",
         user_model=user_model,
@@ -232,7 +232,7 @@ def run_fn(model):
 
 if args.performance:
     user_model.eval()
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
     import time
 
     samples = args.iters * args.batch_size
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
index b6d9b6c55de..63959e924cb 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
@@ -2,6 +2,5 @@ transformers
 torch
 sentencepiece
 neural-compressor
-intel-extension-for-transformers >= 1.4.1
-lm-eval==0.4.2
+lm-eval==0.4.3
 peft
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 395bc6f9b57..a2aa6c1302a 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -116,7 +116,7 @@ def get_example_inputs(tokenizer):
 
 if args.accuracy:
 
-    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
     eval_args = LMEvalParser(
         model="hf",
         user_model=user_model,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md
index 1abe2633ea3..f0760cc2fe1 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md
@@ -103,6 +103,8 @@ python run_generate_cpu_woq.py \
 > 1.  default search algorithm is beam search with num_beams = 1.
 > 2. [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.10%2Bxpu/docs/tutorials/llm/llm_optimize_transformers.md) Support for the optimized inference of model types "gptj," "mistral," "qwen," and "llama" to achieve high performance and accuracy. Ensure accurate inference for other model types as well.
 > 3. We provide compression technologies `WeightOnlyQuant` with `Rtn/GPTQ/AutoRound` algorithms and `load_in_4bit` and `load_in_8bit` work on intel GPU device.
+> 4. The quantization process is performed on the CPU accelerator by default. Users can override this setting by specifying the environment variable `INC_TARGET_DEVICE`. Usage on bash: ```export INC_TARGET_DEVICE=xpu```.
+> 5. For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the OMP_NUM_THREADS explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using taskset.
 
 ## Prerequisite​
 ### Dependencies
@@ -111,7 +113,7 @@ Intel-extension-for-pytorch dependencies are in oneapi package, before install i
 ### Create Environment​
 Pytorch and Intel-extension-for-pytorch version for intel GPU > 2.1 are required, python version requests equal or higher than 3.9 due to [text evaluation library](https://github.com/EleutherAI/lm-evaluation-harness/tree/master) limitation, the dependent packages are listed in requirements_GPU.txt, we recommend create environment as the following steps. For Intel-exension-for-pytorch, we should install from source code now, and Intel-extension-for-pytorch will add weight-only quantization in the next version.
 
->**Note**: please install transformers==4.40.2.
+>**Note**: please install transformers==4.38.1.
 
 ```bash
 pip install -r requirements_GPU.txt
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md
index 2c3b14459c8..6a5e75b5023 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md
@@ -30,12 +30,6 @@ The scripts [run_generation_sq.py](./run_generation_sq.py) and [run_generation_c
 
 ```bash
 # Installation
-git clone https://github.com/intel/intel-extension-for-transformers.git
-
-# install ITREX
-cd intel-extension-for-transformers
-pip install -r requirements.txt
-pip install -v .
 
 # install requirements
 cd examples/huggingface/pytorch/text-generation/quantization
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py
index 62ef4ca2f49..8329d74b9a4 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py
@@ -293,7 +293,6 @@
         _commit_hash=args._commit_hash,
     )
 elif args.load_in_4bit or args.load_in_8bit:
-    # CPU device usage is provided by intel-extension-for-transformers.
     user_model = AutoModelForCausalLM.from_pretrained(
         args.model,
         load_in_4bit=args.load_in_4bit,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
index 9245d53eb50..7b63a015600 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
@@ -6,9 +6,9 @@
 from transformers import AutoConfig, AutoTokenizer
 from transformers.generation import GenerationConfig
 import intel_extension_for_pytorch as ipex
-# from intel_extension_for_transformers.transformers.llm.utils.generation import _beam_search, _greedy_search
 from neural_compressor.transformers import AutoModelForCausalLM, AutoRoundConfig, RtnConfig, GPTQConfig
 from neural_compressor.transformers.quantization.utils import convert_dtype_str2torch
+from neural_compressor.transformers.generation import _greedy_search, _beam_search
 from transformers.utils import check_min_version
 import contextlib
 
@@ -189,7 +189,6 @@
                                                       torch_dtype=torch.float16,
                                                       )
 elif args.load_in_4bit or args.load_in_8bit:
-    # CPU device usage is provided by intel-extension-for-transformers.
     user_model = AutoModelForCausalLM.from_pretrained(args.model,
                                                       device_map=args.device,
                                                       load_in_4bit=args.load_in_4bit,
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
index 889d7b42682..0519b490ff7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md
@@ -35,9 +35,8 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --double_quant_type "BNB_NF4" \
+    --output_dir saved_results
 
 # "--woq_algo RTN" is used to enable RTN algorithms
 python run_clm_no_trainer.py \
@@ -48,9 +47,38 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
+    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AWQ \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128 \
+    --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AutoRound \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+    --model EleutherAI/gpt-j-6B \
+    --dataset NeelNanda/pile-10k \
+    --int8 \
     --accuracy \
     --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
 ```
 **Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ.
 
@@ -72,8 +100,6 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 
 # "--woq_algo RTN" is used to enable RTN algorithms
@@ -85,13 +111,40 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
+    --double_quant_type "BNB_NF4"
+
+# "--woq_algo AWQ" is used to enable AWQ algorithms
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AWQ \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128 \
+    --calib_iters 128
+
+# "--woq_algo AutoRound" is used to enable AutoRound algorithms
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m \
+    --dataset NeelNanda/pile-10k \
+    --quantize \
+    --woq_algo AutoRound \
+    --woq_bits 4 \
+    --woq_scheme asym \
+    --woq_group_size 128
+
+# "--accuracy" for eval
+python run_clm_no_trainer.py \
+    --model facebook/opt-125m  \
+    --dataset NeelNanda/pile-10k \
+    --int8 \
     --accuracy \
     --tasks "lambada_openai" \
-    --double_quant_type "BNB_NF4"
+    --output_dir saved_results
 ```
 
 ### LLAMA2-7b/13b/70b
->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy.
 #### Quantization
 
 ```bash
@@ -107,8 +160,6 @@ python run_clm_no_trainer.py \
     --woq_group_size 128 \
     --gptq_max_seq_length 2048 \
     --gptq_use_max_length \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 
 # "--woq_algo RTN" is used to enable RTN algorithms
@@ -120,8 +171,6 @@ python run_clm_no_trainer.py \
     --woq_bits 4 \
     --woq_scheme asym \
     --woq_group_size 128 \
-    --accuracy \
-    --tasks "lambada_openai" \
     --double_quant_type "BNB_NF4"
 ```
 
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt
index 63c4d6e10b1..4745e2dfbd7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt
@@ -8,7 +8,6 @@ pytest
 wandb
 einops
 neural-compressor
-intel-extension-for-transformers
 lm_eval==0.4.3
 peft
 auto_round
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
index 9e1d766128e..6c84e27ce88 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh
@@ -70,58 +70,59 @@ function run_benchmark {
     fi
     echo $extra_cmd
 
-        if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
+    if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then
         model_name_or_path="facebook/opt-125m"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then
         model_name_or_path="meta-llama/Llama-2-7b-hf"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
+        model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then
-        model_name_or_path="EleutherAI/gpt-j-6b"\
-        extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+        model_name_or_path="EleutherAI/gpt-j-6b"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type BNB_NF4"
     elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then
         model_name_or_path="EleutherAI/gpt-j-6b"
-        extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
-        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+    elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+    elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AutoRound"
+    elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
     fi
 
-    python -u run_clm_no_trainer.py \
-        --model ${model_name_or_path} \
-        --output_dir ${tuned_checkpoint} \
-        --task ${task} \
-        --batch_size ${batch_size} \
-        ${extra_cmd} ${mode_cmd}
+    if [[ ${mode} == "accuracy" ]]; then
+        python -u run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --output_dir ${tuned_checkpoint} \
+            --task ${task} \
+            --batch_size ${batch_size} \
+            ${extra_cmd} ${mode_cmd}
+    elif [[ ${mode} == "performance" ]]; then
+        incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
+            --model ${model_name_or_path} \
+            --batch_size ${batch_size} \
+            --output_dir ${tuned_checkpoint} \
+            ${extra_cmd} ${mode_cmd}
+    else
+        echo "Error: No such mode: ${mode}"
+        exit 1
+    fi
+        
 }
 
 main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
index 02329bd9e15..51be2900ba7 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py
@@ -53,7 +53,7 @@
                     type=str, help="tasks for accuracy validation")
 parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
 # ============WeightOnly configs===============
-parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'],
+parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound', 'AutoTune'],
                     help="Weight-only parameter.")
 parser.add_argument("--woq_bits", type=int, default=8)
 parser.add_argument("--woq_dtype", type=str, default="int")
@@ -62,6 +62,7 @@
 parser.add_argument("--woq_scheme", default="sym")
 parser.add_argument("--woq_use_mse_search", action="store_true")
 parser.add_argument("--woq_use_full_range", action="store_true")
+parser.add_argument("--quant_lm_head", action="store_true",  help="whether to quant the lm_head layer in transformers")
 # =============GPTQ configs====================
 parser.add_argument("--gptq_actorder", action="store_true",
                     help="Whether to apply the activation order GPTQ heuristic.")
@@ -78,6 +79,35 @@
                     help='Calibration dataset sequence max length, '
                         'this should align with your model config, '
                         'and your dataset builder args: args.pad_max_length')
+# =============AWQ configs====================
+parser.add_argument("--use_auto_scale", action="store_true",
+                    help="Enables best scales search based on activation distribution.")
+parser.add_argument("--use_auto_clip", action="store_true",
+                    help="Enables clip range searchc.")
+parser.add_argument("--folding", action="store_true",
+                    help="Allow insert mul before linear when the scale cannot be absorbed by last layer for TEQ/AWQ.")
+parser.add_argument('--absorb_layer_dict', type=dict, default={},
+                    help="The layer dict that scale can be absorbed for TEQ/AWQ.")
+# ============AUTOROUND configs==============
+parser.add_argument(
+    "--lr",
+    type=float,
+    default=None,
+    help="learning rate, if None, it will be set to 1.0/iters automatically",
+)
+parser.add_argument(
+    "--minmax_lr",
+    type=float,
+    default=None,
+    help="minmax learning rate, if None,it will beset to be the same with lr",
+)
+parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.")
+parser.add_argument("--autoround_nsamples", default=128, type=int, help="num samples for autoround calibration.")
+parser.add_argument(
+    "--disable_quanted_input",
+    action="store_true",
+    help="whether to use the output of quantized block to tune the next block",
+)
 
 # =============DoubleQuant configs====================
 parser.add_argument("--double_quant_type",
@@ -196,6 +226,8 @@ def get_user_model():
     )
     tokenizer = AutoTokenizer.from_pretrained(args.model)
     user_model = user_model.float()
+    if args.woq_algo == 'AutoRound':
+        user_model.to(torch.float32)
 
     # Set model's seq_len when GPTQ calibration is enabled.
     if args.woq_algo == 'GPTQ':
@@ -210,6 +242,31 @@ def get_user_model():
     user_model.eval()
     return user_model, tokenizer
 
+def eval_fn(user_model=None):
+    user_model.eval()
+    from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
+    import time
+
+    samples = args.iters * args.batch_size
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        limit=samples,
+        device="hpu" if is_hpex_available() else "cpu",
+    )
+    start = time.time()
+    results = evaluate(eval_args)
+    end = time.time()
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    return acc
 
 if args.quantize:
     # dataset
@@ -224,9 +281,25 @@ def get_user_model():
         shuffle=False,
         collate_fn=calib_evaluator.collate_batch,
     )
+    def calib_func(prepared_model):
+        for i, calib_input in enumerate(calib_dataloader):
+            if i > args.calib_iters:
+                break
+            prepared_model(calib_input[0])
 
     # 3.x api
-    from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
+    from neural_compressor.torch.quantization import (
+        RTNConfig,
+        GPTQConfig,
+        AWQConfig,
+        AutoRoundConfig,
+        TEQConfig,
+        TuningConfig,
+        autotune,
+        get_woq_tuning_config,
+        prepare,
+        convert
+    )
     from neural_compressor.torch.utils import get_double_quant_config_dict
     weight_sym = True if args.woq_scheme == "sym" else False
     if args.double_quant_type is not None:
@@ -239,6 +312,7 @@ def get_user_model():
                     # TODO: add group_dim into double quant config?
                     "use_full_range": args.woq_use_full_range,
                     "use_mse_search": args.woq_use_mse_search,
+                    "quant_lm_head": args.quant_lm_head,
                 }
             )
             quant_config = RTNConfig.from_dict(double_quant_config_dict)
@@ -256,8 +330,8 @@ def get_user_model():
                 double_quant_dtype=args.double_quant_dtype,
                 double_quant_use_sym=args.double_quant_use_sym,
                 double_quant_group_size=args.double_quant_group_size,
+                quant_lm_head=args.quant_lm_head,
             )
-        quant_config.set_local("lm_head", RTNConfig(dtype="fp32"))
         user_model = prepare(model=user_model, quant_config=quant_config)
         user_model = convert(model=user_model)
     elif args.woq_algo == "GPTQ":
@@ -288,6 +362,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                     "act_order": args.gptq_actorder,
                     "block_size": args.gptq_block_size,
                     "static_groups": args.gptq_static_groups,
+                    "quant_lm_head": args.quant_lm_head,
                 }
             )
             quant_config = GPTQConfig.from_dict(double_quant_config_dict)
@@ -307,11 +382,109 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
                 double_quant_dtype=args.double_quant_dtype,
                 double_quant_use_sym=args.double_quant_use_sym,
                 double_quant_group_size=args.double_quant_group_size,
+                quant_lm_head=args.quant_lm_head,
             )
-        quant_config.set_local("lm_head", GPTQConfig(dtype="fp32"))
         user_model = prepare(model=user_model, quant_config=quant_config)
         run_fn_for_gptq(user_model, dataloader_for_calibration)
         user_model = convert(user_model)
+    elif args.woq_algo == "AWQ":
+        quant_config = AWQConfig(
+            dtype=args.woq_dtype,
+            bits=args.woq_bits,
+            use_sym=weight_sym,
+            group_size=args.woq_group_size,
+            group_dim=args.woq_group_dim,
+            use_auto_scale=args.use_auto_scale,
+            use_auto_clip=args.use_auto_clip,
+            folding=args.folding,
+            absorb_layer_dict=args.absorb_layer_dict,
+            quant_lm_head=args.quant_lm_head,
+        )
+        example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+        run_fn = calib_func
+        user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(user_model)
+        user_model = convert(user_model)
+    elif args.woq_algo == "TEQ":
+        quant_config = TEQConfig(
+            dtype=args.woq_dtype,
+            bits=args.woq_bits,
+            use_sym=weight_sym,
+            group_size=args.woq_group_size,
+            group_dim=args.woq_group_dim,
+            folding=args.folding,
+            quant_lm_head=args.quant_lm_head,
+        )
+        example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+        run_fn = calib_func
+        user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
+        run_fn(user_model)
+        user_model = convert(user_model)
+    elif args.woq_algo == "AutoRound":
+        quant_config = AutoRoundConfig(
+                dtype=args.woq_dtype,
+                bits=args.woq_bits,
+                use_sym=weight_sym,
+                group_size=args.woq_group_size,
+                enable_quanted_input=not args.disable_quanted_input,
+                lr=args.lr,
+                minmax_lr=args.minmax_lr,
+                seqlen=args.pad_max_length,
+                nsamples=args.autoround_nsamples,
+                iters=args.autoround_iters,
+            )
+        quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
+        from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader
+        dataloader = get_dataloader(tokenizer=tokenizer,
+                                                seqlen=args.pad_max_length,
+                                                dataset_name=datasets,
+                                                seed=args.seed,
+                                                bs=args.batch_size,
+                                                nsamples=args.autoround_nsamples)
+        @torch.no_grad()
+        def run_fn_for_autoround(model, dataloader):
+            for data in dataloader:
+                if isinstance(data, tuple) or isinstance(data, list):
+                    model(*data)
+                elif isinstance(data, dict):
+                    model(**data)
+                else:
+                    model(data)
+        run_fn = run_fn_for_autoround
+        run_args = (dataloader,)
+        user_model = prepare(model=user_model, quant_config=quant_config)
+        run_fn(user_model, *run_args)
+        user_model = convert(user_model)
+    elif args.woq_algo == "AutoTune":
+        from utils import DataloaderPreprocessor
+        dataloaderPreprocessor = DataloaderPreprocessor(
+            dataloader_original=calib_dataloader,
+            use_max_length=args.gptq_use_max_length,
+            max_seq_length=args.gptq_max_seq_length,
+        )
+        dataloader = dataloaderPreprocessor.get_prepared_dataloader()
+        custom_tune_config = TuningConfig(config_set=get_woq_tuning_config())
+        from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device
+        from tqdm import tqdm
+        def run_fn_for_gptq(model, dataloader_for_calibration, *args):
+            for batch in tqdm(dataloader_for_calibration):
+                batch = move_input_to_device(batch, device=None)
+                if isinstance(batch, tuple) or isinstance(batch, list):
+                    model(batch[0])
+                elif isinstance(batch, dict):
+                    model(**batch)
+                else:
+                    model(batch)
+            return
+        example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long)
+        user_model = autotune(
+            model=user_model,
+            tune_config=custom_tune_config,
+            eval_fn=eval_fn,
+            run_fn=run_fn_for_gptq,
+            run_args=(dataloader, True),  # run_args should be a tuple,
+            example_inputs=example_inputs,
+        )
 
     user_model.save(args.output_dir)
 
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
index a860712b697..ed4ee705726 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh
@@ -85,6 +85,19 @@ function run_tuning {
         model_name_or_path="EleutherAI/gpt-j-6b"
         extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length"
         extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then
+        model_name_or_path="EleutherAI/gpt-j-6b"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128"
+        extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K"
+    elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128"
+    elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128 --woq_scheme asym --autoround_iters 200 --autoround_nsamples 500"
+    elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
+        model_name_or_path="facebook/opt-125m"
+        extra_cmd=$extra_cmd" --woq_algo AutoTune --woq_bits 4"
     fi
 
     python -u run_clm_no_trainer.py \
diff --git a/neural_compressor/transformers/generation/__init__.py b/neural_compressor/transformers/generation/__init__.py
new file mode 100644
index 00000000000..4030000c22c
--- /dev/null
+++ b/neural_compressor/transformers/generation/__init__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .beam_search import _beam_search
+from .greedy_search import _greedy_search
diff --git a/neural_compressor/transformers/generation/beam_search.py b/neural_compressor/transformers/generation/beam_search.py
new file mode 100644
index 00000000000..d4372810078
--- /dev/null
+++ b/neural_compressor/transformers/generation/beam_search.py
@@ -0,0 +1,490 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import time
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers.generation.beam_search import BeamScorer
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.utils import ModelOutput
+
+
+class BeamSearchEncoderDecoderOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    sequences_scores: Optional[torch.FloatTensor] = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    beam_indices: Optional[torch.LongTensor] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+class BeamSearchDecoderOnlyOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    sequences_scores: Optional[torch.FloatTensor] = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    beam_indices: Optional[torch.LongTensor] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
+
+
+def _beam_search(
+    self,
+    input_ids: torch.LongTensor,
+    beam_scorer: BeamScorer,
+    logits_processor: Optional[LogitsProcessorList] = None,
+    stopping_criteria: Optional[StoppingCriteriaList] = None,
+    max_length: Optional[int] = None,
+    pad_token_id: Optional[int] = None,
+    eos_token_id: Optional[Union[int, List[int]]] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_scores: Optional[bool] = None,
+    return_dict_in_generate: Optional[bool] = None,
+    synced_gpus: bool = False,
+    **model_kwargs,
+) -> Union[BeamSearchOutput, torch.LongTensor]:
+    r"""
+    Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+    can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+    <Tip warning={true}>
+    In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
+    instead. For an overview of generation strategies and code examples, check the [following
+    guide](../generation_strategies).
+    </Tip>
+    Parameters:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The sequence used as a prompt for the generation.
+        beam_scorer (`BeamScorer`):
+            An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+            sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+        logits_processor (`LogitsProcessorList`, *optional*):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        stopping_criteria (`StoppingCriteriaList`, *optional*):
+            An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+            used to tell if the generation loop should stop.
+        max_length (`int`, *optional*, defaults to 20):
+            **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+            tokens. The maximum length of the sequence to be generated.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+            returned tensors for more details.
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+            for more details.
+        output_scores (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        synced_gpus (`bool`, *optional*, defaults to `False`):
+            Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+        model_kwargs:
+            Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+            an encoder-decoder model the kwargs should include `encoder_outputs`.
+    Return:
+        [`BeamSearchDecoderOnlyOutput`], [`BeamSearchEncoderDecoderOutput`] or
+        `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+        [`BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+        `return_dict_in_generate=True` or a [`BeamSearchEncoderDecoderOutput`] if
+        `model.config.is_encoder_decoder=True`.
+    Examples:
+    ```python
+    >>> from transformers import (
+    ...     AutoTokenizer,
+    ...     AutoModelForSeq2SeqLM,
+    ...     LogitsProcessorList,
+    ...     MinLengthLogitsProcessor,
+    ...     BeamSearchScorer,
+    ... )
+    >>> import torch
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+    >>> encoder_input_str = "translate English to German: How old are you?"
+    >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+    >>> # lets run beam search using 3 beams
+    >>> num_beams = 3
+    >>> # define decoder start token ids
+    >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+    >>> input_ids = input_ids * model.config.decoder_start_token_id
+    >>> # add encoder_outputs to model keyword arguments
+    >>> model_kwargs = {
+    ...     "encoder_outputs": model.get_encoder()(
+    ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+    ...     )
+    ... }
+    >>> # instantiate beam scorer
+    >>> beam_scorer = BeamSearchScorer(
+    ...     batch_size=1,
+    ...     num_beams=num_beams,
+    ...     device=model.device,
+    ... )
+    >>> # instantiate logits processors
+    >>> logits_processor = LogitsProcessorList(
+    ...     [
+    ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+    ...     ]
+    ... )
+    >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+    >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    ['Wie alt bist du?']
+    ```"""
+    # init values
+    token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or (
+        self.token_latency if hasattr(self, "token_latency") else False
+    )
+
+    latency_list = []
+    logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+    stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+    if max_length is not None:
+        warnings.warn(
+            "`max_length` is deprecated in this function, use"
+            " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+            UserWarning,
+        )
+        stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+    if len(stopping_criteria) == 0:
+        warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
+    pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+    eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+    output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+    )
+    return_dict_in_generate = (
+        return_dict_in_generate
+        if return_dict_in_generate is not None
+        else self.generation_config.return_dict_in_generate
+    )
+
+    batch_size = len(beam_scorer._beam_hyps)
+    num_beams = beam_scorer.num_beams
+
+    batch_beam_size, cur_len = input_ids.shape
+
+    if num_beams * batch_size != batch_beam_size:
+        raise ValueError(
+            f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+        )
+
+    # init attention / hidden states / scores tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+    beam_indices = tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+    decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+    cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+    # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+    if return_dict_in_generate and self.config.is_encoder_decoder:
+        encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+        encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+    # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+    # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+    beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+    beam_scores[:, 1:] = -1e9
+    beam_scores = beam_scores.view((batch_size * num_beams,))
+    this_peer_finished = False  # used by synced_gpus only
+    decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+    while True:
+        tic = time.time()
+        if synced_gpus:
+            # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+            # The following logic allows an early break if all peers finished generating their sequence
+            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+            # send 0.0 if we finished, 1.0 otherwise
+            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+            # did all peers finish? the reduced sum will be 0.0 then
+            if this_peer_finished_flag.item() == 0.0:
+                break
+
+        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        if (
+            re.search("GPTJ", self.config.architectures[0])
+            or re.search("llama", self.config.architectures[0], re.IGNORECASE)
+            or re.search("gptneox", self.config.architectures[0], re.IGNORECASE)
+            or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
+            or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
+            or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+        ):
+            first_token = False
+            input_bs = input_ids.size()[0]
+            has_position_id = True
+            if model_inputs["past_key_values"] is None:
+                first_token = True
+            if first_token and hasattr(self, "trace_graph"):
+                if re.search("GPTJ", self.config.architectures[0]):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.n_layer)
+                        ]
+                    )
+                elif re.search("llama", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("OPT", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                    has_position_id = False
+                elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search(
+                    "rw", self.config.architectures[0], re.IGNORECASE
+                ):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                    has_position_id = False
+
+            if hasattr(self, "trace_graph"):
+                if first_token:
+                    new_attention_mask = model_inputs["attention_mask"][:batch_size].clone()
+                    new_input_ids = model_inputs["input_ids"][:batch_size].clone()
+                    if has_position_id:
+                        new_position_ids = model_inputs["position_ids"][:batch_size].clone()
+                    for i in range(batch_size):
+                        new_attention_mask[i] = model_inputs["attention_mask"][i * num_beams]
+                        new_input_ids[i] = model_inputs["input_ids"][i * num_beams]
+                        if has_position_id:
+                            new_position_ids[i] = model_inputs["position_ids"][i * num_beams]
+                    model_inputs["attention_mask"] = new_attention_mask
+                    model_inputs["input_ids"] = new_input_ids
+                    if has_position_id:
+                        model_inputs["position_ids"] = new_position_ids
+                model_inputs.pop("use_cache", None)
+                model_inputs.pop("token_type_ids", None)
+                if first_token and hasattr(self, "trace_graph_first"):
+                    outputs = self.trace_graph_first(**model_inputs)
+                else:
+                    outputs = self.trace_graph(**model_inputs)
+
+                if first_token and len(model_inputs["past_key_values"][1]) == 4:
+                    outputs = list(outputs)
+                    outputs[0] = outputs[0].repeat_interleave(num_beams, dim=0)
+                    outputs = tuple(outputs)
+                if synced_gpus and this_peer_finished:
+                    cur_len = cur_len + 1
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs[0][:, -1, :]
+            else:
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+                if synced_gpus and this_peer_finished:
+                    cur_len = cur_len + 1
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs.logits[:, -1, :]
+        else:
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+        next_token_scores = nn.functional.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
+        next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+        next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores_processed)
+        # Store scores, attentions and hidden_states when required
+        if return_dict_in_generate:
+            if output_scores:
+                scores += (next_token_scores_processed,)
+            if output_attentions:
+                decoder_attentions += (
+                    (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                )
+                if self.config.is_encoder_decoder:
+                    cross_attentions += (outputs.cross_attentions,)
+
+            if output_hidden_states:
+                decoder_hidden_states += (
+                    (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,)
+                )
+
+        # reshape for beam search
+        vocab_size = next_token_scores.shape[-1]
+        next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+        # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+        n_eos_tokens = len(eos_token_id) if eos_token_id else 0
+        next_token_scores, next_tokens = torch.topk(
+            next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
+        )
+
+        next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+        next_tokens = next_tokens % vocab_size
+
+        # stateless
+        beam_outputs = beam_scorer.process(
+            input_ids,
+            next_token_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            beam_indices=beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+        beam_scores = beam_outputs["next_beam_scores"]
+        beam_next_tokens = beam_outputs["next_beam_tokens"]
+        beam_idx = beam_outputs["next_beam_indices"]
+        input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+        model_kwargs = self._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+        )
+        if model_kwargs["past_key_values"] is not None:
+            model_kwargs["past_key_values"] = self._temporary_reorder_cache(model_kwargs["past_key_values"], beam_idx)
+
+        if return_dict_in_generate and output_scores:
+            # pylint: disable=unsubscriptable-object
+            beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+        # increase cur_len
+        cur_len = cur_len + 1
+        if token_latency:
+            if input_ids.is_xpu:
+                torch.xpu.synchronize()
+            latency_list.append(time.time() - tic)
+
+        if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+            if not synced_gpus:
+                break
+            else:
+                this_peer_finished = True
+
+    sequence_outputs = beam_scorer.finalize(
+        input_ids,
+        beam_scores,
+        next_tokens,
+        next_indices,
+        pad_token_id=pad_token_id,
+        eos_token_id=eos_token_id,
+        max_length=stopping_criteria.max_length,
+        beam_indices=beam_indices,
+        decoder_prompt_len=decoder_prompt_len,
+    )
+    if return_dict_in_generate:
+        if not output_scores:
+            sequence_outputs["sequence_scores"] = None
+
+        if self.config.is_encoder_decoder:
+            output_result = BeamSearchEncoderDecoderOutput(
+                sequences=sequence_outputs["sequences"],
+                sequences_scores=sequence_outputs["sequence_scores"],
+                scores=scores,
+                beam_indices=sequence_outputs["beam_indices"],
+                encoder_attentions=encoder_attentions,
+                encoder_hidden_states=encoder_hidden_states,
+                decoder_attentions=decoder_attentions,
+                cross_attentions=cross_attentions,
+                decoder_hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            output_result = BeamSearchDecoderOnlyOutput(
+                sequences=sequence_outputs["sequences"],
+                sequences_scores=sequence_outputs["sequence_scores"],
+                scores=scores,
+                beam_indices=sequence_outputs["beam_indices"],
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+    else:
+        output_result = sequence_outputs["sequences"]
+    # result
+    if token_latency:
+        return (output_result, latency_list)
+    else:
+        return output_result
diff --git a/neural_compressor/transformers/generation/greedy_search.py b/neural_compressor/transformers/generation/greedy_search.py
new file mode 100644
index 00000000000..f35211005ff
--- /dev/null
+++ b/neural_compressor/transformers/generation/greedy_search.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import time
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.generation.streamers import BaseStreamer
+from transformers.utils import ModelOutput
+
+
+class GreedySearchDecoderOnlyOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+class GreedySearchEncoderDecoderOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]
+
+
+def _greedy_search(
+    self,
+    input_ids: torch.LongTensor,
+    logits_processor: Optional[LogitsProcessorList] = None,
+    stopping_criteria: Optional[StoppingCriteriaList] = None,
+    max_length: Optional[int] = None,
+    pad_token_id: Optional[int] = None,
+    eos_token_id: Optional[Union[int, List[int]]] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_scores: Optional[bool] = None,
+    return_dict_in_generate: Optional[bool] = None,
+    synced_gpus: bool = False,
+    streamer: Optional["BaseStreamer"] = None,
+    **model_kwargs,
+) -> Union[GreedySearchOutput, torch.LongTensor]:
+    r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
+    used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+    <Tip warning={true}>
+    In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
+    instead. For an overview of generation strategies and code examples, check the [following
+    guide](../generation_strategies).
+    </Tip>
+    Parameters:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The sequence used as a prompt for the generation.
+        logits_processor (`LogitsProcessorList`, *optional*):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        stopping_criteria (`StoppingCriteriaList`, *optional*):
+            An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+            used to tell if the generation loop should stop.
+        max_length (`int`, *optional*, defaults to 20):
+            **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+            tokens. The maximum length of the sequence to be generated.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+            returned tensors for more details.
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+            for more details.
+        output_scores (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        synced_gpus (`bool`, *optional*, defaults to `False`):
+            Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+        streamer (`BaseStreamer`, *optional*):
+            Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+            through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+        model_kwargs:
+            Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+            If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+    Return:
+        [`GreedySearchDecoderOnlyOutput`], [`GreedySearchEncoderDecoderOutput`] or
+        `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+        [`GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+        `return_dict_in_generate=True` or a [`GreedySearchEncoderDecoderOutput`] if
+        `model.config.is_encoder_decoder=True`.
+    Examples:
+    ```python
+    >>> from transformers import (
+    ...     AutoTokenizer,
+    ...     AutoModelForCausalLM,
+    ...     LogitsProcessorList,
+    ...     MinLengthLogitsProcessor,
+    ...     StoppingCriteriaList,
+    ...     MaxLengthCriteria,
+    ... )
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+    >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+    >>> input_prompt = "It might be possible to"
+    >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+    >>> # instantiate logits processors
+    >>> logits_processor = LogitsProcessorList(
+    ...     [
+    ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
+    ...     ]
+    ... )
+    >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+    >>> outputs = model.greedy_search(
+    ...     input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
+    ... )
+    >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
+    ```
+    """
+    token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or (
+        self.token_latency if hasattr(self, "token_latency") else False
+    )
+
+    latency_list = []
+    # init values
+    logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+    stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+    if max_length is not None:
+        warnings.warn(
+            "`max_length` is deprecated in this function, use"
+            " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+            UserWarning,
+        )
+        stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+    pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+    eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+    output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+    output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+    )
+    return_dict_in_generate = (
+        return_dict_in_generate
+        if return_dict_in_generate is not None
+        else self.generation_config.return_dict_in_generate
+    )
+
+    # init attention / hidden states / scores tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+    decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+    cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+    # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+    if return_dict_in_generate and self.config.is_encoder_decoder:
+        encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+        encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+    # keep track of which sequences are already finished
+    unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+    this_peer_finished = False  # used by synced_gpus only
+    while True:
+        tic = time.time()
+        if synced_gpus:
+            # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+            # The following logic allows an early break if all peers finished generating their sequence
+            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+            # send 0.0 if we finished, 1.0 otherwise
+            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+            # did all peers finish? the reduced sum will be 0.0 then
+            if this_peer_finished_flag.item() == 0.0:
+                break
+
+        # prepare model inputs
+        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        if (
+            re.search("GPTJ", self.config.architectures[0])
+            or re.search("llama", self.config.architectures[0], re.IGNORECASE)
+            or re.search("gptneox", self.config.architectures[0], re.IGNORECASE)
+            or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
+            or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
+            or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+        ):
+            first_token = False
+            input_bs = input_ids.size()[0]
+            if model_inputs["past_key_values"] is None:
+                first_token = True
+            if first_token and hasattr(self, "trace_graph"):
+                if re.search("GPTJ", self.config.architectures[0]):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.n_layer)
+                        ]
+                    )
+                elif re.search("llama", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("OPT", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search(
+                    "rw", self.config.architectures[0], re.IGNORECASE
+                ):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+            if hasattr(self, "trace_graph"):
+                model_inputs.pop("use_cache", None)
+                model_inputs.pop("token_type_ids", None)
+                outputs = self.trace_graph(**model_inputs)
+                if synced_gpus and this_peer_finished:
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs[0][:, -1, :]
+            else:
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+                if synced_gpus and this_peer_finished:
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs.logits[:, -1, :]
+        else:
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+
+        # pre-process distribution
+        next_tokens_scores = logits_processor(input_ids, next_token_logits)
+
+        # Store scores, attentions and hidden_states when required
+        if return_dict_in_generate:
+            if output_scores:
+                scores += (next_tokens_scores,)
+            if output_attentions:
+                decoder_attentions += (
+                    (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                )
+                if self.config.is_encoder_decoder:
+                    cross_attentions += (outputs.cross_attentions,)
+
+            if output_hidden_states:
+                decoder_hidden_states += (
+                    (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,)
+                )
+
+        # argmax
+        next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+
+        # finished sentences should have their next token be a padding token
+        if eos_token_id is not None:
+            if pad_token_id is None:
+                raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+        # update generated ids, model inputs, and length for next step
+        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        if streamer is not None:
+            streamer.put(next_tokens.cpu())
+        model_kwargs = self._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+        )
+
+        # if eos_token was found in one sentence, set sentence to finished
+        if eos_token_id_tensor is not None:
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            # stop when each sentence is finished
+            if unfinished_sequences.max() == 0:
+                this_peer_finished = True
+        # stop if we exceed the maximum length
+        if token_latency:
+            if input_ids.is_xpu:
+                torch.xpu.synchronize()
+            latency_list.append(time.time() - tic)
+        if stopping_criteria(input_ids, scores):
+            this_peer_finished = True
+        if this_peer_finished and not synced_gpus:
+            break
+    if streamer is not None:
+        streamer.end()
+
+    if return_dict_in_generate:
+        if self.config.is_encoder_decoder:
+            output_result = GreedySearchEncoderDecoderOutput(
+                sequences=input_ids,
+                scores=scores,
+                encoder_attentions=encoder_attentions,
+                encoder_hidden_states=encoder_hidden_states,
+                decoder_attentions=decoder_attentions,
+                cross_attentions=cross_attentions,
+                decoder_hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            output_result = GreedySearchDecoderOnlyOutput(
+                sequences=input_ids,
+                scores=scores,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+    else:
+        output_result = input_ids
+
+    if token_latency:
+        return (output_result, latency_list)
+    else:
+        return output_result
diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py
index e81c3295bfa..877e3be89be 100644
--- a/neural_compressor/transformers/quantization/utils.py
+++ b/neural_compressor/transformers/quantization/utils.py
@@ -351,10 +351,12 @@ def convert_to_quantized_model(model, config, device="cpu"):
         import intel_extension_for_pytorch
 
         assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!"
-        os.environ["INC_TARGET_DEVICE"] = "cpu"
-        logger.info(
-            "Set the environment variable INC_TARGET_DEVICE='cpu' to ensure the quantization process occurs on the CPU."
-        )
+        if "INC_TARGET_DEVICE" not in os.environ:
+            os.environ["INC_TARGET_DEVICE"] = "cpu"
+            logger.info(
+                "Set the environment variable INC_TARGET_DEVICE='cpu'"
+                " to ensure the quantization process occurs on the CPU."
+            )
 
     orig_dtype = torch.float32
     for param in model.parameters():