diff --git a/.azure-pipelines/scripts/ut/run_itrex.sh b/.azure-pipelines/scripts/ut/run_itrex.sh deleted file mode 100644 index 5adaf86579b..00000000000 --- a/.azure-pipelines/scripts/ut/run_itrex.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -set -xe -source /neural-compressor/.azure-pipelines/scripts/change_color.sh -python -c "import neural_compressor as nc;print(nc.version.__version__)" -echo "run itrex ut..." - -# install inc 3x deps -pip install -r /neural-compressor/requirements_pt.txt -export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH - -# prepare itrex -git clone https://github.com/intel/intel-extension-for-transformers.git /intel-extension-for-transformers -cd /intel-extension-for-transformers && git rev-parse --short HEAD -bash /intel-extension-for-transformers/.github/workflows/script/prepare_env.sh -bash /intel-extension-for-transformers/.github/workflows/script/install_binary.sh - -# prepare test env -sed -i '/neural-compressor.git/d' /intel-extension-for-transformers/tests/requirements.txt -pip install -r /intel-extension-for-transformers/tests/requirements.txt -# workaround -pip install onnx==1.16.0 -pip install onnxruntime==1.18.0 -echo "pip list itrex ut deps..." -pip list -LOG_DIR=/neural-compressor/log_dir -mkdir -p ${LOG_DIR} -ut_log_name=${LOG_DIR}/ut_itrex.log - -# run unit test -cd /intel-extension-for-transformers/tests/CI -find . -name "test*.py" | grep -v "test_tf" | sed 's,\.\/,python ,g' | sed 's/$/ --verbose/' > run.sh - -# run UT -$BOLD_YELLOW && echo "cat run.sh..." && $RESET -cat run.sh | tee ${ut_log_name} -$BOLD_YELLOW && echo "------UT start-------" && $RESET -bash -x run.sh 2>&1 | tee -a ${ut_log_name} -$BOLD_YELLOW && echo "------ UT end -------" && $RESET - -if [ $(grep -c "FAILED" ${ut_log_name}) != 0 ] || [ $(grep -c "core dumped" ${ut_log_name}) != 0 ] || [ $(grep -c "ModuleNotFoundError:" ${ut_log_name}) != 0 ] || [ $(grep -c "OK" ${ut_log_name}) == 0 ];then - echo "Find errors in UT test, please check the output..." - exit 1 -fi -echo "UT finished successfully! " \ No newline at end of file diff --git a/.azure-pipelines/ut-itrex.yml b/.azure-pipelines/ut-itrex.yml deleted file mode 100644 index 2f038270234..00000000000 --- a/.azure-pipelines/ut-itrex.yml +++ /dev/null @@ -1,35 +0,0 @@ -trigger: none - -pr: - autoCancel: true - drafts: false - branches: - include: - - master - paths: - include: - - neural_compressor - - setup.py - - requirements.txt - - .azure-pipelines/scripts/ut/run_itrex.sh - - .azure-pipelines/ut-itrex.yml - -pool: MODEL_PERF_TEST - -variables: - UPLOAD_PATH: $(Build.SourcesDirectory)/log_dir - -stages: - - stage: - displayName: Unit Test of ITREX - jobs: - - job: - steps: - - template: template/ut-template.yml - parameters: - dockerConfigName: 'commonDockerConfig' - utScriptFileName: 'run_itrex' - uploadPath: $(UPLOAD_PATH) - utArtifact: 'ut_itrex' - utTestMode: "no-coverage" - utContainerName: "utTest-itrex" diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 9f566749da0..33a7a2b06a4 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -78,19 +78,6 @@ subprojects: - "UT-Basic (Unit Test other basic case Test other basic case)" - "UT-Basic (Unit Test other cases baseline Test other cases baseline)" - - id: "Unit Tests ITREX workflow" - paths: - - "neural_compressor/**" - - "setup.py" - - "requirements.txt" - - ".azure-pipelines/scripts/ut/run_itrex.sh" - - ".azure-pipelines/ut-itrex.yml" - - "!neural_compressor/common/**" - - "!neural_compressor/torch/**" - - "!neural_compressor/tensorflow/**" - checks: - - "UT-ITREX" - - id: "Unit Tests 3x-TensorFlow workflow" paths: - "neural_compressor/common/**" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1b821d93eb1..2875b945c57 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -76,7 +76,7 @@ repos: )$ - repo: https://github.com/PyCQA/docformatter - rev: v1.7.5 + rev: 06907d0 hooks: - id: docformatter args: [ diff --git a/README.md b/README.md index e2bef73e2d7..983b80227ed 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testi * Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst) ## What's New +* [2024/10] [Transformers-like API](./docs/source/3x/transformers_like_api.md) for INT4 inference on Intel CPU and GPU. * [2024/07] From 3.0 release, framework extension API is recommended to be used for quantization. * [2024/07] Performance optimizations and usability improvements on [client-side](./docs/source/3x/client_quant.md). @@ -164,6 +165,16 @@ Intel Neural Compressor will convert the model format from auto-gptq to hpu form Smooth Quantization + + + Transformers-like APIs + + + + + Overview + + Other Modules diff --git a/docs/source/3x/transformers_like_api.md b/docs/source/3x/transformers_like_api.md index 9aafeed5278..55e8d964072 100644 --- a/docs/source/3x/transformers_like_api.md +++ b/docs/source/3x/transformers_like_api.md @@ -208,6 +208,8 @@ python run_generation_gpu_woq.py --woq --benchmark --model save_dir >Note: > * Saving quantized model should be executed before the optimize_transformers function is called. > * The optimize_transformers function is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs). It provides optimizations for both model-wise and content-generation-wise. The detail of `optimize_transformers`, please refer to [the link](https://github.com/intel/intel-extension-for-pytorch/blob/xpu-main/docs/tutorials/llm/llm_optimize_transformers.md). +>* The quantization process is performed on the CPU accelerator by default. Users can override this setting by specifying the environment variable `INC_TARGET_DEVICE`. Usage on bash: ```export INC_TARGET_DEVICE=xpu```. +>* For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the OMP_NUM_THREADS explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using taskset. ## Examples diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index bb18153f389..a1f33413864 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -1,196 +1,225 @@ { - "pytorch": { - "llava_woq_autoround_int4":{ - "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava", - "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full", - "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json", - "input_model": "liuhaotian/llava-v1.5-7b", - "main_script": "main.py", - "batch_size": 1 - }, - "qwenvl_woq_autoround_int4":{ - "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL", - "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full", - "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json", - "input_model": "Qwen/Qwen-VL", - "main_script": "main.py", - "batch_size": 8 - }, - "Phi3Vision_woq_autoround_int4":{ - "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision", - "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full", - "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json", - "input_model": "microsoft/Phi-3-vision-128k-instruct", - "main_script": "main.py", - "batch_size": 1 - }, - "opt_125m_woq_gptq_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "opt_125m_woq_gptq_int4_dq_bnb":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "opt_125m_woq_gptq_int4_dq_ggml":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "llama2_7b_gptq_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "llama2_7b_gptq_int4_dq_bnb":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "llama2_7b_gptq_int4_dq_ggml":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "gpt_j_woq_rtn_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "gpt_j_woq_rtn_int4_dq_bnb":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "gpt_j_woq_rtn_int4_dq_ggml":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "gpt_j_woq_gptq_int4":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "gpt_j_woq_gptq_int4_dq_bnb":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "gpt_j_woq_gptq_int4_dq_ggml":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "gpt_j_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "gpt_j_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "llama2_7b_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "opt_125m_ipex":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "opt_125m_ipex_sq":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 8 - }, - "dlrm_ipex": { - "model_src_dir": "recommendation/dlrm/static_quant/ipex", - "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input", - "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt", - "main_script": "dlrm_s_pytorch.py", - "batch_size": 16384 - }, - "resnet18_pt2e_static":{ - "model_src_dir": "cv/static_quant", - "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", - "input_model": "", - "main_script": "main.py", - "batch_size": 1 - }, - "resnet18_fp8_static":{ - "model_src_dir": "cv/fp8_quant", - "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", - "input_model": "", - "main_script": "main.py", - "batch_size": 1 - }, - "opt_125m_pt2e_static":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e", - "dataset_location": "", - "input_model": "", - "main_script": "run_clm_no_trainer.py", - "batch_size": 1 - }, - "sdxl_ipex_sq":{ - "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant", - "dataset_location": "", - "input_model": "", - "main_script": "main.py", - "batch_size": 1 - }, - "resnet18_mixed_precision": { - "model_src_dir": "cv/mixed_precision", - "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", - "input_model": "resnet18", - "main_script": "main.py", - "batch_size": 20 + "pytorch": { + "llava_woq_autoround_int4":{ + "model_src_dir": "multimodal-modeling/quantization/auto_round/Llava", + "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full", + "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json", + "input_model": "liuhaotian/llava-v1.5-7b", + "main_script": "main.py", + "batch_size": 1 + }, + "qwenvl_woq_autoround_int4":{ + "model_src_dir": "multimodal-modeling/quantization/auto_round/Qwen-VL", + "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full", + "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json", + "input_model": "Qwen/Qwen-VL", + "main_script": "main.py", + "batch_size": 8 + }, + "Phi3Vision_woq_autoround_int4":{ + "model_src_dir": "multimodal-modeling/quantization/auto_round/Phi3-3-vision", + "dataset_location": "/tf_dataset2/datasets/coco2017/coco/train2017_full", + "question_file": "/tf_dataset2/datasets/llava/llava_v1_5_mix665k.json", + "input_model": "microsoft/Phi-3-vision-128k-instruct", + "main_script": "main.py", + "batch_size": 1 + }, + "opt_125m_woq_gptq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_gptq_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_gptq_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "llama2_7b_gptq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "llama2_7b_gptq_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "llama2_7b_gptq_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_rtn_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_rtn_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_rtn_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_gptq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_gptq_int4_dq_bnb":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_gptq_int4_dq_ggml":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "gpt_j_woq_awq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_awq_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_autoround_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_woq_autotune_int4":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "gpt_j_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "gpt_j_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "llama2_7b_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "opt_125m_ipex":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/ipex", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "opt_125m_ipex_sq":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 8 + }, + "dlrm_ipex": { + "model_src_dir": "recommendation/dlrm/static_quant/ipex", + "dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input", + "input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt", + "main_script": "dlrm_s_pytorch.py", + "batch_size": 16384 + }, + "resnet18_pt2e_static":{ + "model_src_dir": "cv/static_quant", + "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", + "input_model": "", + "main_script": "main.py", + "batch_size": 1 + }, + "resnet18_fp8_static":{ + "model_src_dir": "cv/fp8_quant", + "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", + "input_model": "", + "main_script": "main.py", + "batch_size": 1 + }, + "opt_125m_pt2e_static":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + }, + "sdxl_ipex_sq":{ + "model_src_dir": "diffusion_model/diffusers/stable_diffusion/smooth_quant", + "dataset_location": "", + "input_model": "", + "main_script": "main.py", + "batch_size": 1 + }, + "resnet18_mixed_precision": { + "model_src_dir": "cv/mixed_precision", + "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", + "input_model": "resnet18", + "main_script": "main.py", + "batch_size": 20 + } } } } diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt index bc70f987095..736d79c4d72 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/requirements.txt @@ -2,6 +2,5 @@ transformers torch sentencepiece neural-compressor -intel-extension-for-transformers >= 1.4.1 lm-eval==0.4.2 peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py index 40bf217c72e..6ad8e495db2 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant/run_clm_no_trainer.py @@ -62,7 +62,7 @@ def get_user_model(): user_model = convert(model=user_model) user_model.eval() -from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser +from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", user_model=user_model, diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt index d4155dfbf75..d9f59d178e7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt @@ -8,7 +8,6 @@ pytest wandb einops neural-compressor -intel-extension-for-transformers -lm_eval==0.4.2 +lm_eval==0.4.3 peft optimum-intel diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 694c0505ea4..a082421f15b 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -185,7 +185,7 @@ def eval_func(model): config = AutoConfig.from_pretrained(args.model) setattr(model, "config", config) - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", user_model=model, @@ -232,7 +232,7 @@ def eval_func(model): if args.accuracy: user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt index f0b56e558d3..5174182f312 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt @@ -8,6 +8,5 @@ pytest wandb einops neural-compressor -intel-extension-for-transformers -lm_eval==0.4.2 +lm_eval==0.4.3 peft diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py index b56c01f20f5..eb97f930d29 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py @@ -212,7 +212,7 @@ def run_fn(model): if args.accuracy: user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", user_model=user_model, @@ -232,7 +232,7 @@ def run_fn(model): if args.performance: user_model.eval() - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser import time samples = args.iters * args.batch_size diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt index b6d9b6c55de..63959e924cb 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt @@ -2,6 +2,5 @@ transformers torch sentencepiece neural-compressor -intel-extension-for-transformers >= 1.4.1 -lm-eval==0.4.2 +lm-eval==0.4.3 peft \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py index 395bc6f9b57..a2aa6c1302a 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py @@ -116,7 +116,7 @@ def get_example_inputs(tokenizer): if args.accuracy: - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", user_model=user_model, diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md index 1abe2633ea3..f0760cc2fe1 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/README.md @@ -103,6 +103,8 @@ python run_generate_cpu_woq.py \ > 1. default search algorithm is beam search with num_beams = 1. > 2. [ipex.optimize_transformers](https://github.com/intel/intel-extension-for-pytorch/blob/v2.1.10%2Bxpu/docs/tutorials/llm/llm_optimize_transformers.md) Support for the optimized inference of model types "gptj," "mistral," "qwen," and "llama" to achieve high performance and accuracy. Ensure accurate inference for other model types as well. > 3. We provide compression technologies `WeightOnlyQuant` with `Rtn/GPTQ/AutoRound` algorithms and `load_in_4bit` and `load_in_8bit` work on intel GPU device. +> 4. The quantization process is performed on the CPU accelerator by default. Users can override this setting by specifying the environment variable `INC_TARGET_DEVICE`. Usage on bash: ```export INC_TARGET_DEVICE=xpu```. +> 5. For Linux systems, users need to configure the environment variables appropriately to achieve optimal performance. For example, set the OMP_NUM_THREADS explicitly. For processors with hybrid architecture (including both P-cores and E-cores), it is recommended to bind tasks to all P-cores using taskset. ## Prerequisite​ ### Dependencies @@ -111,7 +113,7 @@ Intel-extension-for-pytorch dependencies are in oneapi package, before install i ### Create Environment​ Pytorch and Intel-extension-for-pytorch version for intel GPU > 2.1 are required, python version requests equal or higher than 3.9 due to [text evaluation library](https://github.com/EleutherAI/lm-evaluation-harness/tree/master) limitation, the dependent packages are listed in requirements_GPU.txt, we recommend create environment as the following steps. For Intel-exension-for-pytorch, we should install from source code now, and Intel-extension-for-pytorch will add weight-only quantization in the next version. ->**Note**: please install transformers==4.40.2. +>**Note**: please install transformers==4.38.1. ```bash pip install -r requirements_GPU.txt diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md index 2c3b14459c8..6a5e75b5023 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/llm_quantization_recipes.md @@ -30,12 +30,6 @@ The scripts [run_generation_sq.py](./run_generation_sq.py) and [run_generation_c ```bash # Installation -git clone https://github.com/intel/intel-extension-for-transformers.git - -# install ITREX -cd intel-extension-for-transformers -pip install -r requirements.txt -pip install -v . # install requirements cd examples/huggingface/pytorch/text-generation/quantization diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py index 62ef4ca2f49..8329d74b9a4 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_cpu_woq.py @@ -293,7 +293,6 @@ _commit_hash=args._commit_hash, ) elif args.load_in_4bit or args.load_in_8bit: - # CPU device usage is provided by intel-extension-for-transformers. user_model = AutoModelForCausalLM.from_pretrained( args.model, load_in_4bit=args.load_in_4bit, diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py index 9245d53eb50..7b63a015600 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py @@ -6,9 +6,9 @@ from transformers import AutoConfig, AutoTokenizer from transformers.generation import GenerationConfig import intel_extension_for_pytorch as ipex -# from intel_extension_for_transformers.transformers.llm.utils.generation import _beam_search, _greedy_search from neural_compressor.transformers import AutoModelForCausalLM, AutoRoundConfig, RtnConfig, GPTQConfig from neural_compressor.transformers.quantization.utils import convert_dtype_str2torch +from neural_compressor.transformers.generation import _greedy_search, _beam_search from transformers.utils import check_min_version import contextlib @@ -189,7 +189,6 @@ torch_dtype=torch.float16, ) elif args.load_in_4bit or args.load_in_8bit: - # CPU device usage is provided by intel-extension-for-transformers. user_model = AutoModelForCausalLM.from_pretrained(args.model, device_map=args.device, load_in_4bit=args.load_in_4bit, diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md index 889d7b42682..0519b490ff7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md @@ -35,9 +35,8 @@ python run_clm_no_trainer.py \ --woq_group_size 128 \ --gptq_max_seq_length 2048 \ --gptq_use_max_length \ - --accuracy \ - --tasks "lambada_openai" \ - --double_quant_type "BNB_NF4" + --double_quant_type "BNB_NF4" \ + --output_dir saved_results # "--woq_algo RTN" is used to enable RTN algorithms python run_clm_no_trainer.py \ @@ -48,9 +47,38 @@ python run_clm_no_trainer.py \ --woq_bits 4 \ --woq_scheme asym \ --woq_group_size 128 \ + --double_quant_type "BNB_NF4" + --output_dir saved_results + +# "--woq_algo AWQ" is used to enable AWQ algorithms +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AWQ \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 \ + --calib_iters 128 + +# "--woq_algo AutoRound" is used to enable AutoRound algorithms +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AutoRound \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 + +# "--accuracy" for eval +python run_clm_no_trainer.py \ + --model EleutherAI/gpt-j-6B \ + --dataset NeelNanda/pile-10k \ + --int8 \ --accuracy \ --tasks "lambada_openai" \ - --double_quant_type "BNB_NF4" + --output_dir saved_results ``` **Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md). Our GPTQ API support various CLMs including GPTJ, OPTs, Blooms, Llamas, Falcons, MPTs, ChatGLMs, etc. Simply replace the "--model" argument with other models to quantize different CLMs with GPTQ. @@ -72,8 +100,6 @@ python run_clm_no_trainer.py \ --woq_group_size 128 \ --gptq_max_seq_length 2048 \ --gptq_use_max_length \ - --accuracy \ - --tasks "lambada_openai" \ --double_quant_type "BNB_NF4" # "--woq_algo RTN" is used to enable RTN algorithms @@ -85,13 +111,40 @@ python run_clm_no_trainer.py \ --woq_bits 4 \ --woq_scheme asym \ --woq_group_size 128 \ + --double_quant_type "BNB_NF4" + +# "--woq_algo AWQ" is used to enable AWQ algorithms +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AWQ \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 \ + --calib_iters 128 + +# "--woq_algo AutoRound" is used to enable AutoRound algorithms +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --dataset NeelNanda/pile-10k \ + --quantize \ + --woq_algo AutoRound \ + --woq_bits 4 \ + --woq_scheme asym \ + --woq_group_size 128 + +# "--accuracy" for eval +python run_clm_no_trainer.py \ + --model facebook/opt-125m \ + --dataset NeelNanda/pile-10k \ + --int8 \ --accuracy \ --tasks "lambada_openai" \ - --double_quant_type "BNB_NF4" + --output_dir saved_results ``` ### LLAMA2-7b/13b/70b ->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. #### Quantization ```bash @@ -107,8 +160,6 @@ python run_clm_no_trainer.py \ --woq_group_size 128 \ --gptq_max_seq_length 2048 \ --gptq_use_max_length \ - --accuracy \ - --tasks "lambada_openai" \ --double_quant_type "BNB_NF4" # "--woq_algo RTN" is used to enable RTN algorithms @@ -120,8 +171,6 @@ python run_clm_no_trainer.py \ --woq_bits 4 \ --woq_scheme asym \ --woq_group_size 128 \ - --accuracy \ - --tasks "lambada_openai" \ --double_quant_type "BNB_NF4" ``` diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt index 63c4d6e10b1..4745e2dfbd7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/requirements.txt @@ -8,7 +8,6 @@ pytest wandb einops neural-compressor -intel-extension-for-transformers lm_eval==0.4.3 peft auto_round diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh index 9e1d766128e..6c84e27ce88 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_benchmark.sh @@ -70,58 +70,59 @@ function run_benchmark { fi echo $extra_cmd - if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then + if [ "${topology}" = "opt_125m_woq_gptq_int4" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "opt_125m_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="facebook/opt-125m" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length --gptq_percdamp 0.1 --gptq_actorder" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "llama2_7b_gptq_int4" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_bnb" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "llama2_7b_gptq_int4_dq_ggml" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" elif [ "${topology}" = "gpt_j_woq_rtn_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_bnb" ]; then - model_name_or_path="EleutherAI/gpt-j-6b"\ - extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" + model_name_or_path="EleutherAI/gpt-j-6b" elif [ "${topology}" = "gpt_j_woq_rtn_int4_dq_ggml" ]; then - model_name_or_path="EleutherAI/gpt-j-6b"\ - extra_cmd=$extra_cmd" --woq_algo RTN --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + model_name_or_path="EleutherAI/gpt-j-6b" elif [ "${topology}" = "gpt_j_woq_gptq_int4" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_bnb" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type BNB_NF4" elif [ "${topology}" = "gpt_j_woq_gptq_int4_dq_ggml" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" - extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then + model_name_or_path="facebook/opt-125m" + elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AutoRound" + elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then + model_name_or_path="facebook/opt-125m" fi - python -u run_clm_no_trainer.py \ - --model ${model_name_or_path} \ - --output_dir ${tuned_checkpoint} \ - --task ${task} \ - --batch_size ${batch_size} \ - ${extra_cmd} ${mode_cmd} + if [[ ${mode} == "accuracy" ]]; then + python -u run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --output_dir ${tuned_checkpoint} \ + --task ${task} \ + --batch_size ${batch_size} \ + ${extra_cmd} ${mode_cmd} + elif [[ ${mode} == "performance" ]]; then + incbench --num_cores_per_instance 4 run_clm_no_trainer.py \ + --model ${model_name_or_path} \ + --batch_size ${batch_size} \ + --output_dir ${tuned_checkpoint} \ + ${extra_cmd} ${mode_cmd} + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + } main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py index 02329bd9e15..51be2900ba7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_clm_no_trainer.py @@ -53,7 +53,7 @@ type=str, help="tasks for accuracy validation") parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") # ============WeightOnly configs=============== -parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ'], +parser.add_argument("--woq_algo", default="RTN", choices=['RTN', 'AWQ', 'TEQ', 'GPTQ', 'AutoRound', 'AutoTune'], help="Weight-only parameter.") parser.add_argument("--woq_bits", type=int, default=8) parser.add_argument("--woq_dtype", type=str, default="int") @@ -62,6 +62,7 @@ parser.add_argument("--woq_scheme", default="sym") parser.add_argument("--woq_use_mse_search", action="store_true") parser.add_argument("--woq_use_full_range", action="store_true") +parser.add_argument("--quant_lm_head", action="store_true", help="whether to quant the lm_head layer in transformers") # =============GPTQ configs==================== parser.add_argument("--gptq_actorder", action="store_true", help="Whether to apply the activation order GPTQ heuristic.") @@ -78,6 +79,35 @@ help='Calibration dataset sequence max length, ' 'this should align with your model config, ' 'and your dataset builder args: args.pad_max_length') +# =============AWQ configs==================== +parser.add_argument("--use_auto_scale", action="store_true", + help="Enables best scales search based on activation distribution.") +parser.add_argument("--use_auto_clip", action="store_true", + help="Enables clip range searchc.") +parser.add_argument("--folding", action="store_true", + help="Allow insert mul before linear when the scale cannot be absorbed by last layer for TEQ/AWQ.") +parser.add_argument('--absorb_layer_dict', type=dict, default={}, + help="The layer dict that scale can be absorbed for TEQ/AWQ.") +# ============AUTOROUND configs============== +parser.add_argument( + "--lr", + type=float, + default=None, + help="learning rate, if None, it will be set to 1.0/iters automatically", +) +parser.add_argument( + "--minmax_lr", + type=float, + default=None, + help="minmax learning rate, if None,it will beset to be the same with lr", +) +parser.add_argument("--autoround_iters", default=200, type=int, help="num iters for autoround calibration.") +parser.add_argument("--autoround_nsamples", default=128, type=int, help="num samples for autoround calibration.") +parser.add_argument( + "--disable_quanted_input", + action="store_true", + help="whether to use the output of quantized block to tune the next block", +) # =============DoubleQuant configs==================== parser.add_argument("--double_quant_type", @@ -196,6 +226,8 @@ def get_user_model(): ) tokenizer = AutoTokenizer.from_pretrained(args.model) user_model = user_model.float() + if args.woq_algo == 'AutoRound': + user_model.to(torch.float32) # Set model's seq_len when GPTQ calibration is enabled. if args.woq_algo == 'GPTQ': @@ -210,6 +242,31 @@ def get_user_model(): user_model.eval() return user_model, tokenizer +def eval_fn(user_model=None): + user_model.eval() + from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="hpu" if is_hpex_available() else "cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + return acc if args.quantize: # dataset @@ -224,9 +281,25 @@ def get_user_model(): shuffle=False, collate_fn=calib_evaluator.collate_batch, ) + def calib_func(prepared_model): + for i, calib_input in enumerate(calib_dataloader): + if i > args.calib_iters: + break + prepared_model(calib_input[0]) # 3.x api - from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize + from neural_compressor.torch.quantization import ( + RTNConfig, + GPTQConfig, + AWQConfig, + AutoRoundConfig, + TEQConfig, + TuningConfig, + autotune, + get_woq_tuning_config, + prepare, + convert + ) from neural_compressor.torch.utils import get_double_quant_config_dict weight_sym = True if args.woq_scheme == "sym" else False if args.double_quant_type is not None: @@ -239,6 +312,7 @@ def get_user_model(): # TODO: add group_dim into double quant config? "use_full_range": args.woq_use_full_range, "use_mse_search": args.woq_use_mse_search, + "quant_lm_head": args.quant_lm_head, } ) quant_config = RTNConfig.from_dict(double_quant_config_dict) @@ -256,8 +330,8 @@ def get_user_model(): double_quant_dtype=args.double_quant_dtype, double_quant_use_sym=args.double_quant_use_sym, double_quant_group_size=args.double_quant_group_size, + quant_lm_head=args.quant_lm_head, ) - quant_config.set_local("lm_head", RTNConfig(dtype="fp32")) user_model = prepare(model=user_model, quant_config=quant_config) user_model = convert(model=user_model) elif args.woq_algo == "GPTQ": @@ -288,6 +362,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): "act_order": args.gptq_actorder, "block_size": args.gptq_block_size, "static_groups": args.gptq_static_groups, + "quant_lm_head": args.quant_lm_head, } ) quant_config = GPTQConfig.from_dict(double_quant_config_dict) @@ -307,11 +382,109 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args): double_quant_dtype=args.double_quant_dtype, double_quant_use_sym=args.double_quant_use_sym, double_quant_group_size=args.double_quant_group_size, + quant_lm_head=args.quant_lm_head, ) - quant_config.set_local("lm_head", GPTQConfig(dtype="fp32")) user_model = prepare(model=user_model, quant_config=quant_config) run_fn_for_gptq(user_model, dataloader_for_calibration) user_model = convert(user_model) + elif args.woq_algo == "AWQ": + quant_config = AWQConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + group_dim=args.woq_group_dim, + use_auto_scale=args.use_auto_scale, + use_auto_clip=args.use_auto_clip, + folding=args.folding, + absorb_layer_dict=args.absorb_layer_dict, + quant_lm_head=args.quant_lm_head, + ) + example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) + run_fn = calib_func + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + elif args.woq_algo == "TEQ": + quant_config = TEQConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + group_dim=args.woq_group_dim, + folding=args.folding, + quant_lm_head=args.quant_lm_head, + ) + example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) + run_fn = calib_func + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) + elif args.woq_algo == "AutoRound": + quant_config = AutoRoundConfig( + dtype=args.woq_dtype, + bits=args.woq_bits, + use_sym=weight_sym, + group_size=args.woq_group_size, + enable_quanted_input=not args.disable_quanted_input, + lr=args.lr, + minmax_lr=args.minmax_lr, + seqlen=args.pad_max_length, + nsamples=args.autoround_nsamples, + iters=args.autoround_iters, + ) + quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32")) + from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader + dataloader = get_dataloader(tokenizer=tokenizer, + seqlen=args.pad_max_length, + dataset_name=datasets, + seed=args.seed, + bs=args.batch_size, + nsamples=args.autoround_nsamples) + @torch.no_grad() + def run_fn_for_autoround(model, dataloader): + for data in dataloader: + if isinstance(data, tuple) or isinstance(data, list): + model(*data) + elif isinstance(data, dict): + model(**data) + else: + model(data) + run_fn = run_fn_for_autoround + run_args = (dataloader,) + user_model = prepare(model=user_model, quant_config=quant_config) + run_fn(user_model, *run_args) + user_model = convert(user_model) + elif args.woq_algo == "AutoTune": + from utils import DataloaderPreprocessor + dataloaderPreprocessor = DataloaderPreprocessor( + dataloader_original=calib_dataloader, + use_max_length=args.gptq_use_max_length, + max_seq_length=args.gptq_max_seq_length, + ) + dataloader = dataloaderPreprocessor.get_prepared_dataloader() + custom_tune_config = TuningConfig(config_set=get_woq_tuning_config()) + from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device + from tqdm import tqdm + def run_fn_for_gptq(model, dataloader_for_calibration, *args): + for batch in tqdm(dataloader_for_calibration): + batch = move_input_to_device(batch, device=None) + if isinstance(batch, tuple) or isinstance(batch, list): + model(batch[0]) + elif isinstance(batch, dict): + model(**batch) + else: + model(batch) + return + example_inputs = torch.ones([1, args.pad_max_length], dtype=torch.long) + user_model = autotune( + model=user_model, + tune_config=custom_tune_config, + eval_fn=eval_fn, + run_fn=run_fn_for_gptq, + run_args=(dataloader, True), # run_args should be a tuple, + example_inputs=example_inputs, + ) user_model.save(args.output_dir) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh index a860712b697..ed4ee705726 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/run_quant.sh @@ -85,6 +85,19 @@ function run_tuning { model_name_or_path="EleutherAI/gpt-j-6b" extra_cmd=$extra_cmd" --woq_algo GPTQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --woq_use_mse_search --gptq_use_max_length" extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + elif [ "${topology}" = "gpt_j_woq_awq_int4" ]; then + model_name_or_path="EleutherAI/gpt-j-6b" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128" + extra_cmd=$extra_cmd" --double_quant_type GGML_TYPE_Q4_K" + elif [ "${topology}" = "opt_125m_woq_awq_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AWQ --woq_bits 4 --woq_group_size 128 --woq_scheme asym --calib_iters 128" + elif [ "${topology}" = "opt_125m_woq_autoround_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AutoRound --woq_bits 4 --woq_group_size 128 --woq_scheme asym --autoround_iters 200 --autoround_nsamples 500" + elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then + model_name_or_path="facebook/opt-125m" + extra_cmd=$extra_cmd" --woq_algo AutoTune --woq_bits 4" fi python -u run_clm_no_trainer.py \ diff --git a/neural_compressor/transformers/generation/__init__.py b/neural_compressor/transformers/generation/__init__.py new file mode 100644 index 00000000000..4030000c22c --- /dev/null +++ b/neural_compressor/transformers/generation/__init__.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .beam_search import _beam_search +from .greedy_search import _greedy_search diff --git a/neural_compressor/transformers/generation/beam_search.py b/neural_compressor/transformers/generation/beam_search.py new file mode 100644 index 00000000000..d4372810078 --- /dev/null +++ b/neural_compressor/transformers/generation/beam_search.py @@ -0,0 +1,490 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import time +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.distributed as dist +from torch import nn +from transformers.generation.beam_search import BeamScorer +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria +from transformers.utils import ModelOutput + + +class BeamSearchEncoderDecoderOutput(ModelOutput): + sequences: torch.LongTensor = None + sequences_scores: Optional[torch.FloatTensor] = None + scores: Optional[Tuple[torch.FloatTensor]] = None + beam_indices: Optional[torch.LongTensor] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +class BeamSearchDecoderOnlyOutput(ModelOutput): + sequences: torch.LongTensor = None + sequences_scores: Optional[torch.FloatTensor] = None + scores: Optional[Tuple[torch.FloatTensor]] = None + beam_indices: Optional[torch.LongTensor] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput] + + +def _beam_search( + self, + input_ids: torch.LongTensor, + beam_scorer: BeamScorer, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[Union[int, List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: bool = False, + **model_kwargs, +) -> Union[BeamSearchOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **beam search decoding** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate() + instead. For an overview of generation strategies and code examples, check the [following + guide](../generation_strategies). + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + beam_scorer (`BeamScorer`): + An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and + sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`, *optional*): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + max_length (`int`, *optional*, defaults to 20): + **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated + tokens. The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`Union[int, List[int]]`, *optional*): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + model_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. + Return: + [`BeamSearchDecoderOnlyOutput`], [`BeamSearchEncoderDecoderOutput`] or + `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`BeamSearchEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + Examples: + ```python + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForSeq2SeqLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... BeamSearchScorer, + ... ) + >>> import torch + >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> encoder_input_str = "translate English to German: How old are you?" + >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids + >>> # lets run beam search using 3 beams + >>> num_beams = 3 + >>> # define decoder start token ids + >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) + >>> input_ids = input_ids * model.config.decoder_start_token_id + >>> # add encoder_outputs to model keyword arguments + >>> model_kwargs = { + ... "encoder_outputs": model.get_encoder()( + ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True + ... ) + ... } + >>> # instantiate beam scorer + >>> beam_scorer = BeamSearchScorer( + ... batch_size=1, + ... num_beams=num_beams, + ... device=model.device, + ... ) + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList( + ... [ + ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), + ... ] + ... ) + >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) + >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) + ['Wie alt bist du?'] + ```""" + # init values + token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or ( + self.token_latency if hasattr(self, "token_latency") else False + ) + + latency_list = [] + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + warnings.warn( + "`max_length` is deprecated in this function, use" + " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + if len(stopping_criteria) == 0: + warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) + pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate + if return_dict_in_generate is not None + else self.generation_config.return_dict_in_generate + ) + + batch_size = len(beam_scorer._beam_hyps) + num_beams = beam_scorer.num_beams + + batch_beam_size, cur_len = input_ids.shape + + if num_beams * batch_size != batch_beam_size: + raise ValueError( + f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + beam_indices = tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens + # of the first beam are considered to avoid sampling the exact same tokens across all beams. + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view((batch_size * num_beams,)) + this_peer_finished = False # used by synced_gpus only + decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder + while True: + tic = time.time() + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + if ( + re.search("GPTJ", self.config.architectures[0]) + or re.search("llama", self.config.architectures[0], re.IGNORECASE) + or re.search("gptneox", self.config.architectures[0], re.IGNORECASE) + or re.search("OPT", self.config.architectures[0], re.IGNORECASE) + or re.search("falcon", self.config.architectures[0], re.IGNORECASE) + or re.search("rw", self.config.architectures[0], re.IGNORECASE) + ): + first_token = False + input_bs = input_ids.size()[0] + has_position_id = True + if model_inputs["past_key_values"] is None: + first_token = True + if first_token and hasattr(self, "trace_graph"): + if re.search("GPTJ", self.config.architectures[0]): + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.n_layer) + ] + ) + elif re.search("llama", self.config.architectures[0], re.IGNORECASE): + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE): + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + elif re.search("OPT", self.config.architectures[0], re.IGNORECASE): + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + has_position_id = False + elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search( + "rw", self.config.architectures[0], re.IGNORECASE + ): + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + has_position_id = False + + if hasattr(self, "trace_graph"): + if first_token: + new_attention_mask = model_inputs["attention_mask"][:batch_size].clone() + new_input_ids = model_inputs["input_ids"][:batch_size].clone() + if has_position_id: + new_position_ids = model_inputs["position_ids"][:batch_size].clone() + for i in range(batch_size): + new_attention_mask[i] = model_inputs["attention_mask"][i * num_beams] + new_input_ids[i] = model_inputs["input_ids"][i * num_beams] + if has_position_id: + new_position_ids[i] = model_inputs["position_ids"][i * num_beams] + model_inputs["attention_mask"] = new_attention_mask + model_inputs["input_ids"] = new_input_ids + if has_position_id: + model_inputs["position_ids"] = new_position_ids + model_inputs.pop("use_cache", None) + model_inputs.pop("token_type_ids", None) + if first_token and hasattr(self, "trace_graph_first"): + outputs = self.trace_graph_first(**model_inputs) + else: + outputs = self.trace_graph(**model_inputs) + + if first_token and len(model_inputs["past_key_values"][1]) == 4: + outputs = list(outputs) + outputs[0] = outputs[0].repeat_interleave(num_beams, dim=0) + outputs = tuple(outputs) + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + next_token_logits = outputs[0][:, -1, :] + else: + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + next_token_logits = outputs.logits[:, -1, :] + else: + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + next_token_logits = outputs.logits[:, -1, :] + next_token_scores = nn.functional.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) + next_token_scores_processed = logits_processor(input_ids, next_token_scores) + next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores_processed) + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores_processed,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,) + ) + + # reshape for beam search + vocab_size = next_token_scores.shape[-1] + next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) + # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam. + n_eos_tokens = len(eos_token_id) if eos_token_id else 0 + next_token_scores, next_tokens = torch.topk( + next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True + ) + + next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") + next_tokens = next_tokens % vocab_size + + # stateless + beam_outputs = beam_scorer.process( + input_ids, + next_token_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + beam_indices=beam_indices, + decoder_prompt_len=decoder_prompt_len, + ) + beam_scores = beam_outputs["next_beam_scores"] + beam_next_tokens = beam_outputs["next_beam_tokens"] + beam_idx = beam_outputs["next_beam_indices"] + input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + if model_kwargs["past_key_values"] is not None: + model_kwargs["past_key_values"] = self._temporary_reorder_cache(model_kwargs["past_key_values"], beam_idx) + + if return_dict_in_generate and output_scores: + # pylint: disable=unsubscriptable-object + beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) + # increase cur_len + cur_len = cur_len + 1 + if token_latency: + if input_ids.is_xpu: + torch.xpu.synchronize() + latency_list.append(time.time() - tic) + + if beam_scorer.is_done or stopping_criteria(input_ids, scores): + if not synced_gpus: + break + else: + this_peer_finished = True + + sequence_outputs = beam_scorer.finalize( + input_ids, + beam_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + max_length=stopping_criteria.max_length, + beam_indices=beam_indices, + decoder_prompt_len=decoder_prompt_len, + ) + if return_dict_in_generate: + if not output_scores: + sequence_outputs["sequence_scores"] = None + + if self.config.is_encoder_decoder: + output_result = BeamSearchEncoderDecoderOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + beam_indices=sequence_outputs["beam_indices"], + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + output_result = BeamSearchDecoderOnlyOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + beam_indices=sequence_outputs["beam_indices"], + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + output_result = sequence_outputs["sequences"] + # result + if token_latency: + return (output_result, latency_list) + else: + return output_result diff --git a/neural_compressor/transformers/generation/greedy_search.py b/neural_compressor/transformers/generation/greedy_search.py new file mode 100644 index 00000000000..f35211005ff --- /dev/null +++ b/neural_compressor/transformers/generation/greedy_search.py @@ -0,0 +1,401 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import time +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.distributed as dist +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria +from transformers.generation.streamers import BaseStreamer +from transformers.utils import ModelOutput + + +class GreedySearchDecoderOnlyOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +class GreedySearchEncoderDecoderOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput] + + +def _greedy_search( + self, + input_ids: torch.LongTensor, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[Union[int, List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: bool = False, + streamer: Optional["BaseStreamer"] = None, + **model_kwargs, +) -> Union[GreedySearchOutput, torch.LongTensor]: + r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be + used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + + In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate() + instead. For an overview of generation strategies and code examples, check the [following + guide](../generation_strategies). + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`, *optional*): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + max_length (`int`, *optional*, defaults to 20): + **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated + tokens. The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`Union[int, List[int]]`, *optional*): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + streamer (`BaseStreamer`, *optional*): + Streamer object that will be used to stream the generated sequences. Generated tokens are passed + through `streamer.put(token_ids)` and the streamer is responsible for any further processing. + model_kwargs: + Additional model specific keyword arguments will be forwarded to the `forward` function of the model. + If model is an encoder-decoder model the kwargs should include `encoder_outputs`. + Return: + [`GreedySearchDecoderOnlyOutput`], [`GreedySearchEncoderDecoderOutput`] or + `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`GreedySearchEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + Examples: + ```python + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForCausalLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... StoppingCriteriaList, + ... MaxLengthCriteria, + ... ) + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token + >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id + >>> input_prompt = "It might be possible to" + >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList( + ... [ + ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id), + ... ] + ... ) + >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) + >>> outputs = model.greedy_search( + ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria + ... ) + >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) + ["It might be possible to get a better understanding of the nature of the problem, but it's not"] + ``` + """ + token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or ( + self.token_latency if hasattr(self, "token_latency") else False + ) + + latency_list = [] + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + warnings.warn( + "`max_length` is deprecated in this function, use" + " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if isinstance(eos_token_id, int): + eos_token_id = [eos_token_id] + eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None + output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate + if return_dict_in_generate is not None + else self.generation_config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + # keep track of which sequences are already finished + unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + this_peer_finished = False # used by synced_gpus only + while True: + tic = time.time() + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + if ( + re.search("GPTJ", self.config.architectures[0]) + or re.search("llama", self.config.architectures[0], re.IGNORECASE) + or re.search("gptneox", self.config.architectures[0], re.IGNORECASE) + or re.search("OPT", self.config.architectures[0], re.IGNORECASE) + or re.search("falcon", self.config.architectures[0], re.IGNORECASE) + or re.search("rw", self.config.architectures[0], re.IGNORECASE) + ): + first_token = False + input_bs = input_ids.size()[0] + if model_inputs["past_key_values"] is None: + first_token = True + if first_token and hasattr(self, "trace_graph"): + if re.search("GPTJ", self.config.architectures[0]): + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.n_layer) + ] + ) + elif re.search("llama", self.config.architectures[0], re.IGNORECASE): + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE): + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + elif re.search("OPT", self.config.architectures[0], re.IGNORECASE): + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search( + "rw", self.config.architectures[0], re.IGNORECASE + ): + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long, device=input_ids.device + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + if hasattr(self, "trace_graph"): + model_inputs.pop("use_cache", None) + model_inputs.pop("token_type_ids", None) + outputs = self.trace_graph(**model_inputs) + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + next_token_logits = outputs[0][:, -1, :] + else: + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + next_token_logits = outputs.logits[:, -1, :] + else: + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_tokens_scores = logits_processor(input_ids, next_token_logits) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_tokens_scores,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,) + ) + + # argmax + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + if pad_token_id is None: + raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if streamer is not None: + streamer.put(next_tokens.cpu()) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id_tensor is not None: + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + # stop when each sentence is finished + if unfinished_sequences.max() == 0: + this_peer_finished = True + # stop if we exceed the maximum length + if token_latency: + if input_ids.is_xpu: + torch.xpu.synchronize() + latency_list.append(time.time() - tic) + if stopping_criteria(input_ids, scores): + this_peer_finished = True + if this_peer_finished and not synced_gpus: + break + if streamer is not None: + streamer.end() + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + output_result = GreedySearchEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + output_result = GreedySearchDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + output_result = input_ids + + if token_latency: + return (output_result, latency_list) + else: + return output_result diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index e81c3295bfa..877e3be89be 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -351,10 +351,12 @@ def convert_to_quantized_model(model, config, device="cpu"): import intel_extension_for_pytorch assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!" - os.environ["INC_TARGET_DEVICE"] = "cpu" - logger.info( - "Set the environment variable INC_TARGET_DEVICE='cpu' to ensure the quantization process occurs on the CPU." - ) + if "INC_TARGET_DEVICE" not in os.environ: + os.environ["INC_TARGET_DEVICE"] = "cpu" + logger.info( + "Set the environment variable INC_TARGET_DEVICE='cpu'" + " to ensure the quantization process occurs on the CPU." + ) orig_dtype = torch.float32 for param in model.parameters():