diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 1ede5e193..81d102bc0 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,7 +32,7 @@ jobs: python -m pip install --upgrade pip pip install cmake pip install py-cpuinfo - pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] pip install intel-extension-for-transformers pip install peft @@ -43,7 +43,6 @@ jobs: - name: Test IPEX run: | pip uninstall -y intel-extension-for-transformers - pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu - pip install intel-extension-for-pytorch==2.1.100 + pip install intel-extension-for-pytorch==2.3.0 pytest tests/neural_compressor/test_ipex.py diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 42f884b72..8e02bd551 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -30,7 +30,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu pip install .[ipex,tests] - name: Test with Pytest run: | diff --git a/README.md b/README.md index 49f0d7976..0226b5d47 100644 --- a/README.md +++ b/README.md @@ -239,3 +239,8 @@ Do not forget to install requirements for every example: cd pip install -r requirements.txt ``` + + +## Gaudi + +To train your model on [Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html), check out [Optimum Habana](https://github.com/huggingface/optimum-habana) which provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks. After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness. diff --git a/examples/openvino/image-classification/configs/swin-base-jpqd.json b/examples/openvino/image-classification/configs/swin-base-jpqd.json index 3f03c276a..23b2fd3d8 100644 --- a/examples/openvino/image-classification/configs/swin-base-jpqd.json +++ b/examples/openvino/image-classification/configs/swin-base-jpqd.json @@ -36,8 +36,6 @@ "ignored_scopes": [ "{re}.*__add___[0-1]", "{re}.*layer_norm_0", - "{re}.*matmul_1", - "{re}.*__truediv__*" ] } ] diff --git a/examples/openvino/question-answering/configs/bert-base-jpqd.json b/examples/openvino/question-answering/configs/bert-base-jpqd.json index 425bd9f31..342d327a3 100644 --- a/examples/openvino/question-answering/configs/bert-base-jpqd.json +++ b/examples/openvino/question-answering/configs/bert-base-jpqd.json @@ -36,8 +36,6 @@ "ignored_scopes": [ "{re}.*__add___[0-1]", "{re}.*layer_norm_0", - "{re}.*matmul_1", - "{re}.*__truediv__*" ] } ] diff --git a/examples/openvino/text-classification/configs/bert-base-jpqd.json b/examples/openvino/text-classification/configs/bert-base-jpqd.json index 25c8f2886..d177e4efd 100644 --- a/examples/openvino/text-classification/configs/bert-base-jpqd.json +++ b/examples/openvino/text-classification/configs/bert-base-jpqd.json @@ -40,8 +40,6 @@ "ignored_scopes": [ "{re}.*__add___[0-1]", "{re}.*layer_norm_0", - "{re}.*matmul_1", - "{re}.*__truediv__*" ] } ] diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index 767106408..5673243cb 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -32,7 +32,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install optimum[openvino,nncf] torch" + "# ! pip install optimum[openvino,nncf] torch==2.2.2" ] }, { diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index 41969b162..142cde492 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -52,7 +52,8 @@ "import transformers\n", "from pathlib import Path\n", "from openvino.runtime import Core\n", - "from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig\n", + "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n", + "from optimum.intel.openvino.configuration import OVQuantizationMethod\n", "\n", "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()" @@ -198,9 +199,14 @@ }, "outputs": [], "source": [ - "quantization_config = OVWeightQuantizationConfig(bits=8, dataset=calibration_dataset, num_samples=NUM_SAMPLES)\n", - "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True, quantization_config=quantization_config)\n", - "int8_pipe.save_pretrained(int8_model_path)" + "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n", + "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID)\n", + "quantizer = OVQuantizer(int8_pipe)\n", + "quantizer.quantize(\n", + " ov_config=OVConfig(quantization_config=quantization_config),\n", + " calibration_dataset=calibration_dataset,\n", + " save_directory=int8_model_path\n", + ")" ] }, { diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 4fed3f6f8..ffd084d4e 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -119,6 +119,15 @@ def parse_args_openvino(parser: "ArgumentParser"): "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models." ), ) + optional_group.add_argument( + "--all-layers", + action="store_true", + default=None, + help=( + "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight " + "compression is applied, they are compressed to INT8." + ), + ) optional_group.add_argument( "--disable-stateful", action="store_true", @@ -198,6 +207,7 @@ def run(self): and self.args.ratio is None and self.args.group_size is None and self.args.sym is None + and self.args.all_layers is None and self.args.model in _DEFAULT_4BIT_CONFIGS ): quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model] @@ -207,6 +217,7 @@ def run(self): "ratio": 1 if is_int8 else (self.args.ratio or 0.8), "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, + "all_layers": None if is_int8 else self.args.all_layers, } if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: @@ -226,6 +237,9 @@ def run(self): ) library_name = "transformers" + if self.args.convert_tokenizer: + logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") + if ( library_name == "diffusers" and ov_config @@ -261,10 +275,21 @@ def run(self): ) model.save_pretrained(self.args.output) - else: - if self.args.convert_tokenizer: - logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") + if self.args.disable_convert_tokenizer: + return + + # avoid import when using other exporters (IPEX, INC) + from ...exporters.openvino.convert import export_tokenizer + output = Path(self.args.output) + tokenizer = getattr(model, "tokenizer", None) + if tokenizer is not None: + export_tokenizer(tokenizer, output / "tokenizer") + + tokenizer_2 = getattr(model, "tokenizer_2", None) + if tokenizer_2 is not None: + export_tokenizer(tokenizer_2, output / "tokenizer_2") + else: # TODO : add input shapes main_export( model_name_or_path=self.args.model, diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 8908c430b..9db671906 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -24,7 +24,7 @@ from optimum.exporters import TasksManager from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED -from optimum.exporters.openvino.convert import export_from_model, export_tokenizer +from optimum.exporters.openvino.convert import export_from_model from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -219,6 +219,10 @@ def main_export( model_type = config.model_type.replace("_", "-") if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True + if custom_export_configs is None: + raise ValueError( + f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export." + ) elif task not in TasksManager.get_supported_tasks_for_model_type( model_type, exporter="openvino", library_name=library_name ): @@ -232,6 +236,7 @@ def main_export( raise ValueError( f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}." ) + if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED: loading_kwargs["attn_implementation"] = "eager" # there are some difference between remote and in library representation of past key values for some models, @@ -355,6 +360,9 @@ class StoreAttr(object): **kwargs_shapes, ) + # hide openvino import when using other exporters + from optimum.exporters.openvino.convert import export_tokenizer + if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": tokenizer = next( @@ -373,11 +381,11 @@ class StoreAttr(object): else: tokenizer = getattr(model, "tokenizer", None) if tokenizer is not None: - export_tokenizer(tokenizer, output) + export_tokenizer(tokenizer, output / "tokenizer") tokenizer_2 = getattr(model, "tokenizer_2", None) if tokenizer_2 is not None: - export_tokenizer(tokenizer_2, output, suffix="_2") + export_tokenizer(tokenizer_2, output / "tokenizer_2") elif convert_tokenizer and not is_openvino_tokenizers_available(): logger.warning("Tokenizer won't be converted.") diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 3022346af..3b214f77e 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -547,7 +547,7 @@ def export_from_model( # TODO: support onnx_config.py in the model repo if custom_architecture and custom_export_configs is None: raise ValueError( - f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export." + f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export." ) if task.startswith("text-generation") and model.config.is_encoder_decoder: @@ -614,7 +614,12 @@ def export_from_model( model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) if generation_config is not None: - generation_config.save_pretrained(output) + try: + generation_config.save_pretrained(output) + except Exception as exception: + logger.warning( + f"The generation config will not be saved, saving failed with following error:\n{exception}" + ) model_name_or_path = model.config._name_or_path maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code) @@ -667,20 +672,21 @@ def export_tokenizer( output: Union[str, Path], suffix: Optional[str] = "", ): - from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports + # avoid circular imports + from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME + from optimum.intel.openvino.utils import maybe_convert_tokenizer_to_fast try: from openvino_tokenizers import convert_tokenizer except ModuleNotFoundError: - # avoid this message before tokenizers are part of the openvino dependencies - # logger.info( - # "Run `pip install openvino-tokenizers[transformers]` to get OpenVINO tokenizer/detokenizer models." - # ) return if not isinstance(output, Path): output = Path(output) + if output.exists(): + tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output) + try: converted = convert_tokenizer(tokenizer, with_detokenizer=True) except NotImplementedError: diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 575f1cc4d..d69adc9da 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -20,6 +20,7 @@ from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig from optimum.exporters.onnx.model_configs import ( + CodeGenOnnxConfig, FalconOnnxConfig, GemmaOnnxConfig, LlamaOnnxConfig, @@ -41,15 +42,20 @@ from optimum.utils.normalized_config import NormalizedTextConfig from .model_patcher import ( + AquilaModelPatcher, BaichuanModelPatcher, ChatGLMModelPatcher, + CodeGenModelPatcher, + DBRXModelPatcher, GemmaModelPatcher, - InternLMPatcher, + InternLM2Patcher, + InternLMModelPatcher, LlamaModelPatcher, MixtralModelPatcher, MPTModelPatcher, Phi3ModelPatcher, QwenModelPatcher, + XverseModelPatcher, ) @@ -109,6 +115,15 @@ class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +@register_in_tasks_manager("qwen2-moe", *["text-generation", "text-generation-with-past"], library_name="transformers") +class Qwen2MoEOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + @register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers") class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 @@ -445,7 +460,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": - return InternLMPatcher(self, model, model_kwargs=model_kwargs) + return InternLM2Patcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") @@ -485,6 +500,12 @@ def patch_model_for_export( library_name="transformers", ) class Phi3OpenVINOConfig(PhiOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + MistralDummyPastKeyValuesGenerator, + ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) + def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": @@ -592,3 +613,175 @@ def outputs(self) -> Dict[str, Dict[int, str]]: return { "sample": {0: "batch_size", 2: "height", 3: "width"}, } + + +@register_in_tasks_manager( + "persimmon", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class PersimmonOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("biogpt", *["text-generation", "text-generation-with-past"], library_name="transformers") +class BioGPTOpenVINOConfig(TextDecoderOnnxConfig): + # BioGPT does not require position_ids input. + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager( + "gpt-neox-japanese", *["text-generation", "text-generation-with-past"], library_name="transformers" +) +class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig): + # GPTNeoxJapanese does not require position_ids input. + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager( + "cohere", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class CohereOpenVINOConfig(LlamaOpenVINOConfig): + pass + + +@register_in_tasks_manager("xglm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_attention_heads="attention_heads", hidden_size="d_model" + ) + + +class AquilaDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task, + normalized_config, + batch_size, + sequence_length, + random_batch_size_range, + random_sequence_length_range, + **kwargs, + ) + self.num_key_value_heads = getattr( + normalized_config, "num_key_value_heads", normalized_config.num_attention_heads + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + shape = ( + self.batch_size, + self.num_key_value_heads, + self.sequence_length, + self.hidden_size // self.num_attention_heads, + ) + return [ + ( + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] + + +@register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers") +class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return AquilaModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers") +class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return XverseModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return InternLMModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "codegen", + *["feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past"], + library_name="transformers", +) +class CodeGenOpenVINOConfig(CodeGenOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return CodeGenModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "dbrx", + *["text-generation", "text-generation-with-past"], + library_name="transformers", +) +class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_attention_heads="n_heads", + hidden_size="d_model", + num_layers="n_layers", + num_key_value_heads="attn_config.kv_n_heads", + allow_new=True, + ) + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return DBRXModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f68e873d4..0265b3a5f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -20,6 +20,7 @@ import torch import torch.nn.functional as F +from transformers.cache_utils import Cache, StaticCache from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.utils import is_tf_available @@ -43,6 +44,9 @@ from transformers.modeling_tf_utils import TFPreTrainedModel +BETTERTRANSFORMER_IGNORE = ("codegen",) + + def patch_model_with_bettertransformer(model): COLOR_RED = "\033[1;31m" COLOR_RESET = "\033[0m" @@ -81,6 +85,10 @@ def patch_model_with_bettertransformer(model): # model already has required SDPA implementation if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa": return model + + if model.config.model_type in BETTERTRANSFORMER_IGNORE: + return model + try: model = model.to_bettertransformer() except Exception as e: @@ -293,7 +301,7 @@ def __exit__(self, exc_type, exc_value, traceback): # adopted from # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/gemma/modeling_gemma.py#L965 # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/llama/modeling_llama.py#L1058 -def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None): +def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None): from transformers.modeling_attn_mask_utils import AttentionMaskConverter if self.config._attn_implementation == "sdpa" and past_seen_tokens is not None: @@ -306,10 +314,12 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling # using minimum from dtype with larger bandwith (floa32) may lead to overflow # during execution on platforms with default lower precision (bfloat16, float16) min_dtype = torch.finfo(torch.float16).min sequence_length = input_tensor.shape[1] + # difference with original modeling if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"): # static cache target_length = self.config.max_position_embeddings else: # dynamic cache @@ -321,7 +331,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + # difference with original modeling causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + if sequence_length != 1: causal_mask = torch.triu(causal_mask, diagonal=1) causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) @@ -358,6 +370,104 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po return causal_mask +# adopted from https://github.com/huggingface/transformers/blob/f4014e75db0190792b3feeccfc5dc5b5f9f0ce7b/src/transformers/models/llama/modeling_llama.py#L1036 +def _llama_gemma_update_causal_mask_latest( + self, + attention_mask, + input_tensor, + cache_position, + past_key_values, + output_attentions, +): + from transformers.cache_utils import StaticCache + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_length() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing + if attention_mask.max() != 0: + raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`") + causal_mask = attention_mask + else: + # difference with original modeling + causal_mask = ( + torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + ) + + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +# TODO : deprecate _llama_gemma_update_causal_mask_legacy when transformers>=4.41.0 +if is_transformers_version(">", "4.40.2"): + _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_latest +else: + _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy + + class GemmaModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() @@ -665,6 +775,72 @@ def _baichuan13b_atten_forward( return attn_output, attn_weights, past_key_value +# Adapted from https://huggingface.co/baichuan-inc/Baichuan-7B/blob/262c8cb58b6d3615c208d9230baa869fddee2adb/modeling_baichuan.py#L181 +def _baichuan7b_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + bsz, q_len, _ = hidden_states.size() + + proj = self.W_pack(hidden_states) + proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) + query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + if not output_attentions: + attn_weights = None + attn_output = F.scaled_dot_product_attention( + query_states, key_states, value_states, attn_mask=attention_mask, scale=1 / math.sqrt(self.head_dim) + ) + else: + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights, past_key_value + + class BaichuanModelPatcher(DecoderModelPatcher): def __init__( self, @@ -712,13 +888,18 @@ def forward( for layer in self._model.model.layers: layer.self_attn._orig_forward = layer.self_attn.forward layer.self_attn.forward = types.MethodType(_baichuan13b_atten_forward, layer.self_attn) + else: + for layer in self._model.model.layers: + layer.self_attn._orig_forward = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_baichuan7b_attn_forward, layer.self_attn) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) if hasattr(self._model, "_orig_forward"): self._model.forward = self._model._orig_forward - for layer in self._model.model.layers: + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): layer.self_attn.forward = layer.self_attn._orig_forward @@ -844,7 +1025,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.attn.forward = block.attn._orig_forward -def _internlm_attention_forward( +def _internlm2_attention_forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, @@ -935,14 +1116,14 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: return attn_output, attn_weights, past_key_value -class InternLMPatcher(DecoderModelPatcher): +class InternLM2Patcher(DecoderModelPatcher): def __enter__(self): super().__enter__() if is_torch_version(">=", "2.1.0"): for block in self._model.model.layers: block.attention._orig_forward = block.attention.forward - block.attention.forward = types.MethodType(_internlm_attention_forward, block.attention) + block.attention.forward = types.MethodType(_internlm2_attention_forward, block.attention) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -951,15 +1132,642 @@ def __exit__(self, exc_type, exc_value, traceback): block.attention.forward = block.attention._orig_forward +# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L426 +def _phi3_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + return self._orig_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + # TO DO: remove llama imports when transformers with phi3 support will be released + try: + from transformers.models.phi3.modelling_phi3 import apply_rotary_pos_emb, repeat_kv + except ImportError: + from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + class Phi3ModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - # https://github.com/huggingface/transformers/blob/30ee508c6c92a1c0aa0281d193c7c0fb815b8d2f/src/transformers/models/phi3/modeling_phi3.py#L113 # init inv_freq for torchscript tracing for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_phi3_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + if layer.self_attn.rotary_emb.inv_freq is None: rotary_emb = layer.self_attn.rotary_emb layer.self_attn.rotary_emb.inv_freq = 1.0 / ( rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward + + +def _aquila_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + if output_attentions: + return self._orig_forward( + hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache + ) + bsz, q_len, _ = hidden_states.size() + + if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, getattr(self, "num_key_value_heads", self.num_heads), self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, getattr(self, "num_key_value_heads", self.num_heads), self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + if hasattr(self, "num_key_value_groups"): + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim)) + ) + attn_weights = None + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights, past_key_value + + +class AquilaModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_aquila_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward + + +def _xverse_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + if output_attentions: + return self._orig_forward( + hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim)) + ) + attn_weights = None + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights, past_key_value + + +def _internlm_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + cos = cos[position_ids].unsqueeze(1) + sin = sin[position_ids].unsqueeze(1) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + if output_attentions: + return self._orig_forward( + hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache + ) + + bsz, q_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim)) + ) + attn_weights = None + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights, past_key_value + + +class XverseModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_xverse_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward + + +class InternLMModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_internlm_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward + + +class CodeGenModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + + # whole codegen bettertransformer patch include attn.forward and does not cover codegen2. + # For avoiding breaking model on tracing stage, we reduce area of bettertransformer patch only for _attn. + from optimum.bettertransformer.models.attention import codegen_wrapped_scaled_dot_product + + for layer in self._model.transformer.h: + if is_torch_version(">=", "2.1.0") and not self._model.config.output_attentions: + orig_self_attn_fwd = layer.attn._attn + layer.attn._attn = types.MethodType(codegen_wrapped_scaled_dot_product, layer.attn) + layer.attn._orig_attn = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.transformer.h: + if hasattr(layer.attn, "_orig_attn"): + layer.attn._attn = layer.attn._orig_attn + + +# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L763 +def _dbrx_experts_forward( + self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor +): + bsz, q_len, hidden_size = x.shape + x = x.view(-1, hidden_size) + out = torch.zeros_like(x) + + expert_mask = torch.nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0) + # Chunk experts at once to avoid storing full parameter multiple times in autograd + w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk( + self.moe_num_experts, dim=0 + ) + v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk( + self.moe_num_experts, dim=0 + ) + w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk( + self.moe_num_experts, dim=0 + ) + w1_chunked = [w1.squeeze(dim=0) for w1 in w1_chunked] + v1_chunked = [v1.squeeze(dim=0) for v1 in v1_chunked] + w2_chunked = [w2.squeeze(dim=0) for w2 in w2_chunked] + for expert_idx in range(0, self.moe_num_experts): + topk_idx, token_idx = torch.where(expert_mask[expert_idx]) + + # Difference with original: removal + # if token_idx.shape[0] == 0: + # continue + # loop interruption depends on input data and may affect torchscript tracing + + token_list = token_idx + topk_list = topk_idx + + expert_tokens = x[None, token_list].reshape(-1, hidden_size) + expert_out = ( + self.mlp(expert_tokens, w1_chunked[expert_idx], v1_chunked[expert_idx], w2_chunked[expert_idx]) + * top_weights[token_list, topk_list, None] + ) + + out.index_add_(0, token_idx, expert_out) + + out = out.reshape(bsz, q_len, hidden_size) + return out + + +# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L1228 +def _dbrx_update_causal_mask_legacy( + self, attention_mask: Optional[torch.Tensor], input_tensor: torch.Tensor, cache_position: torch.Tensor +) -> Optional[torch.Tensor]: + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + sequence_length = input_tensor.shape[1] + if hasattr(self.blocks[0].norm_attn_norm.attn, "past_key_value"): # static cache + target_length = self.config.max_position_embeddings + else: # dynamic cache + target_length = ( + attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1 + ) + # difference with original modeling + # removed target_length = int(target_length). + # Casting to int leads to constant folding during tracing that makes impossible to use model for sequence of different length + causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + elif attention_mask.dim() == 4: + # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with + # cache. In that case, the 4D attention mask attends to the newest tokens only. + if attention_mask.shape[-2] < cache_position[0] + sequence_length: + offset = cache_position[0] + else: + offset = 0 + mask_shape = attention_mask.shape + mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + ): + # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400). + is_tracing = ( + torch.jit.is_tracing() + or isinstance(input_tensor, torch.fx.Proxy) + or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()) + ) + if not is_tracing and torch.any(attention_mask != 1): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +# adopted from https://github.com/huggingface/transformers/blob/1b3dba9417eebe16b7c206d1dfca6a4c7f11dbec/src/transformers/models/dbrx/modeling_dbrx.py#L1204 +def _dbrx_update_causal_mask_latest( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, +): + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_length() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing + if attention_mask.max() != 0: + raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`") + causal_mask = attention_mask + else: + # difference with original modeling + causal_mask = ( + torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +if is_transformers_version(">", "4.40.2"): + _dbrx_update_causal_mask = _dbrx_update_causal_mask_latest +else: + _dbrx_update_causal_mask = _dbrx_update_causal_mask_legacy + + +class DBRXModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + # dbrx has some accuracy issues with bf16 with transformers >= 4.40 + # fill causal mask in slightly different way for avoid overflow on some platforms + self._model.transformer._orig_update_causal_mask = self._model.transformer._update_causal_mask + self._model.transformer._update_causal_mask = types.MethodType( + _dbrx_update_causal_mask, self._model.transformer + ) + + for block in self._model.transformer.blocks: + rotary_emb = block.norm_attn_norm.attn.rotary_emb + # initialize inv_freq for torchscript tracing + if rotary_emb.inv_freq is None: + inv_freq = 1.0 / ( + rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) + ) + rotary_emb.inv_freq = inv_freq + # remove continue-operator from iteration loop over experts + block.ffn.experts._orig_forward = block.ffn.experts.forward + block.ffn.experts.forward = types.MethodType(_dbrx_experts_forward, block.ffn.experts) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.transformer._update_causal_mask = self._model.transformer._orig_update_causal_mask + for block in self._model.transformer.blocks: + block.ffn.experts.forward = block.ffn.experts._orig_forward diff --git a/optimum/intel/ipex/inference.py b/optimum/intel/ipex/inference.py index ccf2da9d8..a628ebe12 100644 --- a/optimum/intel/ipex/inference.py +++ b/optimum/intel/ipex/inference.py @@ -97,6 +97,10 @@ def __init__( jit (`boolean = False`, *optional*): Enable jit to accelerate inference speed """ + logger.warning( + "`inference_mode` is deprecated and will be removed in v1.18.0. Use `pipeline` to load and export your model to TorchScript instead." + ) + if not is_ipex_available(): raise ImportError(IPEX_NOT_AVAILABLE_ERROR_MSG) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 2b739ea50..e929a4ddb 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -39,6 +39,7 @@ GenerationConfig, GenerationMixin, PretrainedConfig, + is_torch_xpu_available, ) from transformers.dynamic_module_utils import get_class_from_dynamic_module from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput @@ -52,7 +53,7 @@ from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _patch_model from ..generation.modeling import prepare_jit_inputs from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version -from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask +from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask, recursive_to_device logger = logging.getLogger(__name__) @@ -128,10 +129,14 @@ def __init__( **kwargs, ): OptimizedModel.__init__(self, model=model, config=config) - # To do: add XPU support - self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32 + if is_torch_xpu_available(check_device=True): + self._device = torch.device("xpu:0") + elif torch.cuda.is_available(): + self._device = torch.device("cuda:0") + else: + self._device = torch.device("cpu") self.model.to(self._device) + self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32 self.model_save_dir = model_save_dir self._is_ipex_exported = _is_patched_with_ipex(model, self.export_feature) @@ -161,6 +166,7 @@ def _from_transformers( local_files_only: bool = False, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: bool = False, + _commit_hash: str = None, ): if use_auth_token is not None: warnings.warn( @@ -186,6 +192,7 @@ def _from_transformers( "force_download": force_download, "torch_dtype": torch_dtype, "trust_remote_code": trust_remote_code, + "_commit_hash": _commit_hash, } model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) @@ -319,6 +326,8 @@ def _init_warmup(self): if not self._is_ipex_exported: use_cache = "past_key_values" in self.input_names dummy_inputs = prepare_jit_inputs(self, self.export_feature, use_cache) + if self._device.type != "cpu": + dummy_inputs = recursive_to_device(value=dummy_inputs, device=self._device) for _ in range(2): self(**dummy_inputs) diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 2556a6048..bb3d2fe8c 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -22,6 +22,7 @@ import torch from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import EntryNotFoundError from neural_compressor.utils.pytorch import load from transformers import ( AutoConfig, @@ -40,6 +41,7 @@ ) from transformers.modeling_utils import no_init_weights from transformers.models.auto.auto_factory import _get_model_class +from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from transformers.utils.generic import ContextManagers from optimum.intel.generation import BaseModelForCausalLM @@ -47,7 +49,7 @@ from ...modeling_base import OptimizedModel from ..utils.import_utils import _torch_version, is_itrex_available, is_torch_version from .configuration import INCConfig -from .utils import WEIGHTS_NAME +from .utils import QUANTIZATION_CONFIG_NAME logger = logging.getLogger(__name__) @@ -119,42 +121,79 @@ def _from_pretrained( raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") token = use_auth_token - model_name_or_path = kwargs.pop("model_name_or_path", None) - if model_name_or_path is not None: - logger.warning("`model_name_or_path` is deprecated please use `model_id`") - model_id = model_id or model_name_or_path - model_path = Path(model_id) - - if model_path.is_dir(): - model_cache_path = model_path / file_name + is_local = model_path.is_dir() + model_cache_path = None + inc_config = None + msg = None + if is_local: + if (model_path / subfolder / SAFE_WEIGHTS_NAME).is_file(): + file_name = SAFE_WEIGHTS_NAME + elif not (model_path / subfolder / file_name).is_file(): + raise EnvironmentError( + f"Error no file named {SAFE_WEIGHTS_NAME} or {file_name} found in directory {model_path / subfolder}" + ) + model_cache_path = model_path / subfolder / file_name else: - model_cache_path = hf_hub_download( - repo_id=model_id, - filename=file_name, - subfolder=subfolder, - token=token, - revision=revision, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - ) + # Try download safetensors if exist + try: + model_cache_path = hf_hub_download( + repo_id=model_id, + filename=SAFE_WEIGHTS_NAME, + subfolder=subfolder, + token=token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) + except EntryNotFoundError: + pass + + if model_cache_path is None: + model_cache_path = hf_hub_download( + repo_id=model_id, + filename=file_name, + subfolder=subfolder, + token=token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) model_save_dir = Path(model_cache_path).parent - inc_config = None - msg = None + if is_itrex_available(): - try: - quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json") + quantization_config_path = None + if is_local: + quantization_config_path = model_path / subfolder / QUANTIZATION_CONFIG_NAME + else: + try: + quantization_config_path = hf_hub_download( + repo_id=model_id, + filename=QUANTIZATION_CONFIG_NAME, + subfolder=subfolder, + token=token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) + except EntryNotFoundError: + pass + + if quantization_config_path and Path(quantization_config_path).is_file(): + quantization_config = PretrainedConfig.from_pretrained(quantization_config_path) algorithm = getattr(quantization_config, "quant_method", None) - if algorithm in {"rtn", "gptq", "awq", "autoaround"}: + if algorithm in {"rtn", "gptq", "awq", "autoround"}: from intel_extension_for_transformers.transformers.modeling.modeling_auto import ( _BaseQBitsAutoModelClass, ) _BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class - return _BaseQBitsAutoModelClass.from_pretrained( + model = _BaseQBitsAutoModelClass.from_pretrained( pretrained_model_name_or_path=model_id, token=token, revision=revision, @@ -163,12 +202,16 @@ def _from_pretrained( local_files_only=local_files_only, subfolder=subfolder, trust_remote_code=trust_remote_code, + use_neural_speed=False, **kwargs, ) - except EnvironmentError: - msg = "The model is not quantized with weight-only quantization." + + return cls( + model, config=config, model_save_dir=model_save_dir, q_config=quantization_config, **kwargs + ) + try: - inc_config = INCConfig.from_pretrained(model_id) + inc_config = INCConfig.from_pretrained(model_id, subfolder=subfolder, revision=revision) if not is_torch_version("==", inc_config.torch_version): msg = f"Quantized model was obtained with torch version {inc_config.torch_version} but {_torch_version} was found." logger.warning(f"{msg}") @@ -209,15 +252,19 @@ def _from_pretrained( ) def _save_pretrained(self, save_directory: Union[str, Path]): - output_path = os.path.join(save_directory, WEIGHTS_NAME) - if isinstance(self.model, torch.nn.Module): - state_dict = self.model.state_dict() - if self._q_config: - state_dict["best_configure"] = self._q_config - torch.save(state_dict, output_path) + # For ITREX model + if isinstance(self._q_config, PretrainedConfig): + self._q_config.to_json_file(os.path.join(save_directory, QUANTIZATION_CONFIG_NAME)) + self.model.save_pretrained(save_directory) + # For INC model the state dictionary needs to be modified to include the quantization parameters + else: + state_dict = self.model.state_dict() + if isinstance(self._q_config, dict): + state_dict["best_configure"] = self._q_config + torch.save(state_dict, os.path.join(save_directory, WEIGHTS_NAME)) else: - torch.jit.save(self.model, output_path) + torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME)) if self.inc_config: self.inc_config.save_pretrained(save_directory) diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 9ee436593..500478712 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -29,6 +29,7 @@ from neural_compressor.model.onnx_model import ONNXModel from neural_compressor.model.torch_model import IPEXModel, PyTorchModel from neural_compressor.quantization import fit +from packaging.version import parse from torch.utils.data import DataLoader, RandomSampler from transformers import ( DataCollator, @@ -85,6 +86,7 @@ f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, " f"but only version {ITREX_MINIMUM_VERSION} or higher is supported." ) + from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit from intel_extension_for_transformers.transformers.utils.config import ( @@ -226,6 +228,14 @@ def quantize( # ITREX Weight Only Quantization if not isinstance(quantization_config, PostTrainingQuantConfig): + if is_itrex_version("==", "1.4.2") and ( + is_torch_version("!=", "2.3.0") or parse(_torch_version).local != "cpu" + ): + raise ImportError( + f"Found an incompatible version of `intel-extension-for-transformers` and `torch`. Found version itrex {_itrex_version} and torch {_torch_version}, " + f"but only torch 2.3.0+cpu is compatible with ITREX v1.4.2." + ) + # check neural-compressor version if is_neural_compressor_version("<", NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION): raise ImportError( diff --git a/optimum/intel/neural_compressor/utils.py b/optimum/intel/neural_compressor/utils.py index 3173f5e1c..84c1d6dc2 100644 --- a/optimum/intel/neural_compressor/utils.py +++ b/optimum/intel/neural_compressor/utils.py @@ -28,6 +28,7 @@ CONFIG_NAME = "best_configure.yaml" +QUANTIZATION_CONFIG_NAME = "quantize_config.json" NEURAL_COMPRESSOR_MINIMUM_VERSION = "2.1.0" NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION = "2.3.0" diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 5de672b70..eb233f3d1 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -33,18 +33,25 @@ logger = logging.getLogger(__name__) _DEFAULT_4BIT_CONFIGS = { - "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, + "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64}, "facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8}, "bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6}, "togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128}, - "HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6}, - "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6}, + "HuggingFaceH4/zephyr-7b-beta": { + "bits": 4, + "sym": True, + "group_size": 128, + "ratio": 0.8, + "dataset": "wikitext2", + "awq": True, + }, + "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7}, "meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, "meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, "stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, "stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, - "stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8}, + "stabilityai/stablelm-zephyr-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0}, "pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72}, "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6}, @@ -52,11 +59,25 @@ "tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, "psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, "mistralai/Mixtral-8x7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, + "facebook/opt-2.7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7}, + "togethercomputer/RedPajama-INCITE-Chat-3B-v1": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, + "lmsys/vicuna-7b-v1.5": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0}, + "stabilityai/stablelm-tuned-alpha-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, + "mistralai/Mistral-7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.9}, + "baichuan-inc/Baichuan2-7B-Chat": { + "bits": 4, + "sym": True, + "group_size": 128, + "ratio": 0.8, + "dataset": "wikitext2", + "awq": True, + }, } class OVQuantizationMethod(str, Enum): DEFAULT = "default" + HYBRID = "hybrid" @dataclass diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 8c09e6c0b..bc2cd93c8 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -17,7 +17,7 @@ import warnings from pathlib import Path from tempfile import TemporaryDirectory -from typing import Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union import numpy as np import openvino @@ -28,6 +28,10 @@ from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin +from transformers.generation.configuration_utils import GenerationConfig +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.generation.utils import GenerateOutput, GenerationMode from transformers.modeling_outputs import CausalLMOutputWithPast from optimum.utils.normalized_config import NormalizedConfigManager @@ -38,7 +42,12 @@ from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS from .configuration import _DEFAULT_4BIT_CONFIGS, OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel -from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE +from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, STR_TO_OV_TYPE + + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + from transformers.streamers import BaseStreamer logger = logging.getLogger(__name__) @@ -122,6 +131,8 @@ def __init__( self._pkv_precision = Type.f32 self.next_beam_idx = None self._past_length = 0 + self._first_iter_beam_search = False + self._second_iter_beam_search = False self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -376,7 +387,9 @@ def prepare_inputs( inputs = {} if not self.stateful: if past_key_values is not None: - if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture + ): if self._pkv_precision == Type.bf16: # numpy does not support bf16, pretending f16, should change to bf16 past_key_values = tuple( @@ -397,6 +410,7 @@ def prepare_inputs( elif self.use_cache: for input_name in self.key_value_input_names: model_inputs = self.model.input(input_name) + dtype = OV_TO_NP_TYPE[model_inputs.get_element_type().get_type_name()] shape = model_inputs.get_partial_shape() if self.config.model_type == "chatglm": shape[0] = 0 @@ -407,7 +421,7 @@ def prepare_inputs( shape[2] = 0 else: shape[1] = 0 - inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) + inputs[input_name] = np.empty([dim.get_length() for dim in shape], dtype=dtype) else: # past_key_values are not used explicitly, instead they are handled inside the model if past_key_values is None: @@ -424,7 +438,6 @@ def prepare_inputs( self.next_beam_idx = np.arange(batch_size, dtype=int) self._past_length = 0 past_len = self._get_past_length(past_key_values) - inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed if "attention_mask" in self.input_names or "position_ids" in self.input_names: @@ -474,6 +487,7 @@ def forward( infer_req = self.compiled_model.create_infer_request() self.request_dict[tid] = infer_req else: + tid = -1 infer_req = self.request inputs = self.prepare_inputs( @@ -484,9 +498,13 @@ def forward( **kwargs, ) + if self._first_iter_beam_search: + inputs, duplication_indices = self._deduplicate_inputs(inputs) # Run inference + print(f'.... {tid} infer start ....\n') infer_req.start_async(inputs, share_inputs=True) infer_req.wait() + print(f'..... {tid} infer end .....\n') logits = torch.from_numpy(infer_req.get_tensor("logits").data).to(self.device) if self.stateful: # Need a marker to differentiate the first generate iteration from the others in @@ -499,7 +517,9 @@ def forward( if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) past_key_values = tuple(infer_req.get_tensor(key).data for key in self.key_value_output_names) - if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture + ): # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) @@ -507,6 +527,10 @@ def forward( else: past_key_values = None + if self._first_iter_beam_search: + logits, past_key_values = self._expand_outputs_for_generation(duplication_indices, logits, past_key_values) + self._first_iter_beam_search = False + return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation @@ -536,7 +560,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg if past_key_values: position_ids = position_ids[:, -input_ids.shape[1] :] - return { + model_inputs = { "input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache, @@ -544,12 +568,114 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "attention_mask": attention_mask, } + return model_inputs + + def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple): + batch_size = logits.shape[0] + if indicies.shape[0] != 1: + logits = logits[indicies] + if past_key_values and not self.stateful: + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture + ): + past_key_values = tuple( + tuple( + past_state[indicies] + if not self.config.model_type == "chatglm" + else past_state[:, indicies, ...] + for past_state in layer_past + ) + for layer_past in past_key_values + ) + else: + past_key_values = tuple([past_state[indicies] for past_state in past_key_values]) + if self.stateful: + self.next_beam_idx = ( + self.next_beam_idx[indicies] + if self.next_beam_idx is not None + else np.arange(batch_size, dtype=int)[indicies] + ) + self._second_iter_beam_search = True + return logits, past_key_values + + def _deduplicate_inputs(self, model_inputs: Dict): + input_ids = model_inputs["input_ids"] + upd_model_inputs = {} + unique_input_ids, indicies, reverse_indicies = np.unique( + input_ids, axis=0, return_index=True, return_inverse=True + ) + for input_name, input_tensor in model_inputs.items(): + if input_name not in ["input_ids", "beam_idx"]: + if input_name not in self.key_value_input_names: + upd_model_inputs[input_name] = input_tensor[indicies] + else: + shape = input_tensor.shape if isinstance(input_tensor, Tensor) else list(input_tensor.shape) + dtype = input_tensor.element_type if isinstance(input_tensor, Tensor) else Type(input_tensor.dtype) + upd_batch_size = indicies.shape[0] + if self.config.model_type == "bloom": + upd_batch_size *= self.config.num_attention_heads + shape[0 if not self.config.model_type == "chatglm" else 1] = upd_batch_size + upd_model_inputs[input_name] = Tensor(dtype, shape) + upd_model_inputs["input_ids"] = unique_input_ids + if "beam_idx" in model_inputs: + beam_range = ( + unique_input_ids.shape[0] + if self.config.model_type != "bloom" + else unique_input_ids.shape[0] * self.config.num_attention_heads + ) + beam_idx = np.arange(beam_range, dtype=int) + upd_model_inputs["beam_idx"] = beam_idx + return upd_model_inputs, reverse_indicies + + @torch.no_grad() + def generate( + self, + inputs: Optional[torch.Tensor] = None, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + synced_gpus: Optional[bool] = None, + assistant_model: Optional["PreTrainedModel"] = None, + streamer: Optional["BaseStreamer"] = None, + negative_prompt_ids: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[GenerateOutput, torch.LongTensor]: + _generation_config, _ = self._prepare_generation_config(generation_config, **kwargs) + generation_mode = _generation_config.get_generation_mode(assistant_model) + + is_beam_search = generation_mode in [ + GenerationMode.BEAM_SEARCH, + GenerationMode.BEAM_SAMPLE, + GenerationMode.GROUP_BEAM_SEARCH, + GenerationMode.CONSTRAINED_BEAM_SEARCH, + ] + if is_beam_search: + self._first_iter_beam_search = True + result = super().generate( + inputs, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + assistant_model, + streamer, + negative_prompt_ids, + negative_prompt_attention_mask, + **kwargs, + ) + return result + def _get_past_length(self, past_key_values=None): if past_key_values is None: return 0 if self.stateful: return self._past_length - if self.config.model_type in MULTI_QUERY_ATTN_MODELS: + if self.config.model_type in MULTI_QUERY_ATTN_MODELS and not ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture + ): return past_key_values[0].shape[-2] seq_length_dim = -2 if self.config.model_type == "chatglm": @@ -574,12 +700,20 @@ def _reorder_cache( if self.stateful: # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + self.next_beam_idx = ( + np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx + ) # save beam_idx to be used as an input in the next iteration + self._second_iter_beam_search = False return past_key_values else: - return tuple( - tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values - ) + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture + ): + return tuple( + tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) + for layer_past in past_key_values + ) + return tuple(np.take(past_state, beam_idx, 0) for past_state in past_key_values) def can_generate(self): """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" @@ -700,11 +834,12 @@ def _reorder_cache( This is required to match `past_key_values` with the correct beam_idx at every generation step. """ if self.stateful: - beam_idx = np.array(beam_idx) batch_size = beam_idx.shape[0] + beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx indices = np.array(range(batch_size * self.config.num_attention_heads)) indices = indices.reshape([batch_size, self.config.num_attention_heads]) self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() + self._second_iter_beam_search = False return past_key_values else: standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) @@ -754,6 +889,24 @@ def _convert_to_standard_cache( for layer_past in past_key_value ) + def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple): + batch_size = logits.shape[0] + if indicies.shape[0] != 1: + logits = logits[indicies] + if past_key_values and not self.stateful: + pkv_standard = self._convert_to_standard_cache(past_key_values, batch_size) + pkv = tuple(tuple(past_state[indicies] for past_state in layer_past) for layer_past in pkv_standard) + past_key_values = self._convert_to_bloom_cache(pkv) + + if self.stateful: + self.next_beam_idx = ( + self.next_beam_idx[indicies] + if self.next_beam_idx is not None + else np.arange(batch_size, dtype=int)[indicies] + ) + self._second_iter_beam_search = True + return logits, past_key_values + class OVGPTBigCodeForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache @@ -761,7 +914,9 @@ def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: if self.stateful: - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + # save beam_idx to be used as an input in the next iteration + self.next_beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx + self._second_iter_beam_search = False return past_key_values else: return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 2de7cb815..1b880e736 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -57,14 +57,13 @@ ) from ...exporters.openvino import main_export -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig from .loaders import OVTextualInversionLoaderMixin from .modeling_base import OVBaseModel from .utils import ( ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, - PREDEFINED_SD_DATASETS, _print_compiled_model_properties, ) @@ -293,21 +292,7 @@ def _from_pretrained( else: kwargs[name] = load_method(new_model_save_dir) - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) - unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name - if quantization_config is not None and quantization_config.dataset is not None: - # load the UNet model uncompressed to apply hybrid quantization further - unet = cls.load_model(unet_path) - # Apply weights compression to other `components` without dataset - weight_quantization_params = { - param: value for param, value in quantization_config.__dict__.items() if param != "dataset" - } - weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_params) - else: - weight_quantization_config = quantization_config - unet = cls.load_model(unet_path, weight_quantization_config) - components = { "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "vae_decoder": new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, @@ -315,13 +300,19 @@ def _from_pretrained( "text_encoder_2": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, } - for key, value in components.items(): - components[key] = cls.load_model(value, weight_quantization_config) if value.is_file() else None - if model_save_dir is None: model_save_dir = new_model_save_dir - if quantization_config is not None and quantization_config.dataset is not None: + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + if quantization_config is None or quantization_config.dataset is None: + unet = cls.load_model(unet_path, quantization_config) + for key, value in components.items(): + components[key] = cls.load_model(value, quantization_config) if value.is_file() else None + else: + # Load uncompressed models to apply hybrid quantization further + unet = cls.load_model(unet_path) + for key, value in components.items(): + components[key] = cls.load_model(value) if value.is_file() else None sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) supported_pipelines = ( @@ -332,12 +323,14 @@ def _from_pretrained( if not isinstance(sd_model, supported_pipelines): raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}") - nsamples = quantization_config.num_samples if quantization_config.num_samples else 200 - unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples) + from optimum.intel import OVQuantizer - from .quantization import _hybrid_quantization + hybrid_quantization_config = deepcopy(quantization_config) + hybrid_quantization_config.quant_method = OVQuantizationMethod.HYBRID + quantizer = OVQuantizer(sd_model) + quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config)) - unet = _hybrid_quantization(sd_model.unet.model, weight_quantization_config, dataset=unet_inputs) + return sd_model return cls( unet=unet, @@ -348,62 +341,6 @@ def _from_pretrained( **kwargs, ) - def _prepare_unet_inputs( - self, - dataset: Union[str, List[Any]], - num_samples: int, - height: Optional[int] = None, - width: Optional[int] = None, - seed: Optional[int] = 42, - **kwargs, - ) -> Dict[str, Any]: - self.compile() - - size = self.unet.config.get("sample_size", 64) * self.vae_scale_factor - height = height or min(size, 512) - width = width or min(size, 512) - - if isinstance(dataset, str): - dataset = deepcopy(dataset) - available_datasets = PREDEFINED_SD_DATASETS.keys() - if dataset not in available_datasets: - raise ValueError( - f"""You have entered a string value for dataset. You can only choose between - {list(available_datasets)}, but the {dataset} was found""" - ) - - from datasets import load_dataset - - dataset_metadata = PREDEFINED_SD_DATASETS[dataset] - dataset = load_dataset(dataset, split=dataset_metadata["split"], streaming=True).shuffle(seed=seed) - input_names = dataset_metadata["inputs"] - dataset = dataset.select_columns(list(input_names.values())) - - def transform_fn(data_item): - return {inp_name: data_item[column] for inp_name, column in input_names.items()} - - else: - - def transform_fn(data_item): - return data_item if isinstance(data_item, (list, dict)) else [data_item] - - from .quantization import InferRequestWrapper - - calibration_data = [] - self.unet.request = InferRequestWrapper(self.unet.request, calibration_data) - - for inputs in dataset: - inputs = transform_fn(inputs) - if isinstance(inputs, dict): - self.__call__(**inputs, height=height, width=width) - else: - self.__call__(*inputs, height=height, width=width) - if len(calibration_data) >= num_samples: - break - - self.unet.request = self.unet.request.request - return calibration_data[:num_samples] - @classmethod def _from_transformers( cls, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 86e473fd1..43cf1dd93 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections.abc import copy import inspect import logging @@ -21,7 +22,6 @@ from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union -import datasets import nncf import openvino import torch @@ -48,15 +48,16 @@ from ...exporters.openvino.model_patcher import patch_model_with_bettertransformer from ...exporters.openvino.stateful import ensure_export_task_support_stateful, ensure_stateful_is_available from ..utils.constant import _TASK_ALIASES -from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available +from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available, is_diffusers_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVQuantizationMethod, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, MIN_ONNX_QDQ_OPSET, ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, + PREDEFINED_SD_DATASETS, ) @@ -200,7 +201,7 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, + calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None, save_directory: Optional[Union[str, Path]] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, @@ -318,80 +319,106 @@ def _quantize_ovbasemodel( self, ov_config: OVConfig, save_directory: Union[str, Path] = None, - calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, + calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, **kwargs, ): + if is_diffusers_available(): + from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase + if save_directory is not None: save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) - quantization_config = ov_config.quantization_config + + if calibration_dataset is not None: + # Process custom calibration dataset + + if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + calibration_dataset = self._prepare_unet_dataset( + quantization_config.num_samples, dataset=calibration_dataset + ) + elif is_datasets_available() and isinstance(calibration_dataset, Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) + if self.model.export_feature == "text-generation" and self.model.use_cache: + calibration_dataset = self._prepare_text_generation_dataset( + quantization_config, calibration_dataloader + ) + else: + calibration_dataset = nncf.Dataset(calibration_dataloader) + elif isinstance(calibration_dataset, collections.abc.Iterable): + calibration_dataset = nncf.Dataset(calibration_dataset) + elif not isinstance(calibration_dataset, nncf.Dataset): + raise ValueError( + "`calibration_dataset` must be either an `Iterable` object or an instance of " + f"`nncf.Dataset` or `datasets.Dataset`. Found: {type(calibration_dataset)}." + ) + if isinstance(quantization_config, OVWeightQuantizationConfig): + if quantization_config.dataset is not None and calibration_dataset is not None: + logger.info( + "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " + "quantization. Will rely on `calibration_dataset`." + ) + if calibration_dataset is None and isinstance(quantization_config.dataset, str): from optimum.intel import OVModelForCausalLM if isinstance(self.model, OVModelForCausalLM): - from optimum.gptq.data import get_dataset, prepare_dataset - - tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer) - nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 - calibration_dataset = get_dataset( - quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples + calibration_dataset = self._prepare_builtin_dataset(quantization_config) + elif is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + calibration_dataset = self._prepare_unet_dataset( + quantization_config.num_samples, dataset_name=quantization_config.dataset ) - calibration_dataset = prepare_dataset(calibration_dataset) - calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x)) else: raise ValueError( f"Can't create weight compression calibration dataset from string for {type(self.model)}" ) - _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) + if quantization_config.quant_method == OVQuantizationMethod.HYBRID: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run hybrid quantization.") + if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + # Apply weight-only quantization to all SD submodels except UNet + quantization_config_copy = copy.deepcopy(quantization_config) + quantization_config_copy.dataset = None + quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT + for sd_submodel_name in ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"]: + sd_submodel = getattr(self.model, sd_submodel_name) + if sd_submodel is not None: + _weight_only_quantization(sd_submodel.model, quantization_config_copy) + + # Apply hybrid quantization to UNet + self.model.unet.model = _hybrid_quantization( + self.model.unet.model, quantization_config, calibration_dataset + ) + else: + # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. + self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset) + else: + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) if save_directory is not None: self.model.save_pretrained(save_directory) ov_config.save_pretrained(save_directory) return + if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - if isinstance(calibration_dataset, nncf.Dataset): - quantization_dataset = calibration_dataset - elif isinstance(calibration_dataset, datasets.Dataset): - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) - - if self.model.export_feature == "text-generation" and self.model.use_cache: - # Prefetch past_key_values - self.model.update_pkv_precision(True) - self.model.compile() - collected_inputs = [] - - self.model.request = InferRequestWrapper(self.model.request, collected_inputs) - try: - for data in calibration_dataloader: - self.model.generate(**data, max_new_tokens=1) - if len(collected_inputs) >= quantization_config.num_samples: - break - finally: - self.model.request = self.model.request.request - quantization_dataset = nncf.Dataset(collected_inputs) - else: - quantization_dataset = nncf.Dataset(calibration_dataloader) - else: - if calibration_dataset is None: - raise ValueError("Calibration dataset is required to run quantization.") - quantization_dataset = nncf.Dataset(calibration_dataset) + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") # Actual model quantization quantized_model = nncf.quantize( self.model.model, - quantization_dataset, + calibration_dataset, subset_size=quantization_config.num_samples, ignored_scope=quantization_config.get_ignored_scope_instance(), model_type=nncf.ModelType(quantization_config.model_type), @@ -402,6 +429,7 @@ def _quantize_ovbasemodel( ), **kwargs, ) + self.model.model = quantized_model if save_directory is not None: self.model.save_pretrained(save_directory) @@ -411,7 +439,7 @@ def _quantize_torchmodel( self, ov_config: OVConfig, save_directory: Union[str, Path], - calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, + calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, @@ -482,7 +510,7 @@ def _quantize_torchmodel( if isinstance(calibration_dataset, nncf.Dataset): quantization_dataset = calibration_dataset - elif isinstance(calibration_dataset, datasets.Dataset): + elif isinstance(calibration_dataset, Dataset): calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, batch_size=batch_size, @@ -567,7 +595,7 @@ def get_calibration_dataset( use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, cache_dir: str = HUGGINGFACE_HUB_CACHE, - ) -> datasets.Dataset: + ) -> "Dataset": """ Create the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -654,6 +682,104 @@ def _remove_unused_columns(self, dataset: "Dataset"): ignored_columns = list(set(dataset.column_names) - set(self._signature_columns)) return dataset.remove_columns(ignored_columns) + def _prepare_builtin_dataset(self, quantization_config: OVWeightQuantizationConfig): + from optimum.gptq.data import get_dataset, prepare_dataset + + tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer) + nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 + calibration_dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) + calibration_dataset = prepare_dataset(calibration_dataset) + calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x)) + + return calibration_dataset + + def _prepare_text_generation_dataset( + self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader + ) -> nncf.Dataset: + # Prefetch past_key_values + self.model.update_pkv_precision(True) + self.model.compile() + collected_inputs = [] + + num_samples = quantization_config.num_samples or 200 + + self.model.request = InferRequestWrapper(self.model.request, collected_inputs) + try: + for data in calibration_dataloader: + self.model.generate(**data, max_new_tokens=1) + if len(collected_inputs) >= num_samples: + break + finally: + self.model.request = self.model.request.request + calibration_dataset = nncf.Dataset(collected_inputs) + + return calibration_dataset + + def _prepare_unet_dataset( + self, + num_samples: Optional[int] = None, + dataset_name: Optional[str] = None, + dataset: Optional[Union[Iterable, "Dataset"]] = None, + ) -> nncf.Dataset: + self.model.compile() + + size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor + height, width = 2 * (min(size, 512),) + num_samples = num_samples or 200 + + if dataset is not None: + if isinstance(dataset, nncf.Dataset): + return dataset + if is_datasets_available() and isinstance(dataset, Dataset): + dataset = dataset.select_columns(["caption"]) + + def transform_fn(data_item): + return data_item if isinstance(data_item, (list, dict)) else [data_item] + + elif isinstance(dataset_name, str): + available_datasets = PREDEFINED_SD_DATASETS.keys() + if dataset_name not in available_datasets: + raise ValueError( + f"""You have entered a string value for dataset. You can only choose between + {list(available_datasets)}, but the {dataset_name} was found""" + ) + + from datasets import load_dataset + + dataset_metadata = PREDEFINED_SD_DATASETS[dataset_name] + dataset = load_dataset(dataset_name, split=dataset_metadata["split"], streaming=True).shuffle( + seed=self.seed + ) + input_names = dataset_metadata["inputs"] + dataset = dataset.select_columns(list(input_names.values())) + + def transform_fn(data_item): + return {inp_name: data_item[column] for inp_name, column in input_names.items()} + + else: + raise ValueError( + "For UNet inputs collection either quantization_config.dataset or custom " + "calibration_dataset must be provided." + ) + + calibration_data = [] + try: + self.model.unet.request = InferRequestWrapper(self.model.unet.request, calibration_data) + + for inputs in dataset: + inputs = transform_fn(inputs) + if isinstance(inputs, dict): + self.model(**inputs, height=height, width=width) + else: + self.model(*inputs, height=height, width=width) + if len(calibration_data) >= num_samples: + break + finally: + self.model.unet.request = self.model.unet.request.request + + calibration_dataset = nncf.Dataset(calibration_data[:num_samples]) + return calibration_dataset + def _weight_only_quantization( model: openvino.runtime.Model, @@ -664,14 +790,9 @@ def _weight_only_quantization( if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) - if config.dataset is not None and calibration_dataset is not None: - logger.info( - "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " - "quantization. Will rely on `calibration_dataset`." - ) dataset = None if calibration_dataset is not None: - if isinstance(calibration_dataset, datasets.Dataset): + if is_datasets_available() and isinstance(calibration_dataset, Dataset): raise ValueError( "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only " "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of " @@ -751,7 +872,7 @@ def _collect_ops_with_weights(model): def _hybrid_quantization( - model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig, dataset: Dict[str, Any] + model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig, dataset: nncf.Dataset ) -> openvino.runtime.Model: """ Quantize a model in hybrid mode with NNCF which means that we quantize: @@ -763,7 +884,7 @@ def _hybrid_quantization( The OpenVINO Runtime model for applying hybrid quantization. quantization_config (`OVWeightQuantizationConfig`): The configuration containing the parameters related to quantization. - dataset (`Dict[str, Any]`): + dataset (`nncf.Dataset`): The dataset used for hybrid quantization. Returns: The OpenVINO Runtime model with applied hybrid quantization. @@ -780,7 +901,7 @@ def _hybrid_quantization( subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, - calibration_dataset=nncf.Dataset(dataset), + calibration_dataset=dataset, model_type=nncf.ModelType.TRANSFORMER, ignored_scope=ptq_ignored_scope, # SQ algo should be disabled for MatMul nodes because their weights are already compressed diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 0745a1cd7..0a1f5209a 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -153,8 +153,6 @@ "{re}.*Embedding.*", "{re}.*add___.*", "{re}.*layer_norm_.*", - "{re}.*matmul_1", - "{re}.*__truediv__.*", ], } @@ -906,7 +904,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): output_path = os.path.join(output_dir, OV_XML_FILE_NAME) self.compression_controller.prepare_for_export() model_type = self.model.config.model_type.replace("_", "-") - onnx_config_class = TasksManager.get_exporter_config_constructor( + exporter_config_class = TasksManager.get_exporter_config_constructor( exporter="onnx", model=self.model, task=self.task, @@ -914,9 +912,9 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): ) if self.task == "text-generation": - onnx_config = onnx_config_class(self.model.config, use_past=self.model.config.use_cache) + onnx_config = exporter_config_class(self.model.config, use_past=self.model.config.use_cache) else: - onnx_config = onnx_config_class(self.model.config) + onnx_config = exporter_config_class(self.model.config) num_parameters = self.model.num_parameters() save_as_external_data = use_external_data_format(num_parameters) or self.ov_config.save_onnx_model diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 4d1479f73..69a750fb6 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -17,10 +17,13 @@ import logging import os from glob import glob +from pathlib import Path +from typing import Tuple, Union import numpy as np from huggingface_hub import model_info from openvino.runtime import Core, Type, properties +from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size @@ -107,6 +110,24 @@ } +NEED_CONVERT_TO_FAST_TOKENIZER: Tuple[type(PreTrainedTokenizer)] = (CLIPTokenizer,) + + +def maybe_convert_tokenizer_to_fast( + hf_tokenizer: PreTrainedTokenizer, tokenizer_path: Path +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if isinstance(hf_tokenizer, PreTrainedTokenizerFast): + return hf_tokenizer + + if isinstance(hf_tokenizer, NEED_CONVERT_TO_FAST_TOKENIZER): + try: + return AutoTokenizer.from_pretrained(tokenizer_path) + except Exception: + return hf_tokenizer + + return hf_tokenizer + + def use_external_data_format(num_parameters: int) -> bool: """ Returns whether or not the model requires using external data format for the ONNX export diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py new file mode 100644 index 000000000..40a1e3ca5 --- /dev/null +++ b/optimum/intel/pipelines/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pipeline_base import pipeline diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py new file mode 100644 index 000000000..a6c6a36b0 --- /dev/null +++ b/optimum/intel/pipelines/pipeline_base.py @@ -0,0 +1,293 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, Optional, Union + +import torch +from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer +from transformers import pipeline as transformers_pipeline +from transformers.feature_extraction_utils import PreTrainedFeatureExtractor +from transformers.pipelines import ( + AudioClassificationPipeline, + FillMaskPipeline, + ImageClassificationPipeline, + QuestionAnsweringPipeline, + TextClassificationPipeline, + TextGenerationPipeline, + TokenClassificationPipeline, +) +from transformers.pipelines.base import Pipeline +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +from optimum.intel.utils import is_ipex_available + + +if is_ipex_available(): + from ..ipex.modeling_base import ( + IPEXModel, + IPEXModelForAudioClassification, + IPEXModelForCausalLM, + IPEXModelForImageClassification, + IPEXModelForMaskedLM, + IPEXModelForQuestionAnswering, + IPEXModelForSequenceClassification, + IPEXModelForTokenClassification, + ) + + IPEX_SUPPORTED_TASKS = { + "text-generation": { + "impl": TextGenerationPipeline, + "class": (IPEXModelForCausalLM,), + "default": "gpt2", + "type": "text", + }, + "fill-mask": { + "impl": FillMaskPipeline, + "class": (IPEXModelForMaskedLM,), + "default": "bert-base-cased", + "type": "text", + }, + "question-answering": { + "impl": QuestionAnsweringPipeline, + "class": (IPEXModelForQuestionAnswering,), + "default": "distilbert-base-cased-distilled-squad", + "type": "text", + }, + "image-classification": { + "impl": ImageClassificationPipeline, + "class": (IPEXModelForImageClassification,), + "default": "google/vit-base-patch16-224", + "type": "image", + }, + "text-classification": { + "impl": TextClassificationPipeline, + "class": (IPEXModelForSequenceClassification,), + "default": "distilbert-base-uncased-finetuned-sst-2-english", + "type": "text", + }, + "token-classification": { + "impl": TokenClassificationPipeline, + "class": (IPEXModelForTokenClassification,), + "default": "dbmdz/bert-large-cased-finetuned-conll03-english", + "type": "text", + }, + "audio-classification": { + "impl": AudioClassificationPipeline, + "class": (IPEXModelForAudioClassification,), + "default": "superb/hubert-base-superb-ks", + "type": "audio", + }, + } +else: + IPEX_SUPPORTED_TASKS = {} + + +def load_ipex_model( + model, + targeted_task, + SUPPORTED_TASKS, + model_kwargs: Optional[Dict[str, Any]] = None, + hub_kwargs: Optional[Dict[str, Any]] = None, +): + if model_kwargs is None: + model_kwargs = {} + + ipex_model_class = SUPPORTED_TASKS[targeted_task]["class"][0] + + if model is None: + model_id = SUPPORTED_TASKS[targeted_task]["default"] + model = ipex_model_class.from_pretrained(model_id, export=True, **model_kwargs, **hub_kwargs) + elif isinstance(model, str): + model_id = model + try: + config = AutoConfig.from_pretrained(model) + export = not getattr(config, "torchscript", False) + except RuntimeError: + logger.warning("We will use IPEXModel with export=True to export the model") + export = True + model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs, **hub_kwargs) + elif isinstance(model, IPEXModel): + model_id = getattr(model.config, "name_or_path", None) + else: + raise ValueError( + f"""Model {model} is not supported. Please provide a valid model name or path or a IPEXModel. + You can also provide non model then a default one will be used""" + ) + + return model, model_id + + +MAPPING_LOADING_FUNC = { + "ipex": load_ipex_model, +} + + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + from transformers.tokenization_utils_fast import PreTrainedTokenizerFast + + +logger = logging.get_logger(__name__) + + +def pipeline( + task: str = None, + model: Optional[Union[str, "PreTrainedModel"]] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, + feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, + use_fast: bool = True, + token: Optional[Union[str, bool]] = None, + accelerator: Optional[str] = None, + revision: Optional[str] = None, + trust_remote_code: Optional[bool] = None, + torch_dtype: Optional[Union[str, torch.dtype]] = None, + commit_hash: Optional[str] = None, + **model_kwargs, +) -> Pipeline: + """ + Utility factory method to build a [`Pipeline`]. + + Pipelines are made of: + + - A [tokenizer](tokenizer) in charge of mapping raw textual input to token. + - A [model](model) to make predictions from the inputs. + - Some (optional) post processing for enhancing model's output. + + Args: + task (`str`): + The task defining which pipeline will be returned. Currently accepted tasks are: + + - `"text-generation"`: will return a [`TextGenerationPipeline`]:. + + model (`str` or [`PreTrainedModel`], *optional*): + The model that will be used by the pipeline to make predictions. This can be a model identifier or an + actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch). + + If not provided, the default for the `task` will be loaded. + tokenizer (`str` or [`PreTrainedTokenizer`], *optional*): + The tokenizer that will be used by the pipeline to encode data for the model. This can be a model + identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`]. + + If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model` + is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string). + However, if `config` is also not given or not a string, then the default tokenizer for the given `task` + will be loaded. + accelerator (`str`, *optional*, defaults to `"ipex"`): + The optimization backends, choose from ["ipex", "inc", "openvino"]. + use_fast (`bool`, *optional*, defaults to `True`): + Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]). + torch_dtype (`str` or `torch.dtype`, *optional*): + Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model + (`torch.float16`, `torch.bfloat16`, ... or `"auto"`). + model_kwargs (`Dict[str, Any]`, *optional*): + Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., + **model_kwargs)` function. + + Returns: + [`Pipeline`]: A suitable pipeline for the task. + + Examples: + + ```python + >>> import torch + >>> from optimum.intel.pipelines import pipeline + + >>> pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16) + >>> pipe("Describe a real-world application of AI in sustainable energy.") + ```""" + if model_kwargs is None: + model_kwargs = {} + + if task is None and model is None: + raise RuntimeError( + "Impossible to instantiate a pipeline without either a task or a model " + "being specified. " + "Please provide a task class or a model" + ) + + if model is None and tokenizer is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer" + " may not be compatible with the default model. Please provide a PreTrainedModel class or a" + " path/identifier to a pretrained model when providing tokenizer." + ) + + if accelerator not in MAPPING_LOADING_FUNC: + if accelerator is None: + msg = "Impossible to instantiate a pipeline without specifying an `accelerator`." + else: + msg = f"`accelerator` {accelerator} is not supported." + + raise ValueError(msg + f" Supported list of `accelerator` is : {', '.join(MAPPING_LOADING_FUNC)}.") + + if accelerator == "ipex": + if task not in list(IPEX_SUPPORTED_TASKS.keys()): + raise ValueError( + f"Task {task} is not supported for the IPEX pipeline. Supported tasks are { list(IPEX_SUPPORTED_TASKS.keys())}" + ) + + supported_tasks = IPEX_SUPPORTED_TASKS if accelerator == "ipex" else None + + no_feature_extractor_tasks = set() + no_tokenizer_tasks = set() + for _task, values in supported_tasks.items(): + if values["type"] == "text": + no_feature_extractor_tasks.add(_task) + elif values["type"] in {"image", "video"}: + no_tokenizer_tasks.add(_task) + elif values["type"] in {"audio"}: + no_tokenizer_tasks.add(_task) + elif values["type"] not in ["multimodal", "audio", "video"]: + raise ValueError(f"SUPPORTED_TASK {_task} contains invalid type {values['type']}") + + load_tokenizer = task not in no_tokenizer_tasks + load_feature_extractor = task not in no_feature_extractor_tasks + + hub_kwargs = { + "revision": revision, + "token": token, + "trust_remote_code": trust_remote_code, + "_commit_hash": commit_hash, + } + + if isinstance(model, Path): + model = str(model) + + if torch_dtype is not None: + if "torch_dtype" in model_kwargs: + raise ValueError( + 'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those' + " arguments might conflict, use only one.)" + ) + model_kwargs["torch_dtype"] = torch_dtype + + # Load the correct model if possible + # Infer the framework from the model if not already defined + model, model_id = MAPPING_LOADING_FUNC[accelerator](model, task, supported_tasks, model_kwargs, hub_kwargs) + + if load_tokenizer and tokenizer is None: + tokenizer = AutoTokenizer.from_pretrained(model_id, **hub_kwargs, **model_kwargs) + if load_feature_extractor and feature_extractor is None: + feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, **hub_kwargs, **model_kwargs) + + return transformers_pipeline( + task, + model=model, + tokenizer=tokenizer, + feature_extractor=feature_extractor, + use_fast=use_fast, + torch_dtype=torch_dtype, + ) diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index 99ad42aaf..a2cd72835 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -169,3 +169,16 @@ def get_model_device(model: torch.nn.Module) -> torch.device: # The model had no parameters at all, doesn't matter which device to choose device = torch.device("cpu") return device + + +def recursive_to_device(value, device): + """ + Recursivley move the tensor element in `value` to `device` + """ + if isinstance(value, (tuple, list)): + return type(value)(recursive_to_device(v, device) for v in value) + elif isinstance(value, dict): + return {k: recursive_to_device(v, device) for k, v in value.items()} + elif isinstance(value, torch.Tensor): + return value.to(device) + return value diff --git a/optimum/intel/version.py b/optimum/intel/version.py index 9668d6215..a2a857944 100644 --- a/optimum/intel/version.py +++ b/optimum/intel/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.17.0.dev0" +__version__ = "1.18.0.dev0" diff --git a/setup.py b/setup.py index 251ec61cd..02d7f2845 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "transformers>=4.36.0,<4.41.0", - "optimum~=1.19", + "transformers>=4.36.0,<4.42.0", + "optimum~=1.20", "datasets>=1.4.0", "sentencepiece", "scipy", @@ -53,7 +53,7 @@ "transformers_stream_generator", "einops", "tiktoken", - "sentence_transformers", + "sentence-transformers", ] QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] diff --git a/tests/ipex/test_inference.py b/tests/ipex/test_inference.py index b65d3c9b8..1a452fe40 100644 --- a/tests/ipex/test_inference.py +++ b/tests/ipex/test_inference.py @@ -16,8 +16,6 @@ import torch from parameterized import parameterized - -# TODO : add more tasks from transformers import ( AutoModelForCausalLM, AutoModelForQuestionAnswering, @@ -26,60 +24,51 @@ AutoTokenizer, pipeline, ) +from utils_tests import MODEL_NAMES from optimum.intel import inference_mode as ipex_inference_mode from optimum.intel.ipex.modeling_base import IPEXModel -MODEL_NAMES = { - "bert": "hf-internal-testing/tiny-random-bert", - "bloom": "hf-internal-testing/tiny-random-BloomModel", - "distilbert": "hf-internal-testing/tiny-random-distilbert", - "roberta": "hf-internal-testing/tiny-random-roberta", - "gptj": "hf-internal-testing/tiny-random-gptj", - "gpt2": "hf-internal-testing/tiny-random-gpt2", - "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", - "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", - "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "llama": "fxmarty/tiny-llama-fast-tokenizer", - "llama2": "Jiqing/tiny_random_llama2", - "opt": "hf-internal-testing/tiny-random-OPTModel", - "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", -} - _CLASSIFICATION_TASK_TO_AUTOMODELS = { "text-classification": AutoModelForSequenceClassification, "token-classification": AutoModelForTokenClassification, } -class IPEXIntegrationTest(unittest.TestCase): - CLASSIFICATION_SUPPORTED_ARCHITECTURES = ( +class IPEXClassificationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ( "bert", "distilbert", "roberta", ) - TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ( - "bloom", - "gptj", - "gpt2", - "gpt_neo", - "gpt_bigcode", - "llama", - "llama2", - "opt", - "mpt", - ) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = "This is a sample input" + for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items(): + model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32) + pipe = pipeline(task, model=model, tokenizer=tokenizer) - QA_SUPPORTED_ARCHITECTURES = ( + with torch.inference_mode(): + outputs = pipe(inputs) + with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe: + outputs_ipex = ipex_pipe(inputs) + self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule)) + self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"]) + + +class IPEXQuestionAnsweringTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ( "bert", "distilbert", "roberta", ) - @parameterized.expand(QA_SUPPORTED_ARCHITECTURES) - def test_question_answering_pipeline_inference(self, model_arch): + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForQuestionAnswering.from_pretrained(model_id, torch_dtype=torch.float32) @@ -95,24 +84,22 @@ def test_question_answering_pipeline_inference(self, model_arch): self.assertEqual(outputs["start"], outputs_ipex["start"]) self.assertEqual(outputs["end"], outputs_ipex["end"]) - @parameterized.expand(CLASSIFICATION_SUPPORTED_ARCHITECTURES) - def test_classification_pipeline_inference(self, model_arch): - model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs = "This is a sample input" - for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items(): - model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32) - pipe = pipeline(task, model=model, tokenizer=tokenizer) - with torch.inference_mode(): - outputs = pipe(inputs) - with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe: - outputs_ipex = ipex_pipe(inputs) - self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule)) - self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"]) +class IPEXTextGenerationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ( + "bloom", + "gptj", + "gpt2", + "gpt_neo", + "gpt_bigcode", + "llama", + "llama2", + "opt", + "mpt", + ) - @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES) - def test_text_generation_pipeline_inference(self, model_arch): + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, return_dict=False) model = model.eval() diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 94a5ca9e1..2a2f18f6f 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -45,53 +45,11 @@ ) from optimum.intel.utils.import_utils import is_ipex_version from optimum.utils.testing_utils import grid_parameters +from utils_tests import MODEL_NAMES SEED = 42 -MODEL_NAMES = { - "albert": "hf-internal-testing/tiny-random-albert", - "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", - "bert": "hf-internal-testing/tiny-random-bert", - "bart": "hf-internal-testing/tiny-random-bart", - "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", - "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", - "bloom": "hf-internal-testing/tiny-random-BloomModel", - "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", - "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", - "convnext": "hf-internal-testing/tiny-random-convnext", - "distilbert": "hf-internal-testing/tiny-random-distilbert", - "electra": "hf-internal-testing/tiny-random-electra", - "flaubert": "hf-internal-testing/tiny-random-flaubert", - "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "gpt2": "hf-internal-testing/tiny-random-gpt2", - "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", - "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", - "gptj": "hf-internal-testing/tiny-random-GPTJModel", - "levit": "hf-internal-testing/tiny-random-LevitModel", - "llama": "fxmarty/tiny-llama-fast-tokenizer", - "llama2": "Jiqing/tiny_random_llama2", - "marian": "sshleifer/tiny-marian-en-de", - "mbart": "hf-internal-testing/tiny-random-mbart", - "mistral": "echarlaix/tiny-random-mistral", - "mobilenet_v1": "google/mobilenet_v1_0.75_192", - "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", - "mobilevit": "hf-internal-testing/tiny-random-mobilevit", - "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", - "mt5": "stas/mt5-tiny-random", - "opt": "hf-internal-testing/tiny-random-OPTModel", - "phi": "echarlaix/tiny-random-PhiForCausalLM", - "resnet": "hf-internal-testing/tiny-random-resnet", - "roberta": "hf-internal-testing/tiny-random-roberta", - "roformer": "hf-internal-testing/tiny-random-roformer", - "squeezebert": "hf-internal-testing/tiny-random-squeezebert", - "t5": "hf-internal-testing/tiny-random-t5", - "unispeech": "hf-internal-testing/tiny-random-unispeech", - "vit": "hf-internal-testing/tiny-random-vit", - "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", - "xlm": "hf-internal-testing/tiny-random-xlm", -} - class Timer(object): def __enter__(self): diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py new file mode 100644 index 000000000..c4ae471a0 --- /dev/null +++ b/tests/ipex/test_pipelines.py @@ -0,0 +1,222 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from tempfile import TemporaryDirectory + +import numpy as np +import torch +from parameterized import parameterized +from transformers import AutoTokenizer +from transformers.pipelines import pipeline as transformers_pipeline +from utils_tests import MODEL_NAMES + +from optimum.intel.ipex.modeling_base import ( + IPEXModelForAudioClassification, + IPEXModelForCausalLM, + IPEXModelForImageClassification, + IPEXModelForMaskedLM, + IPEXModelForQuestionAnswering, + IPEXModelForSequenceClassification, + IPEXModelForTokenClassification, +) +from optimum.intel.pipelines import pipeline as ipex_pipeline + + +class PipelinesIntegrationTest(unittest.TestCase): + COMMON_SUPPORTED_ARCHITECTURES = ( + "albert", + "bert", + "distilbert", + "electra", + "flaubert", + "roberta", + "roformer", + "squeezebert", + "xlm", + ) + TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ( + "bart", + "gpt_bigcode", + "blenderbot", + "blenderbot-small", + "bloom", + "codegen", + "gpt2", + "gpt_neo", + "gpt_neox", + "llama", + "llama2", + "mistral", + "mpt", + "opt", + ) + QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES = ( + "bert", + "distilbert", + "roberta", + ) + AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES = ( + "unispeech", + "wav2vec2", + ) + IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES = ( + "beit", + "mobilenet_v1", + "mobilenet_v2", + "mobilevit", + "resnet", + "vit", + ) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_token_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("token-classification", model_id) + ipex_generator = ipex_pipeline("token-classification", model_id, accelerator="ipex") + inputs = "Hello I'm Omar and I live in Zürich." + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertEqual(len(transformers_output), len(ipex_output)) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForTokenClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + for i in range(len(transformers_output)): + self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_sequence_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("text-classification", model_id) + ipex_generator = ipex_pipeline("text-classification", model_id, accelerator="ipex") + inputs = "This restaurant is awesome" + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertEqual(transformers_output[0]["label"], ipex_output[0]["label"]) + self.assertAlmostEqual(transformers_output[0]["score"], ipex_output[0]["score"], delta=1e-4) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_fill_mask_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + inputs = "The Milky Way is a galaxy." + transformers_generator = transformers_pipeline("fill-mask", model_id) + ipex_generator = ipex_pipeline("fill-mask", model_id, accelerator="ipex") + mask_token = transformers_generator.tokenizer.mask_token + inputs = inputs.replace("", mask_token) + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertEqual(len(transformers_output), len(ipex_output)) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForMaskedLM)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + for i in range(len(transformers_output)): + self.assertEqual(transformers_output[i]["token"], ipex_output[i]["token"]) + self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4) + + @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES) + def test_text_generation_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("text-generation", model_id) + ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex") + inputs = "Describe a real-world application of AI." + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForCausalLM)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"]) + + @parameterized.expand(QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES) + def test_question_answering_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("question-answering", model_id) + ipex_generator = ipex_pipeline("question-answering", model_id, accelerator="ipex") + question = "How many programming languages does BLOOM support?" + context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages." + with torch.inference_mode(): + transformers_output = transformers_generator(question=question, context=context) + with torch.inference_mode(): + ipex_output = ipex_generator(question=question, context=context) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForQuestionAnswering)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertAlmostEqual(transformers_output["score"], ipex_output["score"], delta=1e-4) + self.assertEqual(transformers_output["start"], ipex_output["start"]) + self.assertEqual(transformers_output["end"], ipex_output["end"]) + + @parameterized.expand(AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES) + def test_audio_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("audio-classification", model_id) + ipex_generator = ipex_pipeline("audio-classification", model_id, accelerator="ipex") + inputs = [np.random.random(16000)] + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForAudioClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertAlmostEqual(transformers_output[0][0]["score"], ipex_output[0][0]["score"], delta=1e-2) + self.assertAlmostEqual(transformers_output[0][1]["score"], ipex_output[0][1]["score"], delta=1e-2) + + @parameterized.expand(IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES) + def test_image_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("image-classification", model_id) + ipex_generator = ipex_pipeline("image-classification", model_id, accelerator="ipex") + inputs = "http://images.cocodataset.org/val2017/000000039769.jpg" + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertEqual(len(transformers_output), len(ipex_output)) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForImageClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + for i in range(len(transformers_output)): + self.assertEqual(transformers_output[i]["label"], ipex_output[i]["label"]) + self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_pipeline_load_from_ipex_model(self, model_arch): + model_id = MODEL_NAMES[model_arch] + model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + ipex_generator = ipex_pipeline("text-classification", model, tokenizer=tokenizer, accelerator="ipex") + inputs = "This restaurant is awesome" + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertGreaterEqual(ipex_output[0]["score"], 0.0) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_pipeline_load_from_jit_model(self, model_arch): + model_id = MODEL_NAMES[model_arch] + model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True) + save_dir = TemporaryDirectory().name + model.save_pretrained(save_dir) + tokenizer = AutoTokenizer.from_pretrained(model_id) + ipex_generator = ipex_pipeline("text-classification", save_dir, tokenizer=tokenizer, accelerator="ipex") + inputs = "This restaurant is awesome" + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertGreaterEqual(ipex_output[0]["score"], 0.0) diff --git a/tests/ipex/utils_tests.py b/tests/ipex/utils_tests.py new file mode 100644 index 000000000..a14f0bf7c --- /dev/null +++ b/tests/ipex/utils_tests.py @@ -0,0 +1,57 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +MODEL_NAMES = { + "albert": "hf-internal-testing/tiny-random-albert", + "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", + "bert": "hf-internal-testing/tiny-random-bert", + "bart": "hf-internal-testing/tiny-random-bart", + "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", + "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", + "bloom": "hf-internal-testing/tiny-random-BloomModel", + "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", + "convnext": "hf-internal-testing/tiny-random-convnext", + "distilbert": "hf-internal-testing/tiny-random-distilbert", + "electra": "hf-internal-testing/tiny-random-electra", + "flaubert": "hf-internal-testing/tiny-random-flaubert", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", + "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", + "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "gptj": "hf-internal-testing/tiny-random-GPTJModel", + "levit": "hf-internal-testing/tiny-random-LevitModel", + "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", + "marian": "sshleifer/tiny-marian-en-de", + "mbart": "hf-internal-testing/tiny-random-mbart", + "mistral": "echarlaix/tiny-random-mistral", + "mobilenet_v1": "google/mobilenet_v1_0.75_192", + "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", + "mobilevit": "hf-internal-testing/tiny-random-mobilevit", + "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", + "mt5": "stas/mt5-tiny-random", + "opt": "hf-internal-testing/tiny-random-OPTModel", + "phi": "echarlaix/tiny-random-PhiForCausalLM", + "resnet": "hf-internal-testing/tiny-random-resnet", + "roberta": "hf-internal-testing/tiny-random-roberta", + "roformer": "hf-internal-testing/tiny-random-roformer", + "squeezebert": "hf-internal-testing/tiny-random-squeezebert", + "t5": "hf-internal-testing/tiny-random-t5", + "unispeech": "hf-internal-testing/tiny-random-unispeech", + "vit": "hf-internal-testing/tiny-random-vit", + "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", + "xlm": "hf-internal-testing/tiny-random-xlm", +} diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index e6ce4763f..0c3e60969 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -16,10 +16,12 @@ import os import tempfile import unittest +from pathlib import Path import torch from parameterized import parameterized from transformers import AutoTokenizer, pipeline, set_seed +from transformers.utils import SAFE_WEIGHTS_NAME from optimum.exporters import TasksManager from optimum.intel import ( # noqa @@ -37,7 +39,8 @@ INCStableDiffusionPipeline, INCTrainer, ) -from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME +from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, QUANTIZATION_CONFIG_NAME, WEIGHTS_NAME +from optimum.intel.utils.import_utils import is_itrex_available os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -52,7 +55,7 @@ MODEL_NAMES_TO_TASK = ( - ("hf-internal-testing/tiny-random-gpt2", "text-generation"), + ("hf-internal-testing/tiny-random-GPT2LMHeadModel", "text-generation"), ("hf-internal-testing/tiny-random-BertForMaskedLM", "fill-mask"), ("hf-internal-testing/tiny-random-DistilBertForSequenceClassification", "text-classification"), ("hf-internal-testing/tiny-random-DebertaV2Model", "feature-extraction"), @@ -86,7 +89,7 @@ def test_compare_to_transformers(self, model_id, task): outputs = inc_model(**model_inputs) with tempfile.TemporaryDirectory() as tmpdirname: inc_model.save_pretrained(tmpdirname) - loaded_model = model_class.from_pretrained(tmpdirname, file_name=WEIGHTS_NAME) + loaded_model = model_class.from_pretrained(tmpdirname) outputs_loaded = loaded_model(**model_inputs) if task == "feature-extraction": @@ -143,3 +146,57 @@ def test_compare_with_and_without_past_key_values(self): self.assertEqual(outputs_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_without_pkv.shape[1], self.GENERATION_LENGTH) self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv)) + + @unittest.skipIf(not is_itrex_available(), reason="ITREX not available") + def test_saving_loading_woq_itrex_model(self): + model_name = "echarlaix/tiny-random-PhiForCausalLM" + subfolder = "itrex" + model = INCModelForCausalLM.from_pretrained(model_name, revision="itrex", subfolder=subfolder) + tokenizer = AutoTokenizer.from_pretrained(model_name, revision="itrex") + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + tokens = tokenizer("This is a sample output", return_tensors="pt") + + with tempfile.TemporaryDirectory() as tmp_dir: + model_save_dir = Path(tmp_dir) / subfolder + model.save_pretrained(model_save_dir) + folder_contents = os.listdir(model_save_dir) + self.assertIn(SAFE_WEIGHTS_NAME, folder_contents) + self.assertIn(QUANTIZATION_CONFIG_NAME, folder_contents) + loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder) + + with torch.no_grad(): + outputs = model(**tokens) + loaded_outputs = loaded_model(**tokens) + + self.assertTrue("logits" in loaded_outputs) + self.assertIsInstance(loaded_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in loaded_outputs) + self.assertIsInstance(loaded_outputs.past_key_values, tuple) + self.assertTrue(torch.allclose(outputs.logits, loaded_outputs.logits, atol=1e-5)) + + def test_saving_loading_inc_model(self): + model_name = "echarlaix/tiny-random-PhiForCausalLM" + subfolder = "inc" + model = INCModelForCausalLM.from_pretrained(model_name, revision="inc", subfolder=subfolder) + tokenizer = AutoTokenizer.from_pretrained(model_name, revision="inc") + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + tokens = tokenizer("This is a sample output", return_tensors="pt") + + with tempfile.TemporaryDirectory() as tmp_dir: + model_save_dir = Path(tmp_dir) / subfolder + model.save_pretrained(model_save_dir) + folder_contents = os.listdir(model_save_dir) + self.assertIn(WEIGHTS_NAME, folder_contents) + self.assertIn("inc_config.json", folder_contents) + loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder) + self.assertIsInstance(loaded_model.inc_config, INCConfig) + + with torch.no_grad(): + outputs = model(**tokens) + loaded_outputs = loaded_model(**tokens) + + self.assertTrue("logits" in loaded_outputs) + self.assertIsInstance(loaded_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in loaded_outputs) + self.assertIsInstance(loaded_outputs.past_key_values, tuple) + self.assertTrue(torch.allclose(outputs.logits, loaded_outputs.logits, atol=1e-5)) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index da4258613..56f2a5bac 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -47,7 +47,6 @@ from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset from optimum.intel.utils.import_utils import is_torch_version, is_itrex_available - from optimum.intel import ( INCConfig, INCModelForCausalLM, diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index a6d09954f..210623758 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -81,7 +81,7 @@ "electra": "hf-internal-testing/tiny-random-electra", "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt2": "hf-internal-testing/tiny-random-GPT2LMHeadModel", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJModel", diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 851f8355f..8f61d9a36 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -18,8 +18,10 @@ from tempfile import TemporaryDirectory from typing import Optional +import torch from parameterized import parameterized -from transformers import AutoConfig +from sentence_transformers import SentenceTransformer, models +from transformers import AutoConfig, AutoTokenizer from utils_tests import MODEL_NAMES from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED @@ -124,7 +126,7 @@ def test_export(self, model_type: str): class CustomExportModelTest(unittest.TestCase): - def test_export_custom_model(self): + def test_custom_export_config_model(self): class BertOnnxConfigWithPooler(BertOnnxConfig): @property def outputs(self): @@ -157,3 +159,26 @@ def outputs(self): self.assertIsInstance(ov_model, OVBaseModel) self.assertTrue(ov_model.output_names == {"last_hidden_state": 0, "pooler_output": 1}) + + def test_export_custom_model(self): + model_id = "hf-internal-testing/tiny-random-BertModel" + word_embedding_model = models.Transformer(model_id, max_seq_length=256) + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + dense_model = models.Dense( + in_features=pooling_model.get_sentence_embedding_dimension(), + out_features=256, + ) + model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model]) + + with TemporaryDirectory() as tmpdirname: + export_from_model(model, output=tmpdirname, task="feature-extraction") + ov_model = OVModelForCustomTasks.from_pretrained(tmpdirname) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokens = tokenizer("This is a sample input", return_tensors="pt") + with torch.no_grad(): + model_outputs = model(tokens) + + ov_outputs = ov_model(**tokens) + self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4)) + self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4)) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 09fad5d77..cce25bbae 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -18,7 +18,6 @@ from parameterized import parameterized from utils_tests import ( - _ARCHITECTURES_TO_EXPECTED_INT4_INT8, _ARCHITECTURES_TO_EXPECTED_INT8, MODEL_NAMES, get_num_quantized_nodes, @@ -74,8 +73,8 @@ class OVCLIExportTestCase(unittest.TestCase): "wav2vec2": 0, # no tokenizer "bert": 1, # no detokenizer "blenderbot": 2, - "stable-diffusion": 0, # not supported - "stable-diffusion-xl": 0, # not supported + "stable-diffusion": 2, + "stable-diffusion-xl": 4, } SUPPORTED_SD_HYBRID_ARCHITECTURES = ( @@ -84,14 +83,13 @@ class OVCLIExportTestCase(unittest.TestCase): ("latent-consistency", 50, 135), ) - SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),) - - SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"] - - TEST_4BIT_CONFIGURATONS = [] - for arch in SUPPORTED_4BIT_ARCHITECTURES: - for option in SUPPORTED_4BIT_OPTIONS: - TEST_4BIT_CONFIGURATONS.append([arch[0], arch[1], option]) + TEST_4BIT_CONFIGURATONS = [ + ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86), + ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86), + ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86), + ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86), + ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32), + ] def _openvino_export( self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None @@ -197,17 +195,16 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in self.assertEqual(exp_num_fq, num_fq) @parameterized.expand(TEST_4BIT_CONFIGURATONS) - def test_exporters_cli_int4(self, task: str, model_type: str, option: str): + def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int): with TemporaryDirectory() as tmpdir: subprocess.run( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", shell=True, check=True, ) model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {} model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs) - expected_int8, expected_int4 = _ARCHITECTURES_TO_EXPECTED_INT4_INT8[model_type] _, num_int8, num_int4 = get_num_quantized_nodes(model) self.assertEqual(expected_int8, num_int8) self.assertEqual(expected_int4, num_int4) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index d4f55c683..0cb332276 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -527,6 +527,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "bloom", "chatglm", "codegen", + "codegen2", # "data2vec-text", # TODO : enable when enabled in exporters "gemma", "gpt2", @@ -552,6 +553,17 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "orion", "falcon", "falcon-40b", + "persimmon", + "biogpt", + "gpt_neox_japanese", + "cohere", + "xglm", + "aquila", + "aquila2", + "xverse", + "internlm", + "dbrx", + "qwen2-moe", ) GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( @@ -564,6 +576,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "internlm2", "orion", "phi3", + "aquila", + "aquila2", + "xverse", + "internlm", + "codegen2", ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -591,6 +608,7 @@ def test_compare_to_transformers(self, model_arch): self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokens = tokenizer("This is a sample output", return_tensors="pt") + tokens.pop("token_type_ids", None) ov_outputs = ov_model(**tokens) self.assertTrue("logits" in ov_outputs) @@ -617,11 +635,15 @@ def test_compare_to_transformers(self, model_arch): if model_arch == "qwen": return - if model_arch != "chatglm": + if model_arch not in ["chatglm", "persimmon"]: tokenizer.pad_token_id = tokenizer.eos_token_id + + if model_arch == "persimmon": + tokenizer.pad_token_id = tokenizer.bos_token_id # Compare batched generation tokenizer.padding_side = "left" tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens.pop("token_type_ids", None) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -778,6 +800,94 @@ def test_default_filling_attention_mask_and_position_ids(self): del model_with_cache gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow + def test_beam_search(self, model_arch): + model_kwargs = {} + model_id = MODEL_NAMES[model_arch] + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + # Qwen tokenizer does not support padding, chatgm testing model produces nan that incompatible with beam search + if model_arch in ["qwen", "chatglm"]: + return + + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + if model_arch == "persimmon": + tokenizer.pad_token_id = tokenizer.bos_token_id + tokenizer.eos_token_id = tokenizer.bos_token_id + + beam_search_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=False, + eos_token_id=None, + ) + beam_sample_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=True, + eos_token_id=None, + top_k=1, + ) + + group_beam_search_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=False, + eos_token_id=None, + num_beam_groups=2, + diversity_penalty=0.0000001, + ) + force_word = "cat" + force_words_ids = [tokenizer([force_word], add_special_tokens=False).input_ids] + constrained_beam_search_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=False, + eos_token_id=None, + force_words_ids=force_words_ids, + ) + + gen_configs = [ + beam_search_gen_config, + beam_sample_gen_config, + group_beam_search_gen_config, + constrained_beam_search_gen_config, + ] + ov_model_stateful = OVModelForCausalLM.from_pretrained( + model_id, export=True, use_cache=True, stateful=True, **model_kwargs + ) + ov_model_stateless = OVModelForCausalLM.from_pretrained( + model_id, export=True, use_cache=True, stateful=False, **model_kwargs + ) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + tokenizer.pad_token_id = tokenizer.eos_token_id + tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens.pop("token_type_ids", None) + ov_model_stateful.generation_config.eos_token_id = None + ov_model_stateless.generation_config.eos_token_id = None + transformers_model.generation_config.eos_token_id = None + ov_model_stateful.config.eos_token_id = None + ov_model_stateless.config.eos_token_id = None + transformers_model.config.eos_token_id = None + + for idx, gen_config in enumerate(gen_configs): + if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]: + continue + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config) + self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs), f"generation config : {idx}") + ov_stateless_outputs = ov_model_stateless.generate(**tokens, generation_config=gen_config) + self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs), f"generation config : {idx}") + class OVModelForMaskedLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( @@ -1569,7 +1679,7 @@ def test_compare_output_attentions(self, model_arch): preprocessor = AutoFeatureExtractor.from_pretrained(model_id) inputs = preprocessor(images=image, return_tensors="pt") - transformers_model = AutoModelForImageClassification.from_pretrained(model_id) + transformers_model = AutoModelForImageClassification.from_pretrained(model_id, attn_implementation="eager") transformers_model.eval() with torch.no_grad(): transformers_outputs = transformers_model(**inputs, output_attentions=True) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 896f37d76..b7ed36d3e 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -21,21 +21,17 @@ from collections import defaultdict from enum import Enum from functools import partial -from typing import List, Union +from typing import Union import evaluate import numpy as np import torch from datasets import load_dataset -from nncf.quantization.advanced_parameters import OverflowFix from parameterized import parameterized -import openvino.runtime as ov import nncf from transformers import ( AutoModelForQuestionAnswering, AutoModelForSequenceClassification, - AutoModelForCausalLM, - AutoModelForTokenClassification, AutoTokenizer, AutoProcessor, TrainingArguments, @@ -77,12 +73,16 @@ class OVQuantizerTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( + SUPPORTED_ARCHITECTURES_TORCH_MODEL = ( + (OVModelForSequenceClassification, "bert", 22, 35), + (OVModelForCausalLM, "gpt2", 41, 3), + ) + SUPPORTED_ARCHITECTURES_OV_MODEL = ( (OVModelForSequenceClassification, "bert", 32, 35), - # (OVModelForCausalLM, "gpt2", 41, 23), + (OVModelForCausalLM, "gpt2", 31, 22), ) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + @parameterized.expand(SUPPORTED_ARCHITECTURES_TORCH_MODEL) def test_automodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): model_id = MODEL_NAMES[model_name] task = model_cls.export_feature @@ -127,23 +127,21 @@ def preprocess_function(examples, tokenizer): loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): model_id = MODEL_NAMES[model_name] task = model_cls.export_feature dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task] - if "gpt2" in model_id: - expected_int8 -= 1 def preprocess_function(examples, tokenizer): return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True) with tempfile.TemporaryDirectory() as tmp_dir: - transformers_model = model_cls.from_pretrained(model_id, export=True) + ov_model = model_cls.from_pretrained(model_id, export=True) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) + quantizer = OVQuantizer.from_pretrained(ov_model, task=task) calibration_dataset = quantizer.get_calibration_dataset( dataset_name, @@ -413,8 +411,12 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3), ) + quantizer = OVQuantizer(model) + quantization_config = OVWeightQuantizationConfig( + bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID + ) + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset) num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_int8) @@ -663,7 +665,7 @@ def preprocess_function(examples, tokenizer): class OVTrainerTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 49, 38),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 67, 38),) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8): diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index c998d00d8..89d644319 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -322,7 +322,7 @@ def tearDown(self): "default_quantization": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss"], ), @@ -330,14 +330,14 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], ), "customized_quantization": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss"], ), @@ -345,7 +345,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], ), @@ -365,7 +365,7 @@ def tearDown(self): "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -376,7 +376,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -385,7 +385,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -397,7 +397,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -418,7 +418,7 @@ def tearDown(self): "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -429,7 +429,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -438,7 +438,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -450,7 +450,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -553,7 +553,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): "default_quantization": OVTrainerTestDescriptor( model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, compression_metrics=["compression_loss"], ), @@ -572,7 +572,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss"], @@ -580,7 +580,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor( model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss"], @@ -589,7 +589,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): model_id="yujiepan/tiny-random-swin-patch4-window7-224", teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -598,7 +598,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): model_id="yujiepan/tiny-random-swin-patch4-window7-224", teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -797,7 +797,9 @@ def prepare_model_and_dataset(self, desc: OVTrainerTestDescriptor): self.feature_extractor = AutoFeatureExtractor.from_pretrained(desc.model_id) self.tokenizer = self.feature_extractor - self.model = AutoModelForAudioClassification.from_pretrained(desc.model_id, num_labels=self.num_labels) + self.model = AutoModelForAudioClassification.from_pretrained( + desc.model_id, num_labels=self.num_labels, attn_implementation="eager" + ) self.teacher_model = None if desc.teacher_model_id: self.teacher_model = AutoModelForAudioClassification.from_pretrained( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 9f28e40a4..0789f1983 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -18,6 +18,8 @@ MODEL_NAMES = { "albert": "hf-internal-testing/tiny-random-albert", + "aquila": "katuni4ka/tiny-random-aquilachat", + "aquila2": "katuni4ka/tiny-random-aquila2", "audio_spectrogram_transformer": "Ericwang/tiny-random-ast", "bge": "BAAI/bge-small-en-v1.5", "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", @@ -26,16 +28,20 @@ "baichuan2": "katuni4ka/tiny-random-baichuan2", "baichuan2-13b": "katuni4ka/tiny-random-baichuan2-13b", "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", + "biogpt": "hf-tiny-model-private/tiny-random-BioGptForCausalLM", "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", "camembert": "hf-internal-testing/tiny-random-camembert", "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "cohere": "hf-internal-testing/tiny-random-CohereForCausalLM", "chatglm": "katuni4ka/tiny-random-chatglm2", "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", + "codegen2": "katuni4ka/tiny-random-codegen2", "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel", "data2vec_vision": "hf-internal-testing/tiny-random-Data2VecVisionModel", "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel", + "dbrx": "katuni4ka/tiny-random-dbrx", "deberta": "hf-internal-testing/tiny-random-deberta", "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model", "deit": "hf-internal-testing/tiny-random-deit", @@ -51,13 +57,15 @@ "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "gpt_neox_japanese": "hf-internal-testing/tiny-random-GPTNeoXJapaneseForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJModel", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-ibert", + "internlm": "katuni4ka/tiny-random-internlm", "internlm2": "katuni4ka/tiny-random-internlm2", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", - "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM", "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM", "llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", @@ -78,12 +86,14 @@ "olmo": "katuni4ka/tiny-random-olmo-hf", "orion": "katuni4ka/tiny-random-orion", "pegasus": "hf-internal-testing/tiny-random-pegasus", + "persimmon": "hf-internal-testing/tiny-random-PersimmonForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", "phi": "echarlaix/tiny-random-PhiForCausalLM", "phi3": "katuni4ka/tiny-random-phi3", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "qwen": "katuni4ka/tiny-random-qwen", "qwen2": "Qwen/Qwen1.5-0.5B", + "qwen2-moe": "katuni4ka/tiny-random-qwen1.5-moe", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer", @@ -115,6 +125,8 @@ "whisper": "openai/whisper-tiny.en", "xlm": "hf-internal-testing/tiny-random-xlm", "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta", + "xglm": "hf-internal-testing/tiny-random-XGLMForCausalLM", + "xverse": "katuni4ka/tiny-random-xverse", } @@ -140,8 +152,6 @@ "stable-diffusion-xl-refiner": (366, 34, 42, 66), } -_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 86)} - def get_num_quantized_nodes(ov_model): num_fake_quantize = 0