From fa1bc56f151f5e50f19a0b856eba83cd822ce7be Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Tue, 30 Apr 2024 15:12:25 +0200
Subject: [PATCH 01/47] Proper datasets.Dataset importing

---
 optimum/intel/openvino/quantization.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 86e473fd1..d4889c561 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -21,7 +21,6 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
-import datasets
 import nncf
 import openvino
 import torch
@@ -62,6 +61,8 @@
 
 if is_datasets_available():
     from datasets import Dataset
+else:
+    Dataset = None
 
 register_module(ignored_algorithms=[])(Conv1D)
 
@@ -318,7 +319,7 @@ def _quantize_ovbasemodel(
         self,
         ov_config: OVConfig,
         save_directory: Union[str, Path] = None,
-        calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None,
+        calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None,
         batch_size: int = 1,
         data_collator: Optional[DataCollator] = None,
         remove_unused_columns: bool = True,
@@ -358,7 +359,7 @@ def _quantize_ovbasemodel(
 
         if isinstance(calibration_dataset, nncf.Dataset):
             quantization_dataset = calibration_dataset
-        elif isinstance(calibration_dataset, datasets.Dataset):
+        elif Dataset is not None and isinstance(calibration_dataset, Dataset):
             calibration_dataloader = self._get_calibration_dataloader(
                 calibration_dataset=calibration_dataset,
                 batch_size=batch_size,
@@ -411,7 +412,7 @@ def _quantize_torchmodel(
         self,
         ov_config: OVConfig,
         save_directory: Union[str, Path],
-        calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None,
+        calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None,
         file_name: Optional[str] = None,
         batch_size: int = 1,
         data_collator: Optional[DataCollator] = None,
@@ -482,7 +483,7 @@ def _quantize_torchmodel(
 
             if isinstance(calibration_dataset, nncf.Dataset):
                 quantization_dataset = calibration_dataset
-            elif isinstance(calibration_dataset, datasets.Dataset):
+            elif isinstance(calibration_dataset, Dataset):
                 calibration_dataloader = self._get_calibration_dataloader(
                     calibration_dataset=calibration_dataset,
                     batch_size=batch_size,
@@ -567,7 +568,7 @@ def get_calibration_dataset(
         use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
-    ) -> datasets.Dataset:
+    ) -> "Dataset":
         """
         Create the calibration `datasets.Dataset` to use for the post-training static quantization calibration step.
 
@@ -671,7 +672,7 @@ def _weight_only_quantization(
         )
     dataset = None
     if calibration_dataset is not None:
-        if isinstance(calibration_dataset, datasets.Dataset):
+        if Dataset is not None and isinstance(calibration_dataset, Dataset):
             raise ValueError(
                 "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only "
                 "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of "

From d02e281f2f94c791890f99277d0926e2ae7810e0 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 2 May 2024 14:23:57 +0100
Subject: [PATCH 02/47] OV Tokenizer Leftovers

- Support SD Pipeline Slow Tokenizer Conversion
- Support SD Mixed Quantization
- Move Converted OV Tokenizers to a Separate Folder
---
 optimum/commands/export/openvino.py    |  9 +++++++++
 optimum/exporters/openvino/__main__.py |  7 ++++---
 optimum/exporters/openvino/convert.py  | 11 ++++++-----
 optimum/intel/openvino/utils.py        | 24 ++++++++++++++++++++++++
 tests/openvino/test_exporters_cli.py   |  4 ++--
 5 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 4fed3f6f8..c225c50d7 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -21,6 +21,7 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
 from ...exporters import TasksManager
+from ...exporters.openvino.convert import export_tokenizer
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ..base import BaseOptimumCLICommand, CommandInfo
 
@@ -261,6 +262,14 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
+            output = Path(self.args.output)
+            tokenizer = getattr(model, "tokenizer", None)
+            if tokenizer is not None:
+                export_tokenizer(tokenizer, output / "tokenizer")
+
+            tokenizer_2 = getattr(model, "tokenizer_2", None)
+            if tokenizer_2 is not None:
+                export_tokenizer(tokenizer_2, output / "tokenizer_2")
         else:
             if self.args.convert_tokenizer:
                 logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 8908c430b..3fa4fb0eb 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -25,6 +25,7 @@
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
+from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER
 from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
 from optimum.utils.save_utils import maybe_load_preprocessors
 
@@ -364,7 +365,7 @@ class StoreAttr(object):
 
             if tokenizer is not None:
                 try:
-                    export_tokenizer(tokenizer, output)
+                    export_tokenizer(tokenizer, output / OV_TOKENIZER_FLOLDER)
                 except Exception as exception:
                     logger.warning(
                         "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
@@ -373,11 +374,11 @@ class StoreAttr(object):
         else:
             tokenizer = getattr(model, "tokenizer", None)
             if tokenizer is not None:
-                export_tokenizer(tokenizer, output)
+                export_tokenizer(tokenizer, output / "tokenizer")
 
             tokenizer_2 = getattr(model, "tokenizer_2", None)
             if tokenizer_2 is not None:
-                export_tokenizer(tokenizer_2, output, suffix="_2")
+                export_tokenizer(tokenizer_2, output / "tokenizer_2")
     elif convert_tokenizer and not is_openvino_tokenizers_available():
         logger.warning("Tokenizer won't be converted.")
 
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 6c86c2c2d..bb781a690 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -667,20 +667,21 @@ def export_tokenizer(
     output: Union[str, Path],
     suffix: Optional[str] = "",
 ):
-    from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME  # avoid circular imports
+    # avoid circular imports
+    from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
+    from optimum.intel.openvino.utils import maybe_convert_tokenizer_to_fast
 
     try:
         from openvino_tokenizers import convert_tokenizer
     except ModuleNotFoundError:
-        # avoid this message before tokenizers are part of the openvino dependencies
-        # logger.info(
-        #     "Run `pip install openvino-tokenizers[transformers]` to get OpenVINO tokenizer/detokenizer models."
-        # )
         return
 
     if not isinstance(output, Path):
         output = Path(output)
 
+    if output.exists():
+        tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output)
+
     try:
         converted = convert_tokenizer(tokenizer, with_detokenizer=True)
     except NotImplementedError:
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 4d1479f73..89994a7ac 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -17,10 +17,13 @@
 import logging
 import os
 from glob import glob
+from pathlib import Path
+from typing import List, Union
 
 import numpy as np
 from huggingface_hub import model_info
 from openvino.runtime import Core, Type, properties
+from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size
 
 
@@ -31,6 +34,7 @@
 OV_DECODER_NAME = "openvino_decoder_model.xml"
 OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml"
 
+OV_TOKENIZER_FLOLDER = "openvino_tokenizer"
 OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml"
 OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml"
 
@@ -107,6 +111,26 @@
 }
 
 
+NEED_CONVERT_TO_FAST_TOKENIZER: List[PreTrainedTokenizer] = [
+    CLIPTokenizer,
+]
+
+
+def maybe_convert_tokenizer_to_fast(
+    hf_tokenizer: PreTrainedTokenizer, tokenizer_path: Path
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if isinstance(hf_tokenizer, PreTrainedTokenizerFast):
+        return hf_tokenizer
+
+    if any(isinstance(type(hf_tokenizer), slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER):
+        try:
+            return AutoTokenizer.from_pretrained(tokenizer_path)
+        except Exception:
+            return hf_tokenizer
+
+    return hf_tokenizer
+
+
 def use_external_data_format(num_parameters: int) -> bool:
     """
     Returns whether or not the model requires using external data format for the ONNX export
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 09fad5d77..c91f28ba5 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -74,8 +74,8 @@ class OVCLIExportTestCase(unittest.TestCase):
         "wav2vec2": 0,  # no tokenizer
         "bert": 1,  # no detokenizer
         "blenderbot": 2,
-        "stable-diffusion": 0,  # not supported
-        "stable-diffusion-xl": 0,  # not supported
+        "stable-diffusion": 2,
+        "stable-diffusion-xl": 2,
     }
 
     SUPPORTED_SD_HYBRID_ARCHITECTURES = (

From 135c2e9b8f96e54b95baa7c626fc4be3fb0cdc08 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 2 May 2024 15:01:49 +0100
Subject: [PATCH 03/47] Fix Circular Import

---
 optimum/commands/export/openvino.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index c225c50d7..5a6cfeb02 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -21,7 +21,6 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
 from ...exporters import TasksManager
-from ...exporters.openvino.convert import export_tokenizer
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ..base import BaseOptimumCLICommand, CommandInfo
 
@@ -262,6 +261,9 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
+            # avoid circular import
+            from ...exporters.openvino.convert import export_tokenizer
+
             output = Path(self.args.output)
             tokenizer = getattr(model, "tokenizer", None)
             if tokenizer is not None:

From 1766570e78ca7a3f38e1d8d47326c6fb70e7ba7c Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 2 May 2024 15:12:20 +0100
Subject: [PATCH 04/47] Fix Circular Import

---
 optimum/commands/export/openvino.py    | 4 +---
 optimum/exporters/openvino/__main__.py | 4 +++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 5a6cfeb02..c225c50d7 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -21,6 +21,7 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
 from ...exporters import TasksManager
+from ...exporters.openvino.convert import export_tokenizer
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ..base import BaseOptimumCLICommand, CommandInfo
 
@@ -261,9 +262,6 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
-            # avoid circular import
-            from ...exporters.openvino.convert import export_tokenizer
-
             output = Path(self.args.output)
             tokenizer = getattr(model, "tokenizer", None)
             if tokenizer is not None:
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 3fa4fb0eb..41eb0200d 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -25,7 +25,6 @@
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
-from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER
 from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
 from optimum.utils.save_utils import maybe_load_preprocessors
 
@@ -356,6 +355,9 @@ class StoreAttr(object):
         **kwargs_shapes,
     )
 
+    # avoid circular import
+    from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER
+
     if convert_tokenizer and is_openvino_tokenizers_available():
         if library_name != "diffusers":
             tokenizer = next(

From a1ee74970357e5c8ab2164d0bd381993cc035f35 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 2 May 2024 17:07:30 +0100
Subject: [PATCH 05/47] Fix Tests

---
 optimum/intel/openvino/utils.py      | 4 ++--
 tests/openvino/test_exporters_cli.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 89994a7ac..6b49f7a83 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -111,7 +111,7 @@
 }
 
 
-NEED_CONVERT_TO_FAST_TOKENIZER: List[PreTrainedTokenizer] = [
+NEED_CONVERT_TO_FAST_TOKENIZER: List[type(PreTrainedTokenizer)] = [
     CLIPTokenizer,
 ]
 
@@ -122,7 +122,7 @@ def maybe_convert_tokenizer_to_fast(
     if isinstance(hf_tokenizer, PreTrainedTokenizerFast):
         return hf_tokenizer
 
-    if any(isinstance(type(hf_tokenizer), slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER):
+    if any(isinstance(hf_tokenizer, slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER):
         try:
             return AutoTokenizer.from_pretrained(tokenizer_path)
         except Exception:
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index c91f28ba5..cac79abae 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -75,7 +75,7 @@ class OVCLIExportTestCase(unittest.TestCase):
         "bert": 1,  # no detokenizer
         "blenderbot": 2,
         "stable-diffusion": 2,
-        "stable-diffusion-xl": 2,
+        "stable-diffusion-xl": 4,
     }
 
     SUPPORTED_SD_HYBRID_ARCHITECTURES = (

From ef9e5df7d23596a1b95a02698ab58d5e018634c0 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 2 May 2024 18:40:37 +0100
Subject: [PATCH 06/47] Fix INC Tests

---
 optimum/commands/export/openvino.py    | 4 +++-
 optimum/exporters/openvino/__main__.py | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index c225c50d7..a7302ef88 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -21,7 +21,6 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 
 from ...exporters import TasksManager
-from ...exporters.openvino.convert import export_tokenizer
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ..base import BaseOptimumCLICommand, CommandInfo
 
@@ -262,6 +261,9 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
+            # not export when using other exporters
+            from ...exporters.openvino.convert import export_tokenizer
+
             output = Path(self.args.output)
             tokenizer = getattr(model, "tokenizer", None)
             if tokenizer is not None:
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 41eb0200d..234e34aa9 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -24,7 +24,7 @@
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
-from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
+from optimum.exporters.openvino.convert import export_from_model
 from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
 from optimum.utils.save_utils import maybe_load_preprocessors
 
@@ -357,6 +357,8 @@ class StoreAttr(object):
 
     # avoid circular import
     from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER
+    # hide openvino import when using other exporters
+    from optimum.exporters.openvino.convert import export_tokenizer
 
     if convert_tokenizer and is_openvino_tokenizers_available():
         if library_name != "diffusers":

From 1f44ce9da35249f08276ac8affa56ec0d63ac503 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 2 May 2024 18:41:27 +0100
Subject: [PATCH 07/47] Make Style

---
 optimum/exporters/openvino/__main__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 234e34aa9..a43c42e44 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -355,10 +355,10 @@ class StoreAttr(object):
         **kwargs_shapes,
     )
 
-    # avoid circular import
-    from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER
     # hide openvino import when using other exporters
+    # avoid circular import
     from optimum.exporters.openvino.convert import export_tokenizer
+    from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER
 
     if convert_tokenizer and is_openvino_tokenizers_available():
         if library_name != "diffusers":

From ca30de156918069eb1af2d13bd2545a7f2b5a851 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 6 May 2024 10:32:19 +0200
Subject: [PATCH 08/47] SD calibration dataset collection refactoring

---
 optimum/intel/openvino/configuration.py      |   1 +
 optimum/intel/openvino/modeling_diffusion.py |  82 ++------
 optimum/intel/openvino/quantization.py       | 210 ++++++++++++++-----
 tests/openvino/test_quantization.py          |   4 +-
 4 files changed, 174 insertions(+), 123 deletions(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 5de672b70..30dfe5ae6 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -57,6 +57,7 @@
 
 class OVQuantizationMethod(str, Enum):
     DEFAULT = "default"
+    HYBRID = "hybrid"
 
 
 @dataclass
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 2de7cb815..ae86ea2bf 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -11,7 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
+import copy
 import importlib
 import logging
 import os
@@ -57,7 +57,7 @@
 )
 
 from ...exporters.openvino import main_export
-from .configuration import OVConfig, OVWeightQuantizationConfig
+from .configuration import OVConfig, OVWeightQuantizationConfig, OVQuantizationMethod
 from .loaders import OVTextualInversionLoaderMixin
 from .modeling_base import OVBaseModel
 from .utils import (
@@ -300,13 +300,11 @@ def _from_pretrained(
             # load the UNet model uncompressed to apply hybrid quantization further
             unet = cls.load_model(unet_path)
             # Apply weights compression to other `components` without dataset
-            weight_quantization_params = {
-                param: value for param, value in quantization_config.__dict__.items() if param != "dataset"
-            }
-            weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_params)
+            quantization_config_without_dataset = copy.deepcopy(quantization_config)
+            quantization_config_without_dataset.dataset = None
         else:
-            weight_quantization_config = quantization_config
-            unet = cls.load_model(unet_path, weight_quantization_config)
+            quantization_config_without_dataset = quantization_config
+            unet = cls.load_model(unet_path, quantization_config_without_dataset)
 
         components = {
             "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
@@ -316,7 +314,7 @@ def _from_pretrained(
         }
 
         for key, value in components.items():
-            components[key] = cls.load_model(value, weight_quantization_config) if value.is_file() else None
+            components[key] = cls.load_model(value, quantization_config_without_dataset) if value.is_file() else None
 
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
@@ -332,12 +330,14 @@ def _from_pretrained(
             if not isinstance(sd_model, supported_pipelines):
                 raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}")
 
-            nsamples = quantization_config.num_samples if quantization_config.num_samples else 200
-            unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples)
+            from optimum.intel import OVQuantizer
 
-            from .quantization import _hybrid_quantization
+            quantizer = OVQuantizer(sd_model)
+            quantization_config_copy = copy.deepcopy(quantization_config)
+            quantization_config_copy.quant_method = OVQuantizationMethod.HYBRID
+            quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
 
-            unet = _hybrid_quantization(sd_model.unet.model, weight_quantization_config, dataset=unet_inputs)
+            return sd_model
 
         return cls(
             unet=unet,
@@ -348,62 +348,6 @@ def _from_pretrained(
             **kwargs,
         )
 
-    def _prepare_unet_inputs(
-        self,
-        dataset: Union[str, List[Any]],
-        num_samples: int,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        seed: Optional[int] = 42,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        self.compile()
-
-        size = self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        height = height or min(size, 512)
-        width = width or min(size, 512)
-
-        if isinstance(dataset, str):
-            dataset = deepcopy(dataset)
-            available_datasets = PREDEFINED_SD_DATASETS.keys()
-            if dataset not in available_datasets:
-                raise ValueError(
-                    f"""You have entered a string value for dataset. You can only choose between
-                    {list(available_datasets)}, but the {dataset} was found"""
-                )
-
-            from datasets import load_dataset
-
-            dataset_metadata = PREDEFINED_SD_DATASETS[dataset]
-            dataset = load_dataset(dataset, split=dataset_metadata["split"], streaming=True).shuffle(seed=seed)
-            input_names = dataset_metadata["inputs"]
-            dataset = dataset.select_columns(list(input_names.values()))
-
-            def transform_fn(data_item):
-                return {inp_name: data_item[column] for inp_name, column in input_names.items()}
-
-        else:
-
-            def transform_fn(data_item):
-                return data_item if isinstance(data_item, (list, dict)) else [data_item]
-
-        from .quantization import InferRequestWrapper
-
-        calibration_data = []
-        self.unet.request = InferRequestWrapper(self.unet.request, calibration_data)
-
-        for inputs in dataset:
-            inputs = transform_fn(inputs)
-            if isinstance(inputs, dict):
-                self.__call__(**inputs, height=height, width=width)
-            else:
-                self.__call__(*inputs, height=height, width=width)
-            if len(calibration_data) >= num_samples:
-                break
-
-        self.unet.request = self.unet.request.request
-        return calibration_data[:num_samples]
-
     @classmethod
     def _from_transformers(
         cls,
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index d4889c561..f2258864a 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import collections.abc
 import copy
 import inspect
 import logging
@@ -49,13 +50,14 @@
 from ..utils.constant import _TASK_ALIASES
 from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available
 from ..utils.modeling_utils import get_model_device
-from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig
+from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig, OVQuantizationMethod
 from .modeling_base import OVBaseModel
 from .utils import (
     MAX_ONNX_OPSET,
     MIN_ONNX_QDQ_OPSET,
     ONNX_WEIGHTS_NAME,
     OV_XML_FILE_NAME,
+    PREDEFINED_SD_DATASETS,
 )
 
 
@@ -201,7 +203,7 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs):
 
     def quantize(
         self,
-        calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None,
+        calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None,
         save_directory: Optional[Union[str, Path]] = None,
         ov_config: OVConfig = None,
         file_name: Optional[str] = None,
@@ -325,74 +327,84 @@ def _quantize_ovbasemodel(
         remove_unused_columns: bool = True,
         **kwargs,
     ):
+        from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase
+
         if save_directory is not None:
             save_directory = Path(save_directory)
             save_directory.mkdir(parents=True, exist_ok=True)
-
         quantization_config = ov_config.quantization_config
+
+        if calibration_dataset is not None:
+            # Process custom calibration dataset
+
+            if isinstance(self.model, OVStableDiffusionPipelineBase):
+                calibration_dataset = self._prepare_unet_dataset(
+                    quantization_config.num_samples,
+                    dataset=calibration_dataset)
+            elif Dataset is not None and isinstance(calibration_dataset, Dataset):
+                calibration_dataloader = self._get_calibration_dataloader(
+                    calibration_dataset=calibration_dataset,
+                    batch_size=batch_size,
+                    remove_unused_columns=remove_unused_columns,
+                    data_collator=data_collator,
+                )
+
+                if self.model.export_feature == "text-generation" and self.model.use_cache:
+                    calibration_dataset = self._prepare_text_generation_dataset(
+                        quantization_config, calibration_dataloader)
+                else:
+                    calibration_dataset = nncf.Dataset(calibration_dataloader)
+            elif isinstance(calibration_dataset, collections.abc.Iterable):
+                calibration_dataset = nncf.Dataset(calibration_dataset)
+            elif not isinstance(calibration_dataset, nncf.Dataset):
+                raise ValueError("`calibration_dataset` must be either an `Iterable` object or an instance of "
+                                 f"`nncf.Dataset` or `datasets.Dataset`. Found: {type(calibration_dataset)}.")
+
         if isinstance(quantization_config, OVWeightQuantizationConfig):
+            if quantization_config.dataset is not None and calibration_dataset is not None:
+                logger.info(
+                    "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only "
+                    "quantization. Will rely on `calibration_dataset`."
+                )
+
             if calibration_dataset is None and isinstance(quantization_config.dataset, str):
                 from optimum.intel import OVModelForCausalLM
 
                 if isinstance(self.model, OVModelForCausalLM):
-                    from optimum.gptq.data import get_dataset, prepare_dataset
-
-                    tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer)
-                    nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
-                    calibration_dataset = get_dataset(
-                        quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples
-                    )
-                    calibration_dataset = prepare_dataset(calibration_dataset)
-                    calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x))
+                    calibration_dataset = self._prepare_gptq_dataset(quantization_config)
+                elif isinstance(self.model, OVStableDiffusionPipelineBase):
+                    calibration_dataset = self._prepare_unet_dataset(
+                        quantization_config.num_samples,
+                        dataset_name=quantization_config.dataset)
                 else:
                     raise ValueError(
                         f"Can't create weight compression calibration dataset from string for {type(self.model)}"
                     )
 
-            _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
+            if quantization_config.quant_method == OVQuantizationMethod.HYBRID:
+                if calibration_dataset is None:
+                    raise ValueError("Calibration dataset is required to run hybrid quantization.")
+                if isinstance(self.model, OVStableDiffusionPipelineBase):
+                    self.model.unet.model = _hybrid_quantization(self.model.unet.model, quantization_config, calibration_dataset)
+                else:
+                    self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset)
+            else:
+                _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
             if save_directory is not None:
                 self.model.save_pretrained(save_directory)
                 ov_config.save_pretrained(save_directory)
             return
+
         if not isinstance(quantization_config, OVQuantizationConfig):
             raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}")
 
-        if isinstance(calibration_dataset, nncf.Dataset):
-            quantization_dataset = calibration_dataset
-        elif Dataset is not None and isinstance(calibration_dataset, Dataset):
-            calibration_dataloader = self._get_calibration_dataloader(
-                calibration_dataset=calibration_dataset,
-                batch_size=batch_size,
-                remove_unused_columns=remove_unused_columns,
-                data_collator=data_collator,
-            )
-
-            if self.model.export_feature == "text-generation" and self.model.use_cache:
-                # Prefetch past_key_values
-                self.model.update_pkv_precision(True)
-                self.model.compile()
-                collected_inputs = []
-
-                self.model.request = InferRequestWrapper(self.model.request, collected_inputs)
-                try:
-                    for data in calibration_dataloader:
-                        self.model.generate(**data, max_new_tokens=1)
-                        if len(collected_inputs) >= quantization_config.num_samples:
-                            break
-                finally:
-                    self.model.request = self.model.request.request
-                quantization_dataset = nncf.Dataset(collected_inputs)
-            else:
-                quantization_dataset = nncf.Dataset(calibration_dataloader)
-        else:
-            if calibration_dataset is None:
-                raise ValueError("Calibration dataset is required to run quantization.")
-            quantization_dataset = nncf.Dataset(calibration_dataset)
+        if calibration_dataset is None:
+            raise ValueError("Calibration dataset is required to run quantization.")
 
         # Actual model quantization
         quantized_model = nncf.quantize(
             self.model.model,
-            quantization_dataset,
+            calibration_dataset,
             subset_size=quantization_config.num_samples,
             ignored_scope=quantization_config.get_ignored_scope_instance(),
             model_type=nncf.ModelType(quantization_config.model_type),
@@ -655,6 +667,103 @@ def _remove_unused_columns(self, dataset: "Dataset"):
         ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
         return dataset.remove_columns(ignored_columns)
 
+    def _prepare_gptq_dataset(self, quantization_config: OVWeightQuantizationConfig):
+        from optimum.gptq.data import get_dataset, prepare_dataset
+
+        tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer)
+        nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
+        calibration_dataset = get_dataset(
+            quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples
+        )
+        calibration_dataset = prepare_dataset(calibration_dataset)
+        calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x))
+
+        return calibration_dataset
+
+    def _prepare_text_generation_dataset(
+            self,
+            quantization_config: OVQuantizationConfig,
+            calibration_dataloader: OVDataLoader) -> nncf.Dataset:
+        # TODO: this function is not covered by tests, remove if not relevant anymore or cover by tests otherwise
+
+        # Prefetch past_key_values
+        self.model.update_pkv_precision(True)
+        self.model.compile()
+        collected_inputs = []
+
+        num_samples = quantization_config.num_samples or 200
+
+        self.model.request = InferRequestWrapper(self.model.model.request, collected_inputs)
+        try:
+            for data in calibration_dataloader:
+                self.model.generate(**data, max_new_tokens=1)
+                if len(collected_inputs) >= num_samples:
+                    break
+        finally:
+            self.model.model.request = self.model.model.request.request
+        calibration_dataset = nncf.Dataset(collected_inputs)
+        return calibration_dataset
+
+    def _prepare_unet_dataset(
+            self,
+            num_samples: Optional[int] = None,
+            dataset_name: Optional[str] = None,
+            dataset: Optional[Union[Iterable, "Dataset"]] = None) -> nncf.Dataset:
+        self.model.compile()
+
+        size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor
+        height, width = 2 * (min(size, 512),)
+        num_samples = num_samples or 200
+
+        if dataset is not None:
+            if isinstance(dataset, nncf.Dataset):
+                return dataset
+            if Dataset is not None and isinstance(dataset, Dataset):
+                dataset = dataset.select_columns(["caption"])
+
+            def transform_fn(data_item):
+                return data_item if isinstance(data_item, (list, dict)) else [data_item]
+
+        elif isinstance(dataset_name, str):
+            available_datasets = PREDEFINED_SD_DATASETS.keys()
+            if dataset_name not in available_datasets:
+                raise ValueError(
+                    f"""You have entered a string value for dataset. You can only choose between
+                    {list(available_datasets)}, but the {dataset_name} was found"""
+                )
+
+            from datasets import load_dataset
+
+            dataset_metadata = PREDEFINED_SD_DATASETS[dataset_name]
+            dataset = load_dataset(dataset_name, split=dataset_metadata["split"], streaming=True).shuffle(seed=self.seed)
+            input_names = dataset_metadata["inputs"]
+            dataset = dataset.select_columns(list(input_names.values()))
+
+            def transform_fn(data_item):
+                return {inp_name: data_item[column] for inp_name, column in input_names.items()}
+
+        else:
+            raise ValueError("For UNet inputs collection either quantization_config.dataset or custom "
+                             "calibration_dataset must be provided.")
+
+        calibration_data = []
+        try:
+            self.model.unet.request = InferRequestWrapper(self.model.unet.request, calibration_data)
+
+            for inputs in dataset:
+                inputs = transform_fn(inputs)
+                if isinstance(inputs, dict):
+                    self.model(**inputs, height=height, width=width)
+                else:
+                    self.model(*inputs, height=height, width=width)
+                if len(calibration_data) >= num_samples:
+                    break
+        finally:
+            self.model.unet.request = self.model.unet.request.request
+
+        calibration_dataset = nncf.Dataset(calibration_data[:num_samples])
+        return calibration_dataset
+
 
 def _weight_only_quantization(
     model: openvino.runtime.Model,
@@ -665,11 +774,6 @@ def _weight_only_quantization(
     if isinstance(config, dict):
         config = OVWeightQuantizationConfig.from_dict(quantization_config)
 
-    if config.dataset is not None and calibration_dataset is not None:
-        logger.info(
-            "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only "
-            "quantization. Will rely on `calibration_dataset`."
-        )
     dataset = None
     if calibration_dataset is not None:
         if Dataset is not None and isinstance(calibration_dataset, Dataset):
@@ -752,7 +856,7 @@ def _collect_ops_with_weights(model):
 
 
 def _hybrid_quantization(
-    model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig, dataset: Dict[str, Any]
+    model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig, dataset: nncf.Dataset
 ) -> openvino.runtime.Model:
     """
     Quantize a model in hybrid mode with NNCF which means that we quantize:
@@ -764,7 +868,7 @@ def _hybrid_quantization(
             The OpenVINO Runtime model for applying hybrid quantization.
         quantization_config (`OVWeightQuantizationConfig`):
             The configuration containing the parameters related to quantization.
-        dataset (`Dict[str, Any]`):
+        dataset (`nncf.Dataset`):
             The dataset used for hybrid quantization.
     Returns:
         The OpenVINO Runtime model with applied hybrid quantization.
@@ -781,7 +885,7 @@ def _hybrid_quantization(
     subset_size = quantization_config.num_samples if quantization_config.num_samples else 200
     quantized_model = nncf.quantize(
         model=compressed_model,
-        calibration_dataset=nncf.Dataset(dataset),
+        calibration_dataset=dataset,
         model_type=nncf.ModelType.TRANSFORMER,
         ignored_scope=ptq_ignored_scope,
         # SQ algo should be disabled for MatMul nodes because their weights are already compressed
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 896f37d76..de6b80827 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -413,8 +413,10 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset(
         model = model_cls.from_pretrained(
             model_id,
             export=True,
-            quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3),
         )
+        quantizer = OVQuantizer(model)
+        quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID)
+        quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset)
         num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
         self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
         self.assertEqual(expected_ov_int8, num_int8)

From de9b5c18c0b508422361a6562cbdd90b144aa776 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 6 May 2024 10:39:36 +0200
Subject: [PATCH 09/47] linters

---
 optimum/intel/openvino/modeling_diffusion.py |  9 ++--
 optimum/intel/openvino/quantization.py       | 51 +++++++++++---------
 tests/openvino/test_quantization.py          | 10 ++--
 3 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index ae86ea2bf..c5afb2c14 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -11,7 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import copy
+
 import importlib
 import logging
 import os
@@ -57,14 +57,13 @@
 )
 
 from ...exporters.openvino import main_export
-from .configuration import OVConfig, OVWeightQuantizationConfig, OVQuantizationMethod
+from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig
 from .loaders import OVTextualInversionLoaderMixin
 from .modeling_base import OVBaseModel
 from .utils import (
     ONNX_WEIGHTS_NAME,
     OV_TO_NP_TYPE,
     OV_XML_FILE_NAME,
-    PREDEFINED_SD_DATASETS,
     _print_compiled_model_properties,
 )
 
@@ -300,7 +299,7 @@ def _from_pretrained(
             # load the UNet model uncompressed to apply hybrid quantization further
             unet = cls.load_model(unet_path)
             # Apply weights compression to other `components` without dataset
-            quantization_config_without_dataset = copy.deepcopy(quantization_config)
+            quantization_config_without_dataset = deepcopy(quantization_config)
             quantization_config_without_dataset.dataset = None
         else:
             quantization_config_without_dataset = quantization_config
@@ -333,7 +332,7 @@ def _from_pretrained(
             from optimum.intel import OVQuantizer
 
             quantizer = OVQuantizer(sd_model)
-            quantization_config_copy = copy.deepcopy(quantization_config)
+            quantization_config_copy = deepcopy(quantization_config)
             quantization_config_copy.quant_method = OVQuantizationMethod.HYBRID
             quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
 
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index f2258864a..a749f38e6 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -50,7 +50,7 @@
 from ..utils.constant import _TASK_ALIASES
 from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available
 from ..utils.modeling_utils import get_model_device
-from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig, OVQuantizationMethod
+from .configuration import OVConfig, OVQuantizationConfig, OVQuantizationMethod, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel
 from .utils import (
     MAX_ONNX_OPSET,
@@ -339,8 +339,8 @@ def _quantize_ovbasemodel(
 
             if isinstance(self.model, OVStableDiffusionPipelineBase):
                 calibration_dataset = self._prepare_unet_dataset(
-                    quantization_config.num_samples,
-                    dataset=calibration_dataset)
+                    quantization_config.num_samples, dataset=calibration_dataset
+                )
             elif Dataset is not None and isinstance(calibration_dataset, Dataset):
                 calibration_dataloader = self._get_calibration_dataloader(
                     calibration_dataset=calibration_dataset,
@@ -351,14 +351,17 @@ def _quantize_ovbasemodel(
 
                 if self.model.export_feature == "text-generation" and self.model.use_cache:
                     calibration_dataset = self._prepare_text_generation_dataset(
-                        quantization_config, calibration_dataloader)
+                        quantization_config, calibration_dataloader
+                    )
                 else:
                     calibration_dataset = nncf.Dataset(calibration_dataloader)
             elif isinstance(calibration_dataset, collections.abc.Iterable):
                 calibration_dataset = nncf.Dataset(calibration_dataset)
             elif not isinstance(calibration_dataset, nncf.Dataset):
-                raise ValueError("`calibration_dataset` must be either an `Iterable` object or an instance of "
-                                 f"`nncf.Dataset` or `datasets.Dataset`. Found: {type(calibration_dataset)}.")
+                raise ValueError(
+                    "`calibration_dataset` must be either an `Iterable` object or an instance of "
+                    f"`nncf.Dataset` or `datasets.Dataset`. Found: {type(calibration_dataset)}."
+                )
 
         if isinstance(quantization_config, OVWeightQuantizationConfig):
             if quantization_config.dataset is not None and calibration_dataset is not None:
@@ -374,8 +377,8 @@ def _quantize_ovbasemodel(
                     calibration_dataset = self._prepare_gptq_dataset(quantization_config)
                 elif isinstance(self.model, OVStableDiffusionPipelineBase):
                     calibration_dataset = self._prepare_unet_dataset(
-                        quantization_config.num_samples,
-                        dataset_name=quantization_config.dataset)
+                        quantization_config.num_samples, dataset_name=quantization_config.dataset
+                    )
                 else:
                     raise ValueError(
                         f"Can't create weight compression calibration dataset from string for {type(self.model)}"
@@ -385,7 +388,9 @@ def _quantize_ovbasemodel(
                 if calibration_dataset is None:
                     raise ValueError("Calibration dataset is required to run hybrid quantization.")
                 if isinstance(self.model, OVStableDiffusionPipelineBase):
-                    self.model.unet.model = _hybrid_quantization(self.model.unet.model, quantization_config, calibration_dataset)
+                    self.model.unet.model = _hybrid_quantization(
+                        self.model.unet.model, quantization_config, calibration_dataset
+                    )
                 else:
                     self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset)
             else:
@@ -672,18 +677,15 @@ def _prepare_gptq_dataset(self, quantization_config: OVWeightQuantizationConfig)
 
         tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer)
         nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
-        calibration_dataset = get_dataset(
-            quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples
-        )
+        calibration_dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
         calibration_dataset = prepare_dataset(calibration_dataset)
         calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x))
 
         return calibration_dataset
 
     def _prepare_text_generation_dataset(
-            self,
-            quantization_config: OVQuantizationConfig,
-            calibration_dataloader: OVDataLoader) -> nncf.Dataset:
+        self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader
+    ) -> nncf.Dataset:
         # TODO: this function is not covered by tests, remove if not relevant anymore or cover by tests otherwise
 
         # Prefetch past_key_values
@@ -705,10 +707,11 @@ def _prepare_text_generation_dataset(
         return calibration_dataset
 
     def _prepare_unet_dataset(
-            self,
-            num_samples: Optional[int] = None,
-            dataset_name: Optional[str] = None,
-            dataset: Optional[Union[Iterable, "Dataset"]] = None) -> nncf.Dataset:
+        self,
+        num_samples: Optional[int] = None,
+        dataset_name: Optional[str] = None,
+        dataset: Optional[Union[Iterable, "Dataset"]] = None,
+    ) -> nncf.Dataset:
         self.model.compile()
 
         size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor
@@ -735,7 +738,9 @@ def transform_fn(data_item):
             from datasets import load_dataset
 
             dataset_metadata = PREDEFINED_SD_DATASETS[dataset_name]
-            dataset = load_dataset(dataset_name, split=dataset_metadata["split"], streaming=True).shuffle(seed=self.seed)
+            dataset = load_dataset(dataset_name, split=dataset_metadata["split"], streaming=True).shuffle(
+                seed=self.seed
+            )
             input_names = dataset_metadata["inputs"]
             dataset = dataset.select_columns(list(input_names.values()))
 
@@ -743,8 +748,10 @@ def transform_fn(data_item):
                 return {inp_name: data_item[column] for inp_name, column in input_names.items()}
 
         else:
-            raise ValueError("For UNet inputs collection either quantization_config.dataset or custom "
-                             "calibration_dataset must be provided.")
+            raise ValueError(
+                "For UNet inputs collection either quantization_config.dataset or custom "
+                "calibration_dataset must be provided."
+            )
 
         calibration_data = []
         try:
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index de6b80827..98eb121d7 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -21,21 +21,17 @@
 from collections import defaultdict
 from enum import Enum
 from functools import partial
-from typing import List, Union
+from typing import Union
 
 import evaluate
 import numpy as np
 import torch
 from datasets import load_dataset
-from nncf.quantization.advanced_parameters import OverflowFix
 from parameterized import parameterized
-import openvino.runtime as ov
 import nncf
 from transformers import (
     AutoModelForQuestionAnswering,
     AutoModelForSequenceClassification,
-    AutoModelForCausalLM,
-    AutoModelForTokenClassification,
     AutoTokenizer,
     AutoProcessor,
     TrainingArguments,
@@ -415,7 +411,9 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset(
             export=True,
         )
         quantizer = OVQuantizer(model)
-        quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID)
+        quantization_config = OVWeightQuantizationConfig(
+            bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID
+        )
         quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset)
         num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
         self.assertEqual(expected_num_fake_quantize, num_fake_quantize)

From 4a007f5adad1b2bbbf7dd0587b6f3b0280032b71 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 6 May 2024 13:45:27 +0200
Subject: [PATCH 10/47] Addressed comments

---
 optimum/intel/openvino/quantization.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index a749f38e6..d1f28b290 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -341,7 +341,7 @@ def _quantize_ovbasemodel(
                 calibration_dataset = self._prepare_unet_dataset(
                     quantization_config.num_samples, dataset=calibration_dataset
                 )
-            elif Dataset is not None and isinstance(calibration_dataset, Dataset):
+            elif is_datasets_available() and isinstance(calibration_dataset, Dataset):
                 calibration_dataloader = self._get_calibration_dataloader(
                     calibration_dataset=calibration_dataset,
                     batch_size=batch_size,
@@ -374,7 +374,7 @@ def _quantize_ovbasemodel(
                 from optimum.intel import OVModelForCausalLM
 
                 if isinstance(self.model, OVModelForCausalLM):
-                    calibration_dataset = self._prepare_gptq_dataset(quantization_config)
+                    calibration_dataset = self._prepare_builtin_dataset(quantization_config)
                 elif isinstance(self.model, OVStableDiffusionPipelineBase):
                     calibration_dataset = self._prepare_unet_dataset(
                         quantization_config.num_samples, dataset_name=quantization_config.dataset
@@ -392,6 +392,7 @@ def _quantize_ovbasemodel(
                         self.model.unet.model, quantization_config, calibration_dataset
                     )
                 else:
+                    # This may be for example OVModelForImageClassification, OVModelForAudioClassification, etc.
                     self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset)
             else:
                 _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
@@ -672,7 +673,7 @@ def _remove_unused_columns(self, dataset: "Dataset"):
         ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
         return dataset.remove_columns(ignored_columns)
 
-    def _prepare_gptq_dataset(self, quantization_config: OVWeightQuantizationConfig):
+    def _prepare_builtin_dataset(self, quantization_config: OVWeightQuantizationConfig):
         from optimum.gptq.data import get_dataset, prepare_dataset
 
         tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer)
@@ -721,7 +722,7 @@ def _prepare_unet_dataset(
         if dataset is not None:
             if isinstance(dataset, nncf.Dataset):
                 return dataset
-            if Dataset is not None and isinstance(dataset, Dataset):
+            if is_datasets_available() and isinstance(dataset, Dataset):
                 dataset = dataset.select_columns(["caption"])
 
             def transform_fn(data_item):
@@ -783,7 +784,7 @@ def _weight_only_quantization(
 
     dataset = None
     if calibration_dataset is not None:
-        if Dataset is not None and isinstance(calibration_dataset, Dataset):
+        if is_datasets_available() and isinstance(calibration_dataset, Dataset):
             raise ValueError(
                 "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only "
                 "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of "

From 583e43514ba0721fd7dfd87a75fe8f627f4fef58 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 6 May 2024 13:46:46 +0200
Subject: [PATCH 11/47] Updated SD HQ notebook

---
 .../stable_diffusion_hybrid_quantization.ipynb    | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
index 41969b162..efe413a9e 100644
--- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
+++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
@@ -52,7 +52,7 @@
     "import transformers\n",
     "from pathlib import Path\n",
     "from openvino.runtime import Core\n",
-    "from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig\n",
+    "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n",
     "\n",
     "transformers.logging.set_verbosity_error()\n",
     "datasets.logging.set_verbosity_error()"
@@ -198,9 +198,14 @@
    },
    "outputs": [],
    "source": [
-    "quantization_config = OVWeightQuantizationConfig(bits=8, dataset=calibration_dataset, num_samples=NUM_SAMPLES)\n",
-    "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True, quantization_config=quantization_config)\n",
-    "int8_pipe.save_pretrained(int8_model_path)"
+    "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES)\n",
+    "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n",
+    "quantizer = OVQuantizer(int8_pipe)\n",
+    "quantizer.quantize(\n",
+    "    ov_config=OVConfig(quantization_config=quantization_config),\n",
+    "    calibration_dataset=calibration_dataset,\n",
+    "    save_directory=int8_model_path\n",
+    ")"
    ]
   },
   {
@@ -613,7 +618,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,

From 349350c2c7524aa4be33c0baf680a1e45f894745 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 6 May 2024 16:10:22 +0200
Subject: [PATCH 12/47] Quantize SD submodels in OVQuantizer

---
 ...stable_diffusion_hybrid_quantization.ipynb |  5 +--
 optimum/intel/openvino/modeling_diffusion.py  | 33 ++++++++-----------
 optimum/intel/openvino/quantization.py        | 14 ++++++--
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
index efe413a9e..142cde492 100644
--- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
+++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
@@ -53,6 +53,7 @@
     "from pathlib import Path\n",
     "from openvino.runtime import Core\n",
     "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n",
+    "from optimum.intel.openvino.configuration import OVQuantizationMethod\n",
     "\n",
     "transformers.logging.set_verbosity_error()\n",
     "datasets.logging.set_verbosity_error()"
@@ -198,8 +199,8 @@
    },
    "outputs": [],
    "source": [
-    "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES)\n",
     "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n",
+    "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID)\n",
     "quantizer = OVQuantizer(int8_pipe)\n",
     "quantizer.quantize(\n",
     "    ov_config=OVConfig(quantization_config=quantization_config),\n",
@@ -618,7 +619,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index c5afb2c14..c92d20e3e 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -292,19 +292,7 @@ def _from_pretrained(
                 else:
                     kwargs[name] = load_method(new_model_save_dir)
 
-        quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
-
         unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name
-        if quantization_config is not None and quantization_config.dataset is not None:
-            # load the UNet model uncompressed to apply hybrid quantization further
-            unet = cls.load_model(unet_path)
-            # Apply weights compression to other `components` without dataset
-            quantization_config_without_dataset = deepcopy(quantization_config)
-            quantization_config_without_dataset.dataset = None
-        else:
-            quantization_config_without_dataset = quantization_config
-            unet = cls.load_model(unet_path, quantization_config_without_dataset)
-
         components = {
             "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
             "vae_decoder": new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
@@ -312,13 +300,19 @@ def _from_pretrained(
             "text_encoder_2": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name,
         }
 
-        for key, value in components.items():
-            components[key] = cls.load_model(value, quantization_config_without_dataset) if value.is_file() else None
-
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
 
-        if quantization_config is not None and quantization_config.dataset is not None:
+        quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
+        if quantization_config is None or quantization_config.dataset is None:
+            unet = cls.load_model(unet_path, quantization_config)
+            for key, value in components.items():
+                components[key] = cls.load_model(value, quantization_config) if value.is_file() else None
+        else:
+            # Load uncompressed models to apply hybrid quantization further
+            unet = cls.load_model(unet_path)
+            for key, value in components.items():
+                components[key] = cls.load_model(value) if value.is_file() else None
             sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs)
 
             supported_pipelines = (
@@ -331,10 +325,10 @@ def _from_pretrained(
 
             from optimum.intel import OVQuantizer
 
+            hybrid_quantization_config = deepcopy(quantization_config)
+            hybrid_quantization_config.quant_method = OVQuantizationMethod.HYBRID
             quantizer = OVQuantizer(sd_model)
-            quantization_config_copy = deepcopy(quantization_config)
-            quantization_config_copy.quant_method = OVQuantizationMethod.HYBRID
-            quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
+            quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config))
 
             return sd_model
 
@@ -347,6 +341,7 @@ def _from_pretrained(
             **kwargs,
         )
 
+
     @classmethod
     def _from_transformers(
         cls,
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index d1f28b290..45961a86f 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -63,8 +63,6 @@
 
 if is_datasets_available():
     from datasets import Dataset
-else:
-    Dataset = None
 
 register_module(ignored_algorithms=[])(Conv1D)
 
@@ -388,11 +386,21 @@ def _quantize_ovbasemodel(
                 if calibration_dataset is None:
                     raise ValueError("Calibration dataset is required to run hybrid quantization.")
                 if isinstance(self.model, OVStableDiffusionPipelineBase):
+                    # Apply weight-only quantization to all SD submodels except UNet
+                    quantization_config_copy = copy.deepcopy(quantization_config)
+                    quantization_config_copy.dataset = None
+                    quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT
+                    for sd_submodel_name in ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"]:
+                        sd_submodel = getattr(self.model, sd_submodel_name)
+                        if sd_submodel is not None:
+                            _weight_only_quantization(sd_submodel.model, quantization_config_copy)
+
+                    # Apply hybrid quantization to UNet
                     self.model.unet.model = _hybrid_quantization(
                         self.model.unet.model, quantization_config, calibration_dataset
                     )
                 else:
-                    # This may be for example OVModelForImageClassification, OVModelForAudioClassification, etc.
+                    # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc.
                     self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset)
             else:
                 _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)

From 068236dcb585c8af01f4b76793a2aaed5e58ca0b Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 6 May 2024 17:13:33 +0200
Subject: [PATCH 13/47] Black

---
 optimum/intel/openvino/modeling_diffusion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index c92d20e3e..1b880e736 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -341,7 +341,6 @@ def _from_pretrained(
             **kwargs,
         )
 
-
     @classmethod
     def _from_transformers(
         cls,

From ed5cbb91e02c56e53784351f3befb69c56903171 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Tue, 7 May 2024 14:55:10 +0100
Subject: [PATCH 14/47] Apply Review Comments

---
 optimum/commands/export/openvino.py    |  9 ++++++---
 optimum/exporters/openvino/__main__.py |  4 ++--
 optimum/intel/openvino/utils.py        | 10 ++++------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index a7302ef88..56abc6b7c 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -226,6 +226,9 @@ def run(self):
             )
             library_name = "transformers"
 
+        if self.args.convert_tokenizer:
+            logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
+
         if (
             library_name == "diffusers"
             and ov_config
@@ -261,6 +264,9 @@ def run(self):
             )
             model.save_pretrained(self.args.output)
 
+            if self.args.disable_convert_tokenizer:
+                return
+
             # not export when using other exporters
             from ...exporters.openvino.convert import export_tokenizer
 
@@ -273,9 +279,6 @@ def run(self):
             if tokenizer_2 is not None:
                 export_tokenizer(tokenizer_2, output / "tokenizer_2")
         else:
-            if self.args.convert_tokenizer:
-                logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
-
             # TODO : add input shapes
             main_export(
                 model_name_or_path=self.args.model,
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index a43c42e44..0d80101a5 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -358,7 +358,7 @@ class StoreAttr(object):
     # hide openvino import when using other exporters
     # avoid circular import
     from optimum.exporters.openvino.convert import export_tokenizer
-    from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER
+    from optimum.intel.openvino.utils import OV_TOKENIZER_FOLDER
 
     if convert_tokenizer and is_openvino_tokenizers_available():
         if library_name != "diffusers":
@@ -369,7 +369,7 @@ class StoreAttr(object):
 
             if tokenizer is not None:
                 try:
-                    export_tokenizer(tokenizer, output / OV_TOKENIZER_FLOLDER)
+                    export_tokenizer(tokenizer, output / OV_TOKENIZER_FOLDER)
                 except Exception as exception:
                     logger.warning(
                         "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 6b49f7a83..3bf00f071 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -18,7 +18,7 @@
 import os
 from glob import glob
 from pathlib import Path
-from typing import List, Union
+from typing import Tuple, Union
 
 import numpy as np
 from huggingface_hub import model_info
@@ -34,7 +34,7 @@
 OV_DECODER_NAME = "openvino_decoder_model.xml"
 OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml"
 
-OV_TOKENIZER_FLOLDER = "openvino_tokenizer"
+OV_TOKENIZER_FOLDER = "openvino_tokenizer"
 OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml"
 OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml"
 
@@ -111,9 +111,7 @@
 }
 
 
-NEED_CONVERT_TO_FAST_TOKENIZER: List[type(PreTrainedTokenizer)] = [
-    CLIPTokenizer,
-]
+NEED_CONVERT_TO_FAST_TOKENIZER: Tuple[type(PreTrainedTokenizer)] = (CLIPTokenizer,)
 
 
 def maybe_convert_tokenizer_to_fast(
@@ -122,7 +120,7 @@ def maybe_convert_tokenizer_to_fast(
     if isinstance(hf_tokenizer, PreTrainedTokenizerFast):
         return hf_tokenizer
 
-    if any(isinstance(hf_tokenizer, slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER):
+    if isinstance(hf_tokenizer, NEED_CONVERT_TO_FAST_TOKENIZER):
         try:
             return AutoTokenizer.from_pretrained(tokenizer_path)
         except Exception:

From 4fa40a259faf85fd30d7d930565533f4b1e11f32 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Tue, 7 May 2024 14:58:44 +0100
Subject: [PATCH 15/47] Apply Review Comments

---
 optimum/commands/export/openvino.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 56abc6b7c..025a40e05 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -267,7 +267,7 @@ def run(self):
             if self.args.disable_convert_tokenizer:
                 return
 
-            # not export when using other exporters
+            # avoid import when using other exporters (IPEX, INC)
             from ...exporters.openvino.convert import export_tokenizer
 
             output = Path(self.args.output)

From 0029e9165a2dad4cfcf787aa63181d9dc0cd49d5 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Thu, 9 May 2024 13:06:20 +0100
Subject: [PATCH 16/47] Move OV tokenizer to the root folder

---
 optimum/exporters/openvino/__main__.py | 4 +---
 optimum/intel/openvino/utils.py        | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 0d80101a5..31abd0f32 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -356,9 +356,7 @@ class StoreAttr(object):
     )
 
     # hide openvino import when using other exporters
-    # avoid circular import
     from optimum.exporters.openvino.convert import export_tokenizer
-    from optimum.intel.openvino.utils import OV_TOKENIZER_FOLDER
 
     if convert_tokenizer and is_openvino_tokenizers_available():
         if library_name != "diffusers":
@@ -369,7 +367,7 @@ class StoreAttr(object):
 
             if tokenizer is not None:
                 try:
-                    export_tokenizer(tokenizer, output / OV_TOKENIZER_FOLDER)
+                    export_tokenizer(tokenizer, output)
                 except Exception as exception:
                     logger.warning(
                         "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 3bf00f071..69a750fb6 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -34,7 +34,6 @@
 OV_DECODER_NAME = "openvino_decoder_model.xml"
 OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml"
 
-OV_TOKENIZER_FOLDER = "openvino_tokenizer"
 OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml"
 OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml"
 

From 0474b26dd453ecb8dc15966eef4d9198e82791c1 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 14 May 2024 08:52:05 +0200
Subject: [PATCH 17/47] unpin torch

---
 .github/workflows/test_inc.yml  | 2 +-
 .github/workflows/test_ipex.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index 1ede5e193..d4ad06660 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -32,7 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
         pip install intel-extension-for-transformers
         pip install peft
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 42f884b72..8e02bd551 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -30,7 +30,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[ipex,tests]
     - name: Test with Pytest
       run: |

From a814adf9d7794403f20608278c4c44c80ae61c8b Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 14 May 2024 09:20:36 +0200
Subject: [PATCH 18/47] itrex still in 2.2

---
 .github/workflows/test_inc.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index d4ad06660..1ede5e193 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -32,7 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
         pip install intel-extension-for-transformers
         pip install peft

From d0217982beef9b76a1bad406659d1e14dac2ffc3 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 14 May 2024 21:24:20 +0200
Subject: [PATCH 19/47] Bump test torch version  (#708)

---
 .github/workflows/test_inc.yml                   | 6 +++---
 optimum/intel/neural_compressor/modeling_base.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index 1ede5e193..6435d0b71 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -32,7 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
         pip install intel-extension-for-transformers
         pip install peft
@@ -43,7 +43,7 @@ jobs:
     - name: Test IPEX
       run: |
         pip uninstall -y intel-extension-for-transformers
-        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
-        pip install intel-extension-for-pytorch==2.1.100
+        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install intel-extension-for-pytorch==2.3.0
         pytest tests/neural_compressor/test_ipex.py
 
diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
index 2556a6048..c6d5e7bac 100644
--- a/optimum/intel/neural_compressor/modeling_base.py
+++ b/optimum/intel/neural_compressor/modeling_base.py
@@ -147,7 +147,7 @@ def _from_pretrained(
             try:
                 quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json")
                 algorithm = getattr(quantization_config, "quant_method", None)
-                if algorithm in {"rtn", "gptq", "awq", "autoaround"}:
+                if algorithm in {"rtn", "gptq", "awq", "autoround"}:
                     from intel_extension_for_transformers.transformers.modeling.modeling_auto import (
                         _BaseQBitsAutoModelClass,
                     )

From d9c8f9f1589c78289fced36c5d856d74c80dd2a6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 15 May 2024 17:04:41 +0800
Subject: [PATCH 20/47] Add IPEX pipeline (#501)

* define optimum-intel pipeline

* add tests and readme

* fix pipelines example

* fix readme codestyle

* add _load_model in pipeline

* update pipeline for optimum intel

* update tests

* remove readme

* Update optimum/intel/pipelines/__init__.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* fix pipelines

* add all supported tasks testing

* add hub_kwargs and model_kwargs on tokenizer and feature_extractor

* add hub_kwargs and default pipeline tests

* fix _from_transformers args

* rm default pipeline test

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* fix comments

* Update optimum/exporters/openvino/model_patcher.py

* Update optimum/intel/ipex/modeling_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* fix style

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/ipex/inference.py          |   4 +
 optimum/intel/ipex/modeling_base.py      |   2 +
 optimum/intel/pipelines/__init__.py      |  15 ++
 optimum/intel/pipelines/pipeline_base.py | 290 +++++++++++++++++++++++
 tests/ipex/test_pipelines.py             | 265 +++++++++++++++++++++
 5 files changed, 576 insertions(+)
 create mode 100644 optimum/intel/pipelines/__init__.py
 create mode 100644 optimum/intel/pipelines/pipeline_base.py
 create mode 100644 tests/ipex/test_pipelines.py

diff --git a/optimum/intel/ipex/inference.py b/optimum/intel/ipex/inference.py
index ccf2da9d8..a628ebe12 100644
--- a/optimum/intel/ipex/inference.py
+++ b/optimum/intel/ipex/inference.py
@@ -97,6 +97,10 @@ def __init__(
             jit (`boolean = False`, *optional*):
                 Enable jit to accelerate inference speed
         """
+        logger.warning(
+            "`inference_mode` is deprecated and will be removed in v1.18.0. Use `pipeline` to load and export your model to TorchScript instead."
+        )
+
         if not is_ipex_available():
             raise ImportError(IPEX_NOT_AVAILABLE_ERROR_MSG)
 
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 2b739ea50..d2963d55a 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -161,6 +161,7 @@ def _from_transformers(
         local_files_only: bool = False,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         trust_remote_code: bool = False,
+        _commit_hash: str = None,
     ):
         if use_auth_token is not None:
             warnings.warn(
@@ -186,6 +187,7 @@ def _from_transformers(
             "force_download": force_download,
             "torch_dtype": torch_dtype,
             "trust_remote_code": trust_remote_code,
+            "_commit_hash": _commit_hash,
         }
 
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py
new file mode 100644
index 000000000..40a1e3ca5
--- /dev/null
+++ b/optimum/intel/pipelines/__init__.py
@@ -0,0 +1,15 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from .pipeline_base import pipeline
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
new file mode 100644
index 000000000..65e6cfb78
--- /dev/null
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -0,0 +1,290 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+
+import torch
+from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer
+from transformers import pipeline as transformers_pipeline
+from transformers.feature_extraction_utils import PreTrainedFeatureExtractor
+from transformers.pipelines import (
+    AudioClassificationPipeline,
+    FillMaskPipeline,
+    ImageClassificationPipeline,
+    QuestionAnsweringPipeline,
+    TextClassificationPipeline,
+    TextGenerationPipeline,
+    TokenClassificationPipeline,
+)
+from transformers.pipelines.base import Pipeline
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import logging
+
+from optimum.intel.utils import is_ipex_available
+
+
+if is_ipex_available():
+    from ..ipex.modeling_base import (
+        IPEXModel,
+        IPEXModelForAudioClassification,
+        IPEXModelForCausalLM,
+        IPEXModelForImageClassification,
+        IPEXModelForMaskedLM,
+        IPEXModelForQuestionAnswering,
+        IPEXModelForSequenceClassification,
+        IPEXModelForTokenClassification,
+    )
+
+    IPEX_SUPPORTED_TASKS = {
+        "text-generation": {
+            "impl": TextGenerationPipeline,
+            "class": (IPEXModelForCausalLM,),
+            "default": "gpt2",
+            "type": "text",
+        },
+        "fill-mask": {
+            "impl": FillMaskPipeline,
+            "class": (IPEXModelForMaskedLM,),
+            "default": "bert-base-cased",
+            "type": "text",
+        },
+        "question-answering": {
+            "impl": QuestionAnsweringPipeline,
+            "class": (IPEXModelForQuestionAnswering,),
+            "default": "distilbert-base-cased-distilled-squad",
+            "type": "text",
+        },
+        "image-classification": {
+            "impl": ImageClassificationPipeline,
+            "class": (IPEXModelForImageClassification,),
+            "default": "google/vit-base-patch16-224",
+            "type": "image",
+        },
+        "text-classification": {
+            "impl": TextClassificationPipeline,
+            "class": (IPEXModelForSequenceClassification,),
+            "default": "distilbert-base-uncased-finetuned-sst-2-english",
+            "type": "text",
+        },
+        "token-classification": {
+            "impl": TokenClassificationPipeline,
+            "class": (IPEXModelForTokenClassification,),
+            "default": "dbmdz/bert-large-cased-finetuned-conll03-english",
+            "type": "text",
+        },
+        "audio-classification": {
+            "impl": AudioClassificationPipeline,
+            "class": (IPEXModelForAudioClassification,),
+            "default": "superb/hubert-base-superb-ks",
+            "type": "audio",
+        },
+    }
+else:
+    IPEX_SUPPORTED_TASKS = {}
+
+
+def load_ipex_model(
+    model,
+    targeted_task,
+    SUPPORTED_TASKS,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+    hub_kwargs: Optional[Dict[str, Any]] = None,
+):
+    if model_kwargs is None:
+        model_kwargs = {}
+
+    ipex_model_class = SUPPORTED_TASKS[targeted_task]["class"][0]
+
+    if model is None:
+        model_id = SUPPORTED_TASKS[targeted_task]["default"]
+        model = ipex_model_class.from_pretrained(model_id, export=True, **model_kwargs, **hub_kwargs)
+    elif isinstance(model, str):
+        model_id = model
+        try:
+            config = AutoConfig.from_pretrained(model)
+            export = not getattr(config, "torchscript", False)
+        except RuntimeError:
+            logger.warning("We will use IPEXModel with export=True to export the model")
+            export = True
+        model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs, **hub_kwargs)
+    elif isinstance(model, IPEXModel):
+        model_id = getattr(model.config, "name_or_path", None)
+    else:
+        raise ValueError(
+            f"""Model {model} is not supported. Please provide a valid model name or path or a IPEXModel.
+            You can also provide non model then a default one will be used"""
+        )
+
+    return model, model_id
+
+
+MAPPING_LOADING_FUNC = {
+    "ipex": load_ipex_model,
+}
+
+
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+logger = logging.get_logger(__name__)
+
+
+def pipeline(
+    task: str = None,
+    model: Optional[Union[str, "PreTrainedModel"]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+    feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
+    use_fast: bool = True,
+    token: Optional[Union[str, bool]] = None,
+    accelerator: Optional[str] = "ort",
+    revision: Optional[str] = None,
+    trust_remote_code: Optional[bool] = None,
+    torch_dtype: Optional[Union[str, torch.dtype]] = None,
+    commit_hash: Optional[str] = None,
+    **model_kwargs,
+) -> Pipeline:
+    """
+    Utility factory method to build a [`Pipeline`].
+
+    Pipelines are made of:
+
+        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
+        - A [model](model) to make predictions from the inputs.
+        - Some (optional) post processing for enhancing model's output.
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+
+        model (`str` or [`PreTrainedModel`], *optional*):
+            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
+            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch).
+
+            If not provided, the default for the `task` will be loaded.
+        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
+
+            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
+            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
+            will be loaded.
+        accelerator (`str`, *optional*, defaults to `"ipex"`):
+            The optimization backends, choose from ["ipex", "inc", "openvino"].
+        use_fast (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+        torch_dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
+        model_kwargs (`Dict[str, Any]`, *optional*):
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+
+    Returns:
+        [`Pipeline`]: A suitable pipeline for the task.
+
+    Examples:
+
+    ```python
+    >>> import torch
+    >>> from optimum.intel.pipelines import pipeline
+
+    >>> pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16)
+    >>> pipe("Describe a real-world application of AI in sustainable energy.")
+    ```"""
+    if model_kwargs is None:
+        model_kwargs = {}
+
+    if task is None and model is None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline without either a task or a model "
+            "being specified. "
+            "Please provide a task class or a model"
+        )
+
+    if model is None and tokenizer is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
+            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
+            " path/identifier to a pretrained model when providing tokenizer."
+        )
+
+    if accelerator not in MAPPING_LOADING_FUNC:
+        raise ValueError(
+            f'Accelerator {accelerator} is not supported. Supported accelerator is {", ".join(MAPPING_LOADING_FUNC)}.'
+        )
+
+    if accelerator == "ipex":
+        if task not in list(IPEX_SUPPORTED_TASKS.keys()):
+            raise ValueError(
+                f"Task {task} is not supported for the IPEX pipeline. Supported tasks are { list(IPEX_SUPPORTED_TASKS.keys())}"
+            )
+
+    supported_tasks = IPEX_SUPPORTED_TASKS if accelerator == "ipex" else None
+
+    no_feature_extractor_tasks = set()
+    no_tokenizer_tasks = set()
+    for _task, values in supported_tasks.items():
+        if values["type"] == "text":
+            no_feature_extractor_tasks.add(_task)
+        elif values["type"] in {"image", "video"}:
+            no_tokenizer_tasks.add(_task)
+        elif values["type"] in {"audio"}:
+            no_tokenizer_tasks.add(_task)
+        elif values["type"] not in ["multimodal", "audio", "video"]:
+            raise ValueError(f"SUPPORTED_TASK {_task} contains invalid type {values['type']}")
+
+    load_tokenizer = task not in no_tokenizer_tasks
+    load_feature_extractor = task not in no_feature_extractor_tasks
+
+    hub_kwargs = {
+        "revision": revision,
+        "token": token,
+        "trust_remote_code": trust_remote_code,
+        "_commit_hash": commit_hash,
+    }
+
+    if isinstance(model, Path):
+        model = str(model)
+
+    if torch_dtype is not None:
+        if "torch_dtype" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        model_kwargs["torch_dtype"] = torch_dtype
+
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    model, model_id = MAPPING_LOADING_FUNC[accelerator](model, task, supported_tasks, model_kwargs, hub_kwargs)
+
+    if load_tokenizer and tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
+    if load_feature_extractor and feature_extractor is None:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
+
+    return transformers_pipeline(
+        task,
+        model=model,
+        tokenizer=tokenizer,
+        feature_extractor=feature_extractor,
+        use_fast=use_fast,
+        torch_dtype=torch_dtype,
+    )
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
new file mode 100644
index 000000000..89a27ab2c
--- /dev/null
+++ b/tests/ipex/test_pipelines.py
@@ -0,0 +1,265 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+from tempfile import TemporaryDirectory
+
+import numpy as np
+import torch
+from parameterized import parameterized
+from transformers import AutoTokenizer
+from transformers.pipelines import pipeline as transformers_pipeline
+
+from optimum.intel.ipex.modeling_base import (
+    IPEXModelForAudioClassification,
+    IPEXModelForCausalLM,
+    IPEXModelForImageClassification,
+    IPEXModelForMaskedLM,
+    IPEXModelForQuestionAnswering,
+    IPEXModelForSequenceClassification,
+    IPEXModelForTokenClassification,
+)
+from optimum.intel.pipelines import pipeline as ipex_pipeline
+
+
+MODEL_NAMES = {
+    "albert": "hf-internal-testing/tiny-random-albert",
+    "beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
+    "bert": "hf-internal-testing/tiny-random-bert",
+    "bart": "hf-internal-testing/tiny-random-bart",
+    "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel",
+    "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
+    "bloom": "hf-internal-testing/tiny-random-BloomModel",
+    "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
+    "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
+    "convnext": "hf-internal-testing/tiny-random-convnext",
+    "distilbert": "hf-internal-testing/tiny-random-distilbert",
+    "electra": "hf-internal-testing/tiny-random-electra",
+    "flaubert": "hf-internal-testing/tiny-random-flaubert",
+    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
+    "gpt2": "hf-internal-testing/tiny-random-gpt2",
+    "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
+    "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
+    "gptj": "hf-internal-testing/tiny-random-GPTJModel",
+    "levit": "hf-internal-testing/tiny-random-LevitModel",
+    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama2": "Jiqing/tiny_random_llama2",
+    "marian": "sshleifer/tiny-marian-en-de",
+    "mbart": "hf-internal-testing/tiny-random-mbart",
+    "mistral": "echarlaix/tiny-random-mistral",
+    "mobilenet_v1": "google/mobilenet_v1_0.75_192",
+    "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
+    "mobilevit": "hf-internal-testing/tiny-random-mobilevit",
+    "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
+    "mt5": "stas/mt5-tiny-random",
+    "opt": "hf-internal-testing/tiny-random-OPTModel",
+    "phi": "echarlaix/tiny-random-PhiForCausalLM",
+    "resnet": "hf-internal-testing/tiny-random-resnet",
+    "roberta": "hf-internal-testing/tiny-random-roberta",
+    "roformer": "hf-internal-testing/tiny-random-roformer",
+    "squeezebert": "hf-internal-testing/tiny-random-squeezebert",
+    "t5": "hf-internal-testing/tiny-random-t5",
+    "unispeech": "hf-internal-testing/tiny-random-unispeech",
+    "vit": "hf-internal-testing/tiny-random-vit",
+    "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier",
+    "xlm": "hf-internal-testing/tiny-random-xlm",
+}
+
+
+class PipelinesIntegrationTest(unittest.TestCase):
+    COMMON_SUPPORTED_ARCHITECTURES = (
+        "albert",
+        "bert",
+        "distilbert",
+        "electra",
+        "flaubert",
+        "roberta",
+        "roformer",
+        "squeezebert",
+        "xlm",
+    )
+    TEXT_GENERATION_SUPPORTED_ARCHITECTURES = (
+        "bart",
+        "gpt_bigcode",
+        "blenderbot",
+        "blenderbot-small",
+        "bloom",
+        "codegen",
+        "gpt2",
+        "gpt_neo",
+        "gpt_neox",
+        "llama",
+        "llama2",
+        "mistral",
+        "mpt",
+        "opt",
+    )
+    QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES = (
+        "bert",
+        "distilbert",
+        "roberta",
+    )
+    AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES = (
+        "unispeech",
+        "wav2vec2",
+    )
+    IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES = (
+        "beit",
+        "mobilenet_v1",
+        "mobilenet_v2",
+        "mobilevit",
+        "resnet",
+        "vit",
+    )
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_token_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("token-classification", model_id)
+        ipex_generator = ipex_pipeline("token-classification", model_id, accelerator="ipex")
+        inputs = "Hello I'm Omar and I live in Zürich."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertEqual(len(transformers_output), len(ipex_output))
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForTokenClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        for i in range(len(transformers_output)):
+            self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_sequence_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("text-classification", model_id)
+        ipex_generator = ipex_pipeline("text-classification", model_id, accelerator="ipex")
+        inputs = "This restaurant is awesome"
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertEqual(transformers_output[0]["label"], ipex_output[0]["label"])
+        self.assertAlmostEqual(transformers_output[0]["score"], ipex_output[0]["score"], delta=1e-4)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_fill_mask_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        inputs = "The Milky Way is a <mask> galaxy."
+        transformers_generator = transformers_pipeline("fill-mask", model_id)
+        ipex_generator = ipex_pipeline("fill-mask", model_id, accelerator="ipex")
+        mask_token = transformers_generator.tokenizer.mask_token
+        inputs = inputs.replace("<mask>", mask_token)
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertEqual(len(transformers_output), len(ipex_output))
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForMaskedLM))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        for i in range(len(transformers_output)):
+            self.assertEqual(transformers_output[i]["token"], ipex_output[i]["token"])
+            self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4)
+
+    @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
+    def test_text_generation_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("text-generation", model_id)
+        ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
+        inputs = "Describe a real-world application of AI."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForCausalLM))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"])
+
+    @parameterized.expand(QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES)
+    def test_question_answering_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("question-answering", model_id)
+        ipex_generator = ipex_pipeline("question-answering", model_id, accelerator="ipex")
+        question = "How many programming languages does BLOOM support?"
+        context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(question=question, context=context)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(question=question, context=context)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForQuestionAnswering))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertAlmostEqual(transformers_output["score"], ipex_output["score"], delta=1e-4)
+        self.assertEqual(transformers_output["start"], ipex_output["start"])
+        self.assertEqual(transformers_output["end"], ipex_output["end"])
+
+    @parameterized.expand(AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES)
+    def test_audio_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("audio-classification", model_id)
+        ipex_generator = ipex_pipeline("audio-classification", model_id, accelerator="ipex")
+        inputs = [np.random.random(16000)]
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForAudioClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertAlmostEqual(transformers_output[0][0]["score"], ipex_output[0][0]["score"], delta=1e-2)
+        self.assertAlmostEqual(transformers_output[0][1]["score"], ipex_output[0][1]["score"], delta=1e-2)
+
+    @parameterized.expand(IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES)
+    def test_image_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("image-classification", model_id)
+        ipex_generator = ipex_pipeline("image-classification", model_id, accelerator="ipex")
+        inputs = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertEqual(len(transformers_output), len(ipex_output))
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForImageClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        for i in range(len(transformers_output)):
+            self.assertEqual(transformers_output[i]["label"], ipex_output[i]["label"])
+            self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_pipeline_load_from_ipex_model(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        ipex_generator = ipex_pipeline("text-classification", model, tokenizer=tokenizer, accelerator="ipex")
+        inputs = "This restaurant is awesome"
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertGreaterEqual(ipex_output[0]["score"], 0.0)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_pipeline_load_from_jit_model(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True)
+        save_dir = TemporaryDirectory().name
+        model.save_pretrained(save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        ipex_generator = ipex_pipeline("text-classification", save_dir, tokenizer=tokenizer, accelerator="ipex")
+        inputs = "This restaurant is awesome"
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertGreaterEqual(ipex_output[0]["score"], 0.0)

From bfc86637aa328cce6eb66fbfe22fcd38b34db081 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 15 May 2024 17:22:24 +0400
Subject: [PATCH 21/47] Prevent loading model for export if it is not supported
 (#710)

---
 optimum/exporters/openvino/__main__.py | 5 +++++
 optimum/exporters/openvino/convert.py  | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 31abd0f32..9db671906 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -219,6 +219,10 @@ def main_export(
         model_type = config.model_type.replace("_", "-")
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             custom_architecture = True
+            if custom_export_configs is None:
+                raise ValueError(
+                    f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export."
+                )
         elif task not in TasksManager.get_supported_tasks_for_model_type(
             model_type, exporter="openvino", library_name=library_name
         ):
@@ -232,6 +236,7 @@ def main_export(
             raise ValueError(
                 f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
             )
+
         if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
             loading_kwargs["attn_implementation"] = "eager"
         # there are some difference between remote and in library representation of past key values for some models,
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 3634a493c..baa34a5cd 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -547,7 +547,7 @@ def export_from_model(
     # TODO: support onnx_config.py in the model repo
     if custom_architecture and custom_export_configs is None:
         raise ValueError(
-            f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export."
+            f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export."
         )
 
     if task.startswith("text-generation") and model.config.is_encoder_decoder:

From 2b902bbef97d7ebe486487cf89c1737c580c36bd Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 15 May 2024 17:30:00 +0400
Subject: [PATCH 22/47] Optimize first latency beam search for
 OVModelForCausalLM (#695)

* WIP: beam search only

* other beam search algos

* add test

* do not touch decoding cycles

* fix stateless model support

* fix quantization

* move inputs modification into forward

* refactor test
---
 optimum/intel/openvino/modeling_decoder.py | 181 +++++++++++++++++++--
 tests/openvino/test_modeling.py            |  81 +++++++++
 2 files changed, 250 insertions(+), 12 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 9ab494be6..e4dc1ed78 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -17,7 +17,7 @@
 import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import openvino
@@ -28,6 +28,10 @@
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.generation import GenerationMixin
+from transformers.generation.configuration_utils import GenerationConfig, GenerationMode
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.utils import GenerateOutput
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from optimum.utils.normalized_config import NormalizedConfigManager
@@ -41,6 +45,11 @@
 from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
 
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.streamers import BaseStreamer
+
+
 logger = logging.getLogger(__name__)
 
 core = Core()
@@ -122,6 +131,8 @@ def __init__(
         self._pkv_precision = Type.f32
         self.next_beam_idx = None
         self._past_length = 0
+        self._first_iter_beam_search = False
+        self._second_iter_beam_search = False
         self.update_pkv_precision()
         if self.is_dynamic:
             self.model = self._reshape(self.model, -1, -1)
@@ -375,7 +386,11 @@ def prepare_inputs(
         inputs = {}
         if not self.stateful:
             if past_key_values is not None:
-                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
+                if (
+                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
+                    or self.config.model_type == "falcon"
+                    and self.config.new_decoder_architecture
+                ):
                     if self._pkv_precision == Type.bf16:
                         # numpy does not support bf16, pretending f16, should change to bf16
                         past_key_values = tuple(
@@ -418,7 +433,6 @@ def prepare_inputs(
                 self.next_beam_idx = np.arange(batch_size, dtype=int)
                 self._past_length = 0
         past_len = self._get_past_length(past_key_values)
-
         inputs["input_ids"] = np.array(input_ids)
         # Add the attention_mask inputs when needed
         if "attention_mask" in self.input_names or "position_ids" in self.input_names:
@@ -468,6 +482,8 @@ def forward(
             **kwargs,
         )
 
+        if self._first_iter_beam_search:
+            inputs, duplication_indices = self._deduplicate_inputs(inputs)
         # Run inference
         self.request.start_async(inputs, share_inputs=True)
         self.request.wait()
@@ -483,7 +499,11 @@ def forward(
             if self.use_cache:
                 # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer)
                 past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
-                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
+                if (
+                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
+                    or self.config.model_type == "falcon"
+                    and self.config.new_decoder_architecture
+                ):
                     # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention)
                     past_key_values = tuple(
                         past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv)
@@ -491,6 +511,10 @@ def forward(
             else:
                 past_key_values = None
 
+        if self._first_iter_beam_search:
+            logits, past_key_values = self._expand_outputs_for_generation(duplication_indices, logits, past_key_values)
+            self._first_iter_beam_search = False
+
         return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
 
     # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
@@ -520,7 +544,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
 
-        return {
+        model_inputs = {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
             "use_cache": use_cache,
@@ -528,12 +552,116 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             "attention_mask": attention_mask,
         }
 
+        return model_inputs
+
+    def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple):
+        batch_size = logits.shape[0]
+        if indicies.shape[0] != 1:
+            logits = logits[indicies]
+            if past_key_values and not self.stateful:
+                if (
+                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
+                    or self.config.model_type == "falcon"
+                    and self.config.new_decoder_architecture
+                ):
+                    past_key_values = tuple(
+                        tuple(
+                            past_state[indicies]
+                            if not self.config.model_type == "chatglm"
+                            else past_state[:, indicies, ...]
+                            for past_state in layer_past
+                        )
+                        for layer_past in past_key_values
+                    )
+                else:
+                    past_key_values = tuple([past_state[indicies] for past_state in past_key_values])
+        if self.stateful:
+            self.next_beam_idx = (
+                self.next_beam_idx[indicies]
+                if self.next_beam_idx is not None
+                else np.arange(batch_size, dtype=int)[indicies]
+            )
+        self._second_iter_beam_search = True
+        return logits, past_key_values
+
+    def _deduplicate_inputs(self, model_inputs: Dict):
+        input_ids = model_inputs["input_ids"]
+        upd_model_inputs = {}
+        unique_input_ids, indicies, reverse_indicies = np.unique(
+            input_ids, axis=0, return_index=True, return_inverse=True
+        )
+        for input_name, input_tensor in model_inputs.items():
+            if input_name not in ["input_ids", "beam_idx"]:
+                if not isinstance(input_tensor, Tensor):
+                    upd_model_inputs[input_name] = input_tensor[indicies]
+                else:
+                    shape = input_tensor.shape
+                    dtype = input_tensor.element_type
+                    upd_batch_size = indicies.shape[0]
+                    if self.config.model_type == "bloom":
+                        upd_batch_size *= self.config.num_attention_heads
+                    shape[0 if not self.config.model_type == "chatglm" else 1] = upd_batch_size
+                    upd_model_inputs[input_name] = Tensor(dtype, shape)
+        upd_model_inputs["input_ids"] = unique_input_ids
+        if "beam_idx" in model_inputs:
+            beam_range = (
+                unique_input_ids.shape[0]
+                if self.config.model_type != "bloom"
+                else unique_input_ids.shape[0] * self.config.num_attention_heads
+            )
+            beam_idx = np.arange(beam_range, dtype=int)
+            upd_model_inputs["beam_idx"] = beam_idx
+        return upd_model_inputs, reverse_indicies
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        _generation_config, _ = self._prepare_generation_config(generation_config, **kwargs)
+        generation_mode = _generation_config.get_generation_mode(assistant_model)
+
+        is_beam_search = generation_mode in [
+            GenerationMode.BEAM_SEARCH,
+            GenerationMode.BEAM_SAMPLE,
+            GenerationMode.GROUP_BEAM_SEARCH,
+            GenerationMode.CONSTRAINED_BEAM_SEARCH,
+        ]
+        if is_beam_search:
+            self._first_iter_beam_search = True
+        result = super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            assistant_model,
+            streamer,
+            negative_prompt_ids,
+            negative_prompt_attention_mask,
+            **kwargs,
+        )
+        return result
+
     def _get_past_length(self, past_key_values=None):
         if past_key_values is None:
             return 0
         if self.stateful:
             return self._past_length
-        if self.config.model_type in MULTI_QUERY_ATTN_MODELS:
+        if self.config.model_type in MULTI_QUERY_ATTN_MODELS and not (
+            self.config.model_type == "falcon" and self.config.new_decoder_architecture
+        ):
             return past_key_values[0].shape[-2]
         seq_length_dim = -2
         if self.config.model_type == "chatglm":
@@ -558,12 +686,20 @@ def _reorder_cache(
         if self.stateful:
             # TODO: Apply it differently based on model type
             # TODO: At least for bloom we need to replicate values for each attention head
-            self.next_beam_idx = np.array(beam_idx)  # save beam_idx to be used as an input in the next iteration
+            self.next_beam_idx = (
+                np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx
+            )  # save beam_idx to be used as an input in the next iteration
+            self._second_iter_beam_search = False
             return past_key_values
         else:
-            return tuple(
-                tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values
-            )
+            if self.config.model_type not in MULTI_QUERY_ATTN_MODELS and not (
+                self.config.model_type == "falcon" and self.config.new_decoder_architecture
+            ):
+                return tuple(
+                    tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past)
+                    for layer_past in past_key_values
+                )
+            return tuple(np.take(past_state, beam_idx, 0) for past_state in past_key_values)
 
     def can_generate(self):
         """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
@@ -684,11 +820,12 @@ def _reorder_cache(
         This is required to match `past_key_values` with the correct beam_idx at every generation step.
         """
         if self.stateful:
-            beam_idx = np.array(beam_idx)
             batch_size = beam_idx.shape[0]
+            beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx
             indices = np.array(range(batch_size * self.config.num_attention_heads))
             indices = indices.reshape([batch_size, self.config.num_attention_heads])
             self.next_beam_idx = np.take(indices, beam_idx, 0).flatten()
+            self._second_iter_beam_search = False
             return past_key_values
         else:
             standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx))
@@ -738,6 +875,24 @@ def _convert_to_standard_cache(
             for layer_past in past_key_value
         )
 
+    def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple):
+        batch_size = logits.shape[0]
+        if indicies.shape[0] != 1:
+            logits = logits[indicies]
+            if past_key_values and not self.stateful:
+                pkv_standard = self._convert_to_standard_cache(past_key_values, batch_size)
+                pkv = tuple(tuple(past_state[indicies] for past_state in layer_past) for layer_past in pkv_standard)
+                past_key_values = self._convert_to_bloom_cache(pkv)
+
+        if self.stateful:
+            self.next_beam_idx = (
+                self.next_beam_idx[indicies]
+                if self.next_beam_idx is not None
+                else np.arange(batch_size, dtype=int)[indicies]
+            )
+        self._second_iter_beam_search = True
+        return logits, past_key_values
+
 
 class OVGPTBigCodeForCausalLM(OVModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache
@@ -745,7 +900,9 @@ def _reorder_cache(
         self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
     ) -> Tuple[Tuple[torch.Tensor]]:
         if self.stateful:
-            self.next_beam_idx = np.array(beam_idx)  # save beam_idx to be used as an input in the next iteration
+            # save beam_idx to be used as an input in the next iteration
+            self.next_beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx
+            self._second_iter_beam_search = False
             return past_key_values
         else:
             return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values)
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index d4f55c683..75c95c156 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -778,6 +778,87 @@ def test_default_filling_attention_mask_and_position_ids(self):
         del model_with_cache
         gc.collect()
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
+    def test_beam_search(self, model_arch):
+        model_kwargs = {}
+        model_id = MODEL_NAMES[model_arch]
+        if model_arch in self.REMOTE_CODE_MODELS:
+            model_kwargs = {
+                "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
+                "trust_remote_code": True,
+            }
+        # Qwen tokenizer does not support padding, chatgm testing model produces nan that incompatible with beam search
+        if model_arch in ["qwen", "chatglm"]:
+            return
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+        beam_search_gen_config = GenerationConfig(
+            max_new_tokens=10,
+            min_new_tokens=10,
+            num_beams=4,
+            do_sample=False,
+            eos_token_id=None,
+        )
+        beam_sample_gen_config = GenerationConfig(
+            max_new_tokens=10,
+            min_new_tokens=10,
+            num_beams=4,
+            do_sample=True,
+            eos_token_id=None,
+            top_k=1,
+        )
+
+        group_beam_search_gen_config = GenerationConfig(
+            max_new_tokens=10,
+            min_new_tokens=10,
+            num_beams=4,
+            do_sample=False,
+            eos_token_id=None,
+            num_beam_groups=2,
+            diversity_penalty=0.0000001,
+        )
+        force_word = "cat"
+        force_words_ids = [tokenizer([force_word], add_special_tokens=False).input_ids]
+        constrained_beam_search_gen_config = GenerationConfig(
+            max_new_tokens=10,
+            min_new_tokens=10,
+            num_beams=4,
+            do_sample=False,
+            eos_token_id=None,
+            force_words_ids=force_words_ids,
+        )
+
+        gen_configs = [
+            beam_search_gen_config,
+            beam_sample_gen_config,
+            group_beam_search_gen_config,
+            constrained_beam_search_gen_config,
+        ]
+        ov_model_stateful = OVModelForCausalLM.from_pretrained(
+            model_id, export=True, use_cache=True, stateful=True, **model_kwargs
+        )
+        ov_model_stateless = OVModelForCausalLM.from_pretrained(
+            model_id, export=True, use_cache=True, stateful=False, **model_kwargs
+        )
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        ov_model_stateful.generation_config.eos_token_id = None
+        ov_model_stateless.generation_config.eos_token_id = None
+        transformers_model.generation_config.eos_token_id = None
+        ov_model_stateful.config.eos_token_id = None
+        ov_model_stateless.config.eos_token_id = None
+        transformers_model.config.eos_token_id = None
+
+        for gen_config in gen_configs:
+            transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+            ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
+            self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs))
+            ov_stateless_outputs = ov_model_stateless.generate(**tokens, generation_config=gen_config)
+            self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs))
+
 
 class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (

From 3cfbc38e466896b8e2f2f8142a9c538218f1294b Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Wed, 15 May 2024 22:47:06 +0800
Subject: [PATCH 23/47] add XPU support for `IPEXModel.from_pretrained` (#704)

* add xpu support

* Apply suggestions from code review

no device_map

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* add recursive_to_device

* Apply suggestions from code review

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/ipex/modeling_base.py   | 15 +++++++++++----
 optimum/intel/utils/modeling_utils.py | 13 +++++++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index d2963d55a..e929a4ddb 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -39,6 +39,7 @@
     GenerationConfig,
     GenerationMixin,
     PretrainedConfig,
+    is_torch_xpu_available,
 )
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
@@ -52,7 +53,7 @@
 from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _patch_model
 from ..generation.modeling import prepare_jit_inputs
 from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version
-from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask
+from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask, recursive_to_device
 
 
 logger = logging.getLogger(__name__)
@@ -128,10 +129,14 @@ def __init__(
         **kwargs,
     ):
         OptimizedModel.__init__(self, model=model, config=config)
-        # To do: add XPU support
-        self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32
+        if is_torch_xpu_available(check_device=True):
+            self._device = torch.device("xpu:0")
+        elif torch.cuda.is_available():
+            self._device = torch.device("cuda:0")
+        else:
+            self._device = torch.device("cpu")
         self.model.to(self._device)
+        self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32
         self.model_save_dir = model_save_dir
         self._is_ipex_exported = _is_patched_with_ipex(model, self.export_feature)
 
@@ -321,6 +326,8 @@ def _init_warmup(self):
         if not self._is_ipex_exported:
             use_cache = "past_key_values" in self.input_names
             dummy_inputs = prepare_jit_inputs(self, self.export_feature, use_cache)
+            if self._device.type != "cpu":
+                dummy_inputs = recursive_to_device(value=dummy_inputs, device=self._device)
             for _ in range(2):
                 self(**dummy_inputs)
 
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index 99ad42aaf..a2cd72835 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -169,3 +169,16 @@ def get_model_device(model: torch.nn.Module) -> torch.device:
         # The model had no parameters at all, doesn't matter which device to choose
         device = torch.device("cpu")
     return device
+
+
+def recursive_to_device(value, device):
+    """
+    Recursivley move the tensor element in `value` to `device`
+    """
+    if isinstance(value, (tuple, list)):
+        return type(value)(recursive_to_device(v, device) for v in value)
+    elif isinstance(value, dict):
+        return {k: recursive_to_device(v, device) for k, v in value.items()}
+    elif isinstance(value, torch.Tensor):
+        return value.to(device)
+    return value

From 02d5e4eee1dd94babb29fff39d988d08d039a126 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 15 May 2024 19:18:05 +0400
Subject: [PATCH 24/47] Cover more models with openvino export (#709)

* cover more models with openvino export

* xglm

* fix tests
---
 optimum/exporters/openvino/model_configs.py | 55 +++++++++++++++++++++
 tests/openvino/test_modeling.py             | 10 +++-
 tests/openvino/utils_tests.py               |  5 ++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 575f1cc4d..47ca4ff24 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -592,3 +592,58 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
             "sample": {0: "batch_size", 2: "height", 3: "width"},
         }
+
+
+@register_in_tasks_manager(
+    "persimmon",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class PersimmonOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager("biogpt", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class BioGPTOpenVINOConfig(TextDecoderOnnxConfig):
+    # BioGPT does not require position_ids input.
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager(
+    "gpt-neox-japanese", *["text-generation", "text-generation-with-past"], library_name="transformers"
+)
+class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig):
+    # GPTNeoxJapanese does not require position_ids input.
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
+@register_in_tasks_manager(
+    "cohere",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class CohereOpenVINOConfig(LlamaOpenVINOConfig):
+    pass
+
+
+@register_in_tasks_manager("xglm", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 13
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
+        num_attention_heads="attention_heads", hidden_size="d_model"
+    )
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 75c95c156..0a0b66b86 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -552,6 +552,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "orion",
         "falcon",
         "falcon-40b",
+        "persimmon",
+        "biogpt",
+        "gpt_neox_japanese",
+        "cohere",
+        "xglm",
     )
     GENERATION_LENGTH = 100
     REMOTE_CODE_MODELS = (
@@ -617,8 +622,11 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch == "qwen":
             return
 
-        if model_arch != "chatglm":
+        if model_arch not in ["chatglm", "persimmon"]:
             tokenizer.pad_token_id = tokenizer.eos_token_id
+
+        if model_arch == "persimmon":
+            tokenizer.pad_token_id = tokenizer.bos_token_id
         # Compare batched generation
         tokenizer.padding_side = "left"
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 9f28e40a4..aa3ea5f33 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -26,11 +26,13 @@
     "baichuan2": "katuni4ka/tiny-random-baichuan2",
     "baichuan2-13b": "katuni4ka/tiny-random-baichuan2-13b",
     "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus",
+    "biogpt": "hf-tiny-model-private/tiny-random-BioGptForCausalLM",
     "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel",
     "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
     "bloom": "hf-internal-testing/tiny-random-BloomModel",
     "camembert": "hf-internal-testing/tiny-random-camembert",
     "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
+    "cohere": "hf-internal-testing/tiny-random-CohereForCausalLM",
     "chatglm": "katuni4ka/tiny-random-chatglm2",
     "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
     "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel",
@@ -51,6 +53,7 @@
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
+    "gpt_neox_japanese": "hf-internal-testing/tiny-random-GPTNeoXJapaneseForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-ibert",
@@ -78,6 +81,7 @@
     "olmo": "katuni4ka/tiny-random-olmo-hf",
     "orion": "katuni4ka/tiny-random-orion",
     "pegasus": "hf-internal-testing/tiny-random-pegasus",
+    "persimmon": "hf-internal-testing/tiny-random-PersimmonForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "phi": "echarlaix/tiny-random-PhiForCausalLM",
     "phi3": "katuni4ka/tiny-random-phi3",
@@ -115,6 +119,7 @@
     "whisper": "openai/whisper-tiny.en",
     "xlm": "hf-internal-testing/tiny-random-xlm",
     "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta",
+    "xglm": "hf-internal-testing/tiny-random-XGLMForCausalLM",
 }
 
 

From c74388603300c077ccba9cbc82d67b703666daec Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 15 May 2024 17:18:40 +0200
Subject: [PATCH 25/47] IPEX test refactorization (#711)

---
 tests/ipex/test_inference.py | 87 +++++++++++++++---------------------
 tests/ipex/test_modeling.py  | 44 +-----------------
 tests/ipex/test_pipelines.py | 45 +------------------
 tests/ipex/utils_tests.py    | 57 +++++++++++++++++++++++
 4 files changed, 96 insertions(+), 137 deletions(-)
 create mode 100644 tests/ipex/utils_tests.py

diff --git a/tests/ipex/test_inference.py b/tests/ipex/test_inference.py
index b65d3c9b8..1a452fe40 100644
--- a/tests/ipex/test_inference.py
+++ b/tests/ipex/test_inference.py
@@ -16,8 +16,6 @@
 
 import torch
 from parameterized import parameterized
-
-# TODO : add more tasks
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForQuestionAnswering,
@@ -26,60 +24,51 @@
     AutoTokenizer,
     pipeline,
 )
+from utils_tests import MODEL_NAMES
 
 from optimum.intel import inference_mode as ipex_inference_mode
 from optimum.intel.ipex.modeling_base import IPEXModel
 
 
-MODEL_NAMES = {
-    "bert": "hf-internal-testing/tiny-random-bert",
-    "bloom": "hf-internal-testing/tiny-random-BloomModel",
-    "distilbert": "hf-internal-testing/tiny-random-distilbert",
-    "roberta": "hf-internal-testing/tiny-random-roberta",
-    "gptj": "hf-internal-testing/tiny-random-gptj",
-    "gpt2": "hf-internal-testing/tiny-random-gpt2",
-    "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
-    "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
-    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
-    "llama2": "Jiqing/tiny_random_llama2",
-    "opt": "hf-internal-testing/tiny-random-OPTModel",
-    "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
-}
-
 _CLASSIFICATION_TASK_TO_AUTOMODELS = {
     "text-classification": AutoModelForSequenceClassification,
     "token-classification": AutoModelForTokenClassification,
 }
 
 
-class IPEXIntegrationTest(unittest.TestCase):
-    CLASSIFICATION_SUPPORTED_ARCHITECTURES = (
+class IPEXClassificationTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = (
         "bert",
         "distilbert",
         "roberta",
     )
 
-    TEXT_GENERATION_SUPPORTED_ARCHITECTURES = (
-        "bloom",
-        "gptj",
-        "gpt2",
-        "gpt_neo",
-        "gpt_bigcode",
-        "llama",
-        "llama2",
-        "opt",
-        "mpt",
-    )
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        inputs = "This is a sample input"
+        for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items():
+            model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32)
+            pipe = pipeline(task, model=model, tokenizer=tokenizer)
 
-    QA_SUPPORTED_ARCHITECTURES = (
+            with torch.inference_mode():
+                outputs = pipe(inputs)
+            with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe:
+                outputs_ipex = ipex_pipe(inputs)
+            self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule))
+            self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"])
+
+
+class IPEXQuestionAnsweringTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = (
         "bert",
         "distilbert",
         "roberta",
     )
 
-    @parameterized.expand(QA_SUPPORTED_ARCHITECTURES)
-    def test_question_answering_pipeline_inference(self, model_arch):
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline_inference(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         model = AutoModelForQuestionAnswering.from_pretrained(model_id, torch_dtype=torch.float32)
@@ -95,24 +84,22 @@ def test_question_answering_pipeline_inference(self, model_arch):
         self.assertEqual(outputs["start"], outputs_ipex["start"])
         self.assertEqual(outputs["end"], outputs_ipex["end"])
 
-    @parameterized.expand(CLASSIFICATION_SUPPORTED_ARCHITECTURES)
-    def test_classification_pipeline_inference(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = "This is a sample input"
-        for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items():
-            model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32)
-            pipe = pipeline(task, model=model, tokenizer=tokenizer)
 
-            with torch.inference_mode():
-                outputs = pipe(inputs)
-            with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe:
-                outputs_ipex = ipex_pipe(inputs)
-            self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule))
-            self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"])
+class IPEXTextGenerationTest(unittest.TestCase):
+    SUPPORTED_ARCHITECTURES = (
+        "bloom",
+        "gptj",
+        "gpt2",
+        "gpt_neo",
+        "gpt_bigcode",
+        "llama",
+        "llama2",
+        "opt",
+        "mpt",
+    )
 
-    @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
-    def test_text_generation_pipeline_inference(self, model_arch):
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline_inference(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, return_dict=False)
         model = model.eval()
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index 94a5ca9e1..2a2f18f6f 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -45,53 +45,11 @@
 )
 from optimum.intel.utils.import_utils import is_ipex_version
 from optimum.utils.testing_utils import grid_parameters
+from utils_tests import MODEL_NAMES
 
 
 SEED = 42
 
-MODEL_NAMES = {
-    "albert": "hf-internal-testing/tiny-random-albert",
-    "beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
-    "bert": "hf-internal-testing/tiny-random-bert",
-    "bart": "hf-internal-testing/tiny-random-bart",
-    "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel",
-    "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
-    "bloom": "hf-internal-testing/tiny-random-BloomModel",
-    "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
-    "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
-    "convnext": "hf-internal-testing/tiny-random-convnext",
-    "distilbert": "hf-internal-testing/tiny-random-distilbert",
-    "electra": "hf-internal-testing/tiny-random-electra",
-    "flaubert": "hf-internal-testing/tiny-random-flaubert",
-    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
-    "gpt2": "hf-internal-testing/tiny-random-gpt2",
-    "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
-    "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
-    "gptj": "hf-internal-testing/tiny-random-GPTJModel",
-    "levit": "hf-internal-testing/tiny-random-LevitModel",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
-    "llama2": "Jiqing/tiny_random_llama2",
-    "marian": "sshleifer/tiny-marian-en-de",
-    "mbart": "hf-internal-testing/tiny-random-mbart",
-    "mistral": "echarlaix/tiny-random-mistral",
-    "mobilenet_v1": "google/mobilenet_v1_0.75_192",
-    "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
-    "mobilevit": "hf-internal-testing/tiny-random-mobilevit",
-    "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
-    "mt5": "stas/mt5-tiny-random",
-    "opt": "hf-internal-testing/tiny-random-OPTModel",
-    "phi": "echarlaix/tiny-random-PhiForCausalLM",
-    "resnet": "hf-internal-testing/tiny-random-resnet",
-    "roberta": "hf-internal-testing/tiny-random-roberta",
-    "roformer": "hf-internal-testing/tiny-random-roformer",
-    "squeezebert": "hf-internal-testing/tiny-random-squeezebert",
-    "t5": "hf-internal-testing/tiny-random-t5",
-    "unispeech": "hf-internal-testing/tiny-random-unispeech",
-    "vit": "hf-internal-testing/tiny-random-vit",
-    "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier",
-    "xlm": "hf-internal-testing/tiny-random-xlm",
-}
-
 
 class Timer(object):
     def __enter__(self):
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index 89a27ab2c..c4ae471a0 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -20,6 +20,7 @@
 from parameterized import parameterized
 from transformers import AutoTokenizer
 from transformers.pipelines import pipeline as transformers_pipeline
+from utils_tests import MODEL_NAMES
 
 from optimum.intel.ipex.modeling_base import (
     IPEXModelForAudioClassification,
@@ -33,50 +34,6 @@
 from optimum.intel.pipelines import pipeline as ipex_pipeline
 
 
-MODEL_NAMES = {
-    "albert": "hf-internal-testing/tiny-random-albert",
-    "beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
-    "bert": "hf-internal-testing/tiny-random-bert",
-    "bart": "hf-internal-testing/tiny-random-bart",
-    "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel",
-    "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
-    "bloom": "hf-internal-testing/tiny-random-BloomModel",
-    "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
-    "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
-    "convnext": "hf-internal-testing/tiny-random-convnext",
-    "distilbert": "hf-internal-testing/tiny-random-distilbert",
-    "electra": "hf-internal-testing/tiny-random-electra",
-    "flaubert": "hf-internal-testing/tiny-random-flaubert",
-    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
-    "gpt2": "hf-internal-testing/tiny-random-gpt2",
-    "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
-    "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
-    "gptj": "hf-internal-testing/tiny-random-GPTJModel",
-    "levit": "hf-internal-testing/tiny-random-LevitModel",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
-    "llama2": "Jiqing/tiny_random_llama2",
-    "marian": "sshleifer/tiny-marian-en-de",
-    "mbart": "hf-internal-testing/tiny-random-mbart",
-    "mistral": "echarlaix/tiny-random-mistral",
-    "mobilenet_v1": "google/mobilenet_v1_0.75_192",
-    "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
-    "mobilevit": "hf-internal-testing/tiny-random-mobilevit",
-    "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
-    "mt5": "stas/mt5-tiny-random",
-    "opt": "hf-internal-testing/tiny-random-OPTModel",
-    "phi": "echarlaix/tiny-random-PhiForCausalLM",
-    "resnet": "hf-internal-testing/tiny-random-resnet",
-    "roberta": "hf-internal-testing/tiny-random-roberta",
-    "roformer": "hf-internal-testing/tiny-random-roformer",
-    "squeezebert": "hf-internal-testing/tiny-random-squeezebert",
-    "t5": "hf-internal-testing/tiny-random-t5",
-    "unispeech": "hf-internal-testing/tiny-random-unispeech",
-    "vit": "hf-internal-testing/tiny-random-vit",
-    "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier",
-    "xlm": "hf-internal-testing/tiny-random-xlm",
-}
-
-
 class PipelinesIntegrationTest(unittest.TestCase):
     COMMON_SUPPORTED_ARCHITECTURES = (
         "albert",
diff --git a/tests/ipex/utils_tests.py b/tests/ipex/utils_tests.py
new file mode 100644
index 000000000..a14f0bf7c
--- /dev/null
+++ b/tests/ipex/utils_tests.py
@@ -0,0 +1,57 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+MODEL_NAMES = {
+    "albert": "hf-internal-testing/tiny-random-albert",
+    "beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
+    "bert": "hf-internal-testing/tiny-random-bert",
+    "bart": "hf-internal-testing/tiny-random-bart",
+    "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel",
+    "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
+    "bloom": "hf-internal-testing/tiny-random-BloomModel",
+    "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
+    "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
+    "convnext": "hf-internal-testing/tiny-random-convnext",
+    "distilbert": "hf-internal-testing/tiny-random-distilbert",
+    "electra": "hf-internal-testing/tiny-random-electra",
+    "flaubert": "hf-internal-testing/tiny-random-flaubert",
+    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
+    "gpt2": "hf-internal-testing/tiny-random-gpt2",
+    "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
+    "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
+    "gptj": "hf-internal-testing/tiny-random-GPTJModel",
+    "levit": "hf-internal-testing/tiny-random-LevitModel",
+    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama2": "Jiqing/tiny_random_llama2",
+    "marian": "sshleifer/tiny-marian-en-de",
+    "mbart": "hf-internal-testing/tiny-random-mbart",
+    "mistral": "echarlaix/tiny-random-mistral",
+    "mobilenet_v1": "google/mobilenet_v1_0.75_192",
+    "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
+    "mobilevit": "hf-internal-testing/tiny-random-mobilevit",
+    "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
+    "mt5": "stas/mt5-tiny-random",
+    "opt": "hf-internal-testing/tiny-random-OPTModel",
+    "phi": "echarlaix/tiny-random-PhiForCausalLM",
+    "resnet": "hf-internal-testing/tiny-random-resnet",
+    "roberta": "hf-internal-testing/tiny-random-roberta",
+    "roformer": "hf-internal-testing/tiny-random-roformer",
+    "squeezebert": "hf-internal-testing/tiny-random-squeezebert",
+    "t5": "hf-internal-testing/tiny-random-t5",
+    "unispeech": "hf-internal-testing/tiny-random-unispeech",
+    "vit": "hf-internal-testing/tiny-random-vit",
+    "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier",
+    "xlm": "hf-internal-testing/tiny-random-xlm",
+}

From 8c2b787cc75a45ae4670d37970a5394eba90eedc Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 15 May 2024 19:33:55 +0400
Subject: [PATCH 26/47] Add sdpa for phi3 openvino model (#705)

* add sdpa for phi3 openvino model

* fix pkv filling according model code

* Update optimum/exporters/openvino/model_patcher.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* import helpers from phi3 if available

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/exporters/openvino/model_configs.py |  6 ++
 optimum/exporters/openvino/model_patcher.py | 94 ++++++++++++++++++++-
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 47ca4ff24..dc1351211 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -485,6 +485,12 @@ def patch_model_for_export(
     library_name="transformers",
 )
 class Phi3OpenVINOConfig(PhiOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        MistralDummyPastKeyValuesGenerator,
+    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
+
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index f68e873d4..55afb0ffe 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -951,15 +951,107 @@ def __exit__(self, exc_type, exc_value, traceback):
                 block.attention.forward = block.attention._orig_forward
 
 
+# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L426
+def _phi3_self_attn_sdpa_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        return self._orig_forward(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+
+    # TO DO: remove llama imports when transformers with phi3 support will be released
+    try:
+        from transformers.models.phi3.modelling_phi3 import apply_rotary_pos_emb, repeat_kv
+    except ImportError:
+        from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+
+    bsz, q_len, _ = hidden_states.size()
+
+    qkv = self.qkv_proj(hidden_states)
+    query_pos = self.num_heads * self.head_dim
+    query_states = qkv[..., :query_pos]
+    key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+    value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
+
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
+
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query_states.device.type == "cuda" and attention_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=attention_mask,
+        dropout_p=self.attention_dropout if self.training else 0.0,
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal=self.is_causal and attention_mask is None and q_len > 1,
+    )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value
+
+
 class Phi3ModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
-
         # https://github.com/huggingface/transformers/blob/30ee508c6c92a1c0aa0281d193c7c0fb815b8d2f/src/transformers/models/phi3/modeling_phi3.py#L113
         # init inv_freq for torchscript tracing
         for layer in self._model.model.layers:
+            if is_torch_version(">=", "2.1.0"):
+                orig_self_attn_fwd = layer.self_attn.forward
+                layer.self_attn.forward = types.MethodType(_phi3_self_attn_sdpa_forward, layer.self_attn)
+                layer.self_attn._orig_forward = orig_self_attn_fwd
+
             if layer.self_attn.rotary_emb.inv_freq is None:
                 rotary_emb = layer.self_attn.rotary_emb
                 layer.self_attn.rotary_emb.inv_freq = 1.0 / (
                     rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim)
                 )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        for layer in self._model.model.layers:
+            if hasattr(layer.self_attn, "_orig_forward"):
+                layer.self_attn.forward = layer.self_attn._orig_forward

From c30d488f8a46a3980795951edf1f0dc53c0efb0a Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Thu, 16 May 2024 16:51:15 +0200
Subject: [PATCH 27/47] Fix diffusers requirement for quantizing models (#712)

---
 optimum/intel/openvino/quantization.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 45961a86f..17305b947 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -48,7 +48,7 @@
 from ...exporters.openvino.model_patcher import patch_model_with_bettertransformer
 from ...exporters.openvino.stateful import ensure_export_task_support_stateful, ensure_stateful_is_available
 from ..utils.constant import _TASK_ALIASES
-from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available
+from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available, is_diffusers_available
 from ..utils.modeling_utils import get_model_device
 from .configuration import OVConfig, OVQuantizationConfig, OVQuantizationMethod, OVWeightQuantizationConfig
 from .modeling_base import OVBaseModel
@@ -325,7 +325,8 @@ def _quantize_ovbasemodel(
         remove_unused_columns: bool = True,
         **kwargs,
     ):
-        from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase
+        if is_diffusers_available():
+            from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase
 
         if save_directory is not None:
             save_directory = Path(save_directory)
@@ -335,7 +336,7 @@ def _quantize_ovbasemodel(
         if calibration_dataset is not None:
             # Process custom calibration dataset
 
-            if isinstance(self.model, OVStableDiffusionPipelineBase):
+            if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
                 calibration_dataset = self._prepare_unet_dataset(
                     quantization_config.num_samples, dataset=calibration_dataset
                 )
@@ -373,7 +374,7 @@ def _quantize_ovbasemodel(
 
                 if isinstance(self.model, OVModelForCausalLM):
                     calibration_dataset = self._prepare_builtin_dataset(quantization_config)
-                elif isinstance(self.model, OVStableDiffusionPipelineBase):
+                elif is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
                     calibration_dataset = self._prepare_unet_dataset(
                         quantization_config.num_samples, dataset_name=quantization_config.dataset
                     )
@@ -385,7 +386,7 @@ def _quantize_ovbasemodel(
             if quantization_config.quant_method == OVQuantizationMethod.HYBRID:
                 if calibration_dataset is None:
                     raise ValueError("Calibration dataset is required to run hybrid quantization.")
-                if isinstance(self.model, OVStableDiffusionPipelineBase):
+                if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
                     # Apply weight-only quantization to all SD submodels except UNet
                     quantization_config_copy = copy.deepcopy(quantization_config)
                     quantization_config_copy.dataset = None

From 715c054360946b312e94beec91c55a1223258954 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 17 May 2024 13:38:58 +0400
Subject: [PATCH 28/47] Skip saving gen config if saving failed (#717)

---
 optimum/exporters/openvino/convert.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index baa34a5cd..3b214f77e 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -614,7 +614,12 @@ def export_from_model(
         model.config.save_pretrained(output)
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
-            generation_config.save_pretrained(output)
+            try:
+                generation_config.save_pretrained(output)
+            except Exception as exception:
+                logger.warning(
+                    f"The generation config will not be saved, saving failed with following error:\n{exception}"
+                )
 
         model_name_or_path = model.config._name_or_path
         maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code)

From 60d5bf6e0c67f3813fb2148ce3e7258ed84d27a4 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 17 May 2024 20:01:56 +0400
Subject: [PATCH 29/47]  Add support export for new architectures (#716)

* support export more models

* update aquila to support v1 and v2
---
 optimum/exporters/openvino/model_configs.py |  89 ++++++-
 optimum/exporters/openvino/model_patcher.py | 279 +++++++++++++++++++-
 tests/openvino/test_modeling.py             |  11 +
 tests/openvino/utils_tests.py               |   4 +
 4 files changed, 378 insertions(+), 5 deletions(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index dc1351211..8feeafd61 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -41,15 +41,18 @@
 from optimum.utils.normalized_config import NormalizedTextConfig
 
 from .model_patcher import (
+    AquilaModelPatcher,
     BaichuanModelPatcher,
     ChatGLMModelPatcher,
     GemmaModelPatcher,
-    InternLMPatcher,
+    InternLM2Patcher,
+    InternLMModelPatcher,
     LlamaModelPatcher,
     MixtralModelPatcher,
     MPTModelPatcher,
     Phi3ModelPatcher,
     QwenModelPatcher,
+    XverseModelPatcher,
 )
 
 
@@ -445,7 +448,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return InternLMPatcher(self, model, model_kwargs=model_kwargs)
+        return InternLM2Patcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers")
@@ -653,3 +656,85 @@ class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
         num_attention_heads="attention_heads", hidden_size="d_model"
     )
+
+
+class AquilaDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task,
+            normalized_config,
+            batch_size,
+            sequence_length,
+            random_batch_size_range,
+            random_sequence_length_range,
+            **kwargs,
+        )
+        self.num_key_value_heads = getattr(
+            normalized_config, "num_key_value_heads", normalized_config.num_attention_heads
+        )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        shape = (
+            self.batch_size,
+            self.num_key_value_heads,
+            self.sequence_length,
+            self.hidden_size // self.num_attention_heads,
+        )
+        return [
+            (
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+            )
+            for _ in range(self.num_layers)
+        ]
+
+
+@register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return AquilaModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return XverseModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return InternLMModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 55afb0ffe..33fd77cba 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -844,7 +844,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 block.attn.forward = block.attn._orig_forward
 
 
-def _internlm_attention_forward(
+def _internlm2_attention_forward(
     self,
     hidden_states: torch.Tensor,
     attention_mask: Optional[torch.Tensor] = None,
@@ -935,14 +935,14 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return attn_output, attn_weights, past_key_value
 
 
-class InternLMPatcher(DecoderModelPatcher):
+class InternLM2Patcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
 
         if is_torch_version(">=", "2.1.0"):
             for block in self._model.model.layers:
                 block.attention._orig_forward = block.attention.forward
-                block.attention.forward = types.MethodType(_internlm_attention_forward, block.attention)
+                block.attention.forward = types.MethodType(_internlm2_attention_forward, block.attention)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -1055,3 +1055,276 @@ def __exit__(self, exc_type, exc_value, traceback):
         for layer in self._model.model.layers:
             if hasattr(layer.self_attn, "_orig_forward"):
                 layer.self_attn.forward = layer.self_attn._orig_forward
+
+
+def _aquila_self_attn_sdpa_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        """
+        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+        """
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep == 1:
+            return hidden_states
+        hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+        # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+        cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+        sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    if output_attentions:
+        return self._orig_forward(
+            hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache
+        )
+    bsz, q_len, _ = hidden_states.size()
+
+    if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
+        key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+        query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0)
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+        query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+        query_states = torch.cat(query_states, dim=-1)
+
+        key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+        key_states = torch.cat(key_states, dim=-1)
+
+        value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+        value_states = torch.cat(value_states, dim=-1)
+
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, getattr(self, "num_key_value_heads", self.num_heads), self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, getattr(self, "num_key_value_heads", self.num_heads), self.head_dim
+    ).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    if hasattr(self, "num_key_value_groups"):
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim))
+    )
+    attn_weights = None
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+        attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+    else:
+        attn_output = self.o_proj(attn_output)
+
+    return attn_output, attn_weights, past_key_value
+
+
+class AquilaModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.model.layers:
+            if is_torch_version(">=", "2.1.0"):
+                orig_self_attn_fwd = layer.self_attn.forward
+                layer.self_attn.forward = types.MethodType(_aquila_self_attn_sdpa_forward, layer.self_attn)
+                layer.self_attn._orig_forward = orig_self_attn_fwd
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        for layer in self._model.model.layers:
+            if hasattr(layer.self_attn, "_orig_forward"):
+                layer.self_attn.forward = layer.self_attn._orig_forward
+
+
+def _xverse_self_attn_sdpa_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+        # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+        cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+        sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    if output_attentions:
+        return self._orig_forward(
+            hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache
+        )
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim))
+    )
+    attn_weights = None
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, attn_weights, past_key_value
+
+
+def _internlm_self_attn_sdpa_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+        cos = cos[position_ids].unsqueeze(1)
+        sin = sin[position_ids].unsqueeze(1)
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    if output_attentions:
+        return self._orig_forward(
+            hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+    query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim))
+    )
+    attn_weights = None
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights, past_key_value
+
+
+class XverseModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.model.layers:
+            if is_torch_version(">=", "2.1.0"):
+                orig_self_attn_fwd = layer.self_attn.forward
+                layer.self_attn.forward = types.MethodType(_xverse_self_attn_sdpa_forward, layer.self_attn)
+                layer.self_attn._orig_forward = orig_self_attn_fwd
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        for layer in self._model.model.layers:
+            if hasattr(layer.self_attn, "_orig_forward"):
+                layer.self_attn.forward = layer.self_attn._orig_forward
+
+
+class InternLMModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.model.layers:
+            if is_torch_version(">=", "2.1.0"):
+                orig_self_attn_fwd = layer.self_attn.forward
+                layer.self_attn.forward = types.MethodType(_internlm_self_attn_sdpa_forward, layer.self_attn)
+                layer.self_attn._orig_forward = orig_self_attn_fwd
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        for layer in self._model.model.layers:
+            if hasattr(layer.self_attn, "_orig_forward"):
+                layer.self_attn.forward = layer.self_attn._orig_forward
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 0a0b66b86..1191a9390 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -557,6 +557,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "gpt_neox_japanese",
         "cohere",
         "xglm",
+        "aquila",
+        "aquila2",
+        "xverse",
+        "internlm",
     )
     GENERATION_LENGTH = 100
     REMOTE_CODE_MODELS = (
@@ -569,6 +573,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "internlm2",
         "orion",
         "phi3",
+        "aquila",
+        "aquila2",
+        "xverse",
+        "internlm",
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -596,6 +604,7 @@ def test_compare_to_transformers(self, model_arch):
         self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
         tokens = tokenizer("This is a sample output", return_tensors="pt")
+        tokens.pop("token_type_ids", None)
 
         ov_outputs = ov_model(**tokens)
         self.assertTrue("logits" in ov_outputs)
@@ -630,6 +639,7 @@ def test_compare_to_transformers(self, model_arch):
         # Compare batched generation
         tokenizer.padding_side = "left"
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        tokens.pop("token_type_ids", None)
         ov_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         ov_model.config.eos_token_id = None
@@ -853,6 +863,7 @@ def test_beam_search(self, model_arch):
         transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        tokens.pop("token_type_ids", None)
         ov_model_stateful.generation_config.eos_token_id = None
         ov_model_stateless.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index aa3ea5f33..1627112c5 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -18,6 +18,8 @@
 
 MODEL_NAMES = {
     "albert": "hf-internal-testing/tiny-random-albert",
+    "aquila": "katuni4ka/tiny-random-aquilachat",
+    "aquila2": "katuni4ka/tiny-random-aquila2",
     "audio_spectrogram_transformer": "Ericwang/tiny-random-ast",
     "bge": "BAAI/bge-small-en-v1.5",
     "beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
@@ -57,6 +59,7 @@
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-ibert",
+    "internlm": "katuni4ka/tiny-random-internlm",
     "internlm2": "katuni4ka/tiny-random-internlm2",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
     "longt5": "hf-internal-testing/tiny-random-longt5",
@@ -120,6 +123,7 @@
     "xlm": "hf-internal-testing/tiny-random-xlm",
     "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta",
     "xglm": "hf-internal-testing/tiny-random-XGLMForCausalLM",
+    "xverse": "katuni4ka/tiny-random-xverse",
 }
 
 

From bc5051fecf4fd6b03f7e7f261ec2d466aa688049 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 17 May 2024 18:04:35 +0200
Subject: [PATCH 30/47] Add --all-layers argument to openvino CLI (#713)

* Add --all-layers argument to CLI

* Update description
---
 optimum/commands/export/openvino.py  | 11 +++++++++++
 tests/openvino/test_exporters_cli.py | 21 +++++++++------------
 tests/openvino/utils_tests.py        |  2 --
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 025a40e05..ffd084d4e 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -119,6 +119,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models."
         ),
     )
+    optional_group.add_argument(
+        "--all-layers",
+        action="store_true",
+        default=None,
+        help=(
+            "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight "
+            "compression is applied, they are compressed to INT8."
+        ),
+    )
     optional_group.add_argument(
         "--disable-stateful",
         action="store_true",
@@ -198,6 +207,7 @@ def run(self):
                 and self.args.ratio is None
                 and self.args.group_size is None
                 and self.args.sym is None
+                and self.args.all_layers is None
                 and self.args.model in _DEFAULT_4BIT_CONFIGS
             ):
                 quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model]
@@ -207,6 +217,7 @@ def run(self):
                     "ratio": 1 if is_int8 else (self.args.ratio or 0.8),
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
+                    "all_layers": None if is_int8 else self.args.all_layers,
                 }
 
             if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index cac79abae..cce25bbae 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -18,7 +18,6 @@
 
 from parameterized import parameterized
 from utils_tests import (
-    _ARCHITECTURES_TO_EXPECTED_INT4_INT8,
     _ARCHITECTURES_TO_EXPECTED_INT8,
     MODEL_NAMES,
     get_num_quantized_nodes,
@@ -84,14 +83,13 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("latent-consistency", 50, 135),
     )
 
-    SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),)
-
-    SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"]
-
-    TEST_4BIT_CONFIGURATONS = []
-    for arch in SUPPORTED_4BIT_ARCHITECTURES:
-        for option in SUPPORTED_4BIT_OPTIONS:
-            TEST_4BIT_CONFIGURATONS.append([arch[0], arch[1], option])
+    TEST_4BIT_CONFIGURATONS = [
+        ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86),
+        ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
+    ]
 
     def _openvino_export(
         self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None
@@ -197,17 +195,16 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
             self.assertEqual(exp_num_fq, num_fq)
 
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
-    def test_exporters_cli_int4(self, task: str, model_type: str, option: str):
+    def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int):
         with TemporaryDirectory() as tmpdir:
             subprocess.run(
-                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task}  --weight-format {option} {tmpdir}",
+                f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
                 shell=True,
                 check=True,
             )
             model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
             model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs)
 
-            expected_int8, expected_int4 = _ARCHITECTURES_TO_EXPECTED_INT4_INT8[model_type]
             _, num_int8, num_int4 = get_num_quantized_nodes(model)
             self.assertEqual(expected_int8, num_int8)
             self.assertEqual(expected_int4, num_int4)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 1627112c5..d4364d192 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -149,8 +149,6 @@
     "stable-diffusion-xl-refiner": (366, 34, 42, 66),
 }
 
-_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 86)}
-
 
 def get_num_quantized_nodes(ov_model):
     num_fake_quantize = 0

From 7114900cd3d80fdfc6bc18aff1d016bd6b626e31 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Sun, 19 May 2024 01:53:17 +0400
Subject: [PATCH 31/47] fix beam search test reported issues (#718)

* fix beam search test reported issues

* test beam search

* refactor applying code style with preserve logic for olmo
---
 optimum/intel/openvino/modeling_decoder.py | 22 ++++++++--------------
 tests/openvino/test_modeling.py            |  6 ++++++
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index e4dc1ed78..2ad04ab14 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -386,10 +386,8 @@ def prepare_inputs(
         inputs = {}
         if not self.stateful:
             if past_key_values is not None:
-                if (
-                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
-                    or self.config.model_type == "falcon"
-                    and self.config.new_decoder_architecture
+                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or (
+                    self.config.model_type == "falcon" and self.config.new_decoder_architecture
                 ):
                     if self._pkv_precision == Type.bf16:
                         # numpy does not support bf16, pretending f16, should change to bf16
@@ -499,10 +497,8 @@ def forward(
             if self.use_cache:
                 # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer)
                 past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names)
-                if (
-                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
-                    or self.config.model_type == "falcon"
-                    and self.config.new_decoder_architecture
+                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or (
+                    self.config.model_type == "falcon" and self.config.new_decoder_architecture
                 ):
                     # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention)
                     past_key_values = tuple(
@@ -559,10 +555,8 @@ def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_ke
         if indicies.shape[0] != 1:
             logits = logits[indicies]
             if past_key_values and not self.stateful:
-                if (
-                    self.config.model_type not in MULTI_QUERY_ATTN_MODELS
-                    or self.config.model_type == "falcon"
-                    and self.config.new_decoder_architecture
+                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or (
+                    self.config.model_type == "falcon" and self.config.new_decoder_architecture
                 ):
                     past_key_values = tuple(
                         tuple(
@@ -581,7 +575,7 @@ def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_ke
                 if self.next_beam_idx is not None
                 else np.arange(batch_size, dtype=int)[indicies]
             )
-        self._second_iter_beam_search = True
+            self._second_iter_beam_search = True
         return logits, past_key_values
 
     def _deduplicate_inputs(self, model_inputs: Dict):
@@ -692,7 +686,7 @@ def _reorder_cache(
             self._second_iter_beam_search = False
             return past_key_values
         else:
-            if self.config.model_type not in MULTI_QUERY_ATTN_MODELS and not (
+            if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or (
                 self.config.model_type == "falcon" and self.config.new_decoder_architecture
             ):
                 return tuple(
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 1191a9390..692720a97 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -812,6 +812,10 @@ def test_beam_search(self, model_arch):
             return
 
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+        if model_arch == "persimmon":
+            tokenizer.pad_token_id = tokenizer.bos_token_id
+            tokenizer.eos_token_id = tokenizer.bos_token_id
+
         beam_search_gen_config = GenerationConfig(
             max_new_tokens=10,
             min_new_tokens=10,
@@ -872,6 +876,8 @@ def test_beam_search(self, model_arch):
         transformers_model.config.eos_token_id = None
 
         for gen_config in gen_configs:
+            if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]:
+                continue
             transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
             ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
             self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs))

From 7a929e8d6da0dac4fbd8995add32a663e7b9afc5 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 21 May 2024 13:35:53 +0400
Subject: [PATCH 32/47] Fix backward compatibility for GenerationMode import
 (#719)

---
 optimum/intel/openvino/modeling_decoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 2ad04ab14..933d92a50 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -28,10 +28,10 @@
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.generation import GenerationMixin
-from transformers.generation.configuration_utils import GenerationConfig, GenerationMode
+from transformers.generation.configuration_utils import GenerationConfig
 from transformers.generation.logits_process import LogitsProcessorList
 from transformers.generation.stopping_criteria import StoppingCriteriaList
-from transformers.generation.utils import GenerateOutput
+from transformers.generation.utils import GenerateOutput, GenerationMode
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from optimum.utils.normalized_config import NormalizedConfigManager

From c69fe32c638e52433016df8d1a6746db3e7e70da Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 21 May 2024 20:52:25 +0400
Subject: [PATCH 33/47] Add support export for new architectures (#720)

* update codegen config for support codegen2

* add support DBRX

* add qwen2moe support

* fix test models

* buichuan sdpa

* apply review comments

* Apply suggestions from code review

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/exporters/openvino/model_configs.py |  47 +++
 optimum/exporters/openvino/model_patcher.py | 343 +++++++++++++++++++-
 tests/openvino/test_modeling.py             |   4 +
 tests/openvino/utils_tests.py               |   3 +
 4 files changed, 396 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 8feeafd61..d69adc9da 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -20,6 +20,7 @@
 
 from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
 from optimum.exporters.onnx.model_configs import (
+    CodeGenOnnxConfig,
     FalconOnnxConfig,
     GemmaOnnxConfig,
     LlamaOnnxConfig,
@@ -44,6 +45,8 @@
     AquilaModelPatcher,
     BaichuanModelPatcher,
     ChatGLMModelPatcher,
+    CodeGenModelPatcher,
+    DBRXModelPatcher,
     GemmaModelPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
@@ -112,6 +115,15 @@ class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
+@register_in_tasks_manager("qwen2-moe", *["text-generation", "text-generation-with-past"], library_name="transformers")
+class Qwen2MoEOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+
 @register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
@@ -738,3 +750,38 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return InternLMModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "codegen",
+    *["feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past"],
+    library_name="transformers",
+)
+class CodeGenOpenVINOConfig(CodeGenOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return CodeGenModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "dbrx",
+    *["text-generation", "text-generation-with-past"],
+    library_name="transformers",
+)
+class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
+        num_attention_heads="n_heads",
+        hidden_size="d_model",
+        num_layers="n_layers",
+        num_key_value_heads="attn_config.kv_n_heads",
+        allow_new=True,
+    )
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return DBRXModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 33fd77cba..93a843052 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -20,6 +20,7 @@
 
 import torch
 import torch.nn.functional as F
+from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.utils import is_tf_available
 
@@ -43,6 +44,9 @@
         from transformers.modeling_tf_utils import TFPreTrainedModel
 
 
+BETTERTRANSFORMER_IGNORE = ("codegen",)
+
+
 def patch_model_with_bettertransformer(model):
     COLOR_RED = "\033[1;31m"
     COLOR_RESET = "\033[0m"
@@ -81,6 +85,10 @@ def patch_model_with_bettertransformer(model):
     # model already has required SDPA implementation
     if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa":
         return model
+
+    if model.config.model_type in BETTERTRANSFORMER_IGNORE:
+        return model
+
     try:
         model = model.to_bettertransformer()
     except Exception as e:
@@ -665,6 +673,72 @@ def _baichuan13b_atten_forward(
     return attn_output, attn_weights, past_key_value
 
 
+# Adapted from https://huggingface.co/baichuan-inc/Baichuan-7B/blob/262c8cb58b6d3615c208d9230baa869fddee2adb/modeling_baichuan.py#L181
+def _baichuan7b_attn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    def rotate_half(x):
+        """Rotates half the hidden dims of the input."""
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+        # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+        cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+        sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+        q_embed = (q * cos) + (rotate_half(q) * sin)
+        k_embed = (k * cos) + (rotate_half(k) * sin)
+        return q_embed, k_embed
+
+    bsz, q_len, _ = hidden_states.size()
+
+    proj = self.W_pack(hidden_states)
+    proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
+    query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+    if not output_attentions:
+        attn_weights = None
+        attn_output = F.scaled_dot_product_attention(
+            query_states, key_states, value_states, attn_mask=attention_mask, scale=1 / math.sqrt(self.head_dim)
+        )
+    else:
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, attn_weights, past_key_value
+
+
 class BaichuanModelPatcher(DecoderModelPatcher):
     def __init__(
         self,
@@ -712,13 +786,18 @@ def forward(
             for layer in self._model.model.layers:
                 layer.self_attn._orig_forward = layer.self_attn.forward
                 layer.self_attn.forward = types.MethodType(_baichuan13b_atten_forward, layer.self_attn)
+        else:
+            for layer in self._model.model.layers:
+                layer.self_attn._orig_forward = layer.self_attn.forward
+                layer.self_attn.forward = types.MethodType(_baichuan7b_attn_forward, layer.self_attn)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         if hasattr(self._model, "_orig_forward"):
             self._model.forward = self._model._orig_forward
 
-            for layer in self._model.model.layers:
+        for layer in self._model.model.layers:
+            if hasattr(layer.self_attn, "_orig_forward"):
                 layer.self_attn.forward = layer.self_attn._orig_forward
 
 
@@ -1328,3 +1407,265 @@ def __exit__(self, exc_type, exc_value, traceback):
         for layer in self._model.model.layers:
             if hasattr(layer.self_attn, "_orig_forward"):
                 layer.self_attn.forward = layer.self_attn._orig_forward
+
+
+class CodeGenModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+
+        # whole codegen bettertransformer patch include attn.forward and does not cover codegen2.
+        # For avoiding breaking model on tracing stage, we reduce area of bettertransformer patch only for _attn.
+        from optimum.bettertransformer.models.attention import codegen_wrapped_scaled_dot_product
+
+        for layer in self._model.transformer.h:
+            if is_torch_version(">=", "2.1.0") and not self._model.config.output_attentions:
+                orig_self_attn_fwd = layer.attn._attn
+                layer.attn._attn = types.MethodType(codegen_wrapped_scaled_dot_product, layer.attn)
+                layer.attn._orig_attn = orig_self_attn_fwd
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        for layer in self._model.transformer.h:
+            if hasattr(layer.attn, "_orig_attn"):
+                layer.attn._attn = layer.attn._orig_attn
+
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L763
+def _dbrx_experts_forward(
+    self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor
+):
+    bsz, q_len, hidden_size = x.shape
+    x = x.view(-1, hidden_size)
+    out = torch.zeros_like(x)
+
+    expert_mask = torch.nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0)
+    # Chunk experts at once to avoid storing full parameter multiple times in autograd
+    w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+        self.moe_num_experts, dim=0
+    )
+    v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+        self.moe_num_experts, dim=0
+    )
+    w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk(
+        self.moe_num_experts, dim=0
+    )
+    w1_chunked = [w1.squeeze(dim=0) for w1 in w1_chunked]
+    v1_chunked = [v1.squeeze(dim=0) for v1 in v1_chunked]
+    w2_chunked = [w2.squeeze(dim=0) for w2 in w2_chunked]
+    for expert_idx in range(0, self.moe_num_experts):
+        topk_idx, token_idx = torch.where(expert_mask[expert_idx])
+
+        # Difference with original: removal
+        # if token_idx.shape[0] == 0:
+        #     continue
+        # loop interruption depends on input data and may affect torchscript tracing
+
+        token_list = token_idx
+        topk_list = topk_idx
+
+        expert_tokens = x[None, token_list].reshape(-1, hidden_size)
+        expert_out = (
+            self.mlp(expert_tokens, w1_chunked[expert_idx], v1_chunked[expert_idx], w2_chunked[expert_idx])
+            * top_weights[token_list, topk_list, None]
+        )
+
+        out.index_add_(0, token_idx, expert_out)
+
+    out = out.reshape(bsz, q_len, hidden_size)
+    return out
+
+
+# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L1228
+def _dbrx_update_causal_mask_legacy(
+    self, attention_mask: Optional[torch.Tensor], input_tensor: torch.Tensor, cache_position: torch.Tensor
+) -> Optional[torch.Tensor]:
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+    if self.config._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    # difference with original modeling
+    # using minimum from dtype with larger bandwith (floa32) may lead to overflow
+    # during execution on platforms with default lower precision (bfloat16, float16)
+    min_dtype = torch.finfo(torch.float16).min
+    sequence_length = input_tensor.shape[1]
+    if hasattr(self.blocks[0].norm_attn_norm.attn, "past_key_value"):  # static cache
+        target_length = self.config.max_position_embeddings
+    else:  # dynamic cache
+        target_length = (
+            attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1
+        )
+    # difference with original modeling
+    # removed target_length = int(target_length).
+    # Casting to int leads to constant folding during tracing that makes impossible to use model for sequence of different length
+    causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype
+    if sequence_length != 1:
+        causal_mask = torch.triu(causal_mask, diagonal=1)
+    causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+    causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+    if attention_mask is not None:
+        causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+        if attention_mask.dim() == 2:
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+        elif attention_mask.dim() == 4:
+            # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
+            # cache. In that case, the 4D attention mask attends to the newest tokens only.
+            if attention_mask.shape[-2] < cache_position[0] + sequence_length:
+                offset = cache_position[0]
+            else:
+                offset = 0
+            mask_shape = attention_mask.shape
+            mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
+            causal_mask[
+                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
+            ] = mask_slice
+
+    if (
+        self.config._attn_implementation == "sdpa"
+        and attention_mask is not None
+        and attention_mask.device.type == "cuda"
+    ):
+        # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
+        is_tracing = (
+            torch.jit.is_tracing()
+            or isinstance(input_tensor, torch.fx.Proxy)
+            or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
+        )
+        if not is_tracing and torch.any(attention_mask != 1):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
+# adopted from https://github.com/huggingface/transformers/blob/1b3dba9417eebe16b7c206d1dfca6a4c7f11dbec/src/transformers/models/dbrx/modeling_dbrx.py#L1204
+def _dbrx_update_causal_mask_latest(
+    self,
+    attention_mask: torch.Tensor,
+    input_tensor: torch.Tensor,
+    cache_position: torch.Tensor,
+    past_key_values: Cache,
+    output_attentions: bool,
+):
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if self.config._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+    past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+
+    # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+    if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    # difference with original modeling
+    # using minimum from dtype with larger bandwith (floa32) may lead to overflow
+    # during execution on platforms with default lower precision (bfloat16, float16)
+    min_dtype = torch.finfo(torch.float16).min
+    sequence_length = input_tensor.shape[1]
+    if using_static_cache:
+        target_length = past_key_values.get_max_length()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+        if attention_mask.max() != 0:
+            raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+        causal_mask = attention_mask
+    else:
+        # difference with original modeling
+        causal_mask = (
+            torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+    if (
+        self.config._attn_implementation == "sdpa"
+        and attention_mask is not None
+        and attention_mask.device.type == "cuda"
+        and not output_attentions
+    ):
+        # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
+if is_transformers_version(">", "4.40.2"):
+    _dbrx_update_causal_mask = _dbrx_update_causal_mask_latest
+else:
+    _dbrx_update_causal_mask = _dbrx_update_causal_mask_legacy
+
+
+class DBRXModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        # dbrx has some accuracy issues with bf16 with transformers >= 4.40
+        # fill causal mask in slightly different way for avoid overflow on some platforms
+        self._model.transformer._orig_update_causal_mask = self._model.transformer._update_causal_mask
+        self._model.transformer._update_causal_mask = types.MethodType(
+            _dbrx_update_causal_mask, self._model.transformer
+        )
+
+        for block in self._model.transformer.blocks:
+            rotary_emb = block.norm_attn_norm.attn.rotary_emb
+            # initialize inv_freq for torchscript tracing
+            if rotary_emb.inv_freq is None:
+                inv_freq = 1.0 / (
+                    rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim)
+                )
+                rotary_emb.inv_freq = inv_freq
+            # remove continue-operator from iteration loop over experts
+            block.ffn.experts._orig_forward = block.ffn.experts.forward
+            block.ffn.experts.forward = types.MethodType(_dbrx_experts_forward, block.ffn.experts)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        self._model.transformer._update_causal_mask = self._model.transformer._orig_update_causal_mask
+        for block in self._model.transformer.blocks:
+            block.ffn.experts.forward = block.ffn.experts._orig_forward
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 692720a97..cb5ac52ed 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -527,6 +527,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "bloom",
         "chatglm",
         "codegen",
+        "codegen2",
         # "data2vec-text", # TODO : enable when enabled in exporters
         "gemma",
         "gpt2",
@@ -561,6 +562,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "aquila2",
         "xverse",
         "internlm",
+        "dbrx",
+        "qwen2-moe",
     )
     GENERATION_LENGTH = 100
     REMOTE_CODE_MODELS = (
@@ -577,6 +580,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "aquila2",
         "xverse",
         "internlm",
+        "codegen2",
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index d4364d192..91500cfc6 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -37,9 +37,11 @@
     "cohere": "hf-internal-testing/tiny-random-CohereForCausalLM",
     "chatglm": "katuni4ka/tiny-random-chatglm2",
     "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
+    "codegen2": "katuni4ka/tiny-random-codegen2",
     "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel",
     "data2vec_vision": "hf-internal-testing/tiny-random-Data2VecVisionModel",
     "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel",
+    "dbrx": "katuni4ka/tiny-random-dbrx",
     "deberta": "hf-internal-testing/tiny-random-deberta",
     "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model",
     "deit": "hf-internal-testing/tiny-random-deit",
@@ -91,6 +93,7 @@
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen": "katuni4ka/tiny-random-qwen",
     "qwen2": "Qwen/Qwen1.5-0.5B",
+    "qwen2-moe": "katuni4ka/tiny-random-qwen1.5-moe",
     "resnet": "hf-internal-testing/tiny-random-resnet",
     "roberta": "hf-internal-testing/tiny-random-roberta",
     "roformer": "hf-internal-testing/tiny-random-roformer",

From 1319d7bec80622abdb39b7d0307df6e453e4e903 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 23 May 2024 10:32:53 +0200
Subject: [PATCH 34/47] Fix llama and gemma modeling patching for openvino
 export (#714)

* Fix compatibility for transformers v4.41.0 llama and gemma modeling patching

* fix for dev transformers version

* update setup
---
 optimum/exporters/openvino/model_patcher.py | 104 +++++++++++++++++++-
 optimum/intel/openvino/trainer.py           |   6 +-
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 93a843052..0265b3a5f 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -301,7 +301,7 @@ def __exit__(self, exc_type, exc_value, traceback):
 # adopted from
 # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/gemma/modeling_gemma.py#L965
 # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/llama/modeling_llama.py#L1058
-def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None):
+def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None):
     from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
     if self.config._attn_implementation == "sdpa" and past_seen_tokens is not None:
@@ -314,10 +314,12 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
 
     dtype, device = input_tensor.dtype, input_tensor.device
 
+    # difference with original modeling
     # using minimum from dtype with larger bandwith (floa32) may lead to overflow
     # during execution on platforms with default lower precision (bfloat16, float16)
     min_dtype = torch.finfo(torch.float16).min
     sequence_length = input_tensor.shape[1]
+    # difference with original modeling
     if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"):  # static cache
         target_length = self.config.max_position_embeddings
     else:  # dynamic cache
@@ -329,7 +331,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
 
         target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length
 
+    # difference with original modeling
     causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype
+
     if sequence_length != 1:
         causal_mask = torch.triu(causal_mask, diagonal=1)
     causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
@@ -366,6 +370,104 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
     return causal_mask
 
 
+# adopted from https://github.com/huggingface/transformers/blob/f4014e75db0190792b3feeccfc5dc5b5f9f0ce7b/src/transformers/models/llama/modeling_llama.py#L1036
+def _llama_gemma_update_causal_mask_latest(
+    self,
+    attention_mask,
+    input_tensor,
+    cache_position,
+    past_key_values,
+    output_attentions,
+):
+    from transformers.cache_utils import StaticCache
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+
+    # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+    # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+    # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+    # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+    if self.config._attn_implementation == "flash_attention_2":
+        if attention_mask is not None and 0.0 in attention_mask:
+            return attention_mask
+        return None
+
+    # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+    # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+    # to infer the attention mask.
+    past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+    using_static_cache = isinstance(past_key_values, StaticCache)
+
+    # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+    if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+        if AttentionMaskConverter._ignore_causal_mask_sdpa(
+            attention_mask,
+            inputs_embeds=input_tensor,
+            past_key_values_length=past_seen_tokens,
+            is_training=self.training,
+        ):
+            return None
+
+    dtype, device = input_tensor.dtype, input_tensor.device
+    # difference with original modeling
+    # using minimum from dtype with larger bandwith (floa32) may lead to overflow
+    # during execution on platforms with default lower precision (bfloat16, float16)
+    min_dtype = torch.finfo(torch.float16).min
+
+    sequence_length = input_tensor.shape[1]
+    if using_static_cache:
+        target_length = past_key_values.get_max_length()
+    else:
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, torch.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+        if attention_mask.max() != 0:
+            raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+        causal_mask = attention_mask
+    else:
+        # difference with original modeling
+        causal_mask = (
+            torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype
+        )
+
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+    if (
+        self.config._attn_implementation == "sdpa"
+        and attention_mask is not None
+        and attention_mask.device.type == "cuda"
+        and not output_attentions
+    ):
+        # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+        # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        # Details: https://github.com/pytorch/pytorch/issues/110213
+        causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+    return causal_mask
+
+
+# TODO : deprecate _llama_gemma_update_causal_mask_legacy when transformers>=4.41.0
+if is_transformers_version(">", "4.40.2"):
+    _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_latest
+else:
+    _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy
+
+
 class GemmaModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 0745a1cd7..c8b29800f 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -906,7 +906,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
             output_path = os.path.join(output_dir, OV_XML_FILE_NAME)
             self.compression_controller.prepare_for_export()
             model_type = self.model.config.model_type.replace("_", "-")
-            onnx_config_class = TasksManager.get_exporter_config_constructor(
+            exporter_config_class = TasksManager.get_exporter_config_constructor(
                 exporter="onnx",
                 model=self.model,
                 task=self.task,
@@ -914,9 +914,9 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
             )
 
             if self.task == "text-generation":
-                onnx_config = onnx_config_class(self.model.config, use_past=self.model.config.use_cache)
+                onnx_config = exporter_config_class(self.model.config, use_past=self.model.config.use_cache)
             else:
-                onnx_config = onnx_config_class(self.model.config)
+                onnx_config = exporter_config_class(self.model.config)
 
             num_parameters = self.model.num_parameters()
             save_as_external_data = use_external_data_format(num_parameters) or self.ov_config.save_onnx_model

From e22b2fdb1e709ff20d752cb58a3fe0a891ef924e Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 24 May 2024 17:59:38 +0200
Subject: [PATCH 35/47] Fix nncf quantization for decoder models (#727)

* Fix nncf quantization for decoder models

* add test

* update op quant op

* remove deprecated warning

* update expected quantized

* enable stateful

* style
---
 optimum/intel/openvino/modeling_decoder.py |  5 +++--
 optimum/intel/openvino/quantization.py     |  9 ++++-----
 tests/openvino/test_quantization.py        | 18 ++++++++++--------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 933d92a50..72cd1b648 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -42,7 +42,7 @@
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
 from .configuration import _DEFAULT_4BIT_CONFIGS, OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
-from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
+from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
 
 if TYPE_CHECKING:
@@ -409,6 +409,7 @@ def prepare_inputs(
             elif self.use_cache:
                 for input_name in self.key_value_input_names:
                     model_inputs = self.model.input(input_name)
+                    dtype = OV_TO_NP_TYPE[model_inputs.get_element_type().get_type_name()]
                     shape = model_inputs.get_partial_shape()
                     if self.config.model_type == "chatglm":
                         shape[0] = 0
@@ -419,7 +420,7 @@ def prepare_inputs(
                             shape[2] = 0
                         else:
                             shape[1] = 0
-                    inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape())
+                    inputs[input_name] = np.empty([dim.get_length() for dim in shape], dtype=dtype)
         else:
             # past_key_values are not used explicitly, instead they are handled inside the model
             if past_key_values is None:
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 17305b947..43cf1dd93 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -347,7 +347,6 @@ def _quantize_ovbasemodel(
                     remove_unused_columns=remove_unused_columns,
                     data_collator=data_collator,
                 )
-
                 if self.model.export_feature == "text-generation" and self.model.use_cache:
                     calibration_dataset = self._prepare_text_generation_dataset(
                         quantization_config, calibration_dataloader
@@ -430,6 +429,7 @@ def _quantize_ovbasemodel(
             ),
             **kwargs,
         )
+
         self.model.model = quantized_model
         if save_directory is not None:
             self.model.save_pretrained(save_directory)
@@ -696,8 +696,6 @@ def _prepare_builtin_dataset(self, quantization_config: OVWeightQuantizationConf
     def _prepare_text_generation_dataset(
         self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader
     ) -> nncf.Dataset:
-        # TODO: this function is not covered by tests, remove if not relevant anymore or cover by tests otherwise
-
         # Prefetch past_key_values
         self.model.update_pkv_precision(True)
         self.model.compile()
@@ -705,15 +703,16 @@ def _prepare_text_generation_dataset(
 
         num_samples = quantization_config.num_samples or 200
 
-        self.model.request = InferRequestWrapper(self.model.model.request, collected_inputs)
+        self.model.request = InferRequestWrapper(self.model.request, collected_inputs)
         try:
             for data in calibration_dataloader:
                 self.model.generate(**data, max_new_tokens=1)
                 if len(collected_inputs) >= num_samples:
                     break
         finally:
-            self.model.model.request = self.model.model.request.request
+            self.model.request = self.model.request.request
         calibration_dataset = nncf.Dataset(collected_inputs)
+
         return calibration_dataset
 
     def _prepare_unet_dataset(
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 98eb121d7..09b395ea1 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -73,12 +73,16 @@
 
 
 class OVQuantizerTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
+    SUPPORTED_ARCHITECTURES_TORCH_MODEL = (
         (OVModelForSequenceClassification, "bert", 32, 35),
-        # (OVModelForCausalLM, "gpt2", 41, 23),
+        (OVModelForCausalLM, "gpt2", 41, 3),
+    )
+    SUPPORTED_ARCHITECTURES_OV_MODEL = (
+        (OVModelForSequenceClassification, "bert", 32, 35),
+        (OVModelForCausalLM, "gpt2", 31, 22),
     )
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
+    @parameterized.expand(SUPPORTED_ARCHITECTURES_TORCH_MODEL)
     def test_automodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8):
         model_id = MODEL_NAMES[model_name]
         task = model_cls.export_feature
@@ -123,23 +127,21 @@ def preprocess_function(examples, tokenizer):
             loaded_config = OVConfig.from_pretrained(tmp_dir)
             self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict())
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
+    @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL)
     def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8):
         model_id = MODEL_NAMES[model_name]
         task = model_cls.export_feature
         dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task]
-        if "gpt2" in model_id:
-            expected_int8 -= 1
 
         def preprocess_function(examples, tokenizer):
             return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
-            transformers_model = model_cls.from_pretrained(model_id, export=True)
+            ov_model = model_cls.from_pretrained(model_id, export=True)
             tokenizer = AutoTokenizer.from_pretrained(model_id)
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
-            quantizer = OVQuantizer.from_pretrained(transformers_model, task=task)
+            quantizer = OVQuantizer.from_pretrained(ov_model, task=task)
 
             calibration_dataset = quantizer.get_calibration_dataset(
                 dataset_name,

From 7b4e50f15f2facf08b52f710d1f6b56b6065b7f8 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 27 May 2024 14:03:53 +0200
Subject: [PATCH 36/47] Limit ITREX version for WOQ  (#729)

* remove latest ITREX release compatibility

* update workflow
---
 .github/workflows/test_inc.yml                  | 2 +-
 optimum/intel/neural_compressor/quantization.py | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index 6435d0b71..e3a7518a6 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -34,7 +34,7 @@ jobs:
         pip install py-cpuinfo
         pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
-        pip install intel-extension-for-transformers
+        pip install intel-extension-for-transformers==1.4.1
         pip install peft
 
     - name: Test with Pytest
diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py
index 9ee436593..57bc3ae7a 100644
--- a/optimum/intel/neural_compressor/quantization.py
+++ b/optimum/intel/neural_compressor/quantization.py
@@ -79,12 +79,15 @@
 )
 
 
+_ITREX_EXCLUDED_VERSION = "1.4.2"
+
 if is_itrex_available():
     if is_itrex_version("<", ITREX_MINIMUM_VERSION):
         raise ImportError(
             f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, "
             f"but only version {ITREX_MINIMUM_VERSION} or higher is supported."
         )
+
     from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model
     from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit
     from intel_extension_for_transformers.transformers.utils.config import (
@@ -226,6 +229,12 @@ def quantize(
 
         # ITREX Weight Only Quantization
         if not isinstance(quantization_config, PostTrainingQuantConfig):
+            if is_itrex_version("==", _ITREX_EXCLUDED_VERSION):
+                raise ImportError(
+                    f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, "
+                    f"but {_ITREX_EXCLUDED_VERSION} is not compatible."
+                )
+
             # check neural-compressor version
             if is_neural_compressor_version("<", NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION):
                 raise ImportError(

From bfd0767c53b2567810e676fd7e4228fb37f984e5 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 28 May 2024 17:34:59 +0200
Subject: [PATCH 37/47] Fix itrex WOQ model loading (#730)

* Fix loading ITREX model

* add test

* fix loading WOQ and quantization config

* add test

* add revision and subfolder parameters when loading inc config

* style

* update test model id
---
 .../intel/neural_compressor/modeling_base.py  | 115 ++++++++++++------
 optimum/intel/neural_compressor/utils.py      |   1 +
 tests/neural_compressor/test_modeling.py      |  63 +++++++++-
 tests/neural_compressor/test_optimization.py  |   1 -
 tests/neural_compressor/utils_tests.py        |   2 +-
 5 files changed, 143 insertions(+), 39 deletions(-)

diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
index c6d5e7bac..bb3d2fe8c 100644
--- a/optimum/intel/neural_compressor/modeling_base.py
+++ b/optimum/intel/neural_compressor/modeling_base.py
@@ -22,6 +22,7 @@
 import torch
 from huggingface_hub import hf_hub_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import EntryNotFoundError
 from neural_compressor.utils.pytorch import load
 from transformers import (
     AutoConfig,
@@ -40,6 +41,7 @@
 )
 from transformers.modeling_utils import no_init_weights
 from transformers.models.auto.auto_factory import _get_model_class
+from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME
 from transformers.utils.generic import ContextManagers
 
 from optimum.intel.generation import BaseModelForCausalLM
@@ -47,7 +49,7 @@
 from ...modeling_base import OptimizedModel
 from ..utils.import_utils import _torch_version, is_itrex_available, is_torch_version
 from .configuration import INCConfig
-from .utils import WEIGHTS_NAME
+from .utils import QUANTIZATION_CONFIG_NAME
 
 
 logger = logging.getLogger(__name__)
@@ -119,33 +121,70 @@ def _from_pretrained(
                 raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
             token = use_auth_token
 
-        model_name_or_path = kwargs.pop("model_name_or_path", None)
-        if model_name_or_path is not None:
-            logger.warning("`model_name_or_path` is deprecated please use `model_id`")
-            model_id = model_id or model_name_or_path
-
         model_path = Path(model_id)
-
-        if model_path.is_dir():
-            model_cache_path = model_path / file_name
+        is_local = model_path.is_dir()
+        model_cache_path = None
+        inc_config = None
+        msg = None
+        if is_local:
+            if (model_path / subfolder / SAFE_WEIGHTS_NAME).is_file():
+                file_name = SAFE_WEIGHTS_NAME
+            elif not (model_path / subfolder / file_name).is_file():
+                raise EnvironmentError(
+                    f"Error no file named {SAFE_WEIGHTS_NAME} or {file_name} found in directory {model_path / subfolder}"
+                )
+            model_cache_path = model_path / subfolder / file_name
         else:
-            model_cache_path = hf_hub_download(
-                repo_id=model_id,
-                filename=file_name,
-                subfolder=subfolder,
-                token=token,
-                revision=revision,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                local_files_only=local_files_only,
-            )
+            # Try download safetensors if exist
+            try:
+                model_cache_path = hf_hub_download(
+                    repo_id=model_id,
+                    filename=SAFE_WEIGHTS_NAME,
+                    subfolder=subfolder,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                )
+            except EntryNotFoundError:
+                pass
+
+            if model_cache_path is None:
+                model_cache_path = hf_hub_download(
+                    repo_id=model_id,
+                    filename=file_name,
+                    subfolder=subfolder,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                )
 
         model_save_dir = Path(model_cache_path).parent
-        inc_config = None
-        msg = None
+
         if is_itrex_available():
-            try:
-                quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json")
+            quantization_config_path = None
+            if is_local:
+                quantization_config_path = model_path / subfolder / QUANTIZATION_CONFIG_NAME
+            else:
+                try:
+                    quantization_config_path = hf_hub_download(
+                        repo_id=model_id,
+                        filename=QUANTIZATION_CONFIG_NAME,
+                        subfolder=subfolder,
+                        token=token,
+                        revision=revision,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        local_files_only=local_files_only,
+                    )
+                except EntryNotFoundError:
+                    pass
+
+            if quantization_config_path and Path(quantization_config_path).is_file():
+                quantization_config = PretrainedConfig.from_pretrained(quantization_config_path)
                 algorithm = getattr(quantization_config, "quant_method", None)
                 if algorithm in {"rtn", "gptq", "awq", "autoround"}:
                     from intel_extension_for_transformers.transformers.modeling.modeling_auto import (
@@ -154,7 +193,7 @@ def _from_pretrained(
 
                     _BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class
 
-                    return _BaseQBitsAutoModelClass.from_pretrained(
+                    model = _BaseQBitsAutoModelClass.from_pretrained(
                         pretrained_model_name_or_path=model_id,
                         token=token,
                         revision=revision,
@@ -163,12 +202,16 @@ def _from_pretrained(
                         local_files_only=local_files_only,
                         subfolder=subfolder,
                         trust_remote_code=trust_remote_code,
+                        use_neural_speed=False,
                         **kwargs,
                     )
-            except EnvironmentError:
-                msg = "The model is not quantized with weight-only quantization."
+
+                    return cls(
+                        model, config=config, model_save_dir=model_save_dir, q_config=quantization_config, **kwargs
+                    )
+
         try:
-            inc_config = INCConfig.from_pretrained(model_id)
+            inc_config = INCConfig.from_pretrained(model_id, subfolder=subfolder, revision=revision)
             if not is_torch_version("==", inc_config.torch_version):
                 msg = f"Quantized model was obtained with torch version {inc_config.torch_version} but {_torch_version} was found."
                 logger.warning(f"{msg}")
@@ -209,15 +252,19 @@ def _from_pretrained(
         )
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
-        output_path = os.path.join(save_directory, WEIGHTS_NAME)
-
         if isinstance(self.model, torch.nn.Module):
-            state_dict = self.model.state_dict()
-            if self._q_config:
-                state_dict["best_configure"] = self._q_config
-            torch.save(state_dict, output_path)
+            # For ITREX model
+            if isinstance(self._q_config, PretrainedConfig):
+                self._q_config.to_json_file(os.path.join(save_directory, QUANTIZATION_CONFIG_NAME))
+                self.model.save_pretrained(save_directory)
+            # For INC model the state dictionary needs to be modified to include the quantization parameters
+            else:
+                state_dict = self.model.state_dict()
+                if isinstance(self._q_config, dict):
+                    state_dict["best_configure"] = self._q_config
+                torch.save(state_dict, os.path.join(save_directory, WEIGHTS_NAME))
         else:
-            torch.jit.save(self.model, output_path)
+            torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME))
 
         if self.inc_config:
             self.inc_config.save_pretrained(save_directory)
diff --git a/optimum/intel/neural_compressor/utils.py b/optimum/intel/neural_compressor/utils.py
index 3173f5e1c..84c1d6dc2 100644
--- a/optimum/intel/neural_compressor/utils.py
+++ b/optimum/intel/neural_compressor/utils.py
@@ -28,6 +28,7 @@
 
 
 CONFIG_NAME = "best_configure.yaml"
+QUANTIZATION_CONFIG_NAME = "quantize_config.json"
 
 NEURAL_COMPRESSOR_MINIMUM_VERSION = "2.1.0"
 NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION = "2.3.0"
diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py
index e6ce4763f..0c3e60969 100644
--- a/tests/neural_compressor/test_modeling.py
+++ b/tests/neural_compressor/test_modeling.py
@@ -16,10 +16,12 @@
 import os
 import tempfile
 import unittest
+from pathlib import Path
 
 import torch
 from parameterized import parameterized
 from transformers import AutoTokenizer, pipeline, set_seed
+from transformers.utils import SAFE_WEIGHTS_NAME
 
 from optimum.exporters import TasksManager
 from optimum.intel import (  # noqa
@@ -37,7 +39,8 @@
     INCStableDiffusionPipeline,
     INCTrainer,
 )
-from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME
+from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, QUANTIZATION_CONFIG_NAME, WEIGHTS_NAME
+from optimum.intel.utils.import_utils import is_itrex_available
 
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -52,7 +55,7 @@
 
 
 MODEL_NAMES_TO_TASK = (
-    ("hf-internal-testing/tiny-random-gpt2", "text-generation"),
+    ("hf-internal-testing/tiny-random-GPT2LMHeadModel", "text-generation"),
     ("hf-internal-testing/tiny-random-BertForMaskedLM", "fill-mask"),
     ("hf-internal-testing/tiny-random-DistilBertForSequenceClassification", "text-classification"),
     ("hf-internal-testing/tiny-random-DebertaV2Model", "feature-extraction"),
@@ -86,7 +89,7 @@ def test_compare_to_transformers(self, model_id, task):
         outputs = inc_model(**model_inputs)
         with tempfile.TemporaryDirectory() as tmpdirname:
             inc_model.save_pretrained(tmpdirname)
-            loaded_model = model_class.from_pretrained(tmpdirname, file_name=WEIGHTS_NAME)
+            loaded_model = model_class.from_pretrained(tmpdirname)
             outputs_loaded = loaded_model(**model_inputs)
 
         if task == "feature-extraction":
@@ -143,3 +146,57 @@ def test_compare_with_and_without_past_key_values(self):
         self.assertEqual(outputs_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_without_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv))
+
+    @unittest.skipIf(not is_itrex_available(), reason="ITREX not available")
+    def test_saving_loading_woq_itrex_model(self):
+        model_name = "echarlaix/tiny-random-PhiForCausalLM"
+        subfolder = "itrex"
+        model = INCModelForCausalLM.from_pretrained(model_name, revision="itrex", subfolder=subfolder)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, revision="itrex")
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        tokens = tokenizer("This is a sample output", return_tensors="pt")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model_save_dir = Path(tmp_dir) / subfolder
+            model.save_pretrained(model_save_dir)
+            folder_contents = os.listdir(model_save_dir)
+            self.assertIn(SAFE_WEIGHTS_NAME, folder_contents)
+            self.assertIn(QUANTIZATION_CONFIG_NAME, folder_contents)
+            loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        with torch.no_grad():
+            outputs = model(**tokens)
+            loaded_outputs = loaded_model(**tokens)
+
+        self.assertTrue("logits" in loaded_outputs)
+        self.assertIsInstance(loaded_outputs.logits, torch.Tensor)
+        self.assertTrue("past_key_values" in loaded_outputs)
+        self.assertIsInstance(loaded_outputs.past_key_values, tuple)
+        self.assertTrue(torch.allclose(outputs.logits, loaded_outputs.logits, atol=1e-5))
+
+    def test_saving_loading_inc_model(self):
+        model_name = "echarlaix/tiny-random-PhiForCausalLM"
+        subfolder = "inc"
+        model = INCModelForCausalLM.from_pretrained(model_name, revision="inc", subfolder=subfolder)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, revision="inc")
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        tokens = tokenizer("This is a sample output", return_tensors="pt")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model_save_dir = Path(tmp_dir) / subfolder
+            model.save_pretrained(model_save_dir)
+            folder_contents = os.listdir(model_save_dir)
+            self.assertIn(WEIGHTS_NAME, folder_contents)
+            self.assertIn("inc_config.json", folder_contents)
+            loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder)
+            self.assertIsInstance(loaded_model.inc_config, INCConfig)
+
+        with torch.no_grad():
+            outputs = model(**tokens)
+            loaded_outputs = loaded_model(**tokens)
+
+        self.assertTrue("logits" in loaded_outputs)
+        self.assertIsInstance(loaded_outputs.logits, torch.Tensor)
+        self.assertTrue("past_key_values" in loaded_outputs)
+        self.assertIsInstance(loaded_outputs.past_key_values, tuple)
+        self.assertTrue(torch.allclose(outputs.logits, loaded_outputs.logits, atol=1e-5))
diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py
index da4258613..56f2a5bac 100644
--- a/tests/neural_compressor/test_optimization.py
+++ b/tests/neural_compressor/test_optimization.py
@@ -47,7 +47,6 @@
 from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset
 from optimum.intel.utils.import_utils import is_torch_version, is_itrex_available
 
-
 from optimum.intel import (
     INCConfig,
     INCModelForCausalLM,
diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py
index a6d09954f..210623758 100644
--- a/tests/neural_compressor/utils_tests.py
+++ b/tests/neural_compressor/utils_tests.py
@@ -81,7 +81,7 @@
     "electra": "hf-internal-testing/tiny-random-electra",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
-    "gpt2": "hf-internal-testing/tiny-random-gpt2",
+    "gpt2": "hf-internal-testing/tiny-random-GPT2LMHeadModel",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",

From aefabf0b443ef485c6cb8e1e8a51b8c62625739d Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 29 May 2024 10:23:48 +0200
Subject: [PATCH 38/47] Enable ITREX v1.4.2 for torch 2.3.0+cpu (#733)

* Enable ITREX v1.4.2 for specific torch version

* fix

* fix style

* update itrex version

* fix

* fix warning
---
 .github/workflows/test_inc.yml                  |  5 ++---
 optimum/intel/neural_compressor/quantization.py | 11 ++++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index e3a7518a6..81d102bc0 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -32,9 +32,9 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
-        pip install intel-extension-for-transformers==1.4.1
+        pip install intel-extension-for-transformers
         pip install peft
 
     - name: Test with Pytest
@@ -43,7 +43,6 @@ jobs:
     - name: Test IPEX
       run: |
         pip uninstall -y intel-extension-for-transformers
-        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --extra-index-url https://download.pytorch.org/whl/cpu
         pip install intel-extension-for-pytorch==2.3.0
         pytest tests/neural_compressor/test_ipex.py
 
diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py
index 57bc3ae7a..500478712 100644
--- a/optimum/intel/neural_compressor/quantization.py
+++ b/optimum/intel/neural_compressor/quantization.py
@@ -29,6 +29,7 @@
 from neural_compressor.model.onnx_model import ONNXModel
 from neural_compressor.model.torch_model import IPEXModel, PyTorchModel
 from neural_compressor.quantization import fit
+from packaging.version import parse
 from torch.utils.data import DataLoader, RandomSampler
 from transformers import (
     DataCollator,
@@ -79,8 +80,6 @@
 )
 
 
-_ITREX_EXCLUDED_VERSION = "1.4.2"
-
 if is_itrex_available():
     if is_itrex_version("<", ITREX_MINIMUM_VERSION):
         raise ImportError(
@@ -229,10 +228,12 @@ def quantize(
 
         # ITREX Weight Only Quantization
         if not isinstance(quantization_config, PostTrainingQuantConfig):
-            if is_itrex_version("==", _ITREX_EXCLUDED_VERSION):
+            if is_itrex_version("==", "1.4.2") and (
+                is_torch_version("!=", "2.3.0") or parse(_torch_version).local != "cpu"
+            ):
                 raise ImportError(
-                    f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, "
-                    f"but {_ITREX_EXCLUDED_VERSION} is not compatible."
+                    f"Found an incompatible version of `intel-extension-for-transformers` and `torch`. Found version itrex {_itrex_version} and torch {_torch_version}, "
+                    f"but only torch 2.3.0+cpu is compatible with ITREX v1.4.2."
                 )
 
             # check neural-compressor version

From ca05db034cfb961cbf6efa8137b2c4a7749911db Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 29 May 2024 16:32:57 +0200
Subject: [PATCH 39/47] Bump transformers version (#724)

* Bump transformers version

* fix default ignored scope for model using sdpa by default

* fix quant ops test

* update setup

* add ops quant num gpt2

* fix expected ops quant in test

* update optimum version
---
 .../configs/swin-base-jpqd.json               |  2 -
 .../configs/bert-base-jpqd.json               |  2 -
 .../configs/bert-base-jpqd.json               |  2 -
 optimum/intel/openvino/trainer.py             |  2 -
 setup.py                                      |  4 +-
 tests/openvino/test_modeling.py               |  2 +-
 tests/openvino/test_quantization.py           |  4 +-
 tests/openvino/test_training.py               | 38 ++++++++++---------
 8 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/examples/openvino/image-classification/configs/swin-base-jpqd.json b/examples/openvino/image-classification/configs/swin-base-jpqd.json
index 3f03c276a..23b2fd3d8 100644
--- a/examples/openvino/image-classification/configs/swin-base-jpqd.json
+++ b/examples/openvino/image-classification/configs/swin-base-jpqd.json
@@ -36,8 +36,6 @@
       "ignored_scopes": [
         "{re}.*__add___[0-1]",
         "{re}.*layer_norm_0",
-        "{re}.*matmul_1",
-        "{re}.*__truediv__*"
       ]
   }
 ]
diff --git a/examples/openvino/question-answering/configs/bert-base-jpqd.json b/examples/openvino/question-answering/configs/bert-base-jpqd.json
index 425bd9f31..342d327a3 100644
--- a/examples/openvino/question-answering/configs/bert-base-jpqd.json
+++ b/examples/openvino/question-answering/configs/bert-base-jpqd.json
@@ -36,8 +36,6 @@
         "ignored_scopes": [
             "{re}.*__add___[0-1]",
             "{re}.*layer_norm_0",
-            "{re}.*matmul_1",
-            "{re}.*__truediv__*"
         ]
     }
 ]
diff --git a/examples/openvino/text-classification/configs/bert-base-jpqd.json b/examples/openvino/text-classification/configs/bert-base-jpqd.json
index 25c8f2886..d177e4efd 100644
--- a/examples/openvino/text-classification/configs/bert-base-jpqd.json
+++ b/examples/openvino/text-classification/configs/bert-base-jpqd.json
@@ -40,8 +40,6 @@
         "ignored_scopes": [
             "{re}.*__add___[0-1]",
             "{re}.*layer_norm_0",
-            "{re}.*matmul_1",
-            "{re}.*__truediv__*"
         ]
     }
 ]
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index c8b29800f..0a1f5209a 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -153,8 +153,6 @@
         "{re}.*Embedding.*",
         "{re}.*add___.*",
         "{re}.*layer_norm_.*",
-        "{re}.*matmul_1",
-        "{re}.*__truediv__.*",
     ],
 }
 
diff --git a/setup.py b/setup.py
index 251ec61cd..d00ce1dd9 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36.0,<4.41.0",
-    "optimum~=1.19",
+    "transformers>=4.36.0,<4.42.0",
+    "optimum~=1.20",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index cb5ac52ed..1e18fb066 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1679,7 +1679,7 @@ def test_compare_output_attentions(self, model_arch):
         preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
         inputs = preprocessor(images=image, return_tensors="pt")
 
-        transformers_model = AutoModelForImageClassification.from_pretrained(model_id)
+        transformers_model = AutoModelForImageClassification.from_pretrained(model_id, attn_implementation="eager")
         transformers_model.eval()
         with torch.no_grad():
             transformers_outputs = transformers_model(**inputs, output_attentions=True)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 09b395ea1..b7ed36d3e 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -74,7 +74,7 @@
 
 class OVQuantizerTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_TORCH_MODEL = (
-        (OVModelForSequenceClassification, "bert", 32, 35),
+        (OVModelForSequenceClassification, "bert", 22, 35),
         (OVModelForCausalLM, "gpt2", 41, 3),
     )
     SUPPORTED_ARCHITECTURES_OV_MODEL = (
@@ -665,7 +665,7 @@ def preprocess_function(examples, tokenizer):
 
 
 class OVTrainerTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 49, 38),)
+    SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 67, 38),)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
     def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8):
diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
index c998d00d8..89d644319 100644
--- a/tests/openvino/test_training.py
+++ b/tests/openvino/test_training.py
@@ -322,7 +322,7 @@ def tearDown(self):
     "default_quantization": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG,
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss"],
     ),
@@ -330,14 +330,14 @@ def tearDown(self):
         model_id="hf-internal-testing/tiny-random-bert",
         teacher_model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG,
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "customized_quantization": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG,
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss"],
     ),
@@ -345,7 +345,7 @@ def tearDown(self):
         model_id="hf-internal-testing/tiny-random-bert",
         teacher_model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG,
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
@@ -365,7 +365,7 @@ def tearDown(self):
     "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss"],
@@ -376,7 +376,7 @@ def tearDown(self):
             CUSTOMIZED_QUANTIZATION_CONFIG,
             STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         ],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss"],
@@ -385,7 +385,7 @@ def tearDown(self):
         model_id="hf-internal-testing/tiny-random-bert",
         teacher_model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
@@ -397,7 +397,7 @@ def tearDown(self):
             CUSTOMIZED_QUANTIZATION_CONFIG,
             STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         ],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
@@ -418,7 +418,7 @@ def tearDown(self):
     "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
         model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss"],
@@ -429,7 +429,7 @@ def tearDown(self):
             CUSTOMIZED_QUANTIZATION_CONFIG,
             UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         ],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss"],
@@ -438,7 +438,7 @@ def tearDown(self):
         model_id="hf-internal-testing/tiny-random-bert",
         teacher_model_id="hf-internal-testing/tiny-random-bert",
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
@@ -450,7 +450,7 @@ def tearDown(self):
             CUSTOMIZED_QUANTIZATION_CONFIG,
             UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         ],
-        expected_fake_quantize=34,
+        expected_fake_quantize=22,
         expected_int8=32,
         expected_binary_masks=60,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
@@ -553,7 +553,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
     "default_quantization": OVTrainerTestDescriptor(
         model_id="yujiepan/tiny-random-swin-patch4-window7-224",
         nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG,
-        expected_fake_quantize=28,
+        expected_fake_quantize=36,
         expected_int8=28,
         compression_metrics=["compression_loss"],
     ),
@@ -572,7 +572,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
     "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
         model_id="yujiepan/tiny-random-swin-patch4-window7-224",
         nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=28,
+        expected_fake_quantize=36,
         expected_int8=28,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
@@ -580,7 +580,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
     "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
         model_id="yujiepan/tiny-random-swin-patch4-window7-224",
         nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=28,
+        expected_fake_quantize=36,
         expected_int8=28,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
@@ -589,7 +589,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         model_id="yujiepan/tiny-random-swin-patch4-window7-224",
         teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224",
         nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=28,
+        expected_fake_quantize=36,
         expected_int8=28,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
@@ -598,7 +598,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         model_id="yujiepan/tiny-random-swin-patch4-window7-224",
         teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224",
         nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=28,
+        expected_fake_quantize=36,
         expected_int8=28,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
@@ -797,7 +797,9 @@ def prepare_model_and_dataset(self, desc: OVTrainerTestDescriptor):
 
         self.feature_extractor = AutoFeatureExtractor.from_pretrained(desc.model_id)
         self.tokenizer = self.feature_extractor
-        self.model = AutoModelForAudioClassification.from_pretrained(desc.model_id, num_labels=self.num_labels)
+        self.model = AutoModelForAudioClassification.from_pretrained(
+            desc.model_id, num_labels=self.num_labels, attn_implementation="eager"
+        )
         self.teacher_model = None
         if desc.teacher_model_id:
             self.teacher_model = AutoModelForAudioClassification.from_pretrained(

From 70c1475d24a1ff85838e37b1b5703a08917fe267 Mon Sep 17 00:00:00 2001
From: Nikita Malinin <nikita.malinin@intel.com>
Date: Wed, 29 May 2024 16:37:53 +0200
Subject: [PATCH 40/47] Update default 4bit configs (#702)

* Update configuration.py

* Update configuration.py

* fix style

* fix style

* Fix stabilityai/stablelm-zephyr-3b id

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/openvino/configuration.py | 28 +++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 30dfe5ae6..eb233f3d1 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -33,18 +33,25 @@
 logger = logging.getLogger(__name__)
 
 _DEFAULT_4BIT_CONFIGS = {
-    "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5},
+    "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8},
     "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64},
     "facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
     "bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6},
     "togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128},
-    "HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6},
-    "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
+    "HuggingFaceH4/zephyr-7b-beta": {
+        "bits": 4,
+        "sym": True,
+        "group_size": 128,
+        "ratio": 0.8,
+        "dataset": "wikitext2",
+        "awq": True,
+    },
+    "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7},
     "meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
     "meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
     "stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
     "stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8},
-    "stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
+    "stabilityai/stablelm-zephyr-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0},
     "pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
     "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72},
     "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
@@ -52,6 +59,19 @@
     "tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
     "psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
     "mistralai/Mixtral-8x7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
+    "facebook/opt-2.7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7},
+    "togethercomputer/RedPajama-INCITE-Chat-3B-v1": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8},
+    "lmsys/vicuna-7b-v1.5": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0},
+    "stabilityai/stablelm-tuned-alpha-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8},
+    "mistralai/Mistral-7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.9},
+    "baichuan-inc/Baichuan2-7B-Chat": {
+        "bits": 4,
+        "sym": True,
+        "group_size": 128,
+        "ratio": 0.8,
+        "dataset": "wikitext2",
+        "awq": True,
+    },
 }
 
 

From ebc2d3a268c321ab20c3316271b2a11cbdf9bacb Mon Sep 17 00:00:00 2001
From: rbrugaro <rita.brugarolas.brufau@intel.com>
Date: Thu, 30 May 2024 06:11:31 -0700
Subject: [PATCH 41/47] fix accelerator default to ipex not ort (#737)

---
 optimum/intel/pipelines/pipeline_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 65e6cfb78..a7dba9310 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -150,7 +150,7 @@ def pipeline(
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     use_fast: bool = True,
     token: Optional[Union[str, bool]] = None,
-    accelerator: Optional[str] = "ort",
+    accelerator: Optional[str] = "ipex",
     revision: Optional[str] = None,
     trust_remote_code: Optional[bool] = None,
     torch_dtype: Optional[Union[str, torch.dtype]] = None,

From 5dfbcbc47eb301e758000018fc5d5bb7446e18c6 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 30 May 2024 18:29:17 +0200
Subject: [PATCH 42/47] Remove default pipeline accelerator (#739)

---
 optimum/intel/pipelines/pipeline_base.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index a7dba9310..a6c6a36b0 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -150,7 +150,7 @@ def pipeline(
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     use_fast: bool = True,
     token: Optional[Union[str, bool]] = None,
-    accelerator: Optional[str] = "ipex",
+    accelerator: Optional[str] = None,
     revision: Optional[str] = None,
     trust_remote_code: Optional[bool] = None,
     torch_dtype: Optional[Union[str, torch.dtype]] = None,
@@ -226,9 +226,12 @@ def pipeline(
         )
 
     if accelerator not in MAPPING_LOADING_FUNC:
-        raise ValueError(
-            f'Accelerator {accelerator} is not supported. Supported accelerator is {", ".join(MAPPING_LOADING_FUNC)}.'
-        )
+        if accelerator is None:
+            msg = "Impossible to instantiate a pipeline without specifying an `accelerator`."
+        else:
+            msg = f"`accelerator` {accelerator} is not supported."
+
+        raise ValueError(msg + f" Supported list of `accelerator` is : {', '.join(MAPPING_LOADING_FUNC)}.")
 
     if accelerator == "ipex":
         if task not in list(IPEX_SUPPORTED_TASKS.keys()):

From b21d14d675d893e0d8a4481cf5c51013eba56422 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 30 May 2024 18:51:56 +0200
Subject: [PATCH 43/47] Fix bloom generation (#736)

* Fix bloom generation

* remove unused variable

* add style

* add message error

* update model id
---
 optimum/intel/openvino/modeling_decoder.py | 6 +++---
 tests/openvino/test_modeling.py            | 6 +++---
 tests/openvino/utils_tests.py              | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 72cd1b648..fe7cf14c1 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -587,11 +587,11 @@ def _deduplicate_inputs(self, model_inputs: Dict):
         )
         for input_name, input_tensor in model_inputs.items():
             if input_name not in ["input_ids", "beam_idx"]:
-                if not isinstance(input_tensor, Tensor):
+                if input_name not in self.key_value_input_names:
                     upd_model_inputs[input_name] = input_tensor[indicies]
                 else:
-                    shape = input_tensor.shape
-                    dtype = input_tensor.element_type
+                    shape = input_tensor.shape if isinstance(input_tensor, Tensor) else list(input_tensor.shape)
+                    dtype = input_tensor.element_type if isinstance(input_tensor, Tensor) else Type(input_tensor.dtype)
                     upd_batch_size = indicies.shape[0]
                     if self.config.model_type == "bloom":
                         upd_batch_size *= self.config.num_attention_heads
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 1e18fb066..0cb332276 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -879,14 +879,14 @@ def test_beam_search(self, model_arch):
         ov_model_stateless.config.eos_token_id = None
         transformers_model.config.eos_token_id = None
 
-        for gen_config in gen_configs:
+        for idx, gen_config in enumerate(gen_configs):
             if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]:
                 continue
             transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
             ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
-            self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs))
+            self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs), f"generation config : {idx}")
             ov_stateless_outputs = ov_model_stateless.generate(**tokens, generation_config=gen_config)
-            self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs))
+            self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs), f"generation config : {idx}")
 
 
 class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 91500cfc6..0789f1983 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -65,7 +65,7 @@
     "internlm2": "katuni4ka/tiny-random-internlm2",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
     "longt5": "hf-internal-testing/tiny-random-longt5",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
     "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
     "llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",

From 683133f617a2fd0bfa4d05739b3a99fe7e7557e1 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Thu, 30 May 2024 19:31:04 +0200
Subject: [PATCH 44/47] Dev version

---
 optimum/intel/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/version.py b/optimum/intel/version.py
index 9668d6215..a2a857944 100644
--- a/optimum/intel/version.py
+++ b/optimum/intel/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.17.0.dev0"
+__version__ = "1.18.0.dev0"

From 813d7c0fdbe0013f2ca249aecc06fac668d271c3 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 31 May 2024 14:39:53 +0200
Subject: [PATCH 45/47] Add  custom model export test (#677)

* Add  custom model export test

* format
---
 setup.py                      |  2 +-
 tests/openvino/test_export.py | 29 +++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index d00ce1dd9..02d7f2845 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
     "transformers_stream_generator",
     "einops",
     "tiktoken",
-    "sentence_transformers",
+    "sentence-transformers",
 ]
 
 QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"]
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 851f8355f..8f61d9a36 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -18,8 +18,10 @@
 from tempfile import TemporaryDirectory
 from typing import Optional
 
+import torch
 from parameterized import parameterized
-from transformers import AutoConfig
+from sentence_transformers import SentenceTransformer, models
+from transformers import AutoConfig, AutoTokenizer
 from utils_tests import MODEL_NAMES
 
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
@@ -124,7 +126,7 @@ def test_export(self, model_type: str):
 
 
 class CustomExportModelTest(unittest.TestCase):
-    def test_export_custom_model(self):
+    def test_custom_export_config_model(self):
         class BertOnnxConfigWithPooler(BertOnnxConfig):
             @property
             def outputs(self):
@@ -157,3 +159,26 @@ def outputs(self):
 
             self.assertIsInstance(ov_model, OVBaseModel)
             self.assertTrue(ov_model.output_names == {"last_hidden_state": 0, "pooler_output": 1})
+
+    def test_export_custom_model(self):
+        model_id = "hf-internal-testing/tiny-random-BertModel"
+        word_embedding_model = models.Transformer(model_id, max_seq_length=256)
+        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
+        dense_model = models.Dense(
+            in_features=pooling_model.get_sentence_embedding_dimension(),
+            out_features=256,
+        )
+        model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])
+
+        with TemporaryDirectory() as tmpdirname:
+            export_from_model(model, output=tmpdirname, task="feature-extraction")
+            ov_model = OVModelForCustomTasks.from_pretrained(tmpdirname)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer("This is a sample input", return_tensors="pt")
+        with torch.no_grad():
+            model_outputs = model(tokens)
+
+        ov_outputs = ov_model(**tokens)
+        self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4))
+        self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4))

From 6529306738d829bd2c62d9cfaab19615d4b96ab4 Mon Sep 17 00:00:00 2001
From: Nicolas Oliver <dario.n.oliver@intel.com>
Date: Mon, 3 Jun 2024 05:55:35 -0700
Subject: [PATCH 46/47] Update NNCF quantization notebook (#715)

* Update quantized_generation_demo.ipynb

* Update quantized_generation_demo.ipynb
---
 notebooks/openvino/quantized_generation_demo.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb
index 767106408..5673243cb 100644
--- a/notebooks/openvino/quantized_generation_demo.ipynb
+++ b/notebooks/openvino/quantized_generation_demo.ipynb
@@ -32,7 +32,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ! pip install optimum[openvino,nncf] torch"
+    "# ! pip install optimum[openvino,nncf] torch==2.2.2"
    ]
   },
   {

From 096d94b9933d76a661d342cdcd42cae75dbda5ef Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 3 Jun 2024 18:40:15 +0200
Subject: [PATCH 47/47] Add gaudi section (#699)

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 49f0d7976..0226b5d47 100644
--- a/README.md
+++ b/README.md
@@ -239,3 +239,8 @@ Do not forget to install requirements for every example:
 cd <example-folder>
 pip install -r requirements.txt
 ```
+
+
+## Gaudi
+
+To train your model on [Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html), check out [Optimum Habana](https://github.com/huggingface/optimum-habana) which provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks. After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness.