From fa1bc56f151f5e50f19a0b856eba83cd822ce7be Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 30 Apr 2024 15:12:25 +0200 Subject: [PATCH 01/47] Proper datasets.Dataset importing --- optimum/intel/openvino/quantization.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 86e473fd1..d4889c561 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -21,7 +21,6 @@ from pathlib import Path from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union -import datasets import nncf import openvino import torch @@ -62,6 +61,8 @@ if is_datasets_available(): from datasets import Dataset +else: + Dataset = None register_module(ignored_algorithms=[])(Conv1D) @@ -318,7 +319,7 @@ def _quantize_ovbasemodel( self, ov_config: OVConfig, save_directory: Union[str, Path] = None, - calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, + calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, @@ -358,7 +359,7 @@ def _quantize_ovbasemodel( if isinstance(calibration_dataset, nncf.Dataset): quantization_dataset = calibration_dataset - elif isinstance(calibration_dataset, datasets.Dataset): + elif Dataset is not None and isinstance(calibration_dataset, Dataset): calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, batch_size=batch_size, @@ -411,7 +412,7 @@ def _quantize_torchmodel( self, ov_config: OVConfig, save_directory: Union[str, Path], - calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, + calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None, file_name: Optional[str] = None, batch_size: int = 1, data_collator: Optional[DataCollator] = None, @@ -482,7 +483,7 @@ def _quantize_torchmodel( if isinstance(calibration_dataset, nncf.Dataset): quantization_dataset = calibration_dataset - elif isinstance(calibration_dataset, datasets.Dataset): + elif isinstance(calibration_dataset, Dataset): calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, batch_size=batch_size, @@ -567,7 +568,7 @@ def get_calibration_dataset( use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, cache_dir: str = HUGGINGFACE_HUB_CACHE, - ) -> datasets.Dataset: + ) -> "Dataset": """ Create the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -671,7 +672,7 @@ def _weight_only_quantization( ) dataset = None if calibration_dataset is not None: - if isinstance(calibration_dataset, datasets.Dataset): + if Dataset is not None and isinstance(calibration_dataset, Dataset): raise ValueError( "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only " "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of " From d02e281f2f94c791890f99277d0926e2ae7810e0 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 2 May 2024 14:23:57 +0100 Subject: [PATCH 02/47] OV Tokenizer Leftovers - Support SD Pipeline Slow Tokenizer Conversion - Support SD Mixed Quantization - Move Converted OV Tokenizers to a Separate Folder --- optimum/commands/export/openvino.py | 9 +++++++++ optimum/exporters/openvino/__main__.py | 7 ++++--- optimum/exporters/openvino/convert.py | 11 ++++++----- optimum/intel/openvino/utils.py | 24 ++++++++++++++++++++++++ tests/openvino/test_exporters_cli.py | 4 ++-- 5 files changed, 45 insertions(+), 10 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 4fed3f6f8..c225c50d7 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -21,6 +21,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from ...exporters import TasksManager +from ...exporters.openvino.convert import export_tokenizer from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available from ..base import BaseOptimumCLICommand, CommandInfo @@ -261,6 +262,14 @@ def run(self): ) model.save_pretrained(self.args.output) + output = Path(self.args.output) + tokenizer = getattr(model, "tokenizer", None) + if tokenizer is not None: + export_tokenizer(tokenizer, output / "tokenizer") + + tokenizer_2 = getattr(model, "tokenizer_2", None) + if tokenizer_2 is not None: + export_tokenizer(tokenizer_2, output / "tokenizer_2") else: if self.args.convert_tokenizer: logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 8908c430b..3fa4fb0eb 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -25,6 +25,7 @@ from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.openvino.convert import export_from_model, export_tokenizer +from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -364,7 +365,7 @@ class StoreAttr(object): if tokenizer is not None: try: - export_tokenizer(tokenizer, output) + export_tokenizer(tokenizer, output / OV_TOKENIZER_FLOLDER) except Exception as exception: logger.warning( "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer " @@ -373,11 +374,11 @@ class StoreAttr(object): else: tokenizer = getattr(model, "tokenizer", None) if tokenizer is not None: - export_tokenizer(tokenizer, output) + export_tokenizer(tokenizer, output / "tokenizer") tokenizer_2 = getattr(model, "tokenizer_2", None) if tokenizer_2 is not None: - export_tokenizer(tokenizer_2, output, suffix="_2") + export_tokenizer(tokenizer_2, output / "tokenizer_2") elif convert_tokenizer and not is_openvino_tokenizers_available(): logger.warning("Tokenizer won't be converted.") diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 6c86c2c2d..bb781a690 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -667,20 +667,21 @@ def export_tokenizer( output: Union[str, Path], suffix: Optional[str] = "", ): - from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports + # avoid circular imports + from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME + from optimum.intel.openvino.utils import maybe_convert_tokenizer_to_fast try: from openvino_tokenizers import convert_tokenizer except ModuleNotFoundError: - # avoid this message before tokenizers are part of the openvino dependencies - # logger.info( - # "Run `pip install openvino-tokenizers[transformers]` to get OpenVINO tokenizer/detokenizer models." - # ) return if not isinstance(output, Path): output = Path(output) + if output.exists(): + tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output) + try: converted = convert_tokenizer(tokenizer, with_detokenizer=True) except NotImplementedError: diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 4d1479f73..89994a7ac 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -17,10 +17,13 @@ import logging import os from glob import glob +from pathlib import Path +from typing import List, Union import numpy as np from huggingface_hub import model_info from openvino.runtime import Core, Type, properties +from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size @@ -31,6 +34,7 @@ OV_DECODER_NAME = "openvino_decoder_model.xml" OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml" +OV_TOKENIZER_FLOLDER = "openvino_tokenizer" OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml" OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml" @@ -107,6 +111,26 @@ } +NEED_CONVERT_TO_FAST_TOKENIZER: List[PreTrainedTokenizer] = [ + CLIPTokenizer, +] + + +def maybe_convert_tokenizer_to_fast( + hf_tokenizer: PreTrainedTokenizer, tokenizer_path: Path +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if isinstance(hf_tokenizer, PreTrainedTokenizerFast): + return hf_tokenizer + + if any(isinstance(type(hf_tokenizer), slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER): + try: + return AutoTokenizer.from_pretrained(tokenizer_path) + except Exception: + return hf_tokenizer + + return hf_tokenizer + + def use_external_data_format(num_parameters: int) -> bool: """ Returns whether or not the model requires using external data format for the ONNX export diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 09fad5d77..c91f28ba5 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -74,8 +74,8 @@ class OVCLIExportTestCase(unittest.TestCase): "wav2vec2": 0, # no tokenizer "bert": 1, # no detokenizer "blenderbot": 2, - "stable-diffusion": 0, # not supported - "stable-diffusion-xl": 0, # not supported + "stable-diffusion": 2, + "stable-diffusion-xl": 2, } SUPPORTED_SD_HYBRID_ARCHITECTURES = ( From 135c2e9b8f96e54b95baa7c626fc4be3fb0cdc08 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 2 May 2024 15:01:49 +0100 Subject: [PATCH 03/47] Fix Circular Import --- optimum/commands/export/openvino.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index c225c50d7..5a6cfeb02 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -21,7 +21,6 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from ...exporters import TasksManager -from ...exporters.openvino.convert import export_tokenizer from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available from ..base import BaseOptimumCLICommand, CommandInfo @@ -262,6 +261,9 @@ def run(self): ) model.save_pretrained(self.args.output) + # avoid circular import + from ...exporters.openvino.convert import export_tokenizer + output = Path(self.args.output) tokenizer = getattr(model, "tokenizer", None) if tokenizer is not None: From 1766570e78ca7a3f38e1d8d47326c6fb70e7ba7c Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 2 May 2024 15:12:20 +0100 Subject: [PATCH 04/47] Fix Circular Import --- optimum/commands/export/openvino.py | 4 +--- optimum/exporters/openvino/__main__.py | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 5a6cfeb02..c225c50d7 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -21,6 +21,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from ...exporters import TasksManager +from ...exporters.openvino.convert import export_tokenizer from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available from ..base import BaseOptimumCLICommand, CommandInfo @@ -261,9 +262,6 @@ def run(self): ) model.save_pretrained(self.args.output) - # avoid circular import - from ...exporters.openvino.convert import export_tokenizer - output = Path(self.args.output) tokenizer = getattr(model, "tokenizer", None) if tokenizer is not None: diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 3fa4fb0eb..41eb0200d 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -25,7 +25,6 @@ from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.openvino.convert import export_from_model, export_tokenizer -from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -356,6 +355,9 @@ class StoreAttr(object): **kwargs_shapes, ) + # avoid circular import + from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER + if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": tokenizer = next( From a1ee74970357e5c8ab2164d0bd381993cc035f35 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 2 May 2024 17:07:30 +0100 Subject: [PATCH 05/47] Fix Tests --- optimum/intel/openvino/utils.py | 4 ++-- tests/openvino/test_exporters_cli.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 89994a7ac..6b49f7a83 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -111,7 +111,7 @@ } -NEED_CONVERT_TO_FAST_TOKENIZER: List[PreTrainedTokenizer] = [ +NEED_CONVERT_TO_FAST_TOKENIZER: List[type(PreTrainedTokenizer)] = [ CLIPTokenizer, ] @@ -122,7 +122,7 @@ def maybe_convert_tokenizer_to_fast( if isinstance(hf_tokenizer, PreTrainedTokenizerFast): return hf_tokenizer - if any(isinstance(type(hf_tokenizer), slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER): + if any(isinstance(hf_tokenizer, slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER): try: return AutoTokenizer.from_pretrained(tokenizer_path) except Exception: diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index c91f28ba5..cac79abae 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -75,7 +75,7 @@ class OVCLIExportTestCase(unittest.TestCase): "bert": 1, # no detokenizer "blenderbot": 2, "stable-diffusion": 2, - "stable-diffusion-xl": 2, + "stable-diffusion-xl": 4, } SUPPORTED_SD_HYBRID_ARCHITECTURES = ( From ef9e5df7d23596a1b95a02698ab58d5e018634c0 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 2 May 2024 18:40:37 +0100 Subject: [PATCH 06/47] Fix INC Tests --- optimum/commands/export/openvino.py | 4 +++- optimum/exporters/openvino/__main__.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index c225c50d7..a7302ef88 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -21,7 +21,6 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from ...exporters import TasksManager -from ...exporters.openvino.convert import export_tokenizer from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available from ..base import BaseOptimumCLICommand, CommandInfo @@ -262,6 +261,9 @@ def run(self): ) model.save_pretrained(self.args.output) + # not export when using other exporters + from ...exporters.openvino.convert import export_tokenizer + output = Path(self.args.output) tokenizer = getattr(model, "tokenizer", None) if tokenizer is not None: diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 41eb0200d..234e34aa9 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -24,7 +24,7 @@ from optimum.exporters import TasksManager from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED -from optimum.exporters.openvino.convert import export_from_model, export_tokenizer +from optimum.exporters.openvino.convert import export_from_model from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version from optimum.utils.save_utils import maybe_load_preprocessors @@ -357,6 +357,8 @@ class StoreAttr(object): # avoid circular import from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER + # hide openvino import when using other exporters + from optimum.exporters.openvino.convert import export_tokenizer if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": From 1f44ce9da35249f08276ac8affa56ec0d63ac503 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 2 May 2024 18:41:27 +0100 Subject: [PATCH 07/47] Make Style --- optimum/exporters/openvino/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 234e34aa9..a43c42e44 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -355,10 +355,10 @@ class StoreAttr(object): **kwargs_shapes, ) - # avoid circular import - from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER # hide openvino import when using other exporters + # avoid circular import from optimum.exporters.openvino.convert import export_tokenizer + from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": From ca30de156918069eb1af2d13bd2545a7f2b5a851 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 6 May 2024 10:32:19 +0200 Subject: [PATCH 08/47] SD calibration dataset collection refactoring --- optimum/intel/openvino/configuration.py | 1 + optimum/intel/openvino/modeling_diffusion.py | 82 ++------ optimum/intel/openvino/quantization.py | 210 ++++++++++++++----- tests/openvino/test_quantization.py | 4 +- 4 files changed, 174 insertions(+), 123 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 5de672b70..30dfe5ae6 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -57,6 +57,7 @@ class OVQuantizationMethod(str, Enum): DEFAULT = "default" + HYBRID = "hybrid" @dataclass diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 2de7cb815..ae86ea2bf 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import copy import importlib import logging import os @@ -57,7 +57,7 @@ ) from ...exporters.openvino import main_export -from .configuration import OVConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVWeightQuantizationConfig, OVQuantizationMethod from .loaders import OVTextualInversionLoaderMixin from .modeling_base import OVBaseModel from .utils import ( @@ -300,13 +300,11 @@ def _from_pretrained( # load the UNet model uncompressed to apply hybrid quantization further unet = cls.load_model(unet_path) # Apply weights compression to other `components` without dataset - weight_quantization_params = { - param: value for param, value in quantization_config.__dict__.items() if param != "dataset" - } - weight_quantization_config = OVWeightQuantizationConfig.from_dict(weight_quantization_params) + quantization_config_without_dataset = copy.deepcopy(quantization_config) + quantization_config_without_dataset.dataset = None else: - weight_quantization_config = quantization_config - unet = cls.load_model(unet_path, weight_quantization_config) + quantization_config_without_dataset = quantization_config + unet = cls.load_model(unet_path, quantization_config_without_dataset) components = { "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, @@ -316,7 +314,7 @@ def _from_pretrained( } for key, value in components.items(): - components[key] = cls.load_model(value, weight_quantization_config) if value.is_file() else None + components[key] = cls.load_model(value, quantization_config_without_dataset) if value.is_file() else None if model_save_dir is None: model_save_dir = new_model_save_dir @@ -332,12 +330,14 @@ def _from_pretrained( if not isinstance(sd_model, supported_pipelines): raise NotImplementedError(f"Quantization in hybrid mode is not supported for {cls.__name__}") - nsamples = quantization_config.num_samples if quantization_config.num_samples else 200 - unet_inputs = sd_model._prepare_unet_inputs(quantization_config.dataset, nsamples) + from optimum.intel import OVQuantizer - from .quantization import _hybrid_quantization + quantizer = OVQuantizer(sd_model) + quantization_config_copy = copy.deepcopy(quantization_config) + quantization_config_copy.quant_method = OVQuantizationMethod.HYBRID + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy)) - unet = _hybrid_quantization(sd_model.unet.model, weight_quantization_config, dataset=unet_inputs) + return sd_model return cls( unet=unet, @@ -348,62 +348,6 @@ def _from_pretrained( **kwargs, ) - def _prepare_unet_inputs( - self, - dataset: Union[str, List[Any]], - num_samples: int, - height: Optional[int] = None, - width: Optional[int] = None, - seed: Optional[int] = 42, - **kwargs, - ) -> Dict[str, Any]: - self.compile() - - size = self.unet.config.get("sample_size", 64) * self.vae_scale_factor - height = height or min(size, 512) - width = width or min(size, 512) - - if isinstance(dataset, str): - dataset = deepcopy(dataset) - available_datasets = PREDEFINED_SD_DATASETS.keys() - if dataset not in available_datasets: - raise ValueError( - f"""You have entered a string value for dataset. You can only choose between - {list(available_datasets)}, but the {dataset} was found""" - ) - - from datasets import load_dataset - - dataset_metadata = PREDEFINED_SD_DATASETS[dataset] - dataset = load_dataset(dataset, split=dataset_metadata["split"], streaming=True).shuffle(seed=seed) - input_names = dataset_metadata["inputs"] - dataset = dataset.select_columns(list(input_names.values())) - - def transform_fn(data_item): - return {inp_name: data_item[column] for inp_name, column in input_names.items()} - - else: - - def transform_fn(data_item): - return data_item if isinstance(data_item, (list, dict)) else [data_item] - - from .quantization import InferRequestWrapper - - calibration_data = [] - self.unet.request = InferRequestWrapper(self.unet.request, calibration_data) - - for inputs in dataset: - inputs = transform_fn(inputs) - if isinstance(inputs, dict): - self.__call__(**inputs, height=height, width=width) - else: - self.__call__(*inputs, height=height, width=width) - if len(calibration_data) >= num_samples: - break - - self.unet.request = self.unet.request.request - return calibration_data[:num_samples] - @classmethod def _from_transformers( cls, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index d4889c561..f2258864a 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections.abc import copy import inspect import logging @@ -49,13 +50,14 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig +from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig, OVQuantizationMethod from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, MIN_ONNX_QDQ_OPSET, ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, + PREDEFINED_SD_DATASETS, ) @@ -201,7 +203,7 @@ def from_pretrained(cls, model: PreTrainedModel, **kwargs): def quantize( self, - calibration_dataset: Optional[Union[datasets.Dataset, nncf.Dataset, Iterable]] = None, + calibration_dataset: Optional[Union["Dataset", nncf.Dataset, Iterable]] = None, save_directory: Optional[Union[str, Path]] = None, ov_config: OVConfig = None, file_name: Optional[str] = None, @@ -325,74 +327,84 @@ def _quantize_ovbasemodel( remove_unused_columns: bool = True, **kwargs, ): + from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase + if save_directory is not None: save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) - quantization_config = ov_config.quantization_config + + if calibration_dataset is not None: + # Process custom calibration dataset + + if isinstance(self.model, OVStableDiffusionPipelineBase): + calibration_dataset = self._prepare_unet_dataset( + quantization_config.num_samples, + dataset=calibration_dataset) + elif Dataset is not None and isinstance(calibration_dataset, Dataset): + calibration_dataloader = self._get_calibration_dataloader( + calibration_dataset=calibration_dataset, + batch_size=batch_size, + remove_unused_columns=remove_unused_columns, + data_collator=data_collator, + ) + + if self.model.export_feature == "text-generation" and self.model.use_cache: + calibration_dataset = self._prepare_text_generation_dataset( + quantization_config, calibration_dataloader) + else: + calibration_dataset = nncf.Dataset(calibration_dataloader) + elif isinstance(calibration_dataset, collections.abc.Iterable): + calibration_dataset = nncf.Dataset(calibration_dataset) + elif not isinstance(calibration_dataset, nncf.Dataset): + raise ValueError("`calibration_dataset` must be either an `Iterable` object or an instance of " + f"`nncf.Dataset` or `datasets.Dataset`. Found: {type(calibration_dataset)}.") + if isinstance(quantization_config, OVWeightQuantizationConfig): + if quantization_config.dataset is not None and calibration_dataset is not None: + logger.info( + "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " + "quantization. Will rely on `calibration_dataset`." + ) + if calibration_dataset is None and isinstance(quantization_config.dataset, str): from optimum.intel import OVModelForCausalLM if isinstance(self.model, OVModelForCausalLM): - from optimum.gptq.data import get_dataset, prepare_dataset - - tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer) - nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 - calibration_dataset = get_dataset( - quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples - ) - calibration_dataset = prepare_dataset(calibration_dataset) - calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x)) + calibration_dataset = self._prepare_gptq_dataset(quantization_config) + elif isinstance(self.model, OVStableDiffusionPipelineBase): + calibration_dataset = self._prepare_unet_dataset( + quantization_config.num_samples, + dataset_name=quantization_config.dataset) else: raise ValueError( f"Can't create weight compression calibration dataset from string for {type(self.model)}" ) - _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) + if quantization_config.quant_method == OVQuantizationMethod.HYBRID: + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run hybrid quantization.") + if isinstance(self.model, OVStableDiffusionPipelineBase): + self.model.unet.model = _hybrid_quantization(self.model.unet.model, quantization_config, calibration_dataset) + else: + self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset) + else: + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) if save_directory is not None: self.model.save_pretrained(save_directory) ov_config.save_pretrained(save_directory) return + if not isinstance(quantization_config, OVQuantizationConfig): raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") - if isinstance(calibration_dataset, nncf.Dataset): - quantization_dataset = calibration_dataset - elif Dataset is not None and isinstance(calibration_dataset, Dataset): - calibration_dataloader = self._get_calibration_dataloader( - calibration_dataset=calibration_dataset, - batch_size=batch_size, - remove_unused_columns=remove_unused_columns, - data_collator=data_collator, - ) - - if self.model.export_feature == "text-generation" and self.model.use_cache: - # Prefetch past_key_values - self.model.update_pkv_precision(True) - self.model.compile() - collected_inputs = [] - - self.model.request = InferRequestWrapper(self.model.request, collected_inputs) - try: - for data in calibration_dataloader: - self.model.generate(**data, max_new_tokens=1) - if len(collected_inputs) >= quantization_config.num_samples: - break - finally: - self.model.request = self.model.request.request - quantization_dataset = nncf.Dataset(collected_inputs) - else: - quantization_dataset = nncf.Dataset(calibration_dataloader) - else: - if calibration_dataset is None: - raise ValueError("Calibration dataset is required to run quantization.") - quantization_dataset = nncf.Dataset(calibration_dataset) + if calibration_dataset is None: + raise ValueError("Calibration dataset is required to run quantization.") # Actual model quantization quantized_model = nncf.quantize( self.model.model, - quantization_dataset, + calibration_dataset, subset_size=quantization_config.num_samples, ignored_scope=quantization_config.get_ignored_scope_instance(), model_type=nncf.ModelType(quantization_config.model_type), @@ -655,6 +667,103 @@ def _remove_unused_columns(self, dataset: "Dataset"): ignored_columns = list(set(dataset.column_names) - set(self._signature_columns)) return dataset.remove_columns(ignored_columns) + def _prepare_gptq_dataset(self, quantization_config: OVWeightQuantizationConfig): + from optimum.gptq.data import get_dataset, prepare_dataset + + tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer) + nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 + calibration_dataset = get_dataset( + quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples + ) + calibration_dataset = prepare_dataset(calibration_dataset) + calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x)) + + return calibration_dataset + + def _prepare_text_generation_dataset( + self, + quantization_config: OVQuantizationConfig, + calibration_dataloader: OVDataLoader) -> nncf.Dataset: + # TODO: this function is not covered by tests, remove if not relevant anymore or cover by tests otherwise + + # Prefetch past_key_values + self.model.update_pkv_precision(True) + self.model.compile() + collected_inputs = [] + + num_samples = quantization_config.num_samples or 200 + + self.model.request = InferRequestWrapper(self.model.model.request, collected_inputs) + try: + for data in calibration_dataloader: + self.model.generate(**data, max_new_tokens=1) + if len(collected_inputs) >= num_samples: + break + finally: + self.model.model.request = self.model.model.request.request + calibration_dataset = nncf.Dataset(collected_inputs) + return calibration_dataset + + def _prepare_unet_dataset( + self, + num_samples: Optional[int] = None, + dataset_name: Optional[str] = None, + dataset: Optional[Union[Iterable, "Dataset"]] = None) -> nncf.Dataset: + self.model.compile() + + size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor + height, width = 2 * (min(size, 512),) + num_samples = num_samples or 200 + + if dataset is not None: + if isinstance(dataset, nncf.Dataset): + return dataset + if Dataset is not None and isinstance(dataset, Dataset): + dataset = dataset.select_columns(["caption"]) + + def transform_fn(data_item): + return data_item if isinstance(data_item, (list, dict)) else [data_item] + + elif isinstance(dataset_name, str): + available_datasets = PREDEFINED_SD_DATASETS.keys() + if dataset_name not in available_datasets: + raise ValueError( + f"""You have entered a string value for dataset. You can only choose between + {list(available_datasets)}, but the {dataset_name} was found""" + ) + + from datasets import load_dataset + + dataset_metadata = PREDEFINED_SD_DATASETS[dataset_name] + dataset = load_dataset(dataset_name, split=dataset_metadata["split"], streaming=True).shuffle(seed=self.seed) + input_names = dataset_metadata["inputs"] + dataset = dataset.select_columns(list(input_names.values())) + + def transform_fn(data_item): + return {inp_name: data_item[column] for inp_name, column in input_names.items()} + + else: + raise ValueError("For UNet inputs collection either quantization_config.dataset or custom " + "calibration_dataset must be provided.") + + calibration_data = [] + try: + self.model.unet.request = InferRequestWrapper(self.model.unet.request, calibration_data) + + for inputs in dataset: + inputs = transform_fn(inputs) + if isinstance(inputs, dict): + self.model(**inputs, height=height, width=width) + else: + self.model(*inputs, height=height, width=width) + if len(calibration_data) >= num_samples: + break + finally: + self.model.unet.request = self.model.unet.request.request + + calibration_dataset = nncf.Dataset(calibration_data[:num_samples]) + return calibration_dataset + def _weight_only_quantization( model: openvino.runtime.Model, @@ -665,11 +774,6 @@ def _weight_only_quantization( if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) - if config.dataset is not None and calibration_dataset is not None: - logger.info( - "Both `quantization_config.dataset` and `calibration_dataset` were provided for weight only " - "quantization. Will rely on `calibration_dataset`." - ) dataset = None if calibration_dataset is not None: if Dataset is not None and isinstance(calibration_dataset, Dataset): @@ -752,7 +856,7 @@ def _collect_ops_with_weights(model): def _hybrid_quantization( - model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig, dataset: Dict[str, Any] + model: openvino.runtime.Model, quantization_config: OVWeightQuantizationConfig, dataset: nncf.Dataset ) -> openvino.runtime.Model: """ Quantize a model in hybrid mode with NNCF which means that we quantize: @@ -764,7 +868,7 @@ def _hybrid_quantization( The OpenVINO Runtime model for applying hybrid quantization. quantization_config (`OVWeightQuantizationConfig`): The configuration containing the parameters related to quantization. - dataset (`Dict[str, Any]`): + dataset (`nncf.Dataset`): The dataset used for hybrid quantization. Returns: The OpenVINO Runtime model with applied hybrid quantization. @@ -781,7 +885,7 @@ def _hybrid_quantization( subset_size = quantization_config.num_samples if quantization_config.num_samples else 200 quantized_model = nncf.quantize( model=compressed_model, - calibration_dataset=nncf.Dataset(dataset), + calibration_dataset=dataset, model_type=nncf.ModelType.TRANSFORMER, ignored_scope=ptq_ignored_scope, # SQ algo should be disabled for MatMul nodes because their weights are already compressed diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 896f37d76..de6b80827 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -413,8 +413,10 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( model = model_cls.from_pretrained( model_id, export=True, - quantization_config=OVWeightQuantizationConfig(bits=8, dataset=dataset, num_samples=3), ) + quantizer = OVQuantizer(model) + quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID) + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset) num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) self.assertEqual(expected_ov_int8, num_int8) From de9b5c18c0b508422361a6562cbdd90b144aa776 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 6 May 2024 10:39:36 +0200 Subject: [PATCH 09/47] linters --- optimum/intel/openvino/modeling_diffusion.py | 9 ++-- optimum/intel/openvino/quantization.py | 51 +++++++++++--------- tests/openvino/test_quantization.py | 10 ++-- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index ae86ea2bf..c5afb2c14 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy + import importlib import logging import os @@ -57,14 +57,13 @@ ) from ...exporters.openvino import main_export -from .configuration import OVConfig, OVWeightQuantizationConfig, OVQuantizationMethod +from .configuration import OVConfig, OVQuantizationMethod, OVWeightQuantizationConfig from .loaders import OVTextualInversionLoaderMixin from .modeling_base import OVBaseModel from .utils import ( ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, - PREDEFINED_SD_DATASETS, _print_compiled_model_properties, ) @@ -300,7 +299,7 @@ def _from_pretrained( # load the UNet model uncompressed to apply hybrid quantization further unet = cls.load_model(unet_path) # Apply weights compression to other `components` without dataset - quantization_config_without_dataset = copy.deepcopy(quantization_config) + quantization_config_without_dataset = deepcopy(quantization_config) quantization_config_without_dataset.dataset = None else: quantization_config_without_dataset = quantization_config @@ -333,7 +332,7 @@ def _from_pretrained( from optimum.intel import OVQuantizer quantizer = OVQuantizer(sd_model) - quantization_config_copy = copy.deepcopy(quantization_config) + quantization_config_copy = deepcopy(quantization_config) quantization_config_copy.quant_method = OVQuantizationMethod.HYBRID quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy)) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f2258864a..a749f38e6 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -50,7 +50,7 @@ from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available from ..utils.modeling_utils import get_model_device -from .configuration import OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig, OVQuantizationMethod +from .configuration import OVConfig, OVQuantizationConfig, OVQuantizationMethod, OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( MAX_ONNX_OPSET, @@ -339,8 +339,8 @@ def _quantize_ovbasemodel( if isinstance(self.model, OVStableDiffusionPipelineBase): calibration_dataset = self._prepare_unet_dataset( - quantization_config.num_samples, - dataset=calibration_dataset) + quantization_config.num_samples, dataset=calibration_dataset + ) elif Dataset is not None and isinstance(calibration_dataset, Dataset): calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, @@ -351,14 +351,17 @@ def _quantize_ovbasemodel( if self.model.export_feature == "text-generation" and self.model.use_cache: calibration_dataset = self._prepare_text_generation_dataset( - quantization_config, calibration_dataloader) + quantization_config, calibration_dataloader + ) else: calibration_dataset = nncf.Dataset(calibration_dataloader) elif isinstance(calibration_dataset, collections.abc.Iterable): calibration_dataset = nncf.Dataset(calibration_dataset) elif not isinstance(calibration_dataset, nncf.Dataset): - raise ValueError("`calibration_dataset` must be either an `Iterable` object or an instance of " - f"`nncf.Dataset` or `datasets.Dataset`. Found: {type(calibration_dataset)}.") + raise ValueError( + "`calibration_dataset` must be either an `Iterable` object or an instance of " + f"`nncf.Dataset` or `datasets.Dataset`. Found: {type(calibration_dataset)}." + ) if isinstance(quantization_config, OVWeightQuantizationConfig): if quantization_config.dataset is not None and calibration_dataset is not None: @@ -374,8 +377,8 @@ def _quantize_ovbasemodel( calibration_dataset = self._prepare_gptq_dataset(quantization_config) elif isinstance(self.model, OVStableDiffusionPipelineBase): calibration_dataset = self._prepare_unet_dataset( - quantization_config.num_samples, - dataset_name=quantization_config.dataset) + quantization_config.num_samples, dataset_name=quantization_config.dataset + ) else: raise ValueError( f"Can't create weight compression calibration dataset from string for {type(self.model)}" @@ -385,7 +388,9 @@ def _quantize_ovbasemodel( if calibration_dataset is None: raise ValueError("Calibration dataset is required to run hybrid quantization.") if isinstance(self.model, OVStableDiffusionPipelineBase): - self.model.unet.model = _hybrid_quantization(self.model.unet.model, quantization_config, calibration_dataset) + self.model.unet.model = _hybrid_quantization( + self.model.unet.model, quantization_config, calibration_dataset + ) else: self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset) else: @@ -672,18 +677,15 @@ def _prepare_gptq_dataset(self, quantization_config: OVWeightQuantizationConfig) tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer) nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 - calibration_dataset = get_dataset( - quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples - ) + calibration_dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) calibration_dataset = prepare_dataset(calibration_dataset) calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x)) return calibration_dataset def _prepare_text_generation_dataset( - self, - quantization_config: OVQuantizationConfig, - calibration_dataloader: OVDataLoader) -> nncf.Dataset: + self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader + ) -> nncf.Dataset: # TODO: this function is not covered by tests, remove if not relevant anymore or cover by tests otherwise # Prefetch past_key_values @@ -705,10 +707,11 @@ def _prepare_text_generation_dataset( return calibration_dataset def _prepare_unet_dataset( - self, - num_samples: Optional[int] = None, - dataset_name: Optional[str] = None, - dataset: Optional[Union[Iterable, "Dataset"]] = None) -> nncf.Dataset: + self, + num_samples: Optional[int] = None, + dataset_name: Optional[str] = None, + dataset: Optional[Union[Iterable, "Dataset"]] = None, + ) -> nncf.Dataset: self.model.compile() size = self.model.unet.config.get("sample_size", 64) * self.model.vae_scale_factor @@ -735,7 +738,9 @@ def transform_fn(data_item): from datasets import load_dataset dataset_metadata = PREDEFINED_SD_DATASETS[dataset_name] - dataset = load_dataset(dataset_name, split=dataset_metadata["split"], streaming=True).shuffle(seed=self.seed) + dataset = load_dataset(dataset_name, split=dataset_metadata["split"], streaming=True).shuffle( + seed=self.seed + ) input_names = dataset_metadata["inputs"] dataset = dataset.select_columns(list(input_names.values())) @@ -743,8 +748,10 @@ def transform_fn(data_item): return {inp_name: data_item[column] for inp_name, column in input_names.items()} else: - raise ValueError("For UNet inputs collection either quantization_config.dataset or custom " - "calibration_dataset must be provided.") + raise ValueError( + "For UNet inputs collection either quantization_config.dataset or custom " + "calibration_dataset must be provided." + ) calibration_data = [] try: diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index de6b80827..98eb121d7 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -21,21 +21,17 @@ from collections import defaultdict from enum import Enum from functools import partial -from typing import List, Union +from typing import Union import evaluate import numpy as np import torch from datasets import load_dataset -from nncf.quantization.advanced_parameters import OverflowFix from parameterized import parameterized -import openvino.runtime as ov import nncf from transformers import ( AutoModelForQuestionAnswering, AutoModelForSequenceClassification, - AutoModelForCausalLM, - AutoModelForTokenClassification, AutoTokenizer, AutoProcessor, TrainingArguments, @@ -415,7 +411,9 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( export=True, ) quantizer = OVQuantizer(model) - quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID) + quantization_config = OVWeightQuantizationConfig( + bits=8, num_samples=3, quant_method=OVQuantizationMethod.HYBRID + ) quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset) num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) From 4a007f5adad1b2bbbf7dd0587b6f3b0280032b71 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 6 May 2024 13:45:27 +0200 Subject: [PATCH 10/47] Addressed comments --- optimum/intel/openvino/quantization.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index a749f38e6..d1f28b290 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -341,7 +341,7 @@ def _quantize_ovbasemodel( calibration_dataset = self._prepare_unet_dataset( quantization_config.num_samples, dataset=calibration_dataset ) - elif Dataset is not None and isinstance(calibration_dataset, Dataset): + elif is_datasets_available() and isinstance(calibration_dataset, Dataset): calibration_dataloader = self._get_calibration_dataloader( calibration_dataset=calibration_dataset, batch_size=batch_size, @@ -374,7 +374,7 @@ def _quantize_ovbasemodel( from optimum.intel import OVModelForCausalLM if isinstance(self.model, OVModelForCausalLM): - calibration_dataset = self._prepare_gptq_dataset(quantization_config) + calibration_dataset = self._prepare_builtin_dataset(quantization_config) elif isinstance(self.model, OVStableDiffusionPipelineBase): calibration_dataset = self._prepare_unet_dataset( quantization_config.num_samples, dataset_name=quantization_config.dataset @@ -392,6 +392,7 @@ def _quantize_ovbasemodel( self.model.unet.model, quantization_config, calibration_dataset ) else: + # This may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset) else: _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) @@ -672,7 +673,7 @@ def _remove_unused_columns(self, dataset: "Dataset"): ignored_columns = list(set(dataset.column_names) - set(self._signature_columns)) return dataset.remove_columns(ignored_columns) - def _prepare_gptq_dataset(self, quantization_config: OVWeightQuantizationConfig): + def _prepare_builtin_dataset(self, quantization_config: OVWeightQuantizationConfig): from optimum.gptq.data import get_dataset, prepare_dataset tokenizer = AutoTokenizer.from_pretrained(quantization_config.tokenizer) @@ -721,7 +722,7 @@ def _prepare_unet_dataset( if dataset is not None: if isinstance(dataset, nncf.Dataset): return dataset - if Dataset is not None and isinstance(dataset, Dataset): + if is_datasets_available() and isinstance(dataset, Dataset): dataset = dataset.select_columns(["caption"]) def transform_fn(data_item): @@ -783,7 +784,7 @@ def _weight_only_quantization( dataset = None if calibration_dataset is not None: - if Dataset is not None and isinstance(calibration_dataset, Dataset): + if is_datasets_available() and isinstance(calibration_dataset, Dataset): raise ValueError( "Providing calibration dataset as an instance of `datasets.Dataset` for OV weight-only " "quantization is not supported. Please provide it as `nncf.Dataset` or as iterable of " From 583e43514ba0721fd7dfd87a75fe8f627f4fef58 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 6 May 2024 13:46:46 +0200 Subject: [PATCH 11/47] Updated SD HQ notebook --- .../stable_diffusion_hybrid_quantization.ipynb | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index 41969b162..efe413a9e 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -52,7 +52,7 @@ "import transformers\n", "from pathlib import Path\n", "from openvino.runtime import Core\n", - "from optimum.intel import OVStableDiffusionPipeline, OVWeightQuantizationConfig\n", + "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n", "\n", "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()" @@ -198,9 +198,14 @@ }, "outputs": [], "source": [ - "quantization_config = OVWeightQuantizationConfig(bits=8, dataset=calibration_dataset, num_samples=NUM_SAMPLES)\n", - "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True, quantization_config=quantization_config)\n", - "int8_pipe.save_pretrained(int8_model_path)" + "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES)\n", + "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n", + "quantizer = OVQuantizer(int8_pipe)\n", + "quantizer.quantize(\n", + " ov_config=OVConfig(quantization_config=quantization_config),\n", + " calibration_dataset=calibration_dataset,\n", + " save_directory=int8_model_path\n", + ")" ] }, { @@ -613,7 +618,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.8.10" } }, "nbformat": 4, From 349350c2c7524aa4be33c0baf680a1e45f894745 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 6 May 2024 16:10:22 +0200 Subject: [PATCH 12/47] Quantize SD submodels in OVQuantizer --- ...stable_diffusion_hybrid_quantization.ipynb | 5 +-- optimum/intel/openvino/modeling_diffusion.py | 33 ++++++++----------- optimum/intel/openvino/quantization.py | 14 ++++++-- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index efe413a9e..142cde492 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -53,6 +53,7 @@ "from pathlib import Path\n", "from openvino.runtime import Core\n", "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n", + "from optimum.intel.openvino.configuration import OVQuantizationMethod\n", "\n", "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()" @@ -198,8 +199,8 @@ }, "outputs": [], "source": [ - "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES)\n", "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n", + "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID)\n", "quantizer = OVQuantizer(int8_pipe)\n", "quantizer.quantize(\n", " ov_config=OVConfig(quantization_config=quantization_config),\n", @@ -618,7 +619,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.11.7" } }, "nbformat": 4, diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index c5afb2c14..c92d20e3e 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -292,19 +292,7 @@ def _from_pretrained( else: kwargs[name] = load_method(new_model_save_dir) - quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) - unet_path = new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name - if quantization_config is not None and quantization_config.dataset is not None: - # load the UNet model uncompressed to apply hybrid quantization further - unet = cls.load_model(unet_path) - # Apply weights compression to other `components` without dataset - quantization_config_without_dataset = deepcopy(quantization_config) - quantization_config_without_dataset.dataset = None - else: - quantization_config_without_dataset = quantization_config - unet = cls.load_model(unet_path, quantization_config_without_dataset) - components = { "vae_encoder": new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "vae_decoder": new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, @@ -312,13 +300,19 @@ def _from_pretrained( "text_encoder_2": new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, } - for key, value in components.items(): - components[key] = cls.load_model(value, quantization_config_without_dataset) if value.is_file() else None - if model_save_dir is None: model_save_dir = new_model_save_dir - if quantization_config is not None and quantization_config.dataset is not None: + quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit) + if quantization_config is None or quantization_config.dataset is None: + unet = cls.load_model(unet_path, quantization_config) + for key, value in components.items(): + components[key] = cls.load_model(value, quantization_config) if value.is_file() else None + else: + # Load uncompressed models to apply hybrid quantization further + unet = cls.load_model(unet_path) + for key, value in components.items(): + components[key] = cls.load_model(value) if value.is_file() else None sd_model = cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) supported_pipelines = ( @@ -331,10 +325,10 @@ def _from_pretrained( from optimum.intel import OVQuantizer + hybrid_quantization_config = deepcopy(quantization_config) + hybrid_quantization_config.quant_method = OVQuantizationMethod.HYBRID quantizer = OVQuantizer(sd_model) - quantization_config_copy = deepcopy(quantization_config) - quantization_config_copy.quant_method = OVQuantizationMethod.HYBRID - quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy)) + quantizer.quantize(ov_config=OVConfig(quantization_config=hybrid_quantization_config)) return sd_model @@ -347,6 +341,7 @@ def _from_pretrained( **kwargs, ) + @classmethod def _from_transformers( cls, diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index d1f28b290..45961a86f 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -63,8 +63,6 @@ if is_datasets_available(): from datasets import Dataset -else: - Dataset = None register_module(ignored_algorithms=[])(Conv1D) @@ -388,11 +386,21 @@ def _quantize_ovbasemodel( if calibration_dataset is None: raise ValueError("Calibration dataset is required to run hybrid quantization.") if isinstance(self.model, OVStableDiffusionPipelineBase): + # Apply weight-only quantization to all SD submodels except UNet + quantization_config_copy = copy.deepcopy(quantization_config) + quantization_config_copy.dataset = None + quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT + for sd_submodel_name in ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"]: + sd_submodel = getattr(self.model, sd_submodel_name) + if sd_submodel is not None: + _weight_only_quantization(sd_submodel.model, quantization_config_copy) + + # Apply hybrid quantization to UNet self.model.unet.model = _hybrid_quantization( self.model.unet.model, quantization_config, calibration_dataset ) else: - # This may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. + # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset) else: _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) From 068236dcb585c8af01f4b76793a2aaed5e58ca0b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 6 May 2024 17:13:33 +0200 Subject: [PATCH 13/47] Black --- optimum/intel/openvino/modeling_diffusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index c92d20e3e..1b880e736 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -341,7 +341,6 @@ def _from_pretrained( **kwargs, ) - @classmethod def _from_transformers( cls, From ed5cbb91e02c56e53784351f3befb69c56903171 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 7 May 2024 14:55:10 +0100 Subject: [PATCH 14/47] Apply Review Comments --- optimum/commands/export/openvino.py | 9 ++++++--- optimum/exporters/openvino/__main__.py | 4 ++-- optimum/intel/openvino/utils.py | 10 ++++------ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index a7302ef88..56abc6b7c 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -226,6 +226,9 @@ def run(self): ) library_name = "transformers" + if self.args.convert_tokenizer: + logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") + if ( library_name == "diffusers" and ov_config @@ -261,6 +264,9 @@ def run(self): ) model.save_pretrained(self.args.output) + if self.args.disable_convert_tokenizer: + return + # not export when using other exporters from ...exporters.openvino.convert import export_tokenizer @@ -273,9 +279,6 @@ def run(self): if tokenizer_2 is not None: export_tokenizer(tokenizer_2, output / "tokenizer_2") else: - if self.args.convert_tokenizer: - logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.") - # TODO : add input shapes main_export( model_name_or_path=self.args.model, diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index a43c42e44..0d80101a5 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -358,7 +358,7 @@ class StoreAttr(object): # hide openvino import when using other exporters # avoid circular import from optimum.exporters.openvino.convert import export_tokenizer - from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER + from optimum.intel.openvino.utils import OV_TOKENIZER_FOLDER if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": @@ -369,7 +369,7 @@ class StoreAttr(object): if tokenizer is not None: try: - export_tokenizer(tokenizer, output / OV_TOKENIZER_FLOLDER) + export_tokenizer(tokenizer, output / OV_TOKENIZER_FOLDER) except Exception as exception: logger.warning( "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer " diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 6b49f7a83..3bf00f071 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -18,7 +18,7 @@ import os from glob import glob from pathlib import Path -from typing import List, Union +from typing import Tuple, Union import numpy as np from huggingface_hub import model_info @@ -34,7 +34,7 @@ OV_DECODER_NAME = "openvino_decoder_model.xml" OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml" -OV_TOKENIZER_FLOLDER = "openvino_tokenizer" +OV_TOKENIZER_FOLDER = "openvino_tokenizer" OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml" OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml" @@ -111,9 +111,7 @@ } -NEED_CONVERT_TO_FAST_TOKENIZER: List[type(PreTrainedTokenizer)] = [ - CLIPTokenizer, -] +NEED_CONVERT_TO_FAST_TOKENIZER: Tuple[type(PreTrainedTokenizer)] = (CLIPTokenizer,) def maybe_convert_tokenizer_to_fast( @@ -122,7 +120,7 @@ def maybe_convert_tokenizer_to_fast( if isinstance(hf_tokenizer, PreTrainedTokenizerFast): return hf_tokenizer - if any(isinstance(hf_tokenizer, slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER): + if isinstance(hf_tokenizer, NEED_CONVERT_TO_FAST_TOKENIZER): try: return AutoTokenizer.from_pretrained(tokenizer_path) except Exception: From 4fa40a259faf85fd30d7d930565533f4b1e11f32 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 7 May 2024 14:58:44 +0100 Subject: [PATCH 15/47] Apply Review Comments --- optimum/commands/export/openvino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 56abc6b7c..025a40e05 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -267,7 +267,7 @@ def run(self): if self.args.disable_convert_tokenizer: return - # not export when using other exporters + # avoid import when using other exporters (IPEX, INC) from ...exporters.openvino.convert import export_tokenizer output = Path(self.args.output) From 0029e9165a2dad4cfcf787aa63181d9dc0cd49d5 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Thu, 9 May 2024 13:06:20 +0100 Subject: [PATCH 16/47] Move OV tokenizer to the root folder --- optimum/exporters/openvino/__main__.py | 4 +--- optimum/intel/openvino/utils.py | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 0d80101a5..31abd0f32 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -356,9 +356,7 @@ class StoreAttr(object): ) # hide openvino import when using other exporters - # avoid circular import from optimum.exporters.openvino.convert import export_tokenizer - from optimum.intel.openvino.utils import OV_TOKENIZER_FOLDER if convert_tokenizer and is_openvino_tokenizers_available(): if library_name != "diffusers": @@ -369,7 +367,7 @@ class StoreAttr(object): if tokenizer is not None: try: - export_tokenizer(tokenizer, output / OV_TOKENIZER_FOLDER) + export_tokenizer(tokenizer, output) except Exception as exception: logger.warning( "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer " diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 3bf00f071..69a750fb6 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -34,7 +34,6 @@ OV_DECODER_NAME = "openvino_decoder_model.xml" OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml" -OV_TOKENIZER_FOLDER = "openvino_tokenizer" OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml" OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml" From 0474b26dd453ecb8dc15966eef4d9198e82791c1 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 14 May 2024 08:52:05 +0200 Subject: [PATCH 17/47] unpin torch --- .github/workflows/test_inc.yml | 2 +- .github/workflows/test_ipex.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 1ede5e193..d4ad06660 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,7 +32,7 @@ jobs: python -m pip install --upgrade pip pip install cmake pip install py-cpuinfo - pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] pip install intel-extension-for-transformers pip install peft diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 42f884b72..8e02bd551 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -30,7 +30,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu pip install .[ipex,tests] - name: Test with Pytest run: | From a814adf9d7794403f20608278c4c44c80ae61c8b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 14 May 2024 09:20:36 +0200 Subject: [PATCH 18/47] itrex still in 2.2 --- .github/workflows/test_inc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index d4ad06660..1ede5e193 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,7 +32,7 @@ jobs: python -m pip install --upgrade pip pip install cmake pip install py-cpuinfo - pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] pip install intel-extension-for-transformers pip install peft From d0217982beef9b76a1bad406659d1e14dac2ffc3 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 14 May 2024 21:24:20 +0200 Subject: [PATCH 19/47] Bump test torch version (#708) --- .github/workflows/test_inc.yml | 6 +++--- optimum/intel/neural_compressor/modeling_base.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 1ede5e193..6435d0b71 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,7 +32,7 @@ jobs: python -m pip install --upgrade pip pip install cmake pip install py-cpuinfo - pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu + pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] pip install intel-extension-for-transformers pip install peft @@ -43,7 +43,7 @@ jobs: - name: Test IPEX run: | pip uninstall -y intel-extension-for-transformers - pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu - pip install intel-extension-for-pytorch==2.1.100 + pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --extra-index-url https://download.pytorch.org/whl/cpu + pip install intel-extension-for-pytorch==2.3.0 pytest tests/neural_compressor/test_ipex.py diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 2556a6048..c6d5e7bac 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -147,7 +147,7 @@ def _from_pretrained( try: quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json") algorithm = getattr(quantization_config, "quant_method", None) - if algorithm in {"rtn", "gptq", "awq", "autoaround"}: + if algorithm in {"rtn", "gptq", "awq", "autoround"}: from intel_extension_for_transformers.transformers.modeling.modeling_auto import ( _BaseQBitsAutoModelClass, ) From d9c8f9f1589c78289fced36c5d856d74c80dd2a6 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Wed, 15 May 2024 17:04:41 +0800 Subject: [PATCH 20/47] Add IPEX pipeline (#501) * define optimum-intel pipeline * add tests and readme * fix pipelines example * fix readme codestyle * add _load_model in pipeline * update pipeline for optimum intel * update tests * remove readme * Update optimum/intel/pipelines/__init__.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * fix pipelines * add all supported tasks testing * add hub_kwargs and model_kwargs on tokenizer and feature_extractor * add hub_kwargs and default pipeline tests * fix _from_transformers args * rm default pipeline test * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * fix comments * Update optimum/exporters/openvino/model_patcher.py * Update optimum/intel/ipex/modeling_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/intel/pipelines/pipeline_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * fix style --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/ipex/inference.py | 4 + optimum/intel/ipex/modeling_base.py | 2 + optimum/intel/pipelines/__init__.py | 15 ++ optimum/intel/pipelines/pipeline_base.py | 290 +++++++++++++++++++++++ tests/ipex/test_pipelines.py | 265 +++++++++++++++++++++ 5 files changed, 576 insertions(+) create mode 100644 optimum/intel/pipelines/__init__.py create mode 100644 optimum/intel/pipelines/pipeline_base.py create mode 100644 tests/ipex/test_pipelines.py diff --git a/optimum/intel/ipex/inference.py b/optimum/intel/ipex/inference.py index ccf2da9d8..a628ebe12 100644 --- a/optimum/intel/ipex/inference.py +++ b/optimum/intel/ipex/inference.py @@ -97,6 +97,10 @@ def __init__( jit (`boolean = False`, *optional*): Enable jit to accelerate inference speed """ + logger.warning( + "`inference_mode` is deprecated and will be removed in v1.18.0. Use `pipeline` to load and export your model to TorchScript instead." + ) + if not is_ipex_available(): raise ImportError(IPEX_NOT_AVAILABLE_ERROR_MSG) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 2b739ea50..d2963d55a 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -161,6 +161,7 @@ def _from_transformers( local_files_only: bool = False, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, trust_remote_code: bool = False, + _commit_hash: str = None, ): if use_auth_token is not None: warnings.warn( @@ -186,6 +187,7 @@ def _from_transformers( "force_download": force_download, "torch_dtype": torch_dtype, "trust_remote_code": trust_remote_code, + "_commit_hash": _commit_hash, } model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py new file mode 100644 index 000000000..40a1e3ca5 --- /dev/null +++ b/optimum/intel/pipelines/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pipeline_base import pipeline diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py new file mode 100644 index 000000000..65e6cfb78 --- /dev/null +++ b/optimum/intel/pipelines/pipeline_base.py @@ -0,0 +1,290 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, Optional, Union + +import torch +from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer +from transformers import pipeline as transformers_pipeline +from transformers.feature_extraction_utils import PreTrainedFeatureExtractor +from transformers.pipelines import ( + AudioClassificationPipeline, + FillMaskPipeline, + ImageClassificationPipeline, + QuestionAnsweringPipeline, + TextClassificationPipeline, + TextGenerationPipeline, + TokenClassificationPipeline, +) +from transformers.pipelines.base import Pipeline +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +from optimum.intel.utils import is_ipex_available + + +if is_ipex_available(): + from ..ipex.modeling_base import ( + IPEXModel, + IPEXModelForAudioClassification, + IPEXModelForCausalLM, + IPEXModelForImageClassification, + IPEXModelForMaskedLM, + IPEXModelForQuestionAnswering, + IPEXModelForSequenceClassification, + IPEXModelForTokenClassification, + ) + + IPEX_SUPPORTED_TASKS = { + "text-generation": { + "impl": TextGenerationPipeline, + "class": (IPEXModelForCausalLM,), + "default": "gpt2", + "type": "text", + }, + "fill-mask": { + "impl": FillMaskPipeline, + "class": (IPEXModelForMaskedLM,), + "default": "bert-base-cased", + "type": "text", + }, + "question-answering": { + "impl": QuestionAnsweringPipeline, + "class": (IPEXModelForQuestionAnswering,), + "default": "distilbert-base-cased-distilled-squad", + "type": "text", + }, + "image-classification": { + "impl": ImageClassificationPipeline, + "class": (IPEXModelForImageClassification,), + "default": "google/vit-base-patch16-224", + "type": "image", + }, + "text-classification": { + "impl": TextClassificationPipeline, + "class": (IPEXModelForSequenceClassification,), + "default": "distilbert-base-uncased-finetuned-sst-2-english", + "type": "text", + }, + "token-classification": { + "impl": TokenClassificationPipeline, + "class": (IPEXModelForTokenClassification,), + "default": "dbmdz/bert-large-cased-finetuned-conll03-english", + "type": "text", + }, + "audio-classification": { + "impl": AudioClassificationPipeline, + "class": (IPEXModelForAudioClassification,), + "default": "superb/hubert-base-superb-ks", + "type": "audio", + }, + } +else: + IPEX_SUPPORTED_TASKS = {} + + +def load_ipex_model( + model, + targeted_task, + SUPPORTED_TASKS, + model_kwargs: Optional[Dict[str, Any]] = None, + hub_kwargs: Optional[Dict[str, Any]] = None, +): + if model_kwargs is None: + model_kwargs = {} + + ipex_model_class = SUPPORTED_TASKS[targeted_task]["class"][0] + + if model is None: + model_id = SUPPORTED_TASKS[targeted_task]["default"] + model = ipex_model_class.from_pretrained(model_id, export=True, **model_kwargs, **hub_kwargs) + elif isinstance(model, str): + model_id = model + try: + config = AutoConfig.from_pretrained(model) + export = not getattr(config, "torchscript", False) + except RuntimeError: + logger.warning("We will use IPEXModel with export=True to export the model") + export = True + model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs, **hub_kwargs) + elif isinstance(model, IPEXModel): + model_id = getattr(model.config, "name_or_path", None) + else: + raise ValueError( + f"""Model {model} is not supported. Please provide a valid model name or path or a IPEXModel. + You can also provide non model then a default one will be used""" + ) + + return model, model_id + + +MAPPING_LOADING_FUNC = { + "ipex": load_ipex_model, +} + + +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + from transformers.tokenization_utils_fast import PreTrainedTokenizerFast + + +logger = logging.get_logger(__name__) + + +def pipeline( + task: str = None, + model: Optional[Union[str, "PreTrainedModel"]] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, + feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, + use_fast: bool = True, + token: Optional[Union[str, bool]] = None, + accelerator: Optional[str] = "ort", + revision: Optional[str] = None, + trust_remote_code: Optional[bool] = None, + torch_dtype: Optional[Union[str, torch.dtype]] = None, + commit_hash: Optional[str] = None, + **model_kwargs, +) -> Pipeline: + """ + Utility factory method to build a [`Pipeline`]. + + Pipelines are made of: + + - A [tokenizer](tokenizer) in charge of mapping raw textual input to token. + - A [model](model) to make predictions from the inputs. + - Some (optional) post processing for enhancing model's output. + + Args: + task (`str`): + The task defining which pipeline will be returned. Currently accepted tasks are: + + - `"text-generation"`: will return a [`TextGenerationPipeline`]:. + + model (`str` or [`PreTrainedModel`], *optional*): + The model that will be used by the pipeline to make predictions. This can be a model identifier or an + actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch). + + If not provided, the default for the `task` will be loaded. + tokenizer (`str` or [`PreTrainedTokenizer`], *optional*): + The tokenizer that will be used by the pipeline to encode data for the model. This can be a model + identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`]. + + If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model` + is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string). + However, if `config` is also not given or not a string, then the default tokenizer for the given `task` + will be loaded. + accelerator (`str`, *optional*, defaults to `"ipex"`): + The optimization backends, choose from ["ipex", "inc", "openvino"]. + use_fast (`bool`, *optional*, defaults to `True`): + Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]). + torch_dtype (`str` or `torch.dtype`, *optional*): + Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model + (`torch.float16`, `torch.bfloat16`, ... or `"auto"`). + model_kwargs (`Dict[str, Any]`, *optional*): + Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., + **model_kwargs)` function. + + Returns: + [`Pipeline`]: A suitable pipeline for the task. + + Examples: + + ```python + >>> import torch + >>> from optimum.intel.pipelines import pipeline + + >>> pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16) + >>> pipe("Describe a real-world application of AI in sustainable energy.") + ```""" + if model_kwargs is None: + model_kwargs = {} + + if task is None and model is None: + raise RuntimeError( + "Impossible to instantiate a pipeline without either a task or a model " + "being specified. " + "Please provide a task class or a model" + ) + + if model is None and tokenizer is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer" + " may not be compatible with the default model. Please provide a PreTrainedModel class or a" + " path/identifier to a pretrained model when providing tokenizer." + ) + + if accelerator not in MAPPING_LOADING_FUNC: + raise ValueError( + f'Accelerator {accelerator} is not supported. Supported accelerator is {", ".join(MAPPING_LOADING_FUNC)}.' + ) + + if accelerator == "ipex": + if task not in list(IPEX_SUPPORTED_TASKS.keys()): + raise ValueError( + f"Task {task} is not supported for the IPEX pipeline. Supported tasks are { list(IPEX_SUPPORTED_TASKS.keys())}" + ) + + supported_tasks = IPEX_SUPPORTED_TASKS if accelerator == "ipex" else None + + no_feature_extractor_tasks = set() + no_tokenizer_tasks = set() + for _task, values in supported_tasks.items(): + if values["type"] == "text": + no_feature_extractor_tasks.add(_task) + elif values["type"] in {"image", "video"}: + no_tokenizer_tasks.add(_task) + elif values["type"] in {"audio"}: + no_tokenizer_tasks.add(_task) + elif values["type"] not in ["multimodal", "audio", "video"]: + raise ValueError(f"SUPPORTED_TASK {_task} contains invalid type {values['type']}") + + load_tokenizer = task not in no_tokenizer_tasks + load_feature_extractor = task not in no_feature_extractor_tasks + + hub_kwargs = { + "revision": revision, + "token": token, + "trust_remote_code": trust_remote_code, + "_commit_hash": commit_hash, + } + + if isinstance(model, Path): + model = str(model) + + if torch_dtype is not None: + if "torch_dtype" in model_kwargs: + raise ValueError( + 'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those' + " arguments might conflict, use only one.)" + ) + model_kwargs["torch_dtype"] = torch_dtype + + # Load the correct model if possible + # Infer the framework from the model if not already defined + model, model_id = MAPPING_LOADING_FUNC[accelerator](model, task, supported_tasks, model_kwargs, hub_kwargs) + + if load_tokenizer and tokenizer is None: + tokenizer = AutoTokenizer.from_pretrained(model_id, **hub_kwargs, **model_kwargs) + if load_feature_extractor and feature_extractor is None: + feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, **hub_kwargs, **model_kwargs) + + return transformers_pipeline( + task, + model=model, + tokenizer=tokenizer, + feature_extractor=feature_extractor, + use_fast=use_fast, + torch_dtype=torch_dtype, + ) diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py new file mode 100644 index 000000000..89a27ab2c --- /dev/null +++ b/tests/ipex/test_pipelines.py @@ -0,0 +1,265 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from tempfile import TemporaryDirectory + +import numpy as np +import torch +from parameterized import parameterized +from transformers import AutoTokenizer +from transformers.pipelines import pipeline as transformers_pipeline + +from optimum.intel.ipex.modeling_base import ( + IPEXModelForAudioClassification, + IPEXModelForCausalLM, + IPEXModelForImageClassification, + IPEXModelForMaskedLM, + IPEXModelForQuestionAnswering, + IPEXModelForSequenceClassification, + IPEXModelForTokenClassification, +) +from optimum.intel.pipelines import pipeline as ipex_pipeline + + +MODEL_NAMES = { + "albert": "hf-internal-testing/tiny-random-albert", + "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", + "bert": "hf-internal-testing/tiny-random-bert", + "bart": "hf-internal-testing/tiny-random-bart", + "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", + "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", + "bloom": "hf-internal-testing/tiny-random-BloomModel", + "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", + "convnext": "hf-internal-testing/tiny-random-convnext", + "distilbert": "hf-internal-testing/tiny-random-distilbert", + "electra": "hf-internal-testing/tiny-random-electra", + "flaubert": "hf-internal-testing/tiny-random-flaubert", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", + "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", + "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "gptj": "hf-internal-testing/tiny-random-GPTJModel", + "levit": "hf-internal-testing/tiny-random-LevitModel", + "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", + "marian": "sshleifer/tiny-marian-en-de", + "mbart": "hf-internal-testing/tiny-random-mbart", + "mistral": "echarlaix/tiny-random-mistral", + "mobilenet_v1": "google/mobilenet_v1_0.75_192", + "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", + "mobilevit": "hf-internal-testing/tiny-random-mobilevit", + "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", + "mt5": "stas/mt5-tiny-random", + "opt": "hf-internal-testing/tiny-random-OPTModel", + "phi": "echarlaix/tiny-random-PhiForCausalLM", + "resnet": "hf-internal-testing/tiny-random-resnet", + "roberta": "hf-internal-testing/tiny-random-roberta", + "roformer": "hf-internal-testing/tiny-random-roformer", + "squeezebert": "hf-internal-testing/tiny-random-squeezebert", + "t5": "hf-internal-testing/tiny-random-t5", + "unispeech": "hf-internal-testing/tiny-random-unispeech", + "vit": "hf-internal-testing/tiny-random-vit", + "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", + "xlm": "hf-internal-testing/tiny-random-xlm", +} + + +class PipelinesIntegrationTest(unittest.TestCase): + COMMON_SUPPORTED_ARCHITECTURES = ( + "albert", + "bert", + "distilbert", + "electra", + "flaubert", + "roberta", + "roformer", + "squeezebert", + "xlm", + ) + TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ( + "bart", + "gpt_bigcode", + "blenderbot", + "blenderbot-small", + "bloom", + "codegen", + "gpt2", + "gpt_neo", + "gpt_neox", + "llama", + "llama2", + "mistral", + "mpt", + "opt", + ) + QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES = ( + "bert", + "distilbert", + "roberta", + ) + AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES = ( + "unispeech", + "wav2vec2", + ) + IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES = ( + "beit", + "mobilenet_v1", + "mobilenet_v2", + "mobilevit", + "resnet", + "vit", + ) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_token_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("token-classification", model_id) + ipex_generator = ipex_pipeline("token-classification", model_id, accelerator="ipex") + inputs = "Hello I'm Omar and I live in Zürich." + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertEqual(len(transformers_output), len(ipex_output)) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForTokenClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + for i in range(len(transformers_output)): + self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_sequence_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("text-classification", model_id) + ipex_generator = ipex_pipeline("text-classification", model_id, accelerator="ipex") + inputs = "This restaurant is awesome" + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertEqual(transformers_output[0]["label"], ipex_output[0]["label"]) + self.assertAlmostEqual(transformers_output[0]["score"], ipex_output[0]["score"], delta=1e-4) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_fill_mask_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + inputs = "The Milky Way is a galaxy." + transformers_generator = transformers_pipeline("fill-mask", model_id) + ipex_generator = ipex_pipeline("fill-mask", model_id, accelerator="ipex") + mask_token = transformers_generator.tokenizer.mask_token + inputs = inputs.replace("", mask_token) + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertEqual(len(transformers_output), len(ipex_output)) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForMaskedLM)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + for i in range(len(transformers_output)): + self.assertEqual(transformers_output[i]["token"], ipex_output[i]["token"]) + self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4) + + @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES) + def test_text_generation_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("text-generation", model_id) + ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex") + inputs = "Describe a real-world application of AI." + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForCausalLM)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"]) + + @parameterized.expand(QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES) + def test_question_answering_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("question-answering", model_id) + ipex_generator = ipex_pipeline("question-answering", model_id, accelerator="ipex") + question = "How many programming languages does BLOOM support?" + context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages." + with torch.inference_mode(): + transformers_output = transformers_generator(question=question, context=context) + with torch.inference_mode(): + ipex_output = ipex_generator(question=question, context=context) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForQuestionAnswering)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertAlmostEqual(transformers_output["score"], ipex_output["score"], delta=1e-4) + self.assertEqual(transformers_output["start"], ipex_output["start"]) + self.assertEqual(transformers_output["end"], ipex_output["end"]) + + @parameterized.expand(AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES) + def test_audio_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("audio-classification", model_id) + ipex_generator = ipex_pipeline("audio-classification", model_id, accelerator="ipex") + inputs = [np.random.random(16000)] + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForAudioClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertAlmostEqual(transformers_output[0][0]["score"], ipex_output[0][0]["score"], delta=1e-2) + self.assertAlmostEqual(transformers_output[0][1]["score"], ipex_output[0][1]["score"], delta=1e-2) + + @parameterized.expand(IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES) + def test_image_classification_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + transformers_generator = transformers_pipeline("image-classification", model_id) + ipex_generator = ipex_pipeline("image-classification", model_id, accelerator="ipex") + inputs = "http://images.cocodataset.org/val2017/000000039769.jpg" + with torch.inference_mode(): + transformers_output = transformers_generator(inputs) + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertEqual(len(transformers_output), len(ipex_output)) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForImageClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + for i in range(len(transformers_output)): + self.assertEqual(transformers_output[i]["label"], ipex_output[i]["label"]) + self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_pipeline_load_from_ipex_model(self, model_arch): + model_id = MODEL_NAMES[model_arch] + model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + ipex_generator = ipex_pipeline("text-classification", model, tokenizer=tokenizer, accelerator="ipex") + inputs = "This restaurant is awesome" + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertGreaterEqual(ipex_output[0]["score"], 0.0) + + @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES) + def test_pipeline_load_from_jit_model(self, model_arch): + model_id = MODEL_NAMES[model_arch] + model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True) + save_dir = TemporaryDirectory().name + model.save_pretrained(save_dir) + tokenizer = AutoTokenizer.from_pretrained(model_id) + ipex_generator = ipex_pipeline("text-classification", save_dir, tokenizer=tokenizer, accelerator="ipex") + inputs = "This restaurant is awesome" + with torch.inference_mode(): + ipex_output = ipex_generator(inputs) + self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification)) + self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule)) + self.assertGreaterEqual(ipex_output[0]["score"], 0.0) From bfc86637aa328cce6eb66fbfe22fcd38b34db081 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 15 May 2024 17:22:24 +0400 Subject: [PATCH 21/47] Prevent loading model for export if it is not supported (#710) --- optimum/exporters/openvino/__main__.py | 5 +++++ optimum/exporters/openvino/convert.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 31abd0f32..9db671906 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -219,6 +219,10 @@ def main_export( model_type = config.model_type.replace("_", "-") if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True + if custom_export_configs is None: + raise ValueError( + f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export." + ) elif task not in TasksManager.get_supported_tasks_for_model_type( model_type, exporter="openvino", library_name=library_name ): @@ -232,6 +236,7 @@ def main_export( raise ValueError( f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}." ) + if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED: loading_kwargs["attn_implementation"] = "eager" # there are some difference between remote and in library representation of past key values for some models, diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 3634a493c..baa34a5cd 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -547,7 +547,7 @@ def export_from_model( # TODO: support onnx_config.py in the model repo if custom_architecture and custom_export_configs is None: raise ValueError( - f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the model type {model_type} to be supported natively in the ONNX export." + f"Trying to export a {model_type} model, that is a custom or unsupported architecture, but no custom export configuration was passed as `custom_export_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. Please open an issue at https://github.com/huggingface/optimum-intel/issues if you would like the model type {model_type} to be supported natively in the OpenVINO export." ) if task.startswith("text-generation") and model.config.is_encoder_decoder: From 2b902bbef97d7ebe486487cf89c1737c580c36bd Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 15 May 2024 17:30:00 +0400 Subject: [PATCH 22/47] Optimize first latency beam search for OVModelForCausalLM (#695) * WIP: beam search only * other beam search algos * add test * do not touch decoding cycles * fix stateless model support * fix quantization * move inputs modification into forward * refactor test --- optimum/intel/openvino/modeling_decoder.py | 181 +++++++++++++++++++-- tests/openvino/test_modeling.py | 81 +++++++++ 2 files changed, 250 insertions(+), 12 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 9ab494be6..e4dc1ed78 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -17,7 +17,7 @@ import warnings from pathlib import Path from tempfile import TemporaryDirectory -from typing import Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union import numpy as np import openvino @@ -28,6 +28,10 @@ from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin +from transformers.generation.configuration_utils import GenerationConfig, GenerationMode +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.stopping_criteria import StoppingCriteriaList +from transformers.generation.utils import GenerateOutput from transformers.modeling_outputs import CausalLMOutputWithPast from optimum.utils.normalized_config import NormalizedConfigManager @@ -41,6 +45,11 @@ from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE +if TYPE_CHECKING: + from transformers.modeling_utils import PreTrainedModel + from transformers.streamers import BaseStreamer + + logger = logging.getLogger(__name__) core = Core() @@ -122,6 +131,8 @@ def __init__( self._pkv_precision = Type.f32 self.next_beam_idx = None self._past_length = 0 + self._first_iter_beam_search = False + self._second_iter_beam_search = False self.update_pkv_precision() if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) @@ -375,7 +386,11 @@ def prepare_inputs( inputs = {} if not self.stateful: if past_key_values is not None: - if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + if ( + self.config.model_type not in MULTI_QUERY_ATTN_MODELS + or self.config.model_type == "falcon" + and self.config.new_decoder_architecture + ): if self._pkv_precision == Type.bf16: # numpy does not support bf16, pretending f16, should change to bf16 past_key_values = tuple( @@ -418,7 +433,6 @@ def prepare_inputs( self.next_beam_idx = np.arange(batch_size, dtype=int) self._past_length = 0 past_len = self._get_past_length(past_key_values) - inputs["input_ids"] = np.array(input_ids) # Add the attention_mask inputs when needed if "attention_mask" in self.input_names or "position_ids" in self.input_names: @@ -468,6 +482,8 @@ def forward( **kwargs, ) + if self._first_iter_beam_search: + inputs, duplication_indices = self._deduplicate_inputs(inputs) # Run inference self.request.start_async(inputs, share_inputs=True) self.request.wait() @@ -483,7 +499,11 @@ def forward( if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) - if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + if ( + self.config.model_type not in MULTI_QUERY_ATTN_MODELS + or self.config.model_type == "falcon" + and self.config.new_decoder_architecture + ): # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) @@ -491,6 +511,10 @@ def forward( else: past_key_values = None + if self._first_iter_beam_search: + logits, past_key_values = self._expand_outputs_for_generation(duplication_indices, logits, past_key_values) + self._first_iter_beam_search = False + return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation @@ -520,7 +544,7 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg if past_key_values: position_ids = position_ids[:, -input_ids.shape[1] :] - return { + model_inputs = { "input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache, @@ -528,12 +552,116 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg "attention_mask": attention_mask, } + return model_inputs + + def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple): + batch_size = logits.shape[0] + if indicies.shape[0] != 1: + logits = logits[indicies] + if past_key_values and not self.stateful: + if ( + self.config.model_type not in MULTI_QUERY_ATTN_MODELS + or self.config.model_type == "falcon" + and self.config.new_decoder_architecture + ): + past_key_values = tuple( + tuple( + past_state[indicies] + if not self.config.model_type == "chatglm" + else past_state[:, indicies, ...] + for past_state in layer_past + ) + for layer_past in past_key_values + ) + else: + past_key_values = tuple([past_state[indicies] for past_state in past_key_values]) + if self.stateful: + self.next_beam_idx = ( + self.next_beam_idx[indicies] + if self.next_beam_idx is not None + else np.arange(batch_size, dtype=int)[indicies] + ) + self._second_iter_beam_search = True + return logits, past_key_values + + def _deduplicate_inputs(self, model_inputs: Dict): + input_ids = model_inputs["input_ids"] + upd_model_inputs = {} + unique_input_ids, indicies, reverse_indicies = np.unique( + input_ids, axis=0, return_index=True, return_inverse=True + ) + for input_name, input_tensor in model_inputs.items(): + if input_name not in ["input_ids", "beam_idx"]: + if not isinstance(input_tensor, Tensor): + upd_model_inputs[input_name] = input_tensor[indicies] + else: + shape = input_tensor.shape + dtype = input_tensor.element_type + upd_batch_size = indicies.shape[0] + if self.config.model_type == "bloom": + upd_batch_size *= self.config.num_attention_heads + shape[0 if not self.config.model_type == "chatglm" else 1] = upd_batch_size + upd_model_inputs[input_name] = Tensor(dtype, shape) + upd_model_inputs["input_ids"] = unique_input_ids + if "beam_idx" in model_inputs: + beam_range = ( + unique_input_ids.shape[0] + if self.config.model_type != "bloom" + else unique_input_ids.shape[0] * self.config.num_attention_heads + ) + beam_idx = np.arange(beam_range, dtype=int) + upd_model_inputs["beam_idx"] = beam_idx + return upd_model_inputs, reverse_indicies + + @torch.no_grad() + def generate( + self, + inputs: Optional[torch.Tensor] = None, + generation_config: Optional[GenerationConfig] = None, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, + synced_gpus: Optional[bool] = None, + assistant_model: Optional["PreTrainedModel"] = None, + streamer: Optional["BaseStreamer"] = None, + negative_prompt_ids: Optional[torch.Tensor] = None, + negative_prompt_attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> Union[GenerateOutput, torch.LongTensor]: + _generation_config, _ = self._prepare_generation_config(generation_config, **kwargs) + generation_mode = _generation_config.get_generation_mode(assistant_model) + + is_beam_search = generation_mode in [ + GenerationMode.BEAM_SEARCH, + GenerationMode.BEAM_SAMPLE, + GenerationMode.GROUP_BEAM_SEARCH, + GenerationMode.CONSTRAINED_BEAM_SEARCH, + ] + if is_beam_search: + self._first_iter_beam_search = True + result = super().generate( + inputs, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + assistant_model, + streamer, + negative_prompt_ids, + negative_prompt_attention_mask, + **kwargs, + ) + return result + def _get_past_length(self, past_key_values=None): if past_key_values is None: return 0 if self.stateful: return self._past_length - if self.config.model_type in MULTI_QUERY_ATTN_MODELS: + if self.config.model_type in MULTI_QUERY_ATTN_MODELS and not ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture + ): return past_key_values[0].shape[-2] seq_length_dim = -2 if self.config.model_type == "chatglm": @@ -558,12 +686,20 @@ def _reorder_cache( if self.stateful: # TODO: Apply it differently based on model type # TODO: At least for bloom we need to replicate values for each attention head - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + self.next_beam_idx = ( + np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx + ) # save beam_idx to be used as an input in the next iteration + self._second_iter_beam_search = False return past_key_values else: - return tuple( - tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values - ) + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS and not ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture + ): + return tuple( + tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) + for layer_past in past_key_values + ) + return tuple(np.take(past_state, beam_idx, 0) for past_state in past_key_values) def can_generate(self): """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" @@ -684,11 +820,12 @@ def _reorder_cache( This is required to match `past_key_values` with the correct beam_idx at every generation step. """ if self.stateful: - beam_idx = np.array(beam_idx) batch_size = beam_idx.shape[0] + beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx indices = np.array(range(batch_size * self.config.num_attention_heads)) indices = indices.reshape([batch_size, self.config.num_attention_heads]) self.next_beam_idx = np.take(indices, beam_idx, 0).flatten() + self._second_iter_beam_search = False return past_key_values else: standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) @@ -738,6 +875,24 @@ def _convert_to_standard_cache( for layer_past in past_key_value ) + def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_key_values: Tuple): + batch_size = logits.shape[0] + if indicies.shape[0] != 1: + logits = logits[indicies] + if past_key_values and not self.stateful: + pkv_standard = self._convert_to_standard_cache(past_key_values, batch_size) + pkv = tuple(tuple(past_state[indicies] for past_state in layer_past) for layer_past in pkv_standard) + past_key_values = self._convert_to_bloom_cache(pkv) + + if self.stateful: + self.next_beam_idx = ( + self.next_beam_idx[indicies] + if self.next_beam_idx is not None + else np.arange(batch_size, dtype=int)[indicies] + ) + self._second_iter_beam_search = True + return logits, past_key_values + class OVGPTBigCodeForCausalLM(OVModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache @@ -745,7 +900,9 @@ def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: if self.stateful: - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + # save beam_idx to be used as an input in the next iteration + self.next_beam_idx = np.array(beam_idx) if not self._second_iter_beam_search else self.next_beam_idx + self._second_iter_beam_search = False return past_key_values else: return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index d4f55c683..75c95c156 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -778,6 +778,87 @@ def test_default_filling_attention_mask_and_position_ids(self): del model_with_cache gc.collect() + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @pytest.mark.run_slow + @slow + def test_beam_search(self, model_arch): + model_kwargs = {} + model_id = MODEL_NAMES[model_arch] + if model_arch in self.REMOTE_CODE_MODELS: + model_kwargs = { + "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), + "trust_remote_code": True, + } + # Qwen tokenizer does not support padding, chatgm testing model produces nan that incompatible with beam search + if model_arch in ["qwen", "chatglm"]: + return + + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + beam_search_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=False, + eos_token_id=None, + ) + beam_sample_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=True, + eos_token_id=None, + top_k=1, + ) + + group_beam_search_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=False, + eos_token_id=None, + num_beam_groups=2, + diversity_penalty=0.0000001, + ) + force_word = "cat" + force_words_ids = [tokenizer([force_word], add_special_tokens=False).input_ids] + constrained_beam_search_gen_config = GenerationConfig( + max_new_tokens=10, + min_new_tokens=10, + num_beams=4, + do_sample=False, + eos_token_id=None, + force_words_ids=force_words_ids, + ) + + gen_configs = [ + beam_search_gen_config, + beam_sample_gen_config, + group_beam_search_gen_config, + constrained_beam_search_gen_config, + ] + ov_model_stateful = OVModelForCausalLM.from_pretrained( + model_id, export=True, use_cache=True, stateful=True, **model_kwargs + ) + ov_model_stateless = OVModelForCausalLM.from_pretrained( + model_id, export=True, use_cache=True, stateful=False, **model_kwargs + ) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + tokenizer.pad_token_id = tokenizer.eos_token_id + tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + ov_model_stateful.generation_config.eos_token_id = None + ov_model_stateless.generation_config.eos_token_id = None + transformers_model.generation_config.eos_token_id = None + ov_model_stateful.config.eos_token_id = None + ov_model_stateless.config.eos_token_id = None + transformers_model.config.eos_token_id = None + + for gen_config in gen_configs: + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config) + self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs)) + ov_stateless_outputs = ov_model_stateless.generate(**tokens, generation_config=gen_config) + self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs)) + class OVModelForMaskedLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( From 3cfbc38e466896b8e2f2f8142a9c538218f1294b Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Wed, 15 May 2024 22:47:06 +0800 Subject: [PATCH 23/47] add XPU support for `IPEXModel.from_pretrained` (#704) * add xpu support * Apply suggestions from code review no device_map Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * add recursive_to_device * Apply suggestions from code review Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/ipex/modeling_base.py | 15 +++++++++++---- optimum/intel/utils/modeling_utils.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index d2963d55a..e929a4ddb 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -39,6 +39,7 @@ GenerationConfig, GenerationMixin, PretrainedConfig, + is_torch_xpu_available, ) from transformers.dynamic_module_utils import get_class_from_dynamic_module from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput @@ -52,7 +53,7 @@ from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _patch_model from ..generation.modeling import prepare_jit_inputs from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version -from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask +from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask, recursive_to_device logger = logging.getLogger(__name__) @@ -128,10 +129,14 @@ def __init__( **kwargs, ): OptimizedModel.__init__(self, model=model, config=config) - # To do: add XPU support - self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32 + if is_torch_xpu_available(check_device=True): + self._device = torch.device("xpu:0") + elif torch.cuda.is_available(): + self._device = torch.device("cuda:0") + else: + self._device = torch.device("cpu") self.model.to(self._device) + self._dtype = self.config.torch_dtype if self.config.torch_dtype is not None else torch.float32 self.model_save_dir = model_save_dir self._is_ipex_exported = _is_patched_with_ipex(model, self.export_feature) @@ -321,6 +326,8 @@ def _init_warmup(self): if not self._is_ipex_exported: use_cache = "past_key_values" in self.input_names dummy_inputs = prepare_jit_inputs(self, self.export_feature, use_cache) + if self._device.type != "cpu": + dummy_inputs = recursive_to_device(value=dummy_inputs, device=self._device) for _ in range(2): self(**dummy_inputs) diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index 99ad42aaf..a2cd72835 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -169,3 +169,16 @@ def get_model_device(model: torch.nn.Module) -> torch.device: # The model had no parameters at all, doesn't matter which device to choose device = torch.device("cpu") return device + + +def recursive_to_device(value, device): + """ + Recursivley move the tensor element in `value` to `device` + """ + if isinstance(value, (tuple, list)): + return type(value)(recursive_to_device(v, device) for v in value) + elif isinstance(value, dict): + return {k: recursive_to_device(v, device) for k, v in value.items()} + elif isinstance(value, torch.Tensor): + return value.to(device) + return value From 02d5e4eee1dd94babb29fff39d988d08d039a126 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 15 May 2024 19:18:05 +0400 Subject: [PATCH 24/47] Cover more models with openvino export (#709) * cover more models with openvino export * xglm * fix tests --- optimum/exporters/openvino/model_configs.py | 55 +++++++++++++++++++++ tests/openvino/test_modeling.py | 10 +++- tests/openvino/utils_tests.py | 5 ++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 575f1cc4d..47ca4ff24 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -592,3 +592,58 @@ def outputs(self) -> Dict[str, Dict[int, str]]: return { "sample": {0: "batch_size", 2: "height", 3: "width"}, } + + +@register_in_tasks_manager( + "persimmon", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class PersimmonOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager("biogpt", *["text-generation", "text-generation-with-past"], library_name="transformers") +class BioGPTOpenVINOConfig(TextDecoderOnnxConfig): + # BioGPT does not require position_ids input. + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager( + "gpt-neox-japanese", *["text-generation", "text-generation-with-past"], library_name="transformers" +) +class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig): + # GPTNeoxJapanese does not require position_ids input. + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + +@register_in_tasks_manager( + "cohere", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class CohereOpenVINOConfig(LlamaOpenVINOConfig): + pass + + +@register_in_tasks_manager("xglm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_attention_heads="attention_heads", hidden_size="d_model" + ) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 75c95c156..0a0b66b86 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -552,6 +552,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "orion", "falcon", "falcon-40b", + "persimmon", + "biogpt", + "gpt_neox_japanese", + "cohere", + "xglm", ) GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( @@ -617,8 +622,11 @@ def test_compare_to_transformers(self, model_arch): if model_arch == "qwen": return - if model_arch != "chatglm": + if model_arch not in ["chatglm", "persimmon"]: tokenizer.pad_token_id = tokenizer.eos_token_id + + if model_arch == "persimmon": + tokenizer.pad_token_id = tokenizer.bos_token_id # Compare batched generation tokenizer.padding_side = "left" tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 9f28e40a4..aa3ea5f33 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -26,11 +26,13 @@ "baichuan2": "katuni4ka/tiny-random-baichuan2", "baichuan2-13b": "katuni4ka/tiny-random-baichuan2-13b", "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", + "biogpt": "hf-tiny-model-private/tiny-random-BioGptForCausalLM", "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", "camembert": "hf-internal-testing/tiny-random-camembert", "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "cohere": "hf-internal-testing/tiny-random-CohereForCausalLM", "chatglm": "katuni4ka/tiny-random-chatglm2", "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel", @@ -51,6 +53,7 @@ "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "gpt_neox_japanese": "hf-internal-testing/tiny-random-GPTNeoXJapaneseForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJModel", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-ibert", @@ -78,6 +81,7 @@ "olmo": "katuni4ka/tiny-random-olmo-hf", "orion": "katuni4ka/tiny-random-orion", "pegasus": "hf-internal-testing/tiny-random-pegasus", + "persimmon": "hf-internal-testing/tiny-random-PersimmonForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", "phi": "echarlaix/tiny-random-PhiForCausalLM", "phi3": "katuni4ka/tiny-random-phi3", @@ -115,6 +119,7 @@ "whisper": "openai/whisper-tiny.en", "xlm": "hf-internal-testing/tiny-random-xlm", "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta", + "xglm": "hf-internal-testing/tiny-random-XGLMForCausalLM", } From c74388603300c077ccba9cbc82d67b703666daec Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 15 May 2024 17:18:40 +0200 Subject: [PATCH 25/47] IPEX test refactorization (#711) --- tests/ipex/test_inference.py | 87 +++++++++++++++--------------------- tests/ipex/test_modeling.py | 44 +----------------- tests/ipex/test_pipelines.py | 45 +------------------ tests/ipex/utils_tests.py | 57 +++++++++++++++++++++++ 4 files changed, 96 insertions(+), 137 deletions(-) create mode 100644 tests/ipex/utils_tests.py diff --git a/tests/ipex/test_inference.py b/tests/ipex/test_inference.py index b65d3c9b8..1a452fe40 100644 --- a/tests/ipex/test_inference.py +++ b/tests/ipex/test_inference.py @@ -16,8 +16,6 @@ import torch from parameterized import parameterized - -# TODO : add more tasks from transformers import ( AutoModelForCausalLM, AutoModelForQuestionAnswering, @@ -26,60 +24,51 @@ AutoTokenizer, pipeline, ) +from utils_tests import MODEL_NAMES from optimum.intel import inference_mode as ipex_inference_mode from optimum.intel.ipex.modeling_base import IPEXModel -MODEL_NAMES = { - "bert": "hf-internal-testing/tiny-random-bert", - "bloom": "hf-internal-testing/tiny-random-BloomModel", - "distilbert": "hf-internal-testing/tiny-random-distilbert", - "roberta": "hf-internal-testing/tiny-random-roberta", - "gptj": "hf-internal-testing/tiny-random-gptj", - "gpt2": "hf-internal-testing/tiny-random-gpt2", - "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", - "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", - "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "llama": "fxmarty/tiny-llama-fast-tokenizer", - "llama2": "Jiqing/tiny_random_llama2", - "opt": "hf-internal-testing/tiny-random-OPTModel", - "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", -} - _CLASSIFICATION_TASK_TO_AUTOMODELS = { "text-classification": AutoModelForSequenceClassification, "token-classification": AutoModelForTokenClassification, } -class IPEXIntegrationTest(unittest.TestCase): - CLASSIFICATION_SUPPORTED_ARCHITECTURES = ( +class IPEXClassificationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ( "bert", "distilbert", "roberta", ) - TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ( - "bloom", - "gptj", - "gpt2", - "gpt_neo", - "gpt_bigcode", - "llama", - "llama2", - "opt", - "mpt", - ) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_inference(self, model_arch): + model_id = MODEL_NAMES[model_arch] + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = "This is a sample input" + for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items(): + model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32) + pipe = pipeline(task, model=model, tokenizer=tokenizer) - QA_SUPPORTED_ARCHITECTURES = ( + with torch.inference_mode(): + outputs = pipe(inputs) + with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe: + outputs_ipex = ipex_pipe(inputs) + self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule)) + self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"]) + + +class IPEXQuestionAnsweringTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ( "bert", "distilbert", "roberta", ) - @parameterized.expand(QA_SUPPORTED_ARCHITECTURES) - def test_question_answering_pipeline_inference(self, model_arch): + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForQuestionAnswering.from_pretrained(model_id, torch_dtype=torch.float32) @@ -95,24 +84,22 @@ def test_question_answering_pipeline_inference(self, model_arch): self.assertEqual(outputs["start"], outputs_ipex["start"]) self.assertEqual(outputs["end"], outputs_ipex["end"]) - @parameterized.expand(CLASSIFICATION_SUPPORTED_ARCHITECTURES) - def test_classification_pipeline_inference(self, model_arch): - model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs = "This is a sample input" - for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items(): - model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32) - pipe = pipeline(task, model=model, tokenizer=tokenizer) - with torch.inference_mode(): - outputs = pipe(inputs) - with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe: - outputs_ipex = ipex_pipe(inputs) - self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule)) - self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"]) +class IPEXTextGenerationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ( + "bloom", + "gptj", + "gpt2", + "gpt_neo", + "gpt_bigcode", + "llama", + "llama2", + "opt", + "mpt", + ) - @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES) - def test_text_generation_pipeline_inference(self, model_arch): + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, return_dict=False) model = model.eval() diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 94a5ca9e1..2a2f18f6f 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -45,53 +45,11 @@ ) from optimum.intel.utils.import_utils import is_ipex_version from optimum.utils.testing_utils import grid_parameters +from utils_tests import MODEL_NAMES SEED = 42 -MODEL_NAMES = { - "albert": "hf-internal-testing/tiny-random-albert", - "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", - "bert": "hf-internal-testing/tiny-random-bert", - "bart": "hf-internal-testing/tiny-random-bart", - "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", - "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", - "bloom": "hf-internal-testing/tiny-random-BloomModel", - "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", - "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", - "convnext": "hf-internal-testing/tiny-random-convnext", - "distilbert": "hf-internal-testing/tiny-random-distilbert", - "electra": "hf-internal-testing/tiny-random-electra", - "flaubert": "hf-internal-testing/tiny-random-flaubert", - "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "gpt2": "hf-internal-testing/tiny-random-gpt2", - "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", - "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", - "gptj": "hf-internal-testing/tiny-random-GPTJModel", - "levit": "hf-internal-testing/tiny-random-LevitModel", - "llama": "fxmarty/tiny-llama-fast-tokenizer", - "llama2": "Jiqing/tiny_random_llama2", - "marian": "sshleifer/tiny-marian-en-de", - "mbart": "hf-internal-testing/tiny-random-mbart", - "mistral": "echarlaix/tiny-random-mistral", - "mobilenet_v1": "google/mobilenet_v1_0.75_192", - "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", - "mobilevit": "hf-internal-testing/tiny-random-mobilevit", - "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", - "mt5": "stas/mt5-tiny-random", - "opt": "hf-internal-testing/tiny-random-OPTModel", - "phi": "echarlaix/tiny-random-PhiForCausalLM", - "resnet": "hf-internal-testing/tiny-random-resnet", - "roberta": "hf-internal-testing/tiny-random-roberta", - "roformer": "hf-internal-testing/tiny-random-roformer", - "squeezebert": "hf-internal-testing/tiny-random-squeezebert", - "t5": "hf-internal-testing/tiny-random-t5", - "unispeech": "hf-internal-testing/tiny-random-unispeech", - "vit": "hf-internal-testing/tiny-random-vit", - "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", - "xlm": "hf-internal-testing/tiny-random-xlm", -} - class Timer(object): def __enter__(self): diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py index 89a27ab2c..c4ae471a0 100644 --- a/tests/ipex/test_pipelines.py +++ b/tests/ipex/test_pipelines.py @@ -20,6 +20,7 @@ from parameterized import parameterized from transformers import AutoTokenizer from transformers.pipelines import pipeline as transformers_pipeline +from utils_tests import MODEL_NAMES from optimum.intel.ipex.modeling_base import ( IPEXModelForAudioClassification, @@ -33,50 +34,6 @@ from optimum.intel.pipelines import pipeline as ipex_pipeline -MODEL_NAMES = { - "albert": "hf-internal-testing/tiny-random-albert", - "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", - "bert": "hf-internal-testing/tiny-random-bert", - "bart": "hf-internal-testing/tiny-random-bart", - "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", - "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", - "bloom": "hf-internal-testing/tiny-random-BloomModel", - "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", - "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", - "convnext": "hf-internal-testing/tiny-random-convnext", - "distilbert": "hf-internal-testing/tiny-random-distilbert", - "electra": "hf-internal-testing/tiny-random-electra", - "flaubert": "hf-internal-testing/tiny-random-flaubert", - "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "gpt2": "hf-internal-testing/tiny-random-gpt2", - "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", - "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", - "gptj": "hf-internal-testing/tiny-random-GPTJModel", - "levit": "hf-internal-testing/tiny-random-LevitModel", - "llama": "fxmarty/tiny-llama-fast-tokenizer", - "llama2": "Jiqing/tiny_random_llama2", - "marian": "sshleifer/tiny-marian-en-de", - "mbart": "hf-internal-testing/tiny-random-mbart", - "mistral": "echarlaix/tiny-random-mistral", - "mobilenet_v1": "google/mobilenet_v1_0.75_192", - "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", - "mobilevit": "hf-internal-testing/tiny-random-mobilevit", - "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", - "mt5": "stas/mt5-tiny-random", - "opt": "hf-internal-testing/tiny-random-OPTModel", - "phi": "echarlaix/tiny-random-PhiForCausalLM", - "resnet": "hf-internal-testing/tiny-random-resnet", - "roberta": "hf-internal-testing/tiny-random-roberta", - "roformer": "hf-internal-testing/tiny-random-roformer", - "squeezebert": "hf-internal-testing/tiny-random-squeezebert", - "t5": "hf-internal-testing/tiny-random-t5", - "unispeech": "hf-internal-testing/tiny-random-unispeech", - "vit": "hf-internal-testing/tiny-random-vit", - "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", - "xlm": "hf-internal-testing/tiny-random-xlm", -} - - class PipelinesIntegrationTest(unittest.TestCase): COMMON_SUPPORTED_ARCHITECTURES = ( "albert", diff --git a/tests/ipex/utils_tests.py b/tests/ipex/utils_tests.py new file mode 100644 index 000000000..a14f0bf7c --- /dev/null +++ b/tests/ipex/utils_tests.py @@ -0,0 +1,57 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +MODEL_NAMES = { + "albert": "hf-internal-testing/tiny-random-albert", + "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", + "bert": "hf-internal-testing/tiny-random-bert", + "bart": "hf-internal-testing/tiny-random-bart", + "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", + "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", + "bloom": "hf-internal-testing/tiny-random-BloomModel", + "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", + "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", + "convnext": "hf-internal-testing/tiny-random-convnext", + "distilbert": "hf-internal-testing/tiny-random-distilbert", + "electra": "hf-internal-testing/tiny-random-electra", + "flaubert": "hf-internal-testing/tiny-random-flaubert", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", + "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", + "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "gptj": "hf-internal-testing/tiny-random-GPTJModel", + "levit": "hf-internal-testing/tiny-random-LevitModel", + "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama2": "Jiqing/tiny_random_llama2", + "marian": "sshleifer/tiny-marian-en-de", + "mbart": "hf-internal-testing/tiny-random-mbart", + "mistral": "echarlaix/tiny-random-mistral", + "mobilenet_v1": "google/mobilenet_v1_0.75_192", + "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", + "mobilevit": "hf-internal-testing/tiny-random-mobilevit", + "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", + "mt5": "stas/mt5-tiny-random", + "opt": "hf-internal-testing/tiny-random-OPTModel", + "phi": "echarlaix/tiny-random-PhiForCausalLM", + "resnet": "hf-internal-testing/tiny-random-resnet", + "roberta": "hf-internal-testing/tiny-random-roberta", + "roformer": "hf-internal-testing/tiny-random-roformer", + "squeezebert": "hf-internal-testing/tiny-random-squeezebert", + "t5": "hf-internal-testing/tiny-random-t5", + "unispeech": "hf-internal-testing/tiny-random-unispeech", + "vit": "hf-internal-testing/tiny-random-vit", + "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", + "xlm": "hf-internal-testing/tiny-random-xlm", +} From 8c2b787cc75a45ae4670d37970a5394eba90eedc Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 15 May 2024 19:33:55 +0400 Subject: [PATCH 26/47] Add sdpa for phi3 openvino model (#705) * add sdpa for phi3 openvino model * fix pkv filling according model code * Update optimum/exporters/openvino/model_patcher.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * import helpers from phi3 if available --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/openvino/model_configs.py | 6 ++ optimum/exporters/openvino/model_patcher.py | 94 ++++++++++++++++++++- 2 files changed, 99 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 47ca4ff24..dc1351211 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -485,6 +485,12 @@ def patch_model_for_export( library_name="transformers", ) class Phi3OpenVINOConfig(PhiOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + MistralDummyPastKeyValuesGenerator, + ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) + def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f68e873d4..55afb0ffe 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -951,15 +951,107 @@ def __exit__(self, exc_type, exc_value, traceback): block.attention.forward = block.attention._orig_forward +# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L426 +def _phi3_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + return self._orig_forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + # TO DO: remove llama imports when transformers with phi3 support will be released + try: + from transformers.models.phi3.modelling_phi3 import apply_rotary_pos_emb, repeat_kv + except ImportError: + from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + class Phi3ModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() - # https://github.com/huggingface/transformers/blob/30ee508c6c92a1c0aa0281d193c7c0fb815b8d2f/src/transformers/models/phi3/modeling_phi3.py#L113 # init inv_freq for torchscript tracing for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_phi3_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + if layer.self_attn.rotary_emb.inv_freq is None: rotary_emb = layer.self_attn.rotary_emb layer.self_attn.rotary_emb.inv_freq = 1.0 / ( rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) ) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward From c30d488f8a46a3980795951edf1f0dc53c0efb0a Mon Sep 17 00:00:00 2001 From: Helena Kloosterman Date: Thu, 16 May 2024 16:51:15 +0200 Subject: [PATCH 27/47] Fix diffusers requirement for quantizing models (#712) --- optimum/intel/openvino/quantization.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 45961a86f..17305b947 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -48,7 +48,7 @@ from ...exporters.openvino.model_patcher import patch_model_with_bettertransformer from ...exporters.openvino.stateful import ensure_export_task_support_stateful, ensure_stateful_is_available from ..utils.constant import _TASK_ALIASES -from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available +from ..utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available, is_diffusers_available from ..utils.modeling_utils import get_model_device from .configuration import OVConfig, OVQuantizationConfig, OVQuantizationMethod, OVWeightQuantizationConfig from .modeling_base import OVBaseModel @@ -325,7 +325,8 @@ def _quantize_ovbasemodel( remove_unused_columns: bool = True, **kwargs, ): - from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase + if is_diffusers_available(): + from optimum.intel.openvino.modeling_diffusion import OVStableDiffusionPipelineBase if save_directory is not None: save_directory = Path(save_directory) @@ -335,7 +336,7 @@ def _quantize_ovbasemodel( if calibration_dataset is not None: # Process custom calibration dataset - if isinstance(self.model, OVStableDiffusionPipelineBase): + if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): calibration_dataset = self._prepare_unet_dataset( quantization_config.num_samples, dataset=calibration_dataset ) @@ -373,7 +374,7 @@ def _quantize_ovbasemodel( if isinstance(self.model, OVModelForCausalLM): calibration_dataset = self._prepare_builtin_dataset(quantization_config) - elif isinstance(self.model, OVStableDiffusionPipelineBase): + elif is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): calibration_dataset = self._prepare_unet_dataset( quantization_config.num_samples, dataset_name=quantization_config.dataset ) @@ -385,7 +386,7 @@ def _quantize_ovbasemodel( if quantization_config.quant_method == OVQuantizationMethod.HYBRID: if calibration_dataset is None: raise ValueError("Calibration dataset is required to run hybrid quantization.") - if isinstance(self.model, OVStableDiffusionPipelineBase): + if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): # Apply weight-only quantization to all SD submodels except UNet quantization_config_copy = copy.deepcopy(quantization_config) quantization_config_copy.dataset = None From 715c054360946b312e94beec91c55a1223258954 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 17 May 2024 13:38:58 +0400 Subject: [PATCH 28/47] Skip saving gen config if saving failed (#717) --- optimum/exporters/openvino/convert.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index baa34a5cd..3b214f77e 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -614,7 +614,12 @@ def export_from_model( model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) if generation_config is not None: - generation_config.save_pretrained(output) + try: + generation_config.save_pretrained(output) + except Exception as exception: + logger.warning( + f"The generation config will not be saved, saving failed with following error:\n{exception}" + ) model_name_or_path = model.config._name_or_path maybe_save_preprocessors(model_name_or_path, output, trust_remote_code=trust_remote_code) From 60d5bf6e0c67f3813fb2148ce3e7258ed84d27a4 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 17 May 2024 20:01:56 +0400 Subject: [PATCH 29/47] Add support export for new architectures (#716) * support export more models * update aquila to support v1 and v2 --- optimum/exporters/openvino/model_configs.py | 89 ++++++- optimum/exporters/openvino/model_patcher.py | 279 +++++++++++++++++++- tests/openvino/test_modeling.py | 11 + tests/openvino/utils_tests.py | 4 + 4 files changed, 378 insertions(+), 5 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index dc1351211..8feeafd61 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -41,15 +41,18 @@ from optimum.utils.normalized_config import NormalizedTextConfig from .model_patcher import ( + AquilaModelPatcher, BaichuanModelPatcher, ChatGLMModelPatcher, GemmaModelPatcher, - InternLMPatcher, + InternLM2Patcher, + InternLMModelPatcher, LlamaModelPatcher, MixtralModelPatcher, MPTModelPatcher, Phi3ModelPatcher, QwenModelPatcher, + XverseModelPatcher, ) @@ -445,7 +448,7 @@ class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": - return InternLMPatcher(self, model, model_kwargs=model_kwargs) + return InternLM2Patcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") @@ -653,3 +656,85 @@ class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="attention_heads", hidden_size="d_model" ) + + +class AquilaDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task, + normalized_config, + batch_size, + sequence_length, + random_batch_size_range, + random_sequence_length_range, + **kwargs, + ) + self.num_key_value_heads = getattr( + normalized_config, "num_key_value_heads", normalized_config.num_attention_heads + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + shape = ( + self.batch_size, + self.num_key_value_heads, + self.sequence_length, + self.hidden_size // self.num_attention_heads, + ) + return [ + ( + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] + + +@register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers") +class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return AquilaModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers") +class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return XverseModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers") +class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return InternLMModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 55afb0ffe..33fd77cba 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -844,7 +844,7 @@ def __exit__(self, exc_type, exc_value, traceback): block.attn.forward = block.attn._orig_forward -def _internlm_attention_forward( +def _internlm2_attention_forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, @@ -935,14 +935,14 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: return attn_output, attn_weights, past_key_value -class InternLMPatcher(DecoderModelPatcher): +class InternLM2Patcher(DecoderModelPatcher): def __enter__(self): super().__enter__() if is_torch_version(">=", "2.1.0"): for block in self._model.model.layers: block.attention._orig_forward = block.attention.forward - block.attention.forward = types.MethodType(_internlm_attention_forward, block.attention) + block.attention.forward = types.MethodType(_internlm2_attention_forward, block.attention) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -1055,3 +1055,276 @@ def __exit__(self, exc_type, exc_value, traceback): for layer in self._model.model.layers: if hasattr(layer.self_attn, "_orig_forward"): layer.self_attn.forward = layer.self_attn._orig_forward + + +def _aquila_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + if output_attentions: + return self._orig_forward( + hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache + ) + bsz, q_len, _ = hidden_states.size() + + if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, getattr(self, "num_key_value_heads", self.num_heads), self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, getattr(self, "num_key_value_heads", self.num_heads), self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + if hasattr(self, "num_key_value_groups"): + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim)) + ) + attn_weights = None + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights, past_key_value + + +class AquilaModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_aquila_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward + + +def _xverse_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + if output_attentions: + return self._orig_forward( + hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim)) + ) + attn_weights = None + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights, past_key_value + + +def _internlm_self_attn_sdpa_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + cos = cos[position_ids].unsqueeze(1) + sin = sin[position_ids].unsqueeze(1) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + if output_attentions: + return self._orig_forward( + hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache + ) + + bsz, q_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, scale=(1 / math.sqrt(self.head_dim)) + ) + attn_weights = None + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights, past_key_value + + +class XverseModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_xverse_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward + + +class InternLMModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_internlm_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 0a0b66b86..1191a9390 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -557,6 +557,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "gpt_neox_japanese", "cohere", "xglm", + "aquila", + "aquila2", + "xverse", + "internlm", ) GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( @@ -569,6 +573,10 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "internlm2", "orion", "phi3", + "aquila", + "aquila2", + "xverse", + "internlm", ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -596,6 +604,7 @@ def test_compare_to_transformers(self, model_arch): self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful) tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) tokens = tokenizer("This is a sample output", return_tensors="pt") + tokens.pop("token_type_ids", None) ov_outputs = ov_model(**tokens) self.assertTrue("logits" in ov_outputs) @@ -630,6 +639,7 @@ def test_compare_to_transformers(self, model_arch): # Compare batched generation tokenizer.padding_side = "left" tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens.pop("token_type_ids", None) ov_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None ov_model.config.eos_token_id = None @@ -853,6 +863,7 @@ def test_beam_search(self, model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) tokenizer.pad_token_id = tokenizer.eos_token_id tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens.pop("token_type_ids", None) ov_model_stateful.generation_config.eos_token_id = None ov_model_stateless.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index aa3ea5f33..1627112c5 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -18,6 +18,8 @@ MODEL_NAMES = { "albert": "hf-internal-testing/tiny-random-albert", + "aquila": "katuni4ka/tiny-random-aquilachat", + "aquila2": "katuni4ka/tiny-random-aquila2", "audio_spectrogram_transformer": "Ericwang/tiny-random-ast", "bge": "BAAI/bge-small-en-v1.5", "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", @@ -57,6 +59,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-ibert", + "internlm": "katuni4ka/tiny-random-internlm", "internlm2": "katuni4ka/tiny-random-internlm2", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", @@ -120,6 +123,7 @@ "xlm": "hf-internal-testing/tiny-random-xlm", "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta", "xglm": "hf-internal-testing/tiny-random-XGLMForCausalLM", + "xverse": "katuni4ka/tiny-random-xverse", } From bc5051fecf4fd6b03f7e7f261ec2d466aa688049 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 17 May 2024 18:04:35 +0200 Subject: [PATCH 30/47] Add --all-layers argument to openvino CLI (#713) * Add --all-layers argument to CLI * Update description --- optimum/commands/export/openvino.py | 11 +++++++++++ tests/openvino/test_exporters_cli.py | 21 +++++++++------------ tests/openvino/utils_tests.py | 2 -- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 025a40e05..ffd084d4e 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -119,6 +119,15 @@ def parse_args_openvino(parser: "ArgumentParser"): "or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models." ), ) + optional_group.add_argument( + "--all-layers", + action="store_true", + default=None, + help=( + "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight " + "compression is applied, they are compressed to INT8." + ), + ) optional_group.add_argument( "--disable-stateful", action="store_true", @@ -198,6 +207,7 @@ def run(self): and self.args.ratio is None and self.args.group_size is None and self.args.sym is None + and self.args.all_layers is None and self.args.model in _DEFAULT_4BIT_CONFIGS ): quantization_config = _DEFAULT_4BIT_CONFIGS[self.args.model] @@ -207,6 +217,7 @@ def run(self): "ratio": 1 if is_int8 else (self.args.ratio or 0.8), "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, + "all_layers": None if is_int8 else self.args.all_layers, } if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index cac79abae..cce25bbae 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -18,7 +18,6 @@ from parameterized import parameterized from utils_tests import ( - _ARCHITECTURES_TO_EXPECTED_INT4_INT8, _ARCHITECTURES_TO_EXPECTED_INT8, MODEL_NAMES, get_num_quantized_nodes, @@ -84,14 +83,13 @@ class OVCLIExportTestCase(unittest.TestCase): ("latent-consistency", 50, 135), ) - SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),) - - SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"] - - TEST_4BIT_CONFIGURATONS = [] - for arch in SUPPORTED_4BIT_ARCHITECTURES: - for option in SUPPORTED_4BIT_OPTIONS: - TEST_4BIT_CONFIGURATONS.append([arch[0], arch[1], option]) + TEST_4BIT_CONFIGURATONS = [ + ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86), + ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86), + ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86), + ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86), + ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32), + ] def _openvino_export( self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None @@ -197,17 +195,16 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in self.assertEqual(exp_num_fq, num_fq) @parameterized.expand(TEST_4BIT_CONFIGURATONS) - def test_exporters_cli_int4(self, task: str, model_type: str, option: str): + def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int): with TemporaryDirectory() as tmpdir: subprocess.run( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", shell=True, check=True, ) model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {} model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs) - expected_int8, expected_int4 = _ARCHITECTURES_TO_EXPECTED_INT4_INT8[model_type] _, num_int8, num_int4 = get_num_quantized_nodes(model) self.assertEqual(expected_int8, num_int8) self.assertEqual(expected_int4, num_int4) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 1627112c5..d4364d192 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -149,8 +149,6 @@ "stable-diffusion-xl-refiner": (366, 34, 42, 66), } -_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (62, 86)} - def get_num_quantized_nodes(ov_model): num_fake_quantize = 0 From 7114900cd3d80fdfc6bc18aff1d016bd6b626e31 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Sun, 19 May 2024 01:53:17 +0400 Subject: [PATCH 31/47] fix beam search test reported issues (#718) * fix beam search test reported issues * test beam search * refactor applying code style with preserve logic for olmo --- optimum/intel/openvino/modeling_decoder.py | 22 ++++++++-------------- tests/openvino/test_modeling.py | 6 ++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index e4dc1ed78..2ad04ab14 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -386,10 +386,8 @@ def prepare_inputs( inputs = {} if not self.stateful: if past_key_values is not None: - if ( - self.config.model_type not in MULTI_QUERY_ATTN_MODELS - or self.config.model_type == "falcon" - and self.config.new_decoder_architecture + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture ): if self._pkv_precision == Type.bf16: # numpy does not support bf16, pretending f16, should change to bf16 @@ -499,10 +497,8 @@ def forward( if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) - if ( - self.config.model_type not in MULTI_QUERY_ATTN_MODELS - or self.config.model_type == "falcon" - and self.config.new_decoder_architecture + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture ): # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) past_key_values = tuple( @@ -559,10 +555,8 @@ def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_ke if indicies.shape[0] != 1: logits = logits[indicies] if past_key_values and not self.stateful: - if ( - self.config.model_type not in MULTI_QUERY_ATTN_MODELS - or self.config.model_type == "falcon" - and self.config.new_decoder_architecture + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( + self.config.model_type == "falcon" and self.config.new_decoder_architecture ): past_key_values = tuple( tuple( @@ -581,7 +575,7 @@ def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_ke if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)[indicies] ) - self._second_iter_beam_search = True + self._second_iter_beam_search = True return logits, past_key_values def _deduplicate_inputs(self, model_inputs: Dict): @@ -692,7 +686,7 @@ def _reorder_cache( self._second_iter_beam_search = False return past_key_values else: - if self.config.model_type not in MULTI_QUERY_ATTN_MODELS and not ( + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( self.config.model_type == "falcon" and self.config.new_decoder_architecture ): return tuple( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 1191a9390..692720a97 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -812,6 +812,10 @@ def test_beam_search(self, model_arch): return tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) + if model_arch == "persimmon": + tokenizer.pad_token_id = tokenizer.bos_token_id + tokenizer.eos_token_id = tokenizer.bos_token_id + beam_search_gen_config = GenerationConfig( max_new_tokens=10, min_new_tokens=10, @@ -872,6 +876,8 @@ def test_beam_search(self, model_arch): transformers_model.config.eos_token_id = None for gen_config in gen_configs: + if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]: + continue transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config) self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs)) From 7a929e8d6da0dac4fbd8995add32a663e7b9afc5 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 21 May 2024 13:35:53 +0400 Subject: [PATCH 32/47] Fix backward compatibility for GenerationMode import (#719) --- optimum/intel/openvino/modeling_decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 2ad04ab14..933d92a50 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -28,10 +28,10 @@ from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin -from transformers.generation.configuration_utils import GenerationConfig, GenerationMode +from transformers.generation.configuration_utils import GenerationConfig from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.stopping_criteria import StoppingCriteriaList -from transformers.generation.utils import GenerateOutput +from transformers.generation.utils import GenerateOutput, GenerationMode from transformers.modeling_outputs import CausalLMOutputWithPast from optimum.utils.normalized_config import NormalizedConfigManager From c69fe32c638e52433016df8d1a6746db3e7e70da Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 21 May 2024 20:52:25 +0400 Subject: [PATCH 33/47] Add support export for new architectures (#720) * update codegen config for support codegen2 * add support DBRX * add qwen2moe support * fix test models * buichuan sdpa * apply review comments * Apply suggestions from code review Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/openvino/model_configs.py | 47 +++ optimum/exporters/openvino/model_patcher.py | 343 +++++++++++++++++++- tests/openvino/test_modeling.py | 4 + tests/openvino/utils_tests.py | 3 + 4 files changed, 396 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 8feeafd61..d69adc9da 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -20,6 +20,7 @@ from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig from optimum.exporters.onnx.model_configs import ( + CodeGenOnnxConfig, FalconOnnxConfig, GemmaOnnxConfig, LlamaOnnxConfig, @@ -44,6 +45,8 @@ AquilaModelPatcher, BaichuanModelPatcher, ChatGLMModelPatcher, + CodeGenModelPatcher, + DBRXModelPatcher, GemmaModelPatcher, InternLM2Patcher, InternLMModelPatcher, @@ -112,6 +115,15 @@ class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +@register_in_tasks_manager("qwen2-moe", *["text-generation", "text-generation-with-past"], library_name="transformers") +class Qwen2MoEOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + @register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers") class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 @@ -738,3 +750,38 @@ def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return InternLMModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "codegen", + *["feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past"], + library_name="transformers", +) +class CodeGenOpenVINOConfig(CodeGenOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return CodeGenModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "dbrx", + *["text-generation", "text-generation-with-past"], + library_name="transformers", +) +class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( + num_attention_heads="n_heads", + hidden_size="d_model", + num_layers="n_layers", + num_key_value_heads="attn_config.kv_n_heads", + allow_new=True, + ) + DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return DBRXModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 33fd77cba..93a843052 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -20,6 +20,7 @@ import torch import torch.nn.functional as F +from transformers.cache_utils import Cache, StaticCache from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.utils import is_tf_available @@ -43,6 +44,9 @@ from transformers.modeling_tf_utils import TFPreTrainedModel +BETTERTRANSFORMER_IGNORE = ("codegen",) + + def patch_model_with_bettertransformer(model): COLOR_RED = "\033[1;31m" COLOR_RESET = "\033[0m" @@ -81,6 +85,10 @@ def patch_model_with_bettertransformer(model): # model already has required SDPA implementation if getattr(model, "_supports_sdpa", False) and getattr(model.config, "_attn_implementation", "eager") == "sdpa": return model + + if model.config.model_type in BETTERTRANSFORMER_IGNORE: + return model + try: model = model.to_bettertransformer() except Exception as e: @@ -665,6 +673,72 @@ def _baichuan13b_atten_forward( return attn_output, attn_weights, past_key_value +# Adapted from https://huggingface.co/baichuan-inc/Baichuan-7B/blob/262c8cb58b6d3615c208d9230baa869fddee2adb/modeling_baichuan.py#L181 +def _baichuan7b_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + bsz, q_len, _ = hidden_states.size() + + proj = self.W_pack(hidden_states) + proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2) + query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + if not output_attentions: + attn_weights = None + attn_output = F.scaled_dot_product_attention( + query_states, key_states, value_states, attn_mask=attention_mask, scale=1 / math.sqrt(self.head_dim) + ) + else: + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + return attn_output, attn_weights, past_key_value + + class BaichuanModelPatcher(DecoderModelPatcher): def __init__( self, @@ -712,13 +786,18 @@ def forward( for layer in self._model.model.layers: layer.self_attn._orig_forward = layer.self_attn.forward layer.self_attn.forward = types.MethodType(_baichuan13b_atten_forward, layer.self_attn) + else: + for layer in self._model.model.layers: + layer.self_attn._orig_forward = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_baichuan7b_attn_forward, layer.self_attn) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) if hasattr(self._model, "_orig_forward"): self._model.forward = self._model._orig_forward - for layer in self._model.model.layers: + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): layer.self_attn.forward = layer.self_attn._orig_forward @@ -1328,3 +1407,265 @@ def __exit__(self, exc_type, exc_value, traceback): for layer in self._model.model.layers: if hasattr(layer.self_attn, "_orig_forward"): layer.self_attn.forward = layer.self_attn._orig_forward + + +class CodeGenModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + + # whole codegen bettertransformer patch include attn.forward and does not cover codegen2. + # For avoiding breaking model on tracing stage, we reduce area of bettertransformer patch only for _attn. + from optimum.bettertransformer.models.attention import codegen_wrapped_scaled_dot_product + + for layer in self._model.transformer.h: + if is_torch_version(">=", "2.1.0") and not self._model.config.output_attentions: + orig_self_attn_fwd = layer.attn._attn + layer.attn._attn = types.MethodType(codegen_wrapped_scaled_dot_product, layer.attn) + layer.attn._orig_attn = orig_self_attn_fwd + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + for layer in self._model.transformer.h: + if hasattr(layer.attn, "_orig_attn"): + layer.attn._attn = layer.attn._orig_attn + + +# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L763 +def _dbrx_experts_forward( + self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor +): + bsz, q_len, hidden_size = x.shape + x = x.view(-1, hidden_size) + out = torch.zeros_like(x) + + expert_mask = torch.nn.functional.one_hot(top_experts, num_classes=self.moe_num_experts).permute(2, 1, 0) + # Chunk experts at once to avoid storing full parameter multiple times in autograd + w1_chunked = self.mlp.w1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk( + self.moe_num_experts, dim=0 + ) + v1_chunked = self.mlp.v1.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk( + self.moe_num_experts, dim=0 + ) + w2_chunked = self.mlp.w2.view(self.mlp.moe_num_experts, self.mlp.ffn_hidden_size, self.mlp.hidden_size).chunk( + self.moe_num_experts, dim=0 + ) + w1_chunked = [w1.squeeze(dim=0) for w1 in w1_chunked] + v1_chunked = [v1.squeeze(dim=0) for v1 in v1_chunked] + w2_chunked = [w2.squeeze(dim=0) for w2 in w2_chunked] + for expert_idx in range(0, self.moe_num_experts): + topk_idx, token_idx = torch.where(expert_mask[expert_idx]) + + # Difference with original: removal + # if token_idx.shape[0] == 0: + # continue + # loop interruption depends on input data and may affect torchscript tracing + + token_list = token_idx + topk_list = topk_idx + + expert_tokens = x[None, token_list].reshape(-1, hidden_size) + expert_out = ( + self.mlp(expert_tokens, w1_chunked[expert_idx], v1_chunked[expert_idx], w2_chunked[expert_idx]) + * top_weights[token_list, topk_list, None] + ) + + out.index_add_(0, token_idx, expert_out) + + out = out.reshape(bsz, q_len, hidden_size) + return out + + +# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L1228 +def _dbrx_update_causal_mask_legacy( + self, attention_mask: Optional[torch.Tensor], input_tensor: torch.Tensor, cache_position: torch.Tensor +) -> Optional[torch.Tensor]: + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + sequence_length = input_tensor.shape[1] + if hasattr(self.blocks[0].norm_attn_norm.attn, "past_key_value"): # static cache + target_length = self.config.max_position_embeddings + else: # dynamic cache + target_length = ( + attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1 + ) + # difference with original modeling + # removed target_length = int(target_length). + # Casting to int leads to constant folding during tracing that makes impossible to use model for sequence of different length + causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + elif attention_mask.dim() == 4: + # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with + # cache. In that case, the 4D attention mask attends to the newest tokens only. + if attention_mask.shape[-2] < cache_position[0] + sequence_length: + offset = cache_position[0] + else: + offset = 0 + mask_shape = attention_mask.shape + mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + ): + # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400). + is_tracing = ( + torch.jit.is_tracing() + or isinstance(input_tensor, torch.fx.Proxy) + or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling()) + ) + if not is_tracing and torch.any(attention_mask != 1): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +# adopted from https://github.com/huggingface/transformers/blob/1b3dba9417eebe16b7c206d1dfca6a4c7f11dbec/src/transformers/models/dbrx/modeling_dbrx.py#L1204 +def _dbrx_update_causal_mask_latest( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, +): + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_length() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing + if attention_mask.max() != 0: + raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`") + causal_mask = attention_mask + else: + # difference with original modeling + causal_mask = ( + torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +if is_transformers_version(">", "4.40.2"): + _dbrx_update_causal_mask = _dbrx_update_causal_mask_latest +else: + _dbrx_update_causal_mask = _dbrx_update_causal_mask_legacy + + +class DBRXModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + # dbrx has some accuracy issues with bf16 with transformers >= 4.40 + # fill causal mask in slightly different way for avoid overflow on some platforms + self._model.transformer._orig_update_causal_mask = self._model.transformer._update_causal_mask + self._model.transformer._update_causal_mask = types.MethodType( + _dbrx_update_causal_mask, self._model.transformer + ) + + for block in self._model.transformer.blocks: + rotary_emb = block.norm_attn_norm.attn.rotary_emb + # initialize inv_freq for torchscript tracing + if rotary_emb.inv_freq is None: + inv_freq = 1.0 / ( + rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim) + ) + rotary_emb.inv_freq = inv_freq + # remove continue-operator from iteration loop over experts + block.ffn.experts._orig_forward = block.ffn.experts.forward + block.ffn.experts.forward = types.MethodType(_dbrx_experts_forward, block.ffn.experts) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.transformer._update_causal_mask = self._model.transformer._orig_update_causal_mask + for block in self._model.transformer.blocks: + block.ffn.experts.forward = block.ffn.experts._orig_forward diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 692720a97..cb5ac52ed 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -527,6 +527,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "bloom", "chatglm", "codegen", + "codegen2", # "data2vec-text", # TODO : enable when enabled in exporters "gemma", "gpt2", @@ -561,6 +562,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "aquila2", "xverse", "internlm", + "dbrx", + "qwen2-moe", ) GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( @@ -577,6 +580,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "aquila2", "xverse", "internlm", + "codegen2", ) @parameterized.expand(SUPPORTED_ARCHITECTURES) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index d4364d192..91500cfc6 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -37,9 +37,11 @@ "cohere": "hf-internal-testing/tiny-random-CohereForCausalLM", "chatglm": "katuni4ka/tiny-random-chatglm2", "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", + "codegen2": "katuni4ka/tiny-random-codegen2", "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel", "data2vec_vision": "hf-internal-testing/tiny-random-Data2VecVisionModel", "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel", + "dbrx": "katuni4ka/tiny-random-dbrx", "deberta": "hf-internal-testing/tiny-random-deberta", "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model", "deit": "hf-internal-testing/tiny-random-deit", @@ -91,6 +93,7 @@ "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "qwen": "katuni4ka/tiny-random-qwen", "qwen2": "Qwen/Qwen1.5-0.5B", + "qwen2-moe": "katuni4ka/tiny-random-qwen1.5-moe", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer", From 1319d7bec80622abdb39b7d0307df6e453e4e903 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 23 May 2024 10:32:53 +0200 Subject: [PATCH 34/47] Fix llama and gemma modeling patching for openvino export (#714) * Fix compatibility for transformers v4.41.0 llama and gemma modeling patching * fix for dev transformers version * update setup --- optimum/exporters/openvino/model_patcher.py | 104 +++++++++++++++++++- optimum/intel/openvino/trainer.py | 6 +- 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 93a843052..0265b3a5f 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -301,7 +301,7 @@ def __exit__(self, exc_type, exc_value, traceback): # adopted from # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/gemma/modeling_gemma.py#L965 # https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/llama/modeling_llama.py#L1058 -def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None): +def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, cache_position, past_seen_tokens=None): from transformers.modeling_attn_mask_utils import AttentionMaskConverter if self.config._attn_implementation == "sdpa" and past_seen_tokens is not None: @@ -314,10 +314,12 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling # using minimum from dtype with larger bandwith (floa32) may lead to overflow # during execution on platforms with default lower precision (bfloat16, float16) min_dtype = torch.finfo(torch.float16).min sequence_length = input_tensor.shape[1] + # difference with original modeling if hasattr(getattr(self.layers[0], "self_attn", {}), "past_key_value"): # static cache target_length = self.config.max_position_embeddings else: # dynamic cache @@ -329,7 +331,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + # difference with original modeling causal_mask = torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + if sequence_length != 1: causal_mask = torch.triu(causal_mask, diagonal=1) causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) @@ -366,6 +370,104 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po return causal_mask +# adopted from https://github.com/huggingface/transformers/blob/f4014e75db0190792b3feeccfc5dc5b5f9f0ce7b/src/transformers/models/llama/modeling_llama.py#L1036 +def _llama_gemma_update_causal_mask_latest( + self, + attention_mask, + input_tensor, + cache_position, + past_key_values, + output_attentions, +): + from transformers.cache_utils import StaticCache + from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + # difference with original modeling + # using minimum from dtype with larger bandwith (floa32) may lead to overflow + # during execution on platforms with default lower precision (bfloat16, float16) + min_dtype = torch.finfo(torch.float16).min + + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_length() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing + if attention_mask.max() != 0: + raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`") + causal_mask = attention_mask + else: + # difference with original modeling + causal_mask = ( + torch.full((sequence_length, target_length), fill_value=1, dtype=dtype, device=device) * min_dtype + ) + + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +# TODO : deprecate _llama_gemma_update_causal_mask_legacy when transformers>=4.41.0 +if is_transformers_version(">", "4.40.2"): + _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_latest +else: + _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy + + class GemmaModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 0745a1cd7..c8b29800f 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -906,7 +906,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): output_path = os.path.join(output_dir, OV_XML_FILE_NAME) self.compression_controller.prepare_for_export() model_type = self.model.config.model_type.replace("_", "-") - onnx_config_class = TasksManager.get_exporter_config_constructor( + exporter_config_class = TasksManager.get_exporter_config_constructor( exporter="onnx", model=self.model, task=self.task, @@ -914,9 +914,9 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): ) if self.task == "text-generation": - onnx_config = onnx_config_class(self.model.config, use_past=self.model.config.use_cache) + onnx_config = exporter_config_class(self.model.config, use_past=self.model.config.use_cache) else: - onnx_config = onnx_config_class(self.model.config) + onnx_config = exporter_config_class(self.model.config) num_parameters = self.model.num_parameters() save_as_external_data = use_external_data_format(num_parameters) or self.ov_config.save_onnx_model From e22b2fdb1e709ff20d752cb58a3fe0a891ef924e Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 24 May 2024 17:59:38 +0200 Subject: [PATCH 35/47] Fix nncf quantization for decoder models (#727) * Fix nncf quantization for decoder models * add test * update op quant op * remove deprecated warning * update expected quantized * enable stateful * style --- optimum/intel/openvino/modeling_decoder.py | 5 +++-- optimum/intel/openvino/quantization.py | 9 ++++----- tests/openvino/test_quantization.py | 18 ++++++++++-------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 933d92a50..72cd1b648 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -42,7 +42,7 @@ from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS from .configuration import _DEFAULT_4BIT_CONFIGS, OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel -from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE +from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, STR_TO_OV_TYPE if TYPE_CHECKING: @@ -409,6 +409,7 @@ def prepare_inputs( elif self.use_cache: for input_name in self.key_value_input_names: model_inputs = self.model.input(input_name) + dtype = OV_TO_NP_TYPE[model_inputs.get_element_type().get_type_name()] shape = model_inputs.get_partial_shape() if self.config.model_type == "chatglm": shape[0] = 0 @@ -419,7 +420,7 @@ def prepare_inputs( shape[2] = 0 else: shape[1] = 0 - inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) + inputs[input_name] = np.empty([dim.get_length() for dim in shape], dtype=dtype) else: # past_key_values are not used explicitly, instead they are handled inside the model if past_key_values is None: diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 17305b947..43cf1dd93 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -347,7 +347,6 @@ def _quantize_ovbasemodel( remove_unused_columns=remove_unused_columns, data_collator=data_collator, ) - if self.model.export_feature == "text-generation" and self.model.use_cache: calibration_dataset = self._prepare_text_generation_dataset( quantization_config, calibration_dataloader @@ -430,6 +429,7 @@ def _quantize_ovbasemodel( ), **kwargs, ) + self.model.model = quantized_model if save_directory is not None: self.model.save_pretrained(save_directory) @@ -696,8 +696,6 @@ def _prepare_builtin_dataset(self, quantization_config: OVWeightQuantizationConf def _prepare_text_generation_dataset( self, quantization_config: OVQuantizationConfig, calibration_dataloader: OVDataLoader ) -> nncf.Dataset: - # TODO: this function is not covered by tests, remove if not relevant anymore or cover by tests otherwise - # Prefetch past_key_values self.model.update_pkv_precision(True) self.model.compile() @@ -705,15 +703,16 @@ def _prepare_text_generation_dataset( num_samples = quantization_config.num_samples or 200 - self.model.request = InferRequestWrapper(self.model.model.request, collected_inputs) + self.model.request = InferRequestWrapper(self.model.request, collected_inputs) try: for data in calibration_dataloader: self.model.generate(**data, max_new_tokens=1) if len(collected_inputs) >= num_samples: break finally: - self.model.model.request = self.model.model.request.request + self.model.request = self.model.request.request calibration_dataset = nncf.Dataset(collected_inputs) + return calibration_dataset def _prepare_unet_dataset( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 98eb121d7..09b395ea1 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -73,12 +73,16 @@ class OVQuantizerTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( + SUPPORTED_ARCHITECTURES_TORCH_MODEL = ( (OVModelForSequenceClassification, "bert", 32, 35), - # (OVModelForCausalLM, "gpt2", 41, 23), + (OVModelForCausalLM, "gpt2", 41, 3), + ) + SUPPORTED_ARCHITECTURES_OV_MODEL = ( + (OVModelForSequenceClassification, "bert", 32, 35), + (OVModelForCausalLM, "gpt2", 31, 22), ) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + @parameterized.expand(SUPPORTED_ARCHITECTURES_TORCH_MODEL) def test_automodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): model_id = MODEL_NAMES[model_name] task = model_cls.export_feature @@ -123,23 +127,21 @@ def preprocess_function(examples, tokenizer): loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL) def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): model_id = MODEL_NAMES[model_name] task = model_cls.export_feature dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task] - if "gpt2" in model_id: - expected_int8 -= 1 def preprocess_function(examples, tokenizer): return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True) with tempfile.TemporaryDirectory() as tmp_dir: - transformers_model = model_cls.from_pretrained(model_id, export=True) + ov_model = model_cls.from_pretrained(model_id, export=True) tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) + quantizer = OVQuantizer.from_pretrained(ov_model, task=task) calibration_dataset = quantizer.get_calibration_dataset( dataset_name, From 7b4e50f15f2facf08b52f710d1f6b56b6065b7f8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 27 May 2024 14:03:53 +0200 Subject: [PATCH 36/47] Limit ITREX version for WOQ (#729) * remove latest ITREX release compatibility * update workflow --- .github/workflows/test_inc.yml | 2 +- optimum/intel/neural_compressor/quantization.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 6435d0b71..e3a7518a6 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -34,7 +34,7 @@ jobs: pip install py-cpuinfo pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] - pip install intel-extension-for-transformers + pip install intel-extension-for-transformers==1.4.1 pip install peft - name: Test with Pytest diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 9ee436593..57bc3ae7a 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -79,12 +79,15 @@ ) +_ITREX_EXCLUDED_VERSION = "1.4.2" + if is_itrex_available(): if is_itrex_version("<", ITREX_MINIMUM_VERSION): raise ImportError( f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, " f"but only version {ITREX_MINIMUM_VERSION} or higher is supported." ) + from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit from intel_extension_for_transformers.transformers.utils.config import ( @@ -226,6 +229,12 @@ def quantize( # ITREX Weight Only Quantization if not isinstance(quantization_config, PostTrainingQuantConfig): + if is_itrex_version("==", _ITREX_EXCLUDED_VERSION): + raise ImportError( + f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, " + f"but {_ITREX_EXCLUDED_VERSION} is not compatible." + ) + # check neural-compressor version if is_neural_compressor_version("<", NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION): raise ImportError( From bfd0767c53b2567810e676fd7e4228fb37f984e5 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 28 May 2024 17:34:59 +0200 Subject: [PATCH 37/47] Fix itrex WOQ model loading (#730) * Fix loading ITREX model * add test * fix loading WOQ and quantization config * add test * add revision and subfolder parameters when loading inc config * style * update test model id --- .../intel/neural_compressor/modeling_base.py | 115 ++++++++++++------ optimum/intel/neural_compressor/utils.py | 1 + tests/neural_compressor/test_modeling.py | 63 +++++++++- tests/neural_compressor/test_optimization.py | 1 - tests/neural_compressor/utils_tests.py | 2 +- 5 files changed, 143 insertions(+), 39 deletions(-) diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index c6d5e7bac..bb3d2fe8c 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -22,6 +22,7 @@ import torch from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import EntryNotFoundError from neural_compressor.utils.pytorch import load from transformers import ( AutoConfig, @@ -40,6 +41,7 @@ ) from transformers.modeling_utils import no_init_weights from transformers.models.auto.auto_factory import _get_model_class +from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME from transformers.utils.generic import ContextManagers from optimum.intel.generation import BaseModelForCausalLM @@ -47,7 +49,7 @@ from ...modeling_base import OptimizedModel from ..utils.import_utils import _torch_version, is_itrex_available, is_torch_version from .configuration import INCConfig -from .utils import WEIGHTS_NAME +from .utils import QUANTIZATION_CONFIG_NAME logger = logging.getLogger(__name__) @@ -119,33 +121,70 @@ def _from_pretrained( raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") token = use_auth_token - model_name_or_path = kwargs.pop("model_name_or_path", None) - if model_name_or_path is not None: - logger.warning("`model_name_or_path` is deprecated please use `model_id`") - model_id = model_id or model_name_or_path - model_path = Path(model_id) - - if model_path.is_dir(): - model_cache_path = model_path / file_name + is_local = model_path.is_dir() + model_cache_path = None + inc_config = None + msg = None + if is_local: + if (model_path / subfolder / SAFE_WEIGHTS_NAME).is_file(): + file_name = SAFE_WEIGHTS_NAME + elif not (model_path / subfolder / file_name).is_file(): + raise EnvironmentError( + f"Error no file named {SAFE_WEIGHTS_NAME} or {file_name} found in directory {model_path / subfolder}" + ) + model_cache_path = model_path / subfolder / file_name else: - model_cache_path = hf_hub_download( - repo_id=model_id, - filename=file_name, - subfolder=subfolder, - token=token, - revision=revision, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - ) + # Try download safetensors if exist + try: + model_cache_path = hf_hub_download( + repo_id=model_id, + filename=SAFE_WEIGHTS_NAME, + subfolder=subfolder, + token=token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) + except EntryNotFoundError: + pass + + if model_cache_path is None: + model_cache_path = hf_hub_download( + repo_id=model_id, + filename=file_name, + subfolder=subfolder, + token=token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) model_save_dir = Path(model_cache_path).parent - inc_config = None - msg = None + if is_itrex_available(): - try: - quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json") + quantization_config_path = None + if is_local: + quantization_config_path = model_path / subfolder / QUANTIZATION_CONFIG_NAME + else: + try: + quantization_config_path = hf_hub_download( + repo_id=model_id, + filename=QUANTIZATION_CONFIG_NAME, + subfolder=subfolder, + token=token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) + except EntryNotFoundError: + pass + + if quantization_config_path and Path(quantization_config_path).is_file(): + quantization_config = PretrainedConfig.from_pretrained(quantization_config_path) algorithm = getattr(quantization_config, "quant_method", None) if algorithm in {"rtn", "gptq", "awq", "autoround"}: from intel_extension_for_transformers.transformers.modeling.modeling_auto import ( @@ -154,7 +193,7 @@ def _from_pretrained( _BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class - return _BaseQBitsAutoModelClass.from_pretrained( + model = _BaseQBitsAutoModelClass.from_pretrained( pretrained_model_name_or_path=model_id, token=token, revision=revision, @@ -163,12 +202,16 @@ def _from_pretrained( local_files_only=local_files_only, subfolder=subfolder, trust_remote_code=trust_remote_code, + use_neural_speed=False, **kwargs, ) - except EnvironmentError: - msg = "The model is not quantized with weight-only quantization." + + return cls( + model, config=config, model_save_dir=model_save_dir, q_config=quantization_config, **kwargs + ) + try: - inc_config = INCConfig.from_pretrained(model_id) + inc_config = INCConfig.from_pretrained(model_id, subfolder=subfolder, revision=revision) if not is_torch_version("==", inc_config.torch_version): msg = f"Quantized model was obtained with torch version {inc_config.torch_version} but {_torch_version} was found." logger.warning(f"{msg}") @@ -209,15 +252,19 @@ def _from_pretrained( ) def _save_pretrained(self, save_directory: Union[str, Path]): - output_path = os.path.join(save_directory, WEIGHTS_NAME) - if isinstance(self.model, torch.nn.Module): - state_dict = self.model.state_dict() - if self._q_config: - state_dict["best_configure"] = self._q_config - torch.save(state_dict, output_path) + # For ITREX model + if isinstance(self._q_config, PretrainedConfig): + self._q_config.to_json_file(os.path.join(save_directory, QUANTIZATION_CONFIG_NAME)) + self.model.save_pretrained(save_directory) + # For INC model the state dictionary needs to be modified to include the quantization parameters + else: + state_dict = self.model.state_dict() + if isinstance(self._q_config, dict): + state_dict["best_configure"] = self._q_config + torch.save(state_dict, os.path.join(save_directory, WEIGHTS_NAME)) else: - torch.jit.save(self.model, output_path) + torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME)) if self.inc_config: self.inc_config.save_pretrained(save_directory) diff --git a/optimum/intel/neural_compressor/utils.py b/optimum/intel/neural_compressor/utils.py index 3173f5e1c..84c1d6dc2 100644 --- a/optimum/intel/neural_compressor/utils.py +++ b/optimum/intel/neural_compressor/utils.py @@ -28,6 +28,7 @@ CONFIG_NAME = "best_configure.yaml" +QUANTIZATION_CONFIG_NAME = "quantize_config.json" NEURAL_COMPRESSOR_MINIMUM_VERSION = "2.1.0" NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION = "2.3.0" diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index e6ce4763f..0c3e60969 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -16,10 +16,12 @@ import os import tempfile import unittest +from pathlib import Path import torch from parameterized import parameterized from transformers import AutoTokenizer, pipeline, set_seed +from transformers.utils import SAFE_WEIGHTS_NAME from optimum.exporters import TasksManager from optimum.intel import ( # noqa @@ -37,7 +39,8 @@ INCStableDiffusionPipeline, INCTrainer, ) -from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME +from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, QUANTIZATION_CONFIG_NAME, WEIGHTS_NAME +from optimum.intel.utils.import_utils import is_itrex_available os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -52,7 +55,7 @@ MODEL_NAMES_TO_TASK = ( - ("hf-internal-testing/tiny-random-gpt2", "text-generation"), + ("hf-internal-testing/tiny-random-GPT2LMHeadModel", "text-generation"), ("hf-internal-testing/tiny-random-BertForMaskedLM", "fill-mask"), ("hf-internal-testing/tiny-random-DistilBertForSequenceClassification", "text-classification"), ("hf-internal-testing/tiny-random-DebertaV2Model", "feature-extraction"), @@ -86,7 +89,7 @@ def test_compare_to_transformers(self, model_id, task): outputs = inc_model(**model_inputs) with tempfile.TemporaryDirectory() as tmpdirname: inc_model.save_pretrained(tmpdirname) - loaded_model = model_class.from_pretrained(tmpdirname, file_name=WEIGHTS_NAME) + loaded_model = model_class.from_pretrained(tmpdirname) outputs_loaded = loaded_model(**model_inputs) if task == "feature-extraction": @@ -143,3 +146,57 @@ def test_compare_with_and_without_past_key_values(self): self.assertEqual(outputs_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_without_pkv.shape[1], self.GENERATION_LENGTH) self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv)) + + @unittest.skipIf(not is_itrex_available(), reason="ITREX not available") + def test_saving_loading_woq_itrex_model(self): + model_name = "echarlaix/tiny-random-PhiForCausalLM" + subfolder = "itrex" + model = INCModelForCausalLM.from_pretrained(model_name, revision="itrex", subfolder=subfolder) + tokenizer = AutoTokenizer.from_pretrained(model_name, revision="itrex") + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + tokens = tokenizer("This is a sample output", return_tensors="pt") + + with tempfile.TemporaryDirectory() as tmp_dir: + model_save_dir = Path(tmp_dir) / subfolder + model.save_pretrained(model_save_dir) + folder_contents = os.listdir(model_save_dir) + self.assertIn(SAFE_WEIGHTS_NAME, folder_contents) + self.assertIn(QUANTIZATION_CONFIG_NAME, folder_contents) + loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder) + + with torch.no_grad(): + outputs = model(**tokens) + loaded_outputs = loaded_model(**tokens) + + self.assertTrue("logits" in loaded_outputs) + self.assertIsInstance(loaded_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in loaded_outputs) + self.assertIsInstance(loaded_outputs.past_key_values, tuple) + self.assertTrue(torch.allclose(outputs.logits, loaded_outputs.logits, atol=1e-5)) + + def test_saving_loading_inc_model(self): + model_name = "echarlaix/tiny-random-PhiForCausalLM" + subfolder = "inc" + model = INCModelForCausalLM.from_pretrained(model_name, revision="inc", subfolder=subfolder) + tokenizer = AutoTokenizer.from_pretrained(model_name, revision="inc") + tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + tokens = tokenizer("This is a sample output", return_tensors="pt") + + with tempfile.TemporaryDirectory() as tmp_dir: + model_save_dir = Path(tmp_dir) / subfolder + model.save_pretrained(model_save_dir) + folder_contents = os.listdir(model_save_dir) + self.assertIn(WEIGHTS_NAME, folder_contents) + self.assertIn("inc_config.json", folder_contents) + loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder) + self.assertIsInstance(loaded_model.inc_config, INCConfig) + + with torch.no_grad(): + outputs = model(**tokens) + loaded_outputs = loaded_model(**tokens) + + self.assertTrue("logits" in loaded_outputs) + self.assertIsInstance(loaded_outputs.logits, torch.Tensor) + self.assertTrue("past_key_values" in loaded_outputs) + self.assertIsInstance(loaded_outputs.past_key_values, tuple) + self.assertTrue(torch.allclose(outputs.logits, loaded_outputs.logits, atol=1e-5)) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index da4258613..56f2a5bac 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -47,7 +47,6 @@ from utils_tests import MODEL_NAMES, SEED, INCTestMixin, _generate_dataset from optimum.intel.utils.import_utils import is_torch_version, is_itrex_available - from optimum.intel import ( INCConfig, INCModelForCausalLM, diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index a6d09954f..210623758 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -81,7 +81,7 @@ "electra": "hf-internal-testing/tiny-random-electra", "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt2": "hf-internal-testing/tiny-random-GPT2LMHeadModel", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJModel", From aefabf0b443ef485c6cb8e1e8a51b8c62625739d Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 29 May 2024 10:23:48 +0200 Subject: [PATCH 38/47] Enable ITREX v1.4.2 for torch 2.3.0+cpu (#733) * Enable ITREX v1.4.2 for specific torch version * fix * fix style * update itrex version * fix * fix warning --- .github/workflows/test_inc.yml | 5 ++--- optimum/intel/neural_compressor/quantization.py | 11 ++++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index e3a7518a6..81d102bc0 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -32,9 +32,9 @@ jobs: python -m pip install --upgrade pip pip install cmake pip install py-cpuinfo - pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu + pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] - pip install intel-extension-for-transformers==1.4.1 + pip install intel-extension-for-transformers pip install peft - name: Test with Pytest @@ -43,7 +43,6 @@ jobs: - name: Test IPEX run: | pip uninstall -y intel-extension-for-transformers - pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --extra-index-url https://download.pytorch.org/whl/cpu pip install intel-extension-for-pytorch==2.3.0 pytest tests/neural_compressor/test_ipex.py diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 57bc3ae7a..500478712 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -29,6 +29,7 @@ from neural_compressor.model.onnx_model import ONNXModel from neural_compressor.model.torch_model import IPEXModel, PyTorchModel from neural_compressor.quantization import fit +from packaging.version import parse from torch.utils.data import DataLoader, RandomSampler from transformers import ( DataCollator, @@ -79,8 +80,6 @@ ) -_ITREX_EXCLUDED_VERSION = "1.4.2" - if is_itrex_available(): if is_itrex_version("<", ITREX_MINIMUM_VERSION): raise ImportError( @@ -229,10 +228,12 @@ def quantize( # ITREX Weight Only Quantization if not isinstance(quantization_config, PostTrainingQuantConfig): - if is_itrex_version("==", _ITREX_EXCLUDED_VERSION): + if is_itrex_version("==", "1.4.2") and ( + is_torch_version("!=", "2.3.0") or parse(_torch_version).local != "cpu" + ): raise ImportError( - f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, " - f"but {_ITREX_EXCLUDED_VERSION} is not compatible." + f"Found an incompatible version of `intel-extension-for-transformers` and `torch`. Found version itrex {_itrex_version} and torch {_torch_version}, " + f"but only torch 2.3.0+cpu is compatible with ITREX v1.4.2." ) # check neural-compressor version From ca05db034cfb961cbf6efa8137b2c4a7749911db Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 29 May 2024 16:32:57 +0200 Subject: [PATCH 39/47] Bump transformers version (#724) * Bump transformers version * fix default ignored scope for model using sdpa by default * fix quant ops test * update setup * add ops quant num gpt2 * fix expected ops quant in test * update optimum version --- .../configs/swin-base-jpqd.json | 2 - .../configs/bert-base-jpqd.json | 2 - .../configs/bert-base-jpqd.json | 2 - optimum/intel/openvino/trainer.py | 2 - setup.py | 4 +- tests/openvino/test_modeling.py | 2 +- tests/openvino/test_quantization.py | 4 +- tests/openvino/test_training.py | 38 ++++++++++--------- 8 files changed, 25 insertions(+), 31 deletions(-) diff --git a/examples/openvino/image-classification/configs/swin-base-jpqd.json b/examples/openvino/image-classification/configs/swin-base-jpqd.json index 3f03c276a..23b2fd3d8 100644 --- a/examples/openvino/image-classification/configs/swin-base-jpqd.json +++ b/examples/openvino/image-classification/configs/swin-base-jpqd.json @@ -36,8 +36,6 @@ "ignored_scopes": [ "{re}.*__add___[0-1]", "{re}.*layer_norm_0", - "{re}.*matmul_1", - "{re}.*__truediv__*" ] } ] diff --git a/examples/openvino/question-answering/configs/bert-base-jpqd.json b/examples/openvino/question-answering/configs/bert-base-jpqd.json index 425bd9f31..342d327a3 100644 --- a/examples/openvino/question-answering/configs/bert-base-jpqd.json +++ b/examples/openvino/question-answering/configs/bert-base-jpqd.json @@ -36,8 +36,6 @@ "ignored_scopes": [ "{re}.*__add___[0-1]", "{re}.*layer_norm_0", - "{re}.*matmul_1", - "{re}.*__truediv__*" ] } ] diff --git a/examples/openvino/text-classification/configs/bert-base-jpqd.json b/examples/openvino/text-classification/configs/bert-base-jpqd.json index 25c8f2886..d177e4efd 100644 --- a/examples/openvino/text-classification/configs/bert-base-jpqd.json +++ b/examples/openvino/text-classification/configs/bert-base-jpqd.json @@ -40,8 +40,6 @@ "ignored_scopes": [ "{re}.*__add___[0-1]", "{re}.*layer_norm_0", - "{re}.*matmul_1", - "{re}.*__truediv__*" ] } ] diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index c8b29800f..0a1f5209a 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -153,8 +153,6 @@ "{re}.*Embedding.*", "{re}.*add___.*", "{re}.*layer_norm_.*", - "{re}.*matmul_1", - "{re}.*__truediv__.*", ], } diff --git a/setup.py b/setup.py index 251ec61cd..d00ce1dd9 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "transformers>=4.36.0,<4.41.0", - "optimum~=1.19", + "transformers>=4.36.0,<4.42.0", + "optimum~=1.20", "datasets>=1.4.0", "sentencepiece", "scipy", diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index cb5ac52ed..1e18fb066 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -1679,7 +1679,7 @@ def test_compare_output_attentions(self, model_arch): preprocessor = AutoFeatureExtractor.from_pretrained(model_id) inputs = preprocessor(images=image, return_tensors="pt") - transformers_model = AutoModelForImageClassification.from_pretrained(model_id) + transformers_model = AutoModelForImageClassification.from_pretrained(model_id, attn_implementation="eager") transformers_model.eval() with torch.no_grad(): transformers_outputs = transformers_model(**inputs, output_attentions=True) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 09b395ea1..b7ed36d3e 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -74,7 +74,7 @@ class OVQuantizerTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_TORCH_MODEL = ( - (OVModelForSequenceClassification, "bert", 32, 35), + (OVModelForSequenceClassification, "bert", 22, 35), (OVModelForCausalLM, "gpt2", 41, 3), ) SUPPORTED_ARCHITECTURES_OV_MODEL = ( @@ -665,7 +665,7 @@ def preprocess_function(examples, tokenizer): class OVTrainerTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 49, 38),) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (("distilbert-base-uncased", 67, 38),) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) def test_aware_training_quantization(self, model_name, expected_fake_quantize, expected_int8): diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index c998d00d8..89d644319 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -322,7 +322,7 @@ def tearDown(self): "default_quantization": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss"], ), @@ -330,14 +330,14 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], ), "customized_quantization": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss"], ), @@ -345,7 +345,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG, - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], ), @@ -365,7 +365,7 @@ def tearDown(self): "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -376,7 +376,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -385,7 +385,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -397,7 +397,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -418,7 +418,7 @@ def tearDown(self): "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -429,7 +429,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -438,7 +438,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -450,7 +450,7 @@ def tearDown(self): CUSTOMIZED_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT, ], - expected_fake_quantize=34, + expected_fake_quantize=22, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -553,7 +553,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): "default_quantization": OVTrainerTestDescriptor( model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, compression_metrics=["compression_loss"], ), @@ -572,7 +572,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss"], @@ -580,7 +580,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor( model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss"], @@ -589,7 +589,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): model_id="yujiepan/tiny-random-swin-patch4-window7-224", teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -598,7 +598,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel): model_id="yujiepan/tiny-random-swin-patch4-window7-224", teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224", nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG], - expected_fake_quantize=28, + expected_fake_quantize=36, expected_int8=28, expected_binary_masks=48, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -797,7 +797,9 @@ def prepare_model_and_dataset(self, desc: OVTrainerTestDescriptor): self.feature_extractor = AutoFeatureExtractor.from_pretrained(desc.model_id) self.tokenizer = self.feature_extractor - self.model = AutoModelForAudioClassification.from_pretrained(desc.model_id, num_labels=self.num_labels) + self.model = AutoModelForAudioClassification.from_pretrained( + desc.model_id, num_labels=self.num_labels, attn_implementation="eager" + ) self.teacher_model = None if desc.teacher_model_id: self.teacher_model = AutoModelForAudioClassification.from_pretrained( From 70c1475d24a1ff85838e37b1b5703a08917fe267 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 29 May 2024 16:37:53 +0200 Subject: [PATCH 40/47] Update default 4bit configs (#702) * Update configuration.py * Update configuration.py * fix style * fix style * Fix stabilityai/stablelm-zephyr-3b id Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/openvino/configuration.py | 28 +++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 30dfe5ae6..eb233f3d1 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -33,18 +33,25 @@ logger = logging.getLogger(__name__) _DEFAULT_4BIT_CONFIGS = { - "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, + "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64}, "facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8}, "bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6}, "togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128}, - "HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6}, - "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6}, + "HuggingFaceH4/zephyr-7b-beta": { + "bits": 4, + "sym": True, + "group_size": 128, + "ratio": 0.8, + "dataset": "wikitext2", + "awq": True, + }, + "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7}, "meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, "meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, "stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, "stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, - "stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8}, + "stabilityai/stablelm-zephyr-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0}, "pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72}, "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6}, @@ -52,6 +59,19 @@ "tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, "psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, "mistralai/Mixtral-8x7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, + "facebook/opt-2.7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7}, + "togethercomputer/RedPajama-INCITE-Chat-3B-v1": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, + "lmsys/vicuna-7b-v1.5": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0}, + "stabilityai/stablelm-tuned-alpha-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, + "mistralai/Mistral-7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.9}, + "baichuan-inc/Baichuan2-7B-Chat": { + "bits": 4, + "sym": True, + "group_size": 128, + "ratio": 0.8, + "dataset": "wikitext2", + "awq": True, + }, } From ebc2d3a268c321ab20c3316271b2a11cbdf9bacb Mon Sep 17 00:00:00 2001 From: rbrugaro Date: Thu, 30 May 2024 06:11:31 -0700 Subject: [PATCH 41/47] fix accelerator default to ipex not ort (#737) --- optimum/intel/pipelines/pipeline_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py index 65e6cfb78..a7dba9310 100644 --- a/optimum/intel/pipelines/pipeline_base.py +++ b/optimum/intel/pipelines/pipeline_base.py @@ -150,7 +150,7 @@ def pipeline( feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, - accelerator: Optional[str] = "ort", + accelerator: Optional[str] = "ipex", revision: Optional[str] = None, trust_remote_code: Optional[bool] = None, torch_dtype: Optional[Union[str, torch.dtype]] = None, From 5dfbcbc47eb301e758000018fc5d5bb7446e18c6 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 30 May 2024 18:29:17 +0200 Subject: [PATCH 42/47] Remove default pipeline accelerator (#739) --- optimum/intel/pipelines/pipeline_base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py index a7dba9310..a6c6a36b0 100644 --- a/optimum/intel/pipelines/pipeline_base.py +++ b/optimum/intel/pipelines/pipeline_base.py @@ -150,7 +150,7 @@ def pipeline( feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, use_fast: bool = True, token: Optional[Union[str, bool]] = None, - accelerator: Optional[str] = "ipex", + accelerator: Optional[str] = None, revision: Optional[str] = None, trust_remote_code: Optional[bool] = None, torch_dtype: Optional[Union[str, torch.dtype]] = None, @@ -226,9 +226,12 @@ def pipeline( ) if accelerator not in MAPPING_LOADING_FUNC: - raise ValueError( - f'Accelerator {accelerator} is not supported. Supported accelerator is {", ".join(MAPPING_LOADING_FUNC)}.' - ) + if accelerator is None: + msg = "Impossible to instantiate a pipeline without specifying an `accelerator`." + else: + msg = f"`accelerator` {accelerator} is not supported." + + raise ValueError(msg + f" Supported list of `accelerator` is : {', '.join(MAPPING_LOADING_FUNC)}.") if accelerator == "ipex": if task not in list(IPEX_SUPPORTED_TASKS.keys()): From b21d14d675d893e0d8a4481cf5c51013eba56422 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 30 May 2024 18:51:56 +0200 Subject: [PATCH 43/47] Fix bloom generation (#736) * Fix bloom generation * remove unused variable * add style * add message error * update model id --- optimum/intel/openvino/modeling_decoder.py | 6 +++--- tests/openvino/test_modeling.py | 6 +++--- tests/openvino/utils_tests.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 72cd1b648..fe7cf14c1 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -587,11 +587,11 @@ def _deduplicate_inputs(self, model_inputs: Dict): ) for input_name, input_tensor in model_inputs.items(): if input_name not in ["input_ids", "beam_idx"]: - if not isinstance(input_tensor, Tensor): + if input_name not in self.key_value_input_names: upd_model_inputs[input_name] = input_tensor[indicies] else: - shape = input_tensor.shape - dtype = input_tensor.element_type + shape = input_tensor.shape if isinstance(input_tensor, Tensor) else list(input_tensor.shape) + dtype = input_tensor.element_type if isinstance(input_tensor, Tensor) else Type(input_tensor.dtype) upd_batch_size = indicies.shape[0] if self.config.model_type == "bloom": upd_batch_size *= self.config.num_attention_heads diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 1e18fb066..0cb332276 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -879,14 +879,14 @@ def test_beam_search(self, model_arch): ov_model_stateless.config.eos_token_id = None transformers_model.config.eos_token_id = None - for gen_config in gen_configs: + for idx, gen_config in enumerate(gen_configs): if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]: continue transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config) - self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs)) + self.assertTrue(torch.allclose(ov_stateful_outputs, transformers_outputs), f"generation config : {idx}") ov_stateless_outputs = ov_model_stateless.generate(**tokens, generation_config=gen_config) - self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs)) + self.assertTrue(torch.allclose(ov_stateless_outputs, transformers_outputs), f"generation config : {idx}") class OVModelForMaskedLMIntegrationTest(unittest.TestCase): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 91500cfc6..0789f1983 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -65,7 +65,7 @@ "internlm2": "katuni4ka/tiny-random-internlm2", "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-longt5", - "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM", "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM", "llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", From 683133f617a2fd0bfa4d05739b3a99fe7e7557e1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Thu, 30 May 2024 19:31:04 +0200 Subject: [PATCH 44/47] Dev version --- optimum/intel/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/version.py b/optimum/intel/version.py index 9668d6215..a2a857944 100644 --- a/optimum/intel/version.py +++ b/optimum/intel/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.17.0.dev0" +__version__ = "1.18.0.dev0" From 813d7c0fdbe0013f2ca249aecc06fac668d271c3 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 31 May 2024 14:39:53 +0200 Subject: [PATCH 45/47] Add custom model export test (#677) * Add custom model export test * format --- setup.py | 2 +- tests/openvino/test_export.py | 29 +++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index d00ce1dd9..02d7f2845 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ "transformers_stream_generator", "einops", "tiktoken", - "sentence_transformers", + "sentence-transformers", ] QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"] diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 851f8355f..8f61d9a36 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -18,8 +18,10 @@ from tempfile import TemporaryDirectory from typing import Optional +import torch from parameterized import parameterized -from transformers import AutoConfig +from sentence_transformers import SentenceTransformer, models +from transformers import AutoConfig, AutoTokenizer from utils_tests import MODEL_NAMES from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED @@ -124,7 +126,7 @@ def test_export(self, model_type: str): class CustomExportModelTest(unittest.TestCase): - def test_export_custom_model(self): + def test_custom_export_config_model(self): class BertOnnxConfigWithPooler(BertOnnxConfig): @property def outputs(self): @@ -157,3 +159,26 @@ def outputs(self): self.assertIsInstance(ov_model, OVBaseModel) self.assertTrue(ov_model.output_names == {"last_hidden_state": 0, "pooler_output": 1}) + + def test_export_custom_model(self): + model_id = "hf-internal-testing/tiny-random-BertModel" + word_embedding_model = models.Transformer(model_id, max_seq_length=256) + pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) + dense_model = models.Dense( + in_features=pooling_model.get_sentence_embedding_dimension(), + out_features=256, + ) + model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model]) + + with TemporaryDirectory() as tmpdirname: + export_from_model(model, output=tmpdirname, task="feature-extraction") + ov_model = OVModelForCustomTasks.from_pretrained(tmpdirname) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokens = tokenizer("This is a sample input", return_tensors="pt") + with torch.no_grad(): + model_outputs = model(tokens) + + ov_outputs = ov_model(**tokens) + self.assertTrue(torch.allclose(ov_outputs.token_embeddings, model_outputs.token_embeddings, atol=1e-4)) + self.assertTrue(torch.allclose(ov_outputs.sentence_embedding, model_outputs.sentence_embedding, atol=1e-4)) From 6529306738d829bd2c62d9cfaab19615d4b96ab4 Mon Sep 17 00:00:00 2001 From: Nicolas Oliver Date: Mon, 3 Jun 2024 05:55:35 -0700 Subject: [PATCH 46/47] Update NNCF quantization notebook (#715) * Update quantized_generation_demo.ipynb * Update quantized_generation_demo.ipynb --- notebooks/openvino/quantized_generation_demo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index 767106408..5673243cb 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -32,7 +32,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install optimum[openvino,nncf] torch" + "# ! pip install optimum[openvino,nncf] torch==2.2.2" ] }, { From 096d94b9933d76a661d342cdcd42cae75dbda5ef Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 3 Jun 2024 18:40:15 +0200 Subject: [PATCH 47/47] Add gaudi section (#699) --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 49f0d7976..0226b5d47 100644 --- a/README.md +++ b/README.md @@ -239,3 +239,8 @@ Do not forget to install requirements for every example: cd pip install -r requirements.txt ``` + + +## Gaudi + +To train your model on [Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html), check out [Optimum Habana](https://github.com/huggingface/optimum-habana) which provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks. After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness.