From d9377b826d24b2e1c206632bc40f39aab02b3d43 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 15 Oct 2024 15:28:37 +0800
Subject: [PATCH] Support generation search for transformers examples (#2029)

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .pre-commit-config.yaml                       |   2 +-
 .../text-generation/run_generation_gpu_woq.py |   1 +
 .../transformers/generation/__init__.py       |  19 +
 .../transformers/generation/beam_search.py    | 490 ++++++++++++++++++
 .../transformers/generation/greedy_search.py  | 401 ++++++++++++++
 5 files changed, 912 insertions(+), 1 deletion(-)
 create mode 100644 neural_compressor/transformers/generation/__init__.py
 create mode 100644 neural_compressor/transformers/generation/beam_search.py
 create mode 100644 neural_compressor/transformers/generation/greedy_search.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b821d93eb1..2875b945c57 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -76,7 +76,7 @@ repos:
           )$
 
   - repo: https://github.com/PyCQA/docformatter
-    rev: v1.7.5
+    rev: 06907d0
     hooks:
       - id: docformatter
         args: [
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
index f92a2ff6b8c..7b63a015600 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/run_generation_gpu_woq.py
@@ -8,6 +8,7 @@
 import intel_extension_for_pytorch as ipex
 from neural_compressor.transformers import AutoModelForCausalLM, AutoRoundConfig, RtnConfig, GPTQConfig
 from neural_compressor.transformers.quantization.utils import convert_dtype_str2torch
+from neural_compressor.transformers.generation import _greedy_search, _beam_search
 from transformers.utils import check_min_version
 import contextlib
 
diff --git a/neural_compressor/transformers/generation/__init__.py b/neural_compressor/transformers/generation/__init__.py
new file mode 100644
index 00000000000..4030000c22c
--- /dev/null
+++ b/neural_compressor/transformers/generation/__init__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .beam_search import _beam_search
+from .greedy_search import _greedy_search
diff --git a/neural_compressor/transformers/generation/beam_search.py b/neural_compressor/transformers/generation/beam_search.py
new file mode 100644
index 00000000000..d4372810078
--- /dev/null
+++ b/neural_compressor/transformers/generation/beam_search.py
@@ -0,0 +1,490 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import time
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers.generation.beam_search import BeamScorer
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.utils import ModelOutput
+
+
+class BeamSearchEncoderDecoderOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    sequences_scores: Optional[torch.FloatTensor] = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    beam_indices: Optional[torch.LongTensor] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+class BeamSearchDecoderOnlyOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    sequences_scores: Optional[torch.FloatTensor] = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    beam_indices: Optional[torch.LongTensor] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
+
+
+def _beam_search(
+    self,
+    input_ids: torch.LongTensor,
+    beam_scorer: BeamScorer,
+    logits_processor: Optional[LogitsProcessorList] = None,
+    stopping_criteria: Optional[StoppingCriteriaList] = None,
+    max_length: Optional[int] = None,
+    pad_token_id: Optional[int] = None,
+    eos_token_id: Optional[Union[int, List[int]]] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_scores: Optional[bool] = None,
+    return_dict_in_generate: Optional[bool] = None,
+    synced_gpus: bool = False,
+    **model_kwargs,
+) -> Union[BeamSearchOutput, torch.LongTensor]:
+    r"""
+    Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+    can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+    <Tip warning={true}>
+    In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
+    instead. For an overview of generation strategies and code examples, check the [following
+    guide](../generation_strategies).
+    </Tip>
+    Parameters:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The sequence used as a prompt for the generation.
+        beam_scorer (`BeamScorer`):
+            An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+            sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+        logits_processor (`LogitsProcessorList`, *optional*):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        stopping_criteria (`StoppingCriteriaList`, *optional*):
+            An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+            used to tell if the generation loop should stop.
+        max_length (`int`, *optional*, defaults to 20):
+            **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+            tokens. The maximum length of the sequence to be generated.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+            returned tensors for more details.
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+            for more details.
+        output_scores (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        synced_gpus (`bool`, *optional*, defaults to `False`):
+            Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+        model_kwargs:
+            Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+            an encoder-decoder model the kwargs should include `encoder_outputs`.
+    Return:
+        [`BeamSearchDecoderOnlyOutput`], [`BeamSearchEncoderDecoderOutput`] or
+        `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+        [`BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+        `return_dict_in_generate=True` or a [`BeamSearchEncoderDecoderOutput`] if
+        `model.config.is_encoder_decoder=True`.
+    Examples:
+    ```python
+    >>> from transformers import (
+    ...     AutoTokenizer,
+    ...     AutoModelForSeq2SeqLM,
+    ...     LogitsProcessorList,
+    ...     MinLengthLogitsProcessor,
+    ...     BeamSearchScorer,
+    ... )
+    >>> import torch
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+    >>> encoder_input_str = "translate English to German: How old are you?"
+    >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+    >>> # lets run beam search using 3 beams
+    >>> num_beams = 3
+    >>> # define decoder start token ids
+    >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+    >>> input_ids = input_ids * model.config.decoder_start_token_id
+    >>> # add encoder_outputs to model keyword arguments
+    >>> model_kwargs = {
+    ...     "encoder_outputs": model.get_encoder()(
+    ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+    ...     )
+    ... }
+    >>> # instantiate beam scorer
+    >>> beam_scorer = BeamSearchScorer(
+    ...     batch_size=1,
+    ...     num_beams=num_beams,
+    ...     device=model.device,
+    ... )
+    >>> # instantiate logits processors
+    >>> logits_processor = LogitsProcessorList(
+    ...     [
+    ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+    ...     ]
+    ... )
+    >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+    >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    ['Wie alt bist du?']
+    ```"""
+    # init values
+    token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or (
+        self.token_latency if hasattr(self, "token_latency") else False
+    )
+
+    latency_list = []
+    logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+    stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+    if max_length is not None:
+        warnings.warn(
+            "`max_length` is deprecated in this function, use"
+            " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+            UserWarning,
+        )
+        stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+    if len(stopping_criteria) == 0:
+        warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
+    pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+    eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+    output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+    )
+    return_dict_in_generate = (
+        return_dict_in_generate
+        if return_dict_in_generate is not None
+        else self.generation_config.return_dict_in_generate
+    )
+
+    batch_size = len(beam_scorer._beam_hyps)
+    num_beams = beam_scorer.num_beams
+
+    batch_beam_size, cur_len = input_ids.shape
+
+    if num_beams * batch_size != batch_beam_size:
+        raise ValueError(
+            f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+        )
+
+    # init attention / hidden states / scores tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+    beam_indices = tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+    decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+    cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+    # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+    if return_dict_in_generate and self.config.is_encoder_decoder:
+        encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+        encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+    # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+    # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+    beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+    beam_scores[:, 1:] = -1e9
+    beam_scores = beam_scores.view((batch_size * num_beams,))
+    this_peer_finished = False  # used by synced_gpus only
+    decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+    while True:
+        tic = time.time()
+        if synced_gpus:
+            # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+            # The following logic allows an early break if all peers finished generating their sequence
+            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+            # send 0.0 if we finished, 1.0 otherwise
+            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+            # did all peers finish? the reduced sum will be 0.0 then
+            if this_peer_finished_flag.item() == 0.0:
+                break
+
+        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        if (
+            re.search("GPTJ", self.config.architectures[0])
+            or re.search("llama", self.config.architectures[0], re.IGNORECASE)
+            or re.search("gptneox", self.config.architectures[0], re.IGNORECASE)
+            or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
+            or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
+            or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+        ):
+            first_token = False
+            input_bs = input_ids.size()[0]
+            has_position_id = True
+            if model_inputs["past_key_values"] is None:
+                first_token = True
+            if first_token and hasattr(self, "trace_graph"):
+                if re.search("GPTJ", self.config.architectures[0]):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.n_layer)
+                        ]
+                    )
+                elif re.search("llama", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("OPT", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                    has_position_id = False
+                elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search(
+                    "rw", self.config.architectures[0], re.IGNORECASE
+                ):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(batch_size * num_beams)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                    has_position_id = False
+
+            if hasattr(self, "trace_graph"):
+                if first_token:
+                    new_attention_mask = model_inputs["attention_mask"][:batch_size].clone()
+                    new_input_ids = model_inputs["input_ids"][:batch_size].clone()
+                    if has_position_id:
+                        new_position_ids = model_inputs["position_ids"][:batch_size].clone()
+                    for i in range(batch_size):
+                        new_attention_mask[i] = model_inputs["attention_mask"][i * num_beams]
+                        new_input_ids[i] = model_inputs["input_ids"][i * num_beams]
+                        if has_position_id:
+                            new_position_ids[i] = model_inputs["position_ids"][i * num_beams]
+                    model_inputs["attention_mask"] = new_attention_mask
+                    model_inputs["input_ids"] = new_input_ids
+                    if has_position_id:
+                        model_inputs["position_ids"] = new_position_ids
+                model_inputs.pop("use_cache", None)
+                model_inputs.pop("token_type_ids", None)
+                if first_token and hasattr(self, "trace_graph_first"):
+                    outputs = self.trace_graph_first(**model_inputs)
+                else:
+                    outputs = self.trace_graph(**model_inputs)
+
+                if first_token and len(model_inputs["past_key_values"][1]) == 4:
+                    outputs = list(outputs)
+                    outputs[0] = outputs[0].repeat_interleave(num_beams, dim=0)
+                    outputs = tuple(outputs)
+                if synced_gpus and this_peer_finished:
+                    cur_len = cur_len + 1
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs[0][:, -1, :]
+            else:
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+                if synced_gpus and this_peer_finished:
+                    cur_len = cur_len + 1
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs.logits[:, -1, :]
+        else:
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+        next_token_scores = nn.functional.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
+        next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+        next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(next_token_scores_processed)
+        # Store scores, attentions and hidden_states when required
+        if return_dict_in_generate:
+            if output_scores:
+                scores += (next_token_scores_processed,)
+            if output_attentions:
+                decoder_attentions += (
+                    (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                )
+                if self.config.is_encoder_decoder:
+                    cross_attentions += (outputs.cross_attentions,)
+
+            if output_hidden_states:
+                decoder_hidden_states += (
+                    (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,)
+                )
+
+        # reshape for beam search
+        vocab_size = next_token_scores.shape[-1]
+        next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+        # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
+        n_eos_tokens = len(eos_token_id) if eos_token_id else 0
+        next_token_scores, next_tokens = torch.topk(
+            next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
+        )
+
+        next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+        next_tokens = next_tokens % vocab_size
+
+        # stateless
+        beam_outputs = beam_scorer.process(
+            input_ids,
+            next_token_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            beam_indices=beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+        beam_scores = beam_outputs["next_beam_scores"]
+        beam_next_tokens = beam_outputs["next_beam_tokens"]
+        beam_idx = beam_outputs["next_beam_indices"]
+        input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+        model_kwargs = self._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+        )
+        if model_kwargs["past_key_values"] is not None:
+            model_kwargs["past_key_values"] = self._temporary_reorder_cache(model_kwargs["past_key_values"], beam_idx)
+
+        if return_dict_in_generate and output_scores:
+            # pylint: disable=unsubscriptable-object
+            beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+        # increase cur_len
+        cur_len = cur_len + 1
+        if token_latency:
+            if input_ids.is_xpu:
+                torch.xpu.synchronize()
+            latency_list.append(time.time() - tic)
+
+        if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+            if not synced_gpus:
+                break
+            else:
+                this_peer_finished = True
+
+    sequence_outputs = beam_scorer.finalize(
+        input_ids,
+        beam_scores,
+        next_tokens,
+        next_indices,
+        pad_token_id=pad_token_id,
+        eos_token_id=eos_token_id,
+        max_length=stopping_criteria.max_length,
+        beam_indices=beam_indices,
+        decoder_prompt_len=decoder_prompt_len,
+    )
+    if return_dict_in_generate:
+        if not output_scores:
+            sequence_outputs["sequence_scores"] = None
+
+        if self.config.is_encoder_decoder:
+            output_result = BeamSearchEncoderDecoderOutput(
+                sequences=sequence_outputs["sequences"],
+                sequences_scores=sequence_outputs["sequence_scores"],
+                scores=scores,
+                beam_indices=sequence_outputs["beam_indices"],
+                encoder_attentions=encoder_attentions,
+                encoder_hidden_states=encoder_hidden_states,
+                decoder_attentions=decoder_attentions,
+                cross_attentions=cross_attentions,
+                decoder_hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            output_result = BeamSearchDecoderOnlyOutput(
+                sequences=sequence_outputs["sequences"],
+                sequences_scores=sequence_outputs["sequence_scores"],
+                scores=scores,
+                beam_indices=sequence_outputs["beam_indices"],
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+    else:
+        output_result = sequence_outputs["sequences"]
+    # result
+    if token_latency:
+        return (output_result, latency_list)
+    else:
+        return output_result
diff --git a/neural_compressor/transformers/generation/greedy_search.py b/neural_compressor/transformers/generation/greedy_search.py
new file mode 100644
index 00000000000..f35211005ff
--- /dev/null
+++ b/neural_compressor/transformers/generation/greedy_search.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import time
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from transformers.generation.logits_process import LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers.generation.streamers import BaseStreamer
+from transformers.utils import ModelOutput
+
+
+class GreedySearchDecoderOnlyOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+class GreedySearchEncoderDecoderOutput(ModelOutput):
+    sequences: torch.LongTensor = None
+    scores: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None
+
+
+GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]
+
+
+def _greedy_search(
+    self,
+    input_ids: torch.LongTensor,
+    logits_processor: Optional[LogitsProcessorList] = None,
+    stopping_criteria: Optional[StoppingCriteriaList] = None,
+    max_length: Optional[int] = None,
+    pad_token_id: Optional[int] = None,
+    eos_token_id: Optional[Union[int, List[int]]] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_scores: Optional[bool] = None,
+    return_dict_in_generate: Optional[bool] = None,
+    synced_gpus: bool = False,
+    streamer: Optional["BaseStreamer"] = None,
+    **model_kwargs,
+) -> Union[GreedySearchOutput, torch.LongTensor]:
+    r"""Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
+    used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+    <Tip warning={true}>
+    In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
+    instead. For an overview of generation strategies and code examples, check the [following
+    guide](../generation_strategies).
+    </Tip>
+    Parameters:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The sequence used as a prompt for the generation.
+        logits_processor (`LogitsProcessorList`, *optional*):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        stopping_criteria (`StoppingCriteriaList`, *optional*):
+            An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+            used to tell if the generation loop should stop.
+        max_length (`int`, *optional*, defaults to 20):
+            **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+            tokens. The maximum length of the sequence to be generated.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+            returned tensors for more details.
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+            for more details.
+        output_scores (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        synced_gpus (`bool`, *optional*, defaults to `False`):
+            Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+        streamer (`BaseStreamer`, *optional*):
+            Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+            through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+        model_kwargs:
+            Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
+            If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
+    Return:
+        [`GreedySearchDecoderOnlyOutput`], [`GreedySearchEncoderDecoderOutput`] or
+        `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+        [`GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+        `return_dict_in_generate=True` or a [`GreedySearchEncoderDecoderOutput`] if
+        `model.config.is_encoder_decoder=True`.
+    Examples:
+    ```python
+    >>> from transformers import (
+    ...     AutoTokenizer,
+    ...     AutoModelForCausalLM,
+    ...     LogitsProcessorList,
+    ...     MinLengthLogitsProcessor,
+    ...     StoppingCriteriaList,
+    ...     MaxLengthCriteria,
+    ... )
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+    >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
+    >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
+    >>> input_prompt = "It might be possible to"
+    >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+    >>> # instantiate logits processors
+    >>> logits_processor = LogitsProcessorList(
+    ...     [
+    ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
+    ...     ]
+    ... )
+    >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
+    >>> outputs = model.greedy_search(
+    ...     input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
+    ... )
+    >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
+    ```
+    """
+    token_latency = (self.config.token_latency if hasattr(self.config, "token_latency") else False) or (
+        self.token_latency if hasattr(self, "token_latency") else False
+    )
+
+    latency_list = []
+    # init values
+    logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+    stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+    if max_length is not None:
+        warnings.warn(
+            "`max_length` is deprecated in this function, use"
+            " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
+            UserWarning,
+        )
+        stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+    pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
+    eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+    eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+    output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
+    output_attentions = output_attentions if output_attentions is not None else self.generation_config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
+    )
+    return_dict_in_generate = (
+        return_dict_in_generate
+        if return_dict_in_generate is not None
+        else self.generation_config.return_dict_in_generate
+    )
+
+    # init attention / hidden states / scores tuples
+    scores = () if (return_dict_in_generate and output_scores) else None
+    decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+    cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+    decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+    # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+    if return_dict_in_generate and self.config.is_encoder_decoder:
+        encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+        encoder_hidden_states = model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+    # keep track of which sequences are already finished
+    unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+    this_peer_finished = False  # used by synced_gpus only
+    while True:
+        tic = time.time()
+        if synced_gpus:
+            # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+            # The following logic allows an early break if all peers finished generating their sequence
+            this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+            # send 0.0 if we finished, 1.0 otherwise
+            dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+            # did all peers finish? the reduced sum will be 0.0 then
+            if this_peer_finished_flag.item() == 0.0:
+                break
+
+        # prepare model inputs
+        model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+        if (
+            re.search("GPTJ", self.config.architectures[0])
+            or re.search("llama", self.config.architectures[0], re.IGNORECASE)
+            or re.search("gptneox", self.config.architectures[0], re.IGNORECASE)
+            or re.search("OPT", self.config.architectures[0], re.IGNORECASE)
+            or re.search("falcon", self.config.architectures[0], re.IGNORECASE)
+            or re.search("rw", self.config.architectures[0], re.IGNORECASE)
+        ):
+            first_token = False
+            input_bs = input_ids.size()[0]
+            if model_inputs["past_key_values"] is None:
+                first_token = True
+            if first_token and hasattr(self, "trace_graph"):
+                if re.search("GPTJ", self.config.architectures[0]):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.n_layer)
+                        ]
+                    )
+                elif re.search("llama", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("gptneox", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("OPT", self.config.architectures[0], re.IGNORECASE):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+                elif re.search("falcon", self.config.architectures[0], re.IGNORECASE) or re.search(
+                    "rw", self.config.architectures[0], re.IGNORECASE
+                ):
+                    beam_idx_tmp = torch.zeros(
+                        (2048, int(input_bs)), dtype=torch.long, device=input_ids.device
+                    ).contiguous()
+                    model_inputs["past_key_values"] = tuple(
+                        [
+                            (
+                                torch.zeros(1, 0, 0, 1, dtype=torch.long, device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                torch.zeros([1, 1, 1, 1], device=input_ids.device).contiguous(),
+                                beam_idx_tmp,
+                            )
+                            for i in range(self.config.num_hidden_layers)
+                        ]
+                    )
+            if hasattr(self, "trace_graph"):
+                model_inputs.pop("use_cache", None)
+                model_inputs.pop("token_type_ids", None)
+                outputs = self.trace_graph(**model_inputs)
+                if synced_gpus and this_peer_finished:
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs[0][:, -1, :]
+            else:
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+                if synced_gpus and this_peer_finished:
+                    continue  # don't waste resources running the code we don't need
+                next_token_logits = outputs.logits[:, -1, :]
+        else:
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+
+        # pre-process distribution
+        next_tokens_scores = logits_processor(input_ids, next_token_logits)
+
+        # Store scores, attentions and hidden_states when required
+        if return_dict_in_generate:
+            if output_scores:
+                scores += (next_tokens_scores,)
+            if output_attentions:
+                decoder_attentions += (
+                    (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                )
+                if self.config.is_encoder_decoder:
+                    cross_attentions += (outputs.cross_attentions,)
+
+            if output_hidden_states:
+                decoder_hidden_states += (
+                    (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,)
+                )
+
+        # argmax
+        next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+
+        # finished sentences should have their next token be a padding token
+        if eos_token_id is not None:
+            if pad_token_id is None:
+                raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+        # update generated ids, model inputs, and length for next step
+        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        if streamer is not None:
+            streamer.put(next_tokens.cpu())
+        model_kwargs = self._update_model_kwargs_for_generation(
+            outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+        )
+
+        # if eos_token was found in one sentence, set sentence to finished
+        if eos_token_id_tensor is not None:
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            # stop when each sentence is finished
+            if unfinished_sequences.max() == 0:
+                this_peer_finished = True
+        # stop if we exceed the maximum length
+        if token_latency:
+            if input_ids.is_xpu:
+                torch.xpu.synchronize()
+            latency_list.append(time.time() - tic)
+        if stopping_criteria(input_ids, scores):
+            this_peer_finished = True
+        if this_peer_finished and not synced_gpus:
+            break
+    if streamer is not None:
+        streamer.end()
+
+    if return_dict_in_generate:
+        if self.config.is_encoder_decoder:
+            output_result = GreedySearchEncoderDecoderOutput(
+                sequences=input_ids,
+                scores=scores,
+                encoder_attentions=encoder_attentions,
+                encoder_hidden_states=encoder_hidden_states,
+                decoder_attentions=decoder_attentions,
+                cross_attentions=cross_attentions,
+                decoder_hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+        else:
+            output_result = GreedySearchDecoderOnlyOutput(
+                sequences=input_ids,
+                scores=scores,
+                attentions=decoder_attentions,
+                hidden_states=decoder_hidden_states,
+                past_key_values=model_kwargs.get("past_key_values"),
+            )
+    else:
+        output_result = input_ids
+
+    if token_latency:
+        return (output_result, latency_list)
+    else:
+        return output_result