From 1ec8aaf30161a9f45b7db34f9c41993e1a5b83e0 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:13:26 -0700 Subject: [PATCH 1/5] Added sccache timeout for vllm build (#230) Co-authored-by: maleksan85 --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index e2f21b2b6105c..832d6f0fa7db1 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -173,7 +173,7 @@ RUN cd vllm \ && python3 -m pip install -r requirements-rocm.txt \ && python3 setup.py clean --all \ && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \ - && python3 setup.py bdist_wheel --dist-dir=dist + && SCCACHE_IDLE_TIMEOUT=1800 python3 setup.py bdist_wheel --dist-dir=dist # Build gradlib RUN cd vllm/gradlib \ && python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist From 0e0e96818d1d14b1562bd8c262dd147a574ab204 Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Mon, 14 Oct 2024 10:03:44 -0500 Subject: [PATCH 2/5] Add fp8 for dbrx (#231) * add fp8 for dbrx * linting --- vllm/model_executor/models/dbrx.py | 70 ++++++++++++++++++------------ 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index d9b358aa7925d..77ebef8eda51d 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -18,7 +18,8 @@ from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.dbrx import DbrxConfig @@ -82,33 +83,45 @@ def __init__( # Define custom weight loader for dbrx model def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - weight_name: str): + weight_name: str, param_name: str): tp_rank = get_tensor_model_parallel_rank() param_data = param.data shard_size = self.intermediate_size shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) # DBRX uses GLU for each experts. # GLU has 3 linear layers: w1, v1 and w2. - if weight_name.endswith("w1."): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] - if weight_name.endswith("v1."): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ) - param_data[:, - shard_size:2 * shard_size, :] = loaded_weight[:, - shard, :] - if weight_name.endswith("w2."): - loaded_weight = torch.reshape( - loaded_weight, - [-1, self.intermediate_size * self.tp_size, self.d_model], - ).transpose(1, 2) - param_data[:] = loaded_weight[:, :, shard] + if weight_name.endswith("w1"): + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 0] = loaded_weight + else: + param_data = loaded_weight + if weight_name.endswith("v1"): + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ) + param_data[:, shard_size:2 * + shard_size, :] = loaded_weight[:, shard, :] + elif param_name.endswith("weight_scale"): + param_data[:, 1] = loaded_weight + else: + param_data[:] = loaded_weight + if weight_name.endswith("w2"): + if param_name.endswith("weight"): + loaded_weight = torch.reshape( + loaded_weight, + [-1, self.intermediate_size * self.tp_size, self.d_model], + ).transpose(1, 2) + param_data[:] = loaded_weight[:, :, shard] + else: + param_data[:] = loaded_weight class DbrxMoE(nn.Module): @@ -409,13 +422,13 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): expert_params_mapping = [( - "w13_" if weight_name in ["w1", "v1"] else "w2_", - f"mlp.{weight_name}.", + "w13" if weight_name in ["w1", "v1"] else "w2", + f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) for name, loaded_weight in weights: - if name.endswith(("w1", "v1", "w2")): - name = name + ".weight" + if name.endswith(("w1", "w2", "v1")): + name = name + "_weight" for param_name, weight_name in expert_params_mapping: if weight_name not in name: continue @@ -424,11 +437,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): continue param = params_dict[name] weight_loader = param.weight_loader - weight_loader(param, loaded_weight, weight_name) + weight_loader(param, loaded_weight, weight_name, name) break else: if is_pp_missing_parameter(name, self): continue + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) From 35e2c542f347375f37f8e6122cabd08a3428c028 Mon Sep 17 00:00:00 2001 From: dhonnappa-amd Date: Mon, 14 Oct 2024 10:39:28 -0500 Subject: [PATCH 3/5] Update Buildkite env variable (#232) --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 6397674563aea..f50fdd4fa583c 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -5,7 +5,7 @@ set -o pipefail echo "--- Confirming Clean Initial State" while true; do sleep 3 - if grep -q clean ${BUILDKITE_META_DATA_RESET_TARGET}; then + if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then echo "GPUs state is \"clean\"" break fi From 82cfa5a61e35c9736322ebb5d693966bf0d999c8 Mon Sep 17 00:00:00 2001 From: seungrokj <144636725+seungrokj@users.noreply.github.com> Date: Thu, 17 Oct 2024 00:25:41 +0900 Subject: [PATCH 4/5] cuda graph + num-scheduler-steps bug fix (#236) * cuda graph + num-scheduler-steps bug fix * cuda graph + num-scheduler-steps bug fix * linting --- vllm/attention/backends/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 2b8c373178ab3..e451cd5522d18 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -218,9 +218,18 @@ def build(self, seq_lens: List[int], query_lens: List[int], # The shape of graph_block_tables is # [max batch size, max context len // block size]. input_block_tables = self.runner.graph_block_tables[:batch_size] + max_blocks = input_block_tables.shape[1] for i, block_table in enumerate(self.block_tables): if block_table: - input_block_tables[i, :len(block_table)] = block_table + num_blocks = len(block_table) + if num_blocks <= max_blocks: + input_block_tables[i, :num_blocks] = block_table + else: + # It may be possible to have more blocks allocated due + # to lookahead slots of multi-step, however, they are + # not used anyway, so can be safely ignored. + input_block_tables[ + i, :max_blocks] = block_table[:max_blocks] block_tables = torch.from_numpy(input_block_tables).to( device, non_blocking=True) else: From 165837071089100caeabb364abe81085d976bc72 Mon Sep 17 00:00:00 2001 From: TJian Date: Wed, 16 Oct 2024 08:34:26 -0700 Subject: [PATCH 5/5] [Model] [BUG] Fix code path logic to load mllama model (#234) * fix code path logic to load mllama model * fix lint error * fix lint error --------- Co-authored-by: tjtanaa --- vllm/attention/backends/utils.py | 57 +++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 16 deletions(-) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index e451cd5522d18..f3e5670b7b110 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -7,7 +7,7 @@ from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder, AttentionState) -from vllm.utils import async_tensor_h2d, make_tensor_with_pad +from vllm.utils import async_tensor_h2d, is_hip, make_tensor_with_pad if TYPE_CHECKING: from vllm.worker.model_runner_base import ModelRunnerBase @@ -334,11 +334,19 @@ def graph_capture_get_metadata_for_batch( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ - f" got '{self.runner.attn_backend.get_name()}'" - self._update_captured_metadata_for_enc_dec_model( - batch_size=batch_size, attn_metadata=attn_metadata) + if is_hip(): + assert ( + self.runner.attn_backend.get_name() == "rocm-flash-attn" + ), (f"Expected attn_backend name to be 'rocm-flash-attn', but " + f" got '{self.runner.attn_backend.get_name()}'") + self._update_captured_metadata_for_enc_dec_model( + batch_size=batch_size, attn_metadata=attn_metadata) + else: + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" + self._update_captured_metadata_for_enc_dec_model( + batch_size=batch_size, attn_metadata=attn_metadata) return attn_metadata @@ -354,11 +362,19 @@ def get_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ - f" got '{self.runner.attn_backend.get_name()}'" - self._add_additonal_input_buffers_for_enc_dec_model( - attn_metadata=attn_metadata, input_buffers=input_buffers) + if is_hip(): + assert ( + self.runner.attn_backend.get_name() == "rocm-flash-attn" + ), (f"Expected attn_backend name to be 'rocm-flash-attn', but " + f" got '{self.runner.attn_backend.get_name()}'") + self._add_additonal_input_buffers_for_enc_dec_model( + attn_metadata=attn_metadata, input_buffers=input_buffers) + else: + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" + self._add_additonal_input_buffers_for_enc_dec_model( + attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers def prepare_graph_input_buffers( @@ -373,11 +389,20 @@ def prepare_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ - f" got '{self.runner.attn_backend.get_name()}'" - self._prepare_input_buffers_for_enc_dec_model( - attn_metadata, input_buffers) + + if is_hip(): + assert ( + self.runner.attn_backend.get_name() == "rocm-flash-attn" + ), (f"Expected attn_backend name to be 'rocm-flash-attn', but " + f" got '{self.runner.attn_backend.get_name()}'") + self._prepare_input_buffers_for_enc_dec_model( + attn_metadata, input_buffers) + else: + assert self.runner.attn_backend.get_name() == "xformers", \ + f"Expected attn_backend name to be 'xformers', but "\ + f" got '{self.runner.attn_backend.get_name()}'" + self._prepare_input_buffers_for_enc_dec_model( + attn_metadata, input_buffers) def begin_forward(self, model_input) -> None: return