calling .cuda on tokens before mask construction (#111)

* calling .cuda on tokens before mask construction Signed-off-by: sahilj <sahilj@nvidia.com> * Remove unnecessary .cuda calls Signed-off-by: sahilj <sahilj@nvidia.com> --------- Signed-off-by: sahilj <sahilj@nvidia.com>
NVIDIA · Feb 26, 2024 · e71e918 · e71e918
1 parent 3d5bce9
commit e71e918
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_critic.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_critic.py
@@ -116,7 +116,7 @@ def get_forward_output_and_loss_func(self):
         # validation step is not used
         def fwd_output_and_loss_func(data_iterator, model):
             batch = next(data_iterator)
-            tokens = batch["tokens"]
+            tokens = batch["tokens"].cuda()
             returns = batch["returns"]
             prev_values = batch["prev_values"]
             mask = batch["mask"]
@@ -125,7 +125,7 @@ def fwd_output_and_loss_func(data_iterator, model):
                 tokens, self.tokenizer.eos_id, False, True, False,
             )
 
-            attention_mask = attention_mask[0:1].cuda(non_blocking=True)
+            attention_mask = attention_mask[0:1]
 
             # when using PP, set the unused variables to None just to be safe
             if parallel_state.get_pipeline_model_parallel_world_size() > 1: