added multigpu evaluation and prediction

ThilinaRajapakse · Sep 24, 2020 · 3e5076d · 3e5076d
1 parent 0710cf3
commit 3e5076d
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,14 +16,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - ClassificationModel
   - NERModel
   - QuestionAnsweringModel
+  - Seq2Seq
+  - T5Model
+  - ConvAI
+  - MultiModalClassificationModel
+- Added multigpu prediction/eval in
+  - ClassificationModel
+  - MultiModalClassificationModel
+  - ConvAI
+  - NERModel
+  - QuestionAnsweringModel
+  - Seq2Seq
+  - T5Model
 
 ### Fixed
 
 - Thread count can now be specified for MultiLabelClassificationModel.
 
-### TODO
 
-- Add multigpu prediction/eval
 ## [0.48.4] - 2020-09-23
 
 ### Fixed

diff --git a/simpletransformers/classification/classification_model.py b/simpletransformers/classification/classification_model.py
@@ -93,7 +93,16 @@
 
 class ClassificationModel:
     def __init__(
-        self, model_type, model_name, num_labels=None, weight=None, args=None, use_cuda=True, cuda_device=-1, onnx_execution_provider=None, **kwargs,
+        self,
+        model_type,
+        model_name,
+        num_labels=None,
+        weight=None,
+        args=None,
+        use_cuda=True,
+        cuda_device=-1,
+        onnx_execution_provider=None,
+        **kwargs,
     ):
 
         """
@@ -818,11 +827,15 @@ def evaluate(
         eval_sampler = SequentialSampler(eval_dataset)
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
         eval_loss = 0.0
         nb_eval_steps = 0
         preds = None
         out_label_ids = None
         model.eval()
+
         if self.args.fp16:
             from torch.cuda import amp
 
@@ -1096,8 +1109,10 @@ def predict(self, to_predict, multi_label=False):
         preds = None
         out_label_ids = None
 
-        if self.args.onnx:
-            model_inputs = self.tokenizer.batch_encode_plus(to_predict, return_tensors="pt", padding=True, truncation=True)
+        if not multi_label and self.args.onnx:
+            model_inputs = self.tokenizer.batch_encode_plus(
+                to_predict, return_tensors="pt", padding=True, truncation=True
+            )
 
             for input_ids, attention_mask in zip(model_inputs["input_ids"], model_inputs["attention_mask"]):
                 input_ids = input_ids.unsqueeze(0).detach().cpu().numpy()
@@ -1119,6 +1134,9 @@ def predict(self, to_predict, multi_label=False):
             self._move_model_to_device()
             dummy_label = 0 if not self.args.labels_map else next(iter(self.args.labels_map.keys()))
 
+            if args.n_gpu > 1:
+                model = torch.nn.DataParallel(model)
+
             if multi_label:
                 if isinstance(to_predict[0], list):
                     eval_examples = [
@@ -1132,7 +1150,9 @@ def predict(self, to_predict, multi_label=False):
                     ]
             else:
                 if isinstance(to_predict[0], list):
-                    eval_examples = [InputExample(i, text[0], text[1], dummy_label) for i, text in enumerate(to_predict)]
+                    eval_examples = [
+                        InputExample(i, text[0], text[1], dummy_label) for i, text in enumerate(to_predict)
+                    ]
                 else:
                     eval_examples = [InputExample(i, text, None, dummy_label) for i, text in enumerate(to_predict)]
             if args.sliding_window:
@@ -1175,7 +1195,9 @@ def predict(self, to_predict, multi_label=False):
                     if preds is None:
                         preds = logits.detach().cpu().numpy()
                         out_label_ids = inputs["labels"].detach().cpu().numpy()
-                        all_layer_hidden_states = np.array([state.detach().cpu().numpy() for state in layer_hidden_states])
+                        all_layer_hidden_states = np.array(
+                            [state.detach().cpu().numpy() for state in layer_hidden_states]
+                        )
                         all_embedding_outputs = embedding_outputs.detach().cpu().numpy()
                     else:
                         preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
@@ -1272,7 +1294,7 @@ def convert_to_onnx(self, output_dir=None, set_onnx_arg=True):
         Args:
             output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None.
             set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True.
-        """ # noqa
+        """  # noqa
         if not output_dir:
             output_dir = os.path.join(self.args.output_dir, "onnx")
         os.makedirs(output_dir, exist_ok=True)
@@ -1288,12 +1310,14 @@ def convert_to_onnx(self, output_dir=None, set_onnx_arg=True):
         with tempfile.TemporaryDirectory() as temp_dir:
             self.save_model(output_dir=temp_dir, model=self.model)
 
-            convert(framework="pt",
-                    model=temp_dir,
-                    tokenizer=self.tokenizer,
-                    output=Path(onnx_model_name),
-                    pipeline_name="sentiment-analysis",
-                    opset=11)
+            convert(
+                framework="pt",
+                model=temp_dir,
+                tokenizer=self.tokenizer,
+                output=Path(onnx_model_name),
+                pipeline_name="sentiment-analysis",
+                opset=11,
+            )
 
         self.args.onnx = True
         self.tokenizer.save_pretrained(output_dir)

diff --git a/simpletransformers/classification/multi_modal_classification_model.py b/simpletransformers/classification/multi_modal_classification_model.py
@@ -819,6 +819,12 @@ def evaluate(
         out_label_ids = None
         model.eval()
 
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        if args.fp16:
+            from torch.cuda import amp
+
         for batch in tqdm(eval_dataloader, disable=args.silent or silent, desc="Running Evaluation"):
             batch = tuple(t.to(device) for t in batch)
             labels = batch[5]
@@ -1018,14 +1024,24 @@ def predict(self, to_predict, image_path, image_type_extension=None):
         preds = None
         out_label_ids = None
 
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        if args.fp16:
+            from torch.cuda import amp
+
         for batch in tqdm(eval_dataloader, disable=args.silent, desc="Running Prediction"):
             batch = tuple(t.to(device) for t in batch)
             labels = batch[5]
             with torch.no_grad():
                 inputs = self._get_inputs_dict(batch)
-
-                outputs = model(**inputs)
-                logits = outputs[0]  # Different from default behaviour
+                if self.args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        logits = outputs[0]  # Different from default behaviour
+                else:
+                    outputs = model(**inputs)
+                    logits = outputs[0]  # Different from default behaviour
                 tmp_eval_loss = self.criterion(logits, labels)
 
                 eval_loss += tmp_eval_loss.mean().item()

diff --git a/simpletransformers/conv_ai/conv_ai_model.py b/simpletransformers/conv_ai/conv_ai_model.py
@@ -587,13 +587,27 @@ def evaluate(self, eval_file, output_dir, verbose=True, silent=False, **kwargs):
         }
         model.eval()
 
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        if args.fp16:
+            from torch.cuda import amp
+
         for batch in tqdm(eval_dataloader, disable=args.silent or silent, desc="Running Evaluation"):
             batch = tuple(t.to(device) for t in batch)
 
             with torch.no_grad():
                 input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
 
-                lm_logits, mc_logits, *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,)
+                if args.fp16:
+                    with amp.autocast():
+                        lm_logits, mc_logits, *_ = model(
+                            input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
+                        )
+                else:
+                    lm_logits, mc_logits, *_ = model(
+                        input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids,
+                    )
                 # model outputs are always tuple in pytorch-transformers (see doc)
 
                 lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
@@ -739,6 +753,9 @@ def interact(self, personality=None):
         tokenizer = self.tokenizer
         process_count = self.args.process_count
 
+        if self.args.fp16:
+            from torch.cuda import amp
+
         self._move_model_to_device()
 
         if not personality:
@@ -764,7 +781,11 @@ def interact(self, personality=None):
                 raw_text = input(">>> ")
             history.append(tokenizer.encode(raw_text))
             with torch.no_grad():
-                out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
+                if args.fp16:
+                    with amp.autocast():
+                        out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
+                else:
+                    out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
             history.append(out_ids)
             history = history[-(2 * args.max_history + 1) :]
             out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
@@ -791,6 +812,9 @@ def interact_single(self, message, history, personality=None, encode_history=Tru
         tokenizer = self.tokenizer
         process_count = self.args.process_count
 
+        if self.args.fp16:
+            from torch.cuda import amp
+
         self._move_model_to_device()
 
         if not personality:
@@ -813,7 +837,11 @@ def interact_single(self, message, history, personality=None, encode_history=Tru
             history = [tokenizer.encode(sentence) for sentence in history]
         history.append(tokenizer.encode(message))
         with torch.no_grad():
-            out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
+            if args.fp16:
+                with amp.autocast():
+                    out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
+            else:
+                out_ids = self.sample_sequence(personality, history, tokenizer, model, args)
         out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
 
         if encode_history:

diff --git a/simpletransformers/ner/ner_model.py b/simpletransformers/ner/ner_model.py
@@ -78,7 +78,15 @@
 
 class NERModel:
     def __init__(
-        self, model_type, model_name, labels=None, args=None, use_cuda=True, cuda_device=-1, onnx_execution_provider=None, **kwargs,
+        self,
+        model_type,
+        model_name,
+        labels=None,
+        args=None,
+        use_cuda=True,
+        cuda_device=-1,
+        onnx_execution_provider=None,
+        **kwargs,
     ):
         """
         Initializes a NERModel
@@ -717,6 +725,9 @@ def evaluate(self, eval_dataset, output_dir, verbose=True, silent=False, wandb_l
         out_label_ids = None
         model.eval()
 
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
         if self.args.fp16:
             from torch.cuda import amp
 
@@ -869,7 +880,9 @@ def predict(self, to_predict, split_on_space=True):
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
         if self.args.onnx:
-            model_inputs = self.tokenizer.batch_encode_plus(to_predict, return_tensors="pt", padding=True, truncation=True)
+            model_inputs = self.tokenizer.batch_encode_plus(
+                to_predict, return_tensors="pt", padding=True, truncation=True
+            )
 
             for input_ids, attention_mask in zip(model_inputs["input_ids"], model_inputs["attention_mask"]):
                 input_ids = input_ids.unsqueeze(0).detach().cpu().numpy()
@@ -886,9 +899,7 @@ def predict(self, to_predict, split_on_space=True):
                 else:
                     preds = np.append(preds, output[0], axis=0)
                     out_input_ids = np.append(out_input_ids, inputs_onnx["input_ids"], axis=0)
-                    out_attention_mask = np.append(
-                        out_attention_mask, inputs_onnx["attention_mask"], axis=0,
-                    )
+                    out_attention_mask = np.append(out_attention_mask, inputs_onnx["attention_mask"], axis=0,)
             out_label_ids = np.zeros_like(out_input_ids)
         else:
             self._move_model_to_device()
@@ -898,6 +909,10 @@ def predict(self, to_predict, split_on_space=True):
             preds = None
             out_label_ids = None
             model.eval()
+
+            if args.n_gpu > 1:
+                model = torch.nn.DataParallel(model)
+
             if self.args.fp16:
                 from torch.cuda import amp
 
@@ -1114,7 +1129,7 @@ def convert_to_onnx(self, output_dir=None, set_onnx_arg=True):
         Args:
             output_dir (str, optional): If specified, ONNX model will be saved to output_dir (else args.output_dir will be used). Defaults to None.
             set_onnx_arg (bool, optional): Updates the model args to set onnx=True. Defaults to True.
-        """ # noqa
+        """  # noqa
         if not output_dir:
             output_dir = os.path.join(self.args.output_dir, "onnx")
         os.makedirs(output_dir, exist_ok=True)
@@ -1130,12 +1145,14 @@ def convert_to_onnx(self, output_dir=None, set_onnx_arg=True):
         with tempfile.TemporaryDirectory() as temp_dir:
             self.save_model(output_dir=temp_dir, model=self.model)
 
-            convert(framework="pt",
-                    model=temp_dir,
-                    tokenizer=self.tokenizer,
-                    output=Path(onnx_model_name),
-                    pipeline_name="ner",
-                    opset=11)
+            convert(
+                framework="pt",
+                model=temp_dir,
+                tokenizer=self.tokenizer,
+                output=Path(onnx_model_name),
+                pipeline_name="ner",
+                opset=11,
+            )
 
         self.args.onnx = True
         self.tokenizer.save_pretrained(output_dir)

diff --git a/simpletransformers/question_answering/question_answering_model.py b/simpletransformers/question_answering/question_answering_model.py
@@ -753,6 +753,9 @@ def evaluate(self, eval_data, output_dir, verbose_logging=False):
         nb_eval_steps = 0
         model.eval()
 
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
         if self.args.fp16:
             from torch.cuda import amp
 
@@ -899,6 +902,10 @@ def predict(self, to_predict, n_best_size=None):
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
         model.eval()
+
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
         if self.args.fp16:
             from torch.cuda import amp
 

diff --git a/simpletransformers/seq2seq/seq2seq_model.py b/simpletransformers/seq2seq/seq2seq_model.py
@@ -751,13 +751,24 @@ def evaluate(self, eval_dataset, output_dir, verbose=True, silent=False, **kwarg
         nb_eval_steps = 0
         model.eval()
 
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+
+        if self.args.fp16:
+            from torch.cuda import amp
+
         for batch in tqdm(eval_dataloader, disable=args.silent or silent, desc="Running Evaluation"):
             # batch = tuple(t.to(device) for t in batch)
 
             inputs = self._get_inputs_dict(batch)
             with torch.no_grad():
-                outputs = model(**inputs)
-                loss = outputs[0]
+                if self.args.fp16:
+                    with amp.autocast():
+                        outputs = model(**inputs)
+                        loss = outputs[0]
+                else:
+                    outputs = model(**inputs)
+                    loss = outputs[0]
                 eval_loss += loss.mean().item()
             nb_eval_steps += 1