From b1872e1ed8542d9aee8e659f2336289a89588062 Mon Sep 17 00:00:00 2001 From: Angel Gonzalez Date: Tue, 7 May 2024 11:35:59 +0200 Subject: [PATCH] Adding checkpoint after traning ends --- src/nanotron/config/config.py | 1 + src/nanotron/trainer.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index d9946f26..e26fac75 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -129,6 +129,7 @@ class CheckpointsArgs: checkpoints_path: Path checkpoint_interval: int save_initial_state: Optional[bool] = False + save_final_state: Optional[bool] = False resume_checkpoint_path: Optional[Path] = None checkpoints_path_is_shared_file_system: Optional[bool] = False diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 0eda00dc..70d023fb 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -442,7 +442,10 @@ def train( self.save_checkpoint() dist.barrier() # let's wait for everyone before leaving - + + if self.config.checkpoints.save_final_state: + self.save_checkpoint() + self.post_training() def training_step(