diff --git a/examples/imagenet/train_new.py b/examples/imagenet/train_new.py index e3ebcc22..68c1ba4b 100644 --- a/examples/imagenet/train_new.py +++ b/examples/imagenet/train_new.py @@ -531,7 +531,9 @@ def main_worker(gpu, ngpus_per_node, args): if args.local_rank == 0 or args.gpu is not None: monitor = ZeusMonitor( - gpu_indices=list(range(args.local_world_size)) if args.gpu is None else [args.gpu], + gpu_indices=list(range(args.local_world_size)) + if args.gpu is None + else [args.gpu], ) plo = GlobalPowerLimitOptimizer( monitor=monitor, @@ -545,8 +547,6 @@ def main_worker(gpu, ngpus_per_node, args): monitor = None plo = None - - for epoch in range(args.start_epoch, args.epochs): if args.local_rank == 0 or args.gpu is not None: plo.on_epoch_begin() diff --git a/pyproject.toml b/pyproject.toml index 7e792114..e9eaf44e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ ignore = [ "B019", # Usage of functools.lru_cache "PLR0913", # Too many function arguments "B905", # zip strict argument + "PLR0915", # Too many statements ] line-length = 120 diff --git a/zeus/optimizer/power_limit.py b/zeus/optimizer/power_limit.py index 50bfc987..2c15fb57 100644 --- a/zeus/optimizer/power_limit.py +++ b/zeus/optimizer/power_limit.py @@ -81,6 +81,7 @@ class Done: @dataclass class Measurement: """POD for GPU energy and time measurements for one power limit.""" + power_limit: int # In Watts. energy: float time: float @@ -187,9 +188,15 @@ def __init__( else: measurements = json.load(self.profile_path.open())["measurements"] self.measurements = [Measurement(**m) for m in measurements] - self.logger.info("Loaded previous profiling results from '%s'.", str(self.profile_path)) + self.logger.info( + "Loaded previous profiling results from '%s'.", str(self.profile_path) + ) optimal_power_limit = self._compute_optimal_power_limit() - self.logger.info("Optimal power limit is %d W.", optimal_power_limit // 1000) + self.logger.info( + "Optimal power limit is %d W for eta_knob %f.", + optimal_power_limit // 1000, + self.eta_knob, + ) self.state = Done(optimal_power_limit=optimal_power_limit) self._set_power_limit(self.state.optimal_power_limit) @@ -257,11 +264,13 @@ def on_step_begin(self) -> None: "Finished profiling for power limit %d W.", self.state.current_power_limit // 1000, ) - self.measurements.append(Measurement( - power_limit=self.state.current_power_limit // 1000, - energy=measurement.total_energy, - time=measurement.time, - )) + self.measurements.append( + Measurement( + power_limit=self.state.current_power_limit // 1000, + energy=measurement.total_energy, + time=measurement.time, + ) + ) # If we're done profiling all power limits, compute the optimal # power limit and transition to the Done state. Otherwise, move # on to the Warmup phase for the next power limit. @@ -317,7 +326,8 @@ def _compute_optimal_power_limit(self) -> int: """ max_power = max(self.power_limits) // 1000 * len(self.monitor.gpu_indices) cost_map = { - measurement.power_limit * 1000: zeus_cost( + measurement.power_limit + * 1000: zeus_cost( energy=measurement.energy, time=measurement.time, eta_knob=self.eta_knob,