added amdsmi dep

ml-energy · Oct 17, 2024 · 156961c · 156961c
1 parent 45e33df
commit 156961c
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 17 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,8 @@ dependencies = [
     "pydantic",  # The `zeus.utils.pydantic_v1` compatibility layer allows us to unpin Pydantic in most cases.
     "rich",
     "tyro",
-    "httpx"
+    "httpx",
+    "amdsmi"
 ]
 dynamic = ["version"]
 

diff --git a/zeus/device/gpu/amd.py b/zeus/device/gpu/amd.py
@@ -1,6 +1,6 @@
 """AMD GPUs."""
 
-from __future__ import annotations 
+from __future__ import annotations
 import functools
 import os
 import concurrent.futures
@@ -13,7 +13,7 @@
     import amdsmi  # type: ignore
 # must catch all exceptions, since ImportError is not the only exception that can be raised (ex. OSError on version mismatch).
 # Specific exceptions are handled when import and initialization are retested in `amdsmi_is_available`
-except Exception: 
+except Exception:
 
     class MockAMDSMI:
         """Mock class for AMD SMI library."""
@@ -80,13 +80,15 @@ def wrapper(*args, **kwargs):
 class AMDGPU(gpu_common.GPU):
     """Implementation of `GPU` for AMD GPUs."""
 
-    def __init__(self, gpu_index: int, executor: concurrent.futures.ThreadPoolExecutor) -> None:
+    def __init__(
+        self, gpu_index: int, executor: concurrent.futures.ThreadPoolExecutor
+    ) -> None:
         """Initialize the GPU object."""
         super().__init__(gpu_index)
         self._get_handle()
 
         # test if _supportsGetTotalEnergyConsumption is true or false, returns a future object so constructor is non-blocking
-        self.supports_energy_future = self.supportsGetTotalEnergyConsumption(executor) 
+        self.supports_energy_future = self.supportsGetTotalEnergyConsumption(executor)
 
     _exception_map = {
         1: gpu_common.ZeusGPUInvalidArgError,  # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL
@@ -253,21 +255,25 @@ def getAverageMemoryPowerUsage(self) -> int:
         )
 
     @_handle_amdsmi_errors
-    def supportsGetTotalEnergyConsumption(self, executor: concurrent.futures.ThreadPoolExecutor) -> concurrent.futures.Future:
+    def supportsGetTotalEnergyConsumption(
+        self, executor: concurrent.futures.ThreadPoolExecutor
+    ) -> concurrent.futures.Future:
         """Check if the GPU supports retrieving total energy consumption. Returns a future object of the result."""
 
         def check_energy_consumption():
             try:
-                wait_time = 0.5 # seconds
-                threshold = 0.01 # 1% threshold
+                wait_time = 0.5  # seconds
+                threshold = 0.01  # 1% threshold
 
                 power = self.getInstantPowerUsage()
                 initial_energy = self.getTotalEnergyConsumption()
                 time.sleep(wait_time)
                 final_energy = self.getTotalEnergyConsumption()
 
                 measured_energy = final_energy - initial_energy
-                expected_energy = power * wait_time # power is in mW, wait_time is in seconds
+                expected_energy = (
+                    power * wait_time
+                )  # power is in mW, wait_time is in seconds
 
                 # if the difference between measured and expected energy is less than 1% of the expected energy, then the API is supported
                 if abs(measured_energy - expected_energy) < threshold * expected_energy:
@@ -287,25 +293,24 @@ def check_energy_consumption():
                     self._supportsGetTotalEnergyConsumption = False
                 else:
                     raise e
-        
+
         future = executor.submit(check_energy_consumption)
         return future
-
 
     @_handle_amdsmi_errors
     def getTotalEnergyConsumption(self) -> int:
         """Return the total energy consumption of the GPU since driver load. Units: mJ."""
         energy_dict = amdsmi.amdsmi_get_energy_count(self.handle)
         if "energy_accumulator" in energy_dict:  # New API
-            energy = energy_dict["energy_accumulator"] * energy_dict["counter_resolution"]
+            energy = (
+                energy_dict["energy_accumulator"] * energy_dict["counter_resolution"]
+            )
         elif "power" in energy_dict and "counter_resolution" in energy_dict:  # Old API
             energy = energy_dict["power"] * energy_dict["counter_resolution"]
         else:
             raise ValueError("Unexpected energy dictionary format")
 
-        return int(
-            energy / 1e3
-        )  # returns in micro Joules, convert to mili Joules
+        return int(energy / 1e3)  # returns in micro Joules, convert to mili Joules
 
 
 class AMDGPUs(gpu_common.GPUs):
@@ -359,9 +364,11 @@ def _init_gpus(self) -> None:
             visible_indices = [int(idx) for idx in visible_device.split(",")]
         else:
             visible_indices = list(range(len(amdsmi.amdsmi_get_processor_handles())))
-        
+
         # create a threadpool with the number of visible GPUs
-        with concurrent.futures.ThreadPoolExecutor(max_workers=len(visible_indices)) as executor:
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=len(visible_indices)
+        ) as executor:
             self._gpus = [AMDGPU(gpu_num, executor) for gpu_num in visible_indices]
 
             for gpu in self._gpus: