Skip to content

Commit

Permalink
added amdsmi dep
Browse files Browse the repository at this point in the history
  • Loading branch information
parthraut committed Oct 17, 2024
1 parent 45e33df commit 156961c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 17 deletions.
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ dependencies = [
"pydantic", # The `zeus.utils.pydantic_v1` compatibility layer allows us to unpin Pydantic in most cases.
"rich",
"tyro",
"httpx"
"httpx",
"amdsmi"
]
dynamic = ["version"]

Expand Down
39 changes: 23 additions & 16 deletions zeus/device/gpu/amd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""AMD GPUs."""

from __future__ import annotations
from __future__ import annotations
import functools
import os
import concurrent.futures
Expand All @@ -13,7 +13,7 @@
import amdsmi # type: ignore
# must catch all exceptions, since ImportError is not the only exception that can be raised (ex. OSError on version mismatch).
# Specific exceptions are handled when import and initialization are retested in `amdsmi_is_available`
except Exception:
except Exception:

class MockAMDSMI:
"""Mock class for AMD SMI library."""
Expand Down Expand Up @@ -80,13 +80,15 @@ def wrapper(*args, **kwargs):
class AMDGPU(gpu_common.GPU):
"""Implementation of `GPU` for AMD GPUs."""

def __init__(self, gpu_index: int, executor: concurrent.futures.ThreadPoolExecutor) -> None:
def __init__(
self, gpu_index: int, executor: concurrent.futures.ThreadPoolExecutor
) -> None:
"""Initialize the GPU object."""
super().__init__(gpu_index)
self._get_handle()

# test if _supportsGetTotalEnergyConsumption is true or false, returns a future object so constructor is non-blocking
self.supports_energy_future = self.supportsGetTotalEnergyConsumption(executor)
self.supports_energy_future = self.supportsGetTotalEnergyConsumption(executor)

_exception_map = {
1: gpu_common.ZeusGPUInvalidArgError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL
Expand Down Expand Up @@ -253,21 +255,25 @@ def getAverageMemoryPowerUsage(self) -> int:
)

@_handle_amdsmi_errors
def supportsGetTotalEnergyConsumption(self, executor: concurrent.futures.ThreadPoolExecutor) -> concurrent.futures.Future:
def supportsGetTotalEnergyConsumption(
self, executor: concurrent.futures.ThreadPoolExecutor
) -> concurrent.futures.Future:
"""Check if the GPU supports retrieving total energy consumption. Returns a future object of the result."""

def check_energy_consumption():
try:
wait_time = 0.5 # seconds
threshold = 0.01 # 1% threshold
wait_time = 0.5 # seconds
threshold = 0.01 # 1% threshold

power = self.getInstantPowerUsage()
initial_energy = self.getTotalEnergyConsumption()
time.sleep(wait_time)
final_energy = self.getTotalEnergyConsumption()

measured_energy = final_energy - initial_energy
expected_energy = power * wait_time # power is in mW, wait_time is in seconds
expected_energy = (
power * wait_time
) # power is in mW, wait_time is in seconds

# if the difference between measured and expected energy is less than 1% of the expected energy, then the API is supported
if abs(measured_energy - expected_energy) < threshold * expected_energy:
Expand All @@ -287,25 +293,24 @@ def check_energy_consumption():
self._supportsGetTotalEnergyConsumption = False
else:
raise e

future = executor.submit(check_energy_consumption)
return future


@_handle_amdsmi_errors
def getTotalEnergyConsumption(self) -> int:
"""Return the total energy consumption of the GPU since driver load. Units: mJ."""
energy_dict = amdsmi.amdsmi_get_energy_count(self.handle)
if "energy_accumulator" in energy_dict: # New API
energy = energy_dict["energy_accumulator"] * energy_dict["counter_resolution"]
energy = (
energy_dict["energy_accumulator"] * energy_dict["counter_resolution"]
)
elif "power" in energy_dict and "counter_resolution" in energy_dict: # Old API
energy = energy_dict["power"] * energy_dict["counter_resolution"]
else:
raise ValueError("Unexpected energy dictionary format")

return int(
energy / 1e3
) # returns in micro Joules, convert to mili Joules
return int(energy / 1e3) # returns in micro Joules, convert to mili Joules


class AMDGPUs(gpu_common.GPUs):
Expand Down Expand Up @@ -359,9 +364,11 @@ def _init_gpus(self) -> None:
visible_indices = [int(idx) for idx in visible_device.split(",")]
else:
visible_indices = list(range(len(amdsmi.amdsmi_get_processor_handles())))

# create a threadpool with the number of visible GPUs
with concurrent.futures.ThreadPoolExecutor(max_workers=len(visible_indices)) as executor:
with concurrent.futures.ThreadPoolExecutor(
max_workers=len(visible_indices)
) as executor:
self._gpus = [AMDGPU(gpu_num, executor) for gpu_num in visible_indices]

for gpu in self._gpus:
Expand Down

0 comments on commit 156961c

Please sign in to comment.