Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added AMD GPU Support to Zeus #57

Merged
merged 30 commits into from
May 2, 2024
Merged
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0d657d2
init
parthraut Apr 2, 2024
d6cad1f
init from amd
parthraut Apr 2, 2024
f6d9242
init
parthraut Apr 2, 2024
0a25e77
init from amd
parthraut Apr 2, 2024
90a2be8
Merge branch 'amd_support' of https://github.com/ml-energy/zeus into …
parthraut Apr 7, 2024
dc53404
finished amd functions
parthraut Apr 7, 2024
7223562
finished impl of all amd func
parthraut Apr 7, 2024
7391159
small changes to gpu, should be ready to go
parthraut Apr 7, 2024
fbded1c
finished amdsmi impl
parthraut Apr 10, 2024
2b66ca6
output of example ran, seems to be giving 0 for output :(
parthraut Apr 10, 2024
6c1e97a
finished up exception handling
parthraut Apr 29, 2024
df75622
removed unnecessary files
parthraut Apr 29, 2024
46c4791
fixed linting and testing
parthraut Apr 29, 2024
02afe3b
fixed bug in test
parthraut Apr 29, 2024
c8b66cf
trying something new 🤪
parthraut Apr 29, 2024
f25184b
fixed bug in nvidia exception
parthraut Apr 30, 2024
13d4ee8
resolving conflicts
parthraut Apr 30, 2024
5767ae9
moved exception_map out of function
parthraut Apr 30, 2024
c91b02f
fixing merge conflicts involving setGpuLockedClocks method
parthraut Apr 30, 2024
a64d9c5
Merge branch 'master' into amd_support
parthraut Apr 30, 2024
7f611c8
fixed issue on amdgpus
parthraut Apr 30, 2024
6f3c171
removed ref to amdsmi, passing tests
parthraut Apr 30, 2024
a38b26e
fixed lint
parthraut Apr 30, 2024
efb4ac9
fixed minor doc
parthraut Apr 30, 2024
4b50fba
fixed error handling for supportsGetTotalEnergyConsumption
parthraut Apr 30, 2024
9ab0c3c
forgot to lint 🤪
parthraut Apr 30, 2024
99fcd38
fixed issues
parthraut May 2, 2024
a6da5cf
fixed linting
parthraut May 2, 2024
c217baf
changed docstring for AMDGPUs to only support 6.0
parthraut May 2, 2024
2466b81
Make the 6.0 notice prettier
jaywonchung May 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 115 additions & 62 deletions zeus/device/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@

import pynvml # necessary for testing to mock!

try:
import amdsmi # This is necessary so that AMDGPU can see the library. Import and initialization is retested in `amdsmi_is_available`.
except ImportError:
amdsmi = None

from zeus.device.exception import ZeusBaseGPUError
from zeus.utils.logging import get_logger

Expand Down Expand Up @@ -275,7 +280,7 @@ def wrapper(*args, **kwargs):
return func(*args, **kwargs)
except pynvml.NVMLError as e:
exception_class = NVIDIAGPU._exception_map.get(e.value, ZeusGPUUnknownError)
raise exception_class(e.msg) from e
raise exception_class(str(e)) from e

return wrapper

Expand Down Expand Up @@ -428,9 +433,11 @@ def _handle_amdsmi_errors(func):
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except amdsmi.AmdSmiException as e:
exception_class = AMDGPU._exception_map.get(e.value, ZeusGPUUnknownError)
raise exception_class(e.msg) from e
except amdsmi.AmdSmiLibraryException as e:
exception_class = AMDGPU._exception_map.get(
e.get_error_code(), ZeusGPUUnknownError
)
raise exception_class(e.get_error_info()) from e

return wrapper

Expand All @@ -447,40 +454,66 @@ def __init__(self, gpu_index: int) -> None:
"""Initializes the AMDGPU object with a specified GPU index. Acquires a handle to the GPU using `amdsmi.amdsmi_get_processor_handles()`."""
super().__init__(gpu_index)
self._get_handle()
self._supportsGetTotalEnergyConsumption = None

_exception_map = {}
_exception_map = {
1: ZeusGPUInvalidArgError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL
2: ZeusGPUNotSupportedError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED
8: ZeusGPUTimeoutError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_TIMEOUT
10: ZeusGPUNoPermissionError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM
15: ZeusGPUMemoryError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_OUT_OF_RESOURCES
18: ZeusGPUInitError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INIT_ERROR
31: ZeusGPUNotFoundError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_FOUND
32: ZeusGPUInitError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_INIT
34: ZeusGPUDriverNotLoadedError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_DRIVER_NOT_LOADED
41: ZeusGPUInsufficientSizeError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INSUFFICIENT_SIZE
45: ZeusGPUDriverNotLoadedError, # amdsmi.amdsmi_wrapper.AMDSMI_NO_ENERGY_DRV
46: ZeusGPUDriverNotLoadedError, # amdsmi.amdsmi_wrapper.AMDSMI_NO_MSR_DRV
47: ZeusGPUDriverNotLoadedError, # amdsmi.amdsmi_wrapper.AMDSMI_NO_HSMP_DRV
48: ZeusGPUNotSupportedError, # amdsmi.amdsmi_wrapper.AMDSMI_NO_HSMP_SUP
49: ZeusGPUNotSupportedError, # amdsmi.amdsmi_wrapper.AMDSMI_NO_HSMP_MSG_SUP
50: ZeusGPUTimeoutError, # amdsmi.amdsmi_wrapper.AMDSMI_HSMP_TIMEOUT
51: ZeusGPUDriverNotLoadedError, # amdsmi.amdsmi_wrapper.AMDSMI_NO_DRV
52: ZeusGPULibraryNotFoundError, # amdsmi.amdsmi_wrapper.AMDSMI_FILE_NOT_FOUND
53: ZeusGPUInvalidArgError, # amdsmi.amdsmi_wrapper.AMDSMI_ARG_PTR_NULL
4294967295: ZeusGPUUnknownError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR
}

@_handle_amdsmi_errors
def _get_handle(self):
handles = amdsmi.amdsmi_get_processor_handles()
if len(handles) <= self.gpu_index:
raise ZeusGPUNotFoundError(
f"GPU with index {self.gpu_index} not found. Found {len(handles)} GPUs."
)
self.handle = amdsmi.amdsmi_get_processor_handles()[self.gpu_index]

@_handle_amdsmi_errors
def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
"""Returns the minimum and maximum power management limits for the specified GPU. Units: mW."""
info = amdsmi.amdsmi_get_power_cap_info(self.handle)
return (info.min_power_cap, info.max_power_cap)
info = amdsmi.amdsmi_get_power_cap_info(self.handle) # Returns in W
return (info["min_power_cap"] * 1000, info["max_power_cap"] * 1000)

@_handle_amdsmi_errors
def setPersistenceMode(self, enable: bool) -> None:
"""Enables persistence mode for the specified GPU."""
raise ZeusGPUNotSupportedError(
"Persistence mode is not supported for AMD GPUs yet"
)
profile = ... # TODO: find out correct profile
amdsmi.amdsmi_set_gpu_power_profile(self.handle, 0, profile)
"""If enable = True, enables persistence mode for the specified GPU. If enable = False, disables persistence mode."""
# N/A for AMD GPUs.
pass

@_handle_amdsmi_errors
def setPowerManagementLimit(self, value: int) -> None:
"""Sets the power management limit for the specified GPU to the given value. Unit: mW."""
amdsmi.amdsmi_set_power_cap(self.handle, sensor_id=0, cap=value)
amdsmi.amdsmi_set_power_cap(
self.handle, 0, int(value * 1000)
) # Units for set_power_cap: microwatts

@_handle_amdsmi_errors
def resetPowerManagementLimit(self) -> None:
"""Resets the power management limit for the specified GPU to the default value."""
info = amdsmi.amdsmi_get_power_cap_info(self.handle)
info = amdsmi.amdsmi_get_power_cap_info(self.handle) # Returns in W
amdsmi.amdsmi_set_power_cap(
self.handle, sensor_id=0, cap=info.default_power_cap
)
self.handle, 0, cap=int(info["default_power_cap"] * 1e6)
) # expects value in microwatts
Copy link
Member

@jaywonchung jaywonchung May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screenshot 2024-05-01 at 8 21 27 PM Holy shit... 5.7 returns in microwatts but 6.0 returns in watts.... Is everything based on 5.7? Then let's keep it and later figure out how to make it work for 6.0...

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

This is what I see, I was going off the source code for 6.0. So it should be correct for ROCM 6.0.

As for pytorch, it looks like it just got full support for ROCM 6.0 with the release of Pytorch 2.3 a week ago.

Should we stick to ROCM 6.0 then?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, if everything is consistent with 6.0, let's keep it that way!


@_handle_amdsmi_errors
def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> None:
Expand All @@ -495,32 +528,22 @@ def setMemoryLockedClocks(self, minMemClockMHz: int, maxMemClockMHz: int) -> Non
@_handle_amdsmi_errors
def getSupportedMemoryClocks(self) -> list[int]:
"""Returns a list of supported memory clock frequencies for the specified GPU. Units: MHz."""
num_supported, current, frequency = amdsmi.amdsmi_get_clk_freq(
self.handle, clk_type=amdsmi.AmdSmiClkType.MEM
) # TODO: Figure out correct clk_type
# frequency; List of frequencies, only the first num_supported frequencies are valid"""
return frequency[:num_supported]
raise ZeusGPUNotSupportedError(
"AMDSMI does not support querying memory frequencies"
)

@_handle_amdsmi_errors
def getSupportedGraphicsClocks(self, freq: int) -> list[int]:
"""Returns a list of supported graphics clock frequencies for the specified GPU at a given frequency. Units: MHz."""
raise ZeusGPUNotSupportedError(
"Getting supported graphics clocks is not supported for AMD GPUs yet"
"AMDSMI does not support querying GFX frequencies given a memory frequency"
)

@_handle_amdsmi_errors
def getName(self) -> str:
"""Returns the name of the specified GPU."""
(
market_name,
vendor_id,
device_id,
rev_id,
asic_serial,
) = amdsmi.amdsmi_get_gpu_asic_info(
self.handle
) # TODO: Does this return correct string
return market_name
info = amdsmi.amdsmi_get_gpu_asic_info(self.handle)
return info["market_name"]

@_handle_amdsmi_errors
def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
Expand All @@ -529,43 +552,70 @@ def setGpuLockedClocks(self, minGpuClockMHz: int, maxGpuClockMHz: int) -> None:
self.handle,
minGpuClockMHz,
maxGpuClockMHz,
clk_type=amdsmi.AMDSMI_CLK_TYPE_GFX,
clk_type=amdsmi.AmdSmiClkType.GFX,
)

@_handle_amdsmi_errors
def resetMemoryLockedClocks(self) -> None:
"""Resets the memory locked clocks of the specified GPU to their default values."""
amdsmi.amdsmi_reset_gpu_clk(
self.handle, clk_type=amdsmi.AMDSMI_CLK_TYPE_SYS
) # TODO: check docs
# Get default MEM clock values
info = amdsmi.amdsmi_get_clock_info(
self.handle, amdsmi.AmdSmiClkType.MEM
) # returns MHz

amdsmi.amdsmi_set_gpu_clk_range(
self.handle,
info["min_clk"],
info["max_clk"],
clk_type=amdsmi.AmdSmiClkType.MEM,
) # expects MHz

@_handle_amdsmi_errors
def resetGpuLockedClocks(self) -> None:
"""Resets the GPU locked clocks of the specified GPU to their default values."""
amdsmi.amdsmi_reset_gpu_clk(
self.handle, clk_type=amdsmi.AMDSMI_CLK_TYPE_GFX
) # TODO: check docs
# Get default GPU clock values
info = amdsmi.amdsmi_get_clock_info(
self.handle, amdsmi.AmdSmiClkType.GFX
) # returns MHz

amdsmi.amdsmi_set_gpu_clk_range(
self.handle,
info["min_clk"],
info["max_clk"],
clk_type=amdsmi.AmdSmiClkType.GFX,
) # expects MHz

@_handle_amdsmi_errors
def getPowerUsage(self) -> int:
"""Returns the power usage of the specified GPU. Units: mW."""
raise ZeusGPUNotSupportedError(
"Getting power usage is not supported for AMD GPUs yet"
)
return int(
amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"] * 1000
) # returns in W, convert to mW

@_handle_amdsmi_errors
def supportsGetTotalEnergyConsumption(self) -> bool:
"""Returns True if the specified GPU supports retrieving the total energy consumption."""
raise ZeusGPUNotSupportedError(
"Getting total energy consumption is not supported for AMD GPUs yet"
)
if self._supportsGetTotalEnergyConsumption is None:
try:
_ = amdsmi.amdsmi_get_energy_count(self.handle)
self._supportsGetTotalEnergyConsumption = True
except amdsmi.AmdSmiLibraryException as e:
if (
e.get_error_code() == 2
): # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED
self._supportsGetTotalEnergyConsumption = False
else:
raise e

return self._supportsGetTotalEnergyConsumption

@_handle_amdsmi_errors
def getTotalEnergyConsumption(self) -> int:
"""Returns the total energy consumption of the specified GPU. Units: mJ."""
raise ZeusGPUNotSupportedError(
"Getting total energy consumption is not supported for AMD GPUs yet"
)
info = amdsmi.amdsmi_get_energy_count(self.handle)
return int(
info["power"] / 1e3
) # returns in micro Joules, convert to mili Joules


class UnprivilegedAMDGPU(AMDGPU):
Expand Down Expand Up @@ -727,8 +777,6 @@ def _init_gpus(self) -> None:
# initialize all GPUs
self._gpus = [NVIDIAGPU(gpu_num) for gpu_num in self.visible_indices]

# eventually replace with: self.gpus = [NVIDIAGPU(gpu_num) for gpu_num in self.visible_indices]

def __del__(self) -> None:
"""Shuts down the NVIDIA GPU monitoring library to release resources and clean up."""
with contextlib.suppress(pynvml.NVMLError):
Expand All @@ -738,9 +786,9 @@ def __del__(self) -> None:
class AMDGPUs(GPUs):
"""AMD GPU Manager object, containing individual AMDGPU objects, abstracting amdsmi calls and handling related exceptions.

This class provides a high-level interface to interact with AMD GPUs. `ROCR_VISIBLE_DEVICES` environment variable is respected if set. For example, if there are
4 GPUs and `ROCR_VISIBLE_DEVICES=0,2`, only GPUs 0 and 2 are instantiated. In this case, to access
GPU of ROCR index 0, use the index 0, and for ROCR index 2, use the index 1.
This class provides a high-level interface to interact with AMD GPUs. `HIP_VISIBLE_DEVICES` environment variable is respected if set. For example, if there are
4 GPUs and `HIP_VISIBLE_DEVICES=0,2`, only GPUs 0 and 2 are instantiated. In this case, to access
GPU of HIP index 0, use the index 0, and for HIP index 2, use the index 1.

This class provides a 1:1 mapping between the methods and AMDSMI library functions. For example, if you want to do the following:

Expand All @@ -756,8 +804,8 @@ class AMDGPUs(GPUs):
constraints = gpus.getPowerManagementLimitConstraints(gpu_index)
```

Note: This class instantiates (grabs the handle, by calling `amdsmi.amdsmi_get_processor_handles()`) all GPUs that are visible to the system, as determined by the `ROCR_VISIBLE_DEVICES` environment variable if set.

Note: This class instantiates (grabs the handle, by calling `amdsmi.amdsmi_get_processor_handles()`) all GPUs that are visible to the system, as determined by the `HIP_VISIBLE_DEVICES` environment variable if set.
!!! Only supports ROCM 6.0 !!!
"""

def __init__(self, ensure_homogeneous: bool = False) -> None:
Expand All @@ -775,23 +823,26 @@ def __init__(self, ensure_homogeneous: bool = False) -> None:
exception_class = AMDGPU._exception_map.get(e.value, ZeusBaseGPUError)
raise exception_class(e.msg) from e

@property
def gpus(self) -> Sequence[GPU]:
"""Returns a list of AMDGPU objects being tracked."""
return self._gpus

def _init_gpus(self) -> None:
# Must respect `ROCR_VISIBLE_DEVICES` if set
if (visible_device := os.environ.get("ROCR_VISIBLE_DEVICES")) is not None:
# Must respect `HIP_VISIBLE_DEVICES` if set
if (visible_device := os.environ.get("HIP_VISIBLE_DEVICES")) is not None:
self.visible_indices = [int(idx) for idx in visible_device.split(",")]
else:
self.visible_indices = list(
range(len(amdsmi.amdsmi_get_processor_handles()))
)

self._gpus = [AMDGPU(gpu_num) for gpu_num in self.visible_indices]

def __del__(self) -> None:
"""Shuts down the AMD GPU monitoring library to release resources and clean up."""
with contextlib.suppress(amdsmi.AmdSmiException):
amdsmi.amdsmi_shut_down() # Ignore error on shutdown. Neccessary for proper cleanup and test functionality
@property
def gpus(self) -> Sequence[GPU]:
"""Returns a list of AMDGPU objects being tracked."""
raise NotImplementedError("AMDGPUs.gpus is not implemented yet.")


_gpus: GPUs | None = None
Expand Down Expand Up @@ -835,6 +886,7 @@ def nvml_is_available() -> bool:
return False
try:
pynvml.nvmlInit()
logger.info("PyNVML is available and initialized.")
return True
except pynvml.NVMLError:
logger.info("PyNVML is available but could not initialize.")
Expand All @@ -850,6 +902,7 @@ def amdsmi_is_available() -> bool:
return False
try:
amdsmi.amdsmi_init()
logger.info("amdsmi is available and initialized")
return True
except amdsmi.AmdSmiLibraryException:
logger.info("amdsmi is available but could not initialize.")
Expand Down
Loading