Skip to content

Commit

Permalink
zeusd debug outputs and doc comments (#87)
Browse files Browse the repository at this point in the history
  • Loading branch information
jaywonchung authored May 30, 2024
1 parent 85bac44 commit 59f1e8f
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 9 deletions.
9 changes: 9 additions & 0 deletions zeus/device/gpu/nvidia.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ def setPowerManagementLimit(self, power_limit_mw: int, block: bool = True) -> No
)
if resp.status_code != 200:
raise ZeusdError(f"Failed to set power management limit: {resp.text}")
logger.debug("Took %s ms to set power limit", resp.elapsed.microseconds / 1000)

@_handle_nvml_errors
def resetPowerManagementLimit(self, block: bool = True) -> None:
Expand All @@ -273,6 +274,9 @@ def setPersistenceMode(self, enabled: bool, block: bool = False) -> None:
)
if resp.status_code != 200:
raise ZeusdError(f"Failed to set persistence mode: {resp.text}")
logger.debug(
"Took %s ms to set persistence mode", resp.elapsed.microseconds / 1000
)

def setMemoryLockedClocks(
self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True
Expand All @@ -286,6 +290,9 @@ def setMemoryLockedClocks(
)
if resp.status_code != 200:
raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}")
logger.debug(
"Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000
)

def resetMemoryLockedClocks(self, block: bool = True) -> None:
"""Reset the locked memory clocks to the default."""
Expand Down Expand Up @@ -369,6 +376,8 @@ def _init_gpus(self) -> None:
raise ZeusdError(
f"ZEUSD_SOCK_PATH points to non-existent file: {sock_path}"
)
if not Path(sock_path).is_socket():
raise ZeusdError(f"ZEUSD_SOCK_PATH is not a socket: {sock_path}")
if not os.access(sock_path, os.W_OK):
raise ZeusdError(f"ZEUSD_SOCK_PATH is not writable: {sock_path}")
self._gpus = [
Expand Down
13 changes: 11 additions & 2 deletions zeusd/src/devices/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,31 +27,40 @@ use crate::error::ZeusdError;
/// This trait can be used to abstract over different GPU management libraries.
/// Currently, this was done to facilitate testing.
pub trait GpuManager {
/// Get the number of GPUs visible in the node.
fn device_count() -> Result<u32, ZeusdError>
where
Self: Sized;
/// Set the persistence mode of the GPU.
fn set_persistence_mode(&mut self, enabled: bool) -> Result<(), ZeusdError>;
/// Set the power management limit in milliwatts.
fn set_power_management_limit(&mut self, power_limit: u32) -> Result<(), ZeusdError>;
/// Set the GPU's locked clock range in MHz.
fn set_gpu_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError>;
/// Reset the GPU's locked clocks.
fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError>;
/// Set the memory locked clock range in MHz.
fn set_mem_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError>;
/// Reset the memory locked clocks.
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError>;
}

/// A request to execute a GPU command.
///
/// This is the type that is sent to the GPU management background task.
/// The optional `Sender` is used to send a response back to the caller if the
/// user wanted to block until the command is executed.
/// The `Span` is used to propagate tracing context starting from the request.
/// user wanted to block until the command is done executing.
/// The `Instant` object is when the request was received by the server.
/// It's used to log how long it took until the command was executed on the GPU.
/// The `Span` object is used to propagate tracing context starting from the request.
pub type GpuCommandRequest = (
GpuCommand,
Option<Sender<Result<(), ZeusdError>>>,
Expand Down
2 changes: 2 additions & 0 deletions zeusd/src/devices/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
//! Interfaces for interacting with devices

pub mod gpu;
15 changes: 8 additions & 7 deletions zeusd/src/routes/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use crate::error::ZeusdError;
/// - The `GpuCommand` variant name is the same as the API name, but the former is camel case
/// and the latter is snake case (e.g., SetPowerLimit vs. set_power_limit).
macro_rules! impl_handler_for_gpu_command {
($api:ident, $path:expr, $($field:ident <$ftype:ty>,)*) => {
($api:ident, $path:expr, $($field:ident: $ftype:ty,)*) => {
paste! {
// Request payload structure.
#[derive(Serialize, Deserialize, Debug)]
Expand Down Expand Up @@ -82,20 +82,20 @@ macro_rules! impl_handler_for_gpu_command {
impl_handler_for_gpu_command!(
set_persistence_mode,
post("/{gpu_id}/set_persistence_mode"),
enabled<bool>,
enabled: bool,
);

impl_handler_for_gpu_command!(
set_power_limit,
post("/{gpu_id}/set_power_limit"),
power_limit_mw<u32>,
power_limit_mw: u32,
);

impl_handler_for_gpu_command!(
set_gpu_locked_clocks,
post("/{gpu_id}/set_gpu_locked_clocks"),
min_clock_mhz<u32>,
max_clock_mhz<u32>,
min_clock_mhz: u32,
max_clock_mhz: u32,
);

impl_handler_for_gpu_command!(
Expand All @@ -106,15 +106,16 @@ impl_handler_for_gpu_command!(
impl_handler_for_gpu_command!(
set_mem_locked_clocks,
post("/{gpu_id}/set_mem_locked_clocks"),
min_clock_mhz<u32>,
max_clock_mhz<u32>,
min_clock_mhz: u32,
max_clock_mhz: u32,
);

impl_handler_for_gpu_command!(
reset_mem_locked_clocks,
post("/{gpu_id}/reset_mem_locked_clocks"),
);

/// Register GPU routes with the Actix web server.
pub fn gpu_routes(cfg: &mut web::ServiceConfig) {
cfg.service(set_persistence_mode_handler)
.service(set_power_limit_handler)
Expand Down
2 changes: 2 additions & 0 deletions zeusd/src/routes/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Routes and handlers for interacting with devices

pub mod gpu;

pub use gpu::gpu_routes;

0 comments on commit 59f1e8f

Please sign in to comment.