From b0ce32dc9a66590651dbb47d93aca6917921a371 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Thu, 23 Nov 2023 03:33:32 -0500 Subject: [PATCH 1/7] add firmware versions and others in -g --- rvs/CMakeLists.txt | 11 ++++++++++- rvs/include/rvsexec.h | 2 +- rvs/src/rvsexec.cpp | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt index 527d4742..881de91b 100644 --- a/rvs/CMakeLists.txt +++ b/rvs/CMakeLists.txt @@ -111,9 +111,18 @@ if(BUILD_ADDRESS_SANITIZER) else() set(ASAN_LIB_PATH "$ENV{LD_LIBRARY_PATH}") endif() +# check if ROCM-SMI is installed +if(DEFINED RVS_ROCMSMI) + if(NOT RVS_ROCMSMI EQUAL 1) + if(NOT EXISTS "${ROCM_SMI_LIB_DIR}/lib${ROCM_SMI_LIB}.so") + message("ERROR: rocm_smi library can't be found!...") + RETURN() + endif() + endif() +endif() ## define include directories -include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS}) +include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS} ${ROCM_SMI_INC_DIR}) ## define lib directories link_directories(${CMAKE_CURRENT_BINARY_DIR} ${RVS_LIB_DIR} ${ROCT_LIB_DIR} ${ROCBLAS_LIB_DIR} ${ROCM_SMI_LIB_DIR} ${ASAN_LIB_PATH}) diff --git a/rvs/include/rvsexec.h b/rvs/include/rvsexec.h index 05e98a87..577bb3ea 100644 --- a/rvs/include/rvsexec.h +++ b/rvs/include/rvsexec.h @@ -71,7 +71,7 @@ class exec { void do_help(void); void do_version(void); int do_gpu_list(void); - + int enumerate_platform(); int do_yaml(const std::string& config_file); int do_yaml(yaml_data_type_t data_type, const std::string& data); int do_yaml_properties(const YAML::Node& node, diff --git a/rvs/src/rvsexec.cpp b/rvs/src/rvsexec.cpp index a099e69b..a1a5dac3 100644 --- a/rvs/src/rvsexec.cpp +++ b/rvs/src/rvsexec.cpp @@ -38,6 +38,7 @@ #include "include/rvsliblogger.h" #include "include/rvsoptions.h" #include "include/rvstrace.h" +#include "rocm_smi/rocm_smi.h" #define MODULE_NAME_CAPS "CLI" @@ -184,6 +185,8 @@ int rvs::exec::run() { } if (rvs::options::has_option("-g")) { + auto ret = enumerate_platform(); + std::cout << "post post post " << std::endl; int sts = do_gpu_list(); rvs::module::terminate(); logger::terminate(); @@ -445,6 +448,39 @@ void rvs::exec::do_help() { cout << "-h --help Display usage information and exit.\n"; } +std::string getOSName(){ + std::ifstream rel_file("/etc/os-release"); + if(!rel_file.good()){ + std::cout << "No /etc/os-release file, cant fetch details " << std::endl; + return std::string{}; + } + std::string line; + while (std::getline(rel_file, line)) + { + auto found = line.find("NAME") ; + if (found!=std::string::npos){ + found = line.find('\"'); + auto endquote = line.find_last_of('\"'); + if(found == std::string::npos || endquote == std::string::npos) + return std::string{}; + std::string osame = line.substr(found+1, endquote-found-1 ); + return osame; + } + } +} + + +int rvs::exec::enumerate_platform(){ + rsmi_init(0); + char vbname[1024]; + uint64_t fwver; + auto ret = rsmi_dev_vbios_version_get(0, vbname, 1024); + auto osName = getOSName(); + ret = rsmi_dev_firmware_version_get(0, RSMI_FW_BLOCK_FIRST, &fwver); + std::cout << " enumerate_platform is " << vbname << " and " < Date: Fri, 24 Nov 2023 02:05:47 -0500 Subject: [PATCH 2/7] Print details on VBIOS and FW versions --- include/rvs_util.h | 24 ++++++++++ rvs/include/rvsexec.h | 1 - rvs/src/rvsexec.cpp | 34 -------------- src/rvs_util.cpp | 100 +++++++++++++++++++++++++++++++++++++++++- 4 files changed, 122 insertions(+), 37 deletions(-) diff --git a/include/rvs_util.h b/include/rvs_util.h index 64dad7e2..950897c0 100644 --- a/include/rvs_util.h +++ b/include/rvs_util.h @@ -30,6 +30,30 @@ #include #include using std::map; +struct fm_ver{ + uint64_t asd; + uint64_t ce; + uint64_t dmcu; + uint64_t mc; + uint64_t me; + uint64_t mec; + uint64_t mec2; + uint64_t pfp; + uint64_t rlc; + uint64_t rlc_srlc; + uint64_t rlc_srlg; + uint64_t rlc_srls; + uint64_t sdma; + uint64_t sdma2; + uint64_t smc; + uint64_t sos; + uint64_t ta_ras; + uint64_t ta_xgmi; + uint64_t uvd; + uint64_t vce; + uint64_t vcn; + // end fwver +}; extern bool is_positive_integer(const std::string& str_val); diff --git a/rvs/include/rvsexec.h b/rvs/include/rvsexec.h index 577bb3ea..b04c3c02 100644 --- a/rvs/include/rvsexec.h +++ b/rvs/include/rvsexec.h @@ -71,7 +71,6 @@ class exec { void do_help(void); void do_version(void); int do_gpu_list(void); - int enumerate_platform(); int do_yaml(const std::string& config_file); int do_yaml(yaml_data_type_t data_type, const std::string& data); int do_yaml_properties(const YAML::Node& node, diff --git a/rvs/src/rvsexec.cpp b/rvs/src/rvsexec.cpp index a1a5dac3..def5265c 100644 --- a/rvs/src/rvsexec.cpp +++ b/rvs/src/rvsexec.cpp @@ -38,7 +38,6 @@ #include "include/rvsliblogger.h" #include "include/rvsoptions.h" #include "include/rvstrace.h" -#include "rocm_smi/rocm_smi.h" #define MODULE_NAME_CAPS "CLI" @@ -185,8 +184,6 @@ int rvs::exec::run() { } if (rvs::options::has_option("-g")) { - auto ret = enumerate_platform(); - std::cout << "post post post " << std::endl; int sts = do_gpu_list(); rvs::module::terminate(); logger::terminate(); @@ -448,39 +445,8 @@ void rvs::exec::do_help() { cout << "-h --help Display usage information and exit.\n"; } -std::string getOSName(){ - std::ifstream rel_file("/etc/os-release"); - if(!rel_file.good()){ - std::cout << "No /etc/os-release file, cant fetch details " << std::endl; - return std::string{}; - } - std::string line; - while (std::getline(rel_file, line)) - { - auto found = line.find("NAME") ; - if (found!=std::string::npos){ - found = line.find('\"'); - auto endquote = line.find_last_of('\"'); - if(found == std::string::npos || endquote == std::string::npos) - return std::string{}; - std::string osame = line.substr(found+1, endquote-found-1 ); - return osame; - } - } -} -int rvs::exec::enumerate_platform(){ - rsmi_init(0); - char vbname[1024]; - uint64_t fwver; - auto ret = rsmi_dev_vbios_version_get(0, vbname, 1024); - auto osName = getOSName(); - ret = rsmi_dev_firmware_version_get(0, RSMI_FW_BLOCK_FIRST, &fwver); - std::cout << " enumerate_platform is " << vbname << " and " < #include +#include #include #include +#include #include #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include "rocm_smi/rocm_smi.h" +/** +returns the current OS name +**/ +std::string getOSName(){ + std::ifstream rel_file("/etc/os-release"); + if(!rel_file.good()){ + std::cout << "No /etc/os-release file, cant fetch details " << std::endl; + return std::string{}; + } + std::string line; + while (std::getline(rel_file, line)) + { + auto found = line.find("NAME") ; + if (found!=std::string::npos){ + found = line.find('\"'); + auto endquote = line.find_last_of('\"'); + if(found == std::string::npos || endquote == std::string::npos) + return std::string{}; + std::string os_name = line.substr(found+1, endquote-found-1 ); + return os_name; + } + } +} + /** * splits a std::string based on a given delimiter * @param str_val input std::string @@ -182,6 +209,65 @@ bool fetch_gpu_list(int hip_num_gpu_devices, map& gpus_device_ind return amd_gpus_found; } +void firmware_version_get(int idx, rsmi_fw_block_t blk, uint64_t* val){ + auto ret = rsmi_dev_firmware_version_get(idx, blk, val); + if (ret != RSMI_STATUS_SUCCESS) + *val = std::numeric_limits::max(); +} + +void fill_firmware_versions(int idx, fm_ver& fmver){ + firmware_version_get(idx, RSMI_FW_BLOCK_ASD ,&fmver.asd); + firmware_version_get(idx, RSMI_FW_BLOCK_CE ,&fmver.ce); + firmware_version_get(idx, RSMI_FW_BLOCK_DMCU ,&fmver.dmcu); + firmware_version_get(idx, RSMI_FW_BLOCK_MC ,&fmver.mc); + firmware_version_get(idx, RSMI_FW_BLOCK_ME ,&fmver.me); + firmware_version_get(idx, RSMI_FW_BLOCK_MEC ,&fmver.mec); + firmware_version_get(idx, RSMI_FW_BLOCK_MEC2 ,&fmver.mec2); + firmware_version_get(idx, RSMI_FW_BLOCK_PFP ,&fmver.pfp); + firmware_version_get(idx, RSMI_FW_BLOCK_RLC ,&fmver.rlc); + firmware_version_get(idx, RSMI_FW_BLOCK_RLC_SRLC ,&fmver.rlc_srlc); + firmware_version_get(idx, RSMI_FW_BLOCK_RLC_SRLG ,&fmver.rlc_srlg); + firmware_version_get(idx, RSMI_FW_BLOCK_RLC_SRLS ,&fmver.rlc_srls); + firmware_version_get(idx, RSMI_FW_BLOCK_SDMA ,&fmver.sdma); + firmware_version_get(idx, RSMI_FW_BLOCK_SDMA2 ,&fmver.sdma2); + firmware_version_get(idx, RSMI_FW_BLOCK_SMC ,&fmver.smc); + firmware_version_get(idx, RSMI_FW_BLOCK_SOS ,&fmver.sos); + firmware_version_get(idx, RSMI_FW_BLOCK_TA_RAS ,&fmver.ta_ras); + firmware_version_get(idx, RSMI_FW_BLOCK_TA_XGMI ,&fmver.ta_xgmi); + firmware_version_get(idx, RSMI_FW_BLOCK_UVD ,&fmver.uvd); + firmware_version_get(idx, RSMI_FW_BLOCK_VCE ,&fmver.vce); + firmware_version_get(idx, RSMI_FW_BLOCK_VCN ,&fmver.vcn); +} + +std::string cleanprint(uint64_t val){ + std::string tmp{"NA"}; + if ( val != std::numeric_limits::max()) + tmp = std::to_string(val); + return tmp; +} +void print_fmversions(const fm_ver& fmver){ + std::cout << " BLOCK_ASD : " << cleanprint(fmver.asd) << "\n" + << " BLOCK_CE : " << cleanprint(fmver.ce) << "\n" + << " BLOCK_DMCU : " << cleanprint(fmver.dmcu) << "\n" + << " BLOCK_MC : " << cleanprint(fmver.mc) << "\n" + << " BLOCK_ME : " << cleanprint(fmver.me) << "\n" + << " BLOCK_MEC : " << cleanprint(fmver.mec) << "\n" + << " BLOCK_MEC2 : " << cleanprint(fmver.mec2) << "\n" + << " BLOCK_PFP : " << cleanprint(fmver.pfp) << "\n" + << " BLOCK_RLC : " << cleanprint(fmver.rlc) << "\n" + << " BLOCK_RLC_SRLC: " << cleanprint(fmver.rlc_srlc) << "\n" + << " BLOCK_RLC_SRLG: " << cleanprint(fmver.rlc_srlg) << "\n" + << " BLOCK_RLC_SRLS: " << cleanprint(fmver.rlc_srls) << "\n" + << " BLOCK_SDMA : " << cleanprint(fmver.sdma) << "\n" + << " BLOCK_SDMA2 : " << cleanprint(fmver.sdma2) << "\n" + << " BLOCK_SMC : " << cleanprint(fmver.smc) << "\n" + << " BLOCK_SOS : " << cleanprint(fmver.sos) << "\n" + << " BLOCK_TA_RAS : " << cleanprint(fmver.ta_ras) << "\n" + << " BLOCK_TA_XGMI : " << cleanprint(fmver.ta_xgmi) << "\n" + << " BLOCK_UVD : " << cleanprint(fmver.uvd) << "\n" + << " BLOCK_VCE : " << cleanprint(fmver.vce) << "\n" + << " BLOCK_VCN : " << cleanprint(fmver.vcn) << std::endl; +} int display_gpu_info (void) { @@ -191,8 +277,9 @@ int display_gpu_info (void) { int32_t node_id; int32_t gpu_id; int32_t device_id; + char vbios_ver[1024]; + fm_ver fmver; }; - char buff[1024]; int hip_num_gpu_devices; std::string errmsg = " No supported GPUs available."; @@ -203,6 +290,7 @@ int display_gpu_info (void) { std::cout << std::endl << errmsg << std::endl; return 0; } + rsmi_init(0); for (int i = 0; i < hip_num_gpu_devices; i++) { hipDeviceProp_t props; hipGetDeviceProperties(&props, i); @@ -231,9 +319,13 @@ int display_gpu_info (void) { info.node_id = node_id; info.gpu_id = gpu_id; info.device_id = dev_id; + auto ret = rsmi_dev_vbios_version_get(i, info.vbios_ver, 1024);// check returns TODO + info.fmver = {}; + fill_firmware_versions(i, info.fmver); gpu_info_list.push_back(info); } + std::cout << "Running on " << getOSName() << std::endl; std::sort(gpu_info_list.begin(), gpu_info_list.end(), [](const struct device_info& a, const struct device_info& b) { return a.node_id < b.node_id; }); @@ -242,10 +334,14 @@ int display_gpu_info (void) { for (const auto& info : gpu_info_list) { std::cout << info.bus << " - GPU[" << std::setw(2) << info.node_id << " - " << std::setw(5) << info.gpu_id << "] " << info.name - << " (Device " << info.device_id << ")\n"; + << " (Device " << info.device_id << ")" << "\n" + <<" VBIOS Version:" << std::string(info.vbios_ver) << "\n" + << " FirmWare Versions are: \n"; + print_fmversions(info.fmver); } } else { std::cout << std::endl << errmsg << std::endl; } + rsmi_shut_down(); return 0; } From 328c29813ce6c58a0566ad6bc424e70716b50b36 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Fri, 24 Nov 2023 03:46:39 -0500 Subject: [PATCH 3/7] cleanup --- src/rvs_util.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/rvs_util.cpp b/src/rvs_util.cpp index 60135633..33574ca6 100644 --- a/src/rvs_util.cpp +++ b/src/rvs_util.cpp @@ -325,7 +325,11 @@ int display_gpu_info (void) { gpu_info_list.push_back(info); } - std::cout << "Running on " << getOSName() << std::endl; + std::string rcm{ROCM_PATH}; + if (rcm.find('/') != std::string::npos){ + rcm = rcm.substr(rcm.find_last_of('/')+1); + } + std::cout << rcm << " Running on " << getOSName() << std::endl; std::sort(gpu_info_list.begin(), gpu_info_list.end(), [](const struct device_info& a, const struct device_info& b) { return a.node_id < b.node_id; }); From 39e77458634d5833d3f102300003665a47a77f05 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Fri, 12 Jan 2024 05:10:43 -0500 Subject: [PATCH 4/7] removed unused smi cmake variable --- rvs/CMakeLists.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/rvs/CMakeLists.txt b/rvs/CMakeLists.txt index 881de91b..454d6770 100644 --- a/rvs/CMakeLists.txt +++ b/rvs/CMakeLists.txt @@ -112,14 +112,14 @@ else() set(ASAN_LIB_PATH "$ENV{LD_LIBRARY_PATH}") endif() # check if ROCM-SMI is installed -if(DEFINED RVS_ROCMSMI) - if(NOT RVS_ROCMSMI EQUAL 1) - if(NOT EXISTS "${ROCM_SMI_LIB_DIR}/lib${ROCM_SMI_LIB}.so") - message("ERROR: rocm_smi library can't be found!...") - RETURN() - endif() - endif() -endif() +#if(DEFINED RVS_ROCMSMI) +# if(NOT RVS_ROCMSMI EQUAL 1) +# if(NOT EXISTS "${ROCM_SMI_LIB_DIR}/lib${ROCM_SMI_LIB}.so") +# message("ERROR: rocm_smi library can't be found!...") +# RETURN() +# endif() +# endif() +#endif() ## define include directories include_directories(./ ../ ${YAML_CPP_INCLUDE_DIRS} ${ROCM_SMI_INC_DIR}) From 5f7b238e5289741f34d13fca3917badc9ad3c334 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Fri, 12 Jan 2024 05:33:00 -0500 Subject: [PATCH 5/7] indentation fixes --- src/rvs_util.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rvs_util.cpp b/src/rvs_util.cpp index 33574ca6..ad0f9413 100644 --- a/src/rvs_util.cpp +++ b/src/rvs_util.cpp @@ -49,7 +49,7 @@ std::string getOSName(){ std::string line; while (std::getline(rel_file, line)) { - auto found = line.find("NAME") ; + auto found = line.find("NAME") ; if (found!=std::string::npos){ found = line.find('\"'); auto endquote = line.find_last_of('\"'); @@ -327,7 +327,7 @@ int display_gpu_info (void) { } std::string rcm{ROCM_PATH}; if (rcm.find('/') != std::string::npos){ - rcm = rcm.substr(rcm.find_last_of('/')+1); + rcm = rcm.substr(rcm.find_last_of('/')+1); } std::cout << rcm << " Running on " << getOSName() << std::endl; std::sort(gpu_info_list.begin(), gpu_info_list.end(), From bd97ca4b7ee9a7f06ce7358d4e54332ae70cc76b Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Fri, 19 Jan 2024 01:27:15 -0500 Subject: [PATCH 6/7] check return error --- src/rvs_util.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/rvs_util.cpp b/src/rvs_util.cpp index ad0f9413..dca93976 100644 --- a/src/rvs_util.cpp +++ b/src/rvs_util.cpp @@ -320,6 +320,10 @@ int display_gpu_info (void) { info.gpu_id = gpu_id; info.device_id = dev_id; auto ret = rsmi_dev_vbios_version_get(i, info.vbios_ver, 1024);// check returns TODO + if (ret != RSMI_STATUS_SUCCESS){ + info.vbios_ver = 0; + rvs::lp::Log("vbios check failure ", rvs::logresults); + } info.fmver = {}; fill_firmware_versions(i, info.fmver); gpu_info_list.push_back(info); From 37db9a758d3280333df75bf6570af396884bbd99 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Fri, 19 Jan 2024 01:31:11 -0500 Subject: [PATCH 7/7] issue fix --- src/rvs_util.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rvs_util.cpp b/src/rvs_util.cpp index dca93976..997dd719 100644 --- a/src/rvs_util.cpp +++ b/src/rvs_util.cpp @@ -321,7 +321,7 @@ int display_gpu_info (void) { info.device_id = dev_id; auto ret = rsmi_dev_vbios_version_get(i, info.vbios_ver, 1024);// check returns TODO if (ret != RSMI_STATUS_SUCCESS){ - info.vbios_ver = 0; + memset(info.vbios_ver, 0, 1024); rvs::lp::Log("vbios check failure ", rvs::logresults); } info.fmver = {};