Skip to content

Commit dc400d9

Browse files
author
Meng, Li (Jassmine)
authored
[SWDEV-230863] add two new interfaces for background health check (#4)
1. Get the bad pages threshold of a processor. 2. Verify the checksum of RAS EEPROM Signed-off-by: Meng Li <li.meng@amd.com>
1 parent d32f2a1 commit dc400d9

File tree

4 files changed

+124
-0
lines changed

4 files changed

+124
-0
lines changed

include/amd_smi/amdsmi.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,8 @@ typedef enum {
260260
AMDSMI_STATUS_ARG_PTR_NULL = 53, //!< Parsed argument is invalid
261261
AMDSMI_STATUS_AMDGPU_RESTART_ERR = 54, //!< AMDGPU restart failed
262262
AMDSMI_STATUS_SETTING_UNAVAILABLE = 55, //!< Setting is not available
263+
AMDSMI_STATUS_CORRUPTED_EEPROM = 56, //!< EEPROM is corrupted
264+
263265
// General errors
264266
AMDSMI_STATUS_MAP_ERROR = 0xFFFFFFFE, //!< The internal library error did not map to a status code
265267
AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
@@ -2640,6 +2642,44 @@ amdsmi_status_t
26402642
amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages,
26412643
amdsmi_retired_page_record_t *info);
26422644

2645+
2646+
/**
2647+
* @brief Get the bad pages threshold of a processor. It is not supported on virtual
2648+
* machine guest
2649+
*
2650+
* @platform{gpu_bm_linux} @platform{host}
2651+
*
2652+
* @details This call will query the device @p processor_handle for the
2653+
* threshold of bad pages (written to @p threshold address).
2654+
* @param[in] processor_handle a processor handle
2655+
* @param[out] threshold of bad page count.
2656+
*
2657+
* @note This function requires root access
2658+
*
2659+
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
2660+
*/
2661+
amdsmi_status_t
2662+
amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold);
2663+
2664+
/**
2665+
* @brief Verify the checksum of RAS EEPROM. It is not supported on virtual
2666+
* machine guest
2667+
*
2668+
* @platform{gpu_bm_linux} @platform{host}
2669+
*
2670+
* @details This call will verify the device @p processor_handle for the
2671+
* checksum of RAS EEPROM.
2672+
* @param[in] processor_handle a processor handle
2673+
*
2674+
* @note This function requires root access
2675+
*
2676+
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success
2677+
* AMDSMI_STATUS_CORRUPTED_EEPROM on the device's EEPROM corruption
2678+
* others on fail
2679+
*/
2680+
amdsmi_status_t
2681+
amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle);
2682+
26432683
/**
26442684
* @brief Returns RAS features info.
26452685
*

include/amd_smi/impl/amd_smi_utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ amdsmi_status_t smi_amdgpu_get_power_cap(amd::smi::AMDSmiGPUDevice* device, int
4545
amdsmi_status_t smi_amdgpu_get_ranges(amd::smi::AMDSmiGPUDevice* device, amdsmi_clk_type_t domain, int *max_freq, int *min_freq, int *num_dpm, int *sleep_state_freq);
4646
amdsmi_status_t smi_amdgpu_get_enabled_blocks(amd::smi::AMDSmiGPUDevice* device, uint64_t *enabled_blocks);
4747
amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device, uint32_t *num_pages, amdsmi_retired_page_record_t *info);
48+
amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device, uint32_t *threshold);
49+
amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device);
4850
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt);
4951
amdsmi_status_t smi_amdgpu_get_driver_version(amd::smi::AMDSmiGPUDevice* device, int *length, char *version);
5052
amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uint32_t *pcie_speed);

src/amd_smi/amd_smi.cc

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2354,6 +2354,40 @@ amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t
23542354
return AMDSMI_STATUS_SUCCESS;
23552355
}
23562356

2357+
amdsmi_status_t
2358+
amdsmi_get_gpu_bad_page_threshold(amdsmi_processor_handle processor_handle, uint32_t *threshold) {
2359+
AMDSMI_CHECK_INIT();
2360+
2361+
if (threshold == nullptr) {
2362+
return AMDSMI_STATUS_INVAL;
2363+
}
2364+
2365+
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
2366+
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
2367+
if (r != AMDSMI_STATUS_SUCCESS)
2368+
return r;
2369+
2370+
amdsmi_status_t status;
2371+
status = smi_amdgpu_get_bad_page_threshold(gpu_device, threshold);
2372+
if (status != AMDSMI_STATUS_SUCCESS) {
2373+
return status;
2374+
}
2375+
2376+
return AMDSMI_STATUS_SUCCESS;
2377+
}
2378+
2379+
amdsmi_status_t
2380+
amdsmi_gpu_validate_ras_eeprom(amdsmi_processor_handle processor_handle) {
2381+
AMDSMI_CHECK_INIT();
2382+
2383+
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
2384+
amdsmi_status_t r = get_gpu_device_from_handle(processor_handle, &gpu_device);
2385+
if (r != AMDSMI_STATUS_SUCCESS)
2386+
return r;
2387+
2388+
return smi_amdgpu_validate_ras_eeprom(gpu_device);
2389+
}
2390+
23572391
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(
23582392
amdsmi_processor_handle processor_handle, amdsmi_ras_feature_t *ras_feature) {
23592393
AMDSMI_CHECK_INIT();

src/amd_smi/amd_smi_utils.cc

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,54 @@ amdsmi_status_t smi_amdgpu_get_bad_page_info(amd::smi::AMDSmiGPUDevice* device,
433433
return AMDSMI_STATUS_SUCCESS;
434434
}
435435

436+
static uint32_t GetDeviceIndex(const std::string s) {
437+
std::string t = s;
438+
size_t tmp = t.find_last_not_of("0123456789");
439+
t.erase(0, tmp+1);
440+
441+
assert(stoi(t) >= 0);
442+
return static_cast<uint32_t>(stoi(t));
443+
}
444+
445+
amdsmi_status_t smi_amdgpu_get_bad_page_threshold(amd::smi::AMDSmiGPUDevice* device,
446+
uint32_t *threshold) {
447+
if (!device->check_if_drm_is_supported()) {
448+
return AMDSMI_STATUS_NOT_SUPPORTED;
449+
}
450+
SMIGPUDEVICE_MUTEX(device->get_mutex())
451+
452+
//TODO: Accessing the node requires root privileges, and its interface may need to be exposed in another path
453+
uint32_t index = GetDeviceIndex(device->get_gpu_path());
454+
std::string fullpath = "/sys/kernel/debug/dri/" + std::to_string(index) + std::string("/ras/bad_page_cnt_threshold");
455+
std::ifstream fs(fullpath.c_str());
456+
457+
if (fs.fail()) {
458+
return AMDSMI_STATUS_NOT_SUPPORTED;
459+
}
460+
461+
std::string line;
462+
getline(fs, line);
463+
if (sscanf(line.c_str(), "%d", threshold) < 0) {
464+
return AMDSMI_STATUS_API_FAILED;
465+
}
466+
467+
fs.close();
468+
469+
return AMDSMI_STATUS_SUCCESS;
470+
}
471+
472+
amdsmi_status_t smi_amdgpu_validate_ras_eeprom(amd::smi::AMDSmiGPUDevice* device) {
473+
if (!device->check_if_drm_is_supported()) {
474+
return AMDSMI_STATUS_NOT_SUPPORTED;
475+
}
476+
SMIGPUDEVICE_MUTEX(device->get_mutex())
477+
478+
uint32_t index = GetDeviceIndex(device->get_gpu_path());
479+
//TODO: need to expose the corresponding interface to validate the checksum of ras eeprom table.
480+
//verify fail: return AMDSMI_STATUS_CORRUPTED_EEPROM
481+
return AMDSMI_STATUS_NOT_SUPPORTED;
482+
}
483+
436484
amdsmi_status_t smi_amdgpu_get_ecc_error_count(amd::smi::AMDSmiGPUDevice* device, amdsmi_error_count_t *err_cnt) {
437485
if (!device->check_if_drm_is_supported()) {
438486
return AMDSMI_STATUS_NOT_SUPPORTED;

0 commit comments

Comments
 (0)