Skip to content

Commit dcd14e3

Browse files
Hawking ZhangHawking Zhang
authored andcommitted
drm/amdgpu: Update usage for bad page threshold
The driver's behavior varies based on the configuration of amdgpu_bad_page_threshold setting Signed-off-by: Hawking Zhang <[email protected]> Reviewed-by: Tao Zhou <[email protected]>
1 parent 3632ac6 commit dcd14e3

File tree

4 files changed

+44
-40
lines changed

4 files changed

+44
-40
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -980,7 +980,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0644);
980980
* result in the GPU entering bad status when the number of total
981981
* faulty pages by ECC exceeds the threshold value.
982982
*/
983-
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = driver sets threshold)");
983+
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore threshold (default value), 0 = disable bad page retirement, -2 = threshold determined by a formula, 0 < threshold < max records, user-defined threshold)");
984984
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
985985

986986
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2882,31 +2882,29 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
28822882
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
28832883

28842884
/*
2885-
* Justification of value bad_page_cnt_threshold in ras structure
2886-
*
2887-
* Generally, 0 <= amdgpu_bad_page_threshold <= max record length
2888-
* in eeprom or amdgpu_bad_page_threshold == -2, introduce two
2889-
* scenarios accordingly.
2890-
*
2891-
* Bad page retirement enablement:
2892-
* - If amdgpu_bad_page_threshold = -2,
2893-
* bad_page_cnt_threshold = typical value by formula.
2894-
*
2895-
* - When the value from user is 0 < amdgpu_bad_page_threshold <
2896-
* max record length in eeprom, use it directly.
2897-
*
2898-
* Bad page retirement disablement:
2899-
* - If amdgpu_bad_page_threshold = 0, bad page retirement
2900-
* functionality is disabled, and bad_page_cnt_threshold will
2901-
* take no effect.
2885+
* amdgpu_bad_page_threshold is used to config
2886+
* the threshold for the number of bad pages.
2887+
* -1: Threshold is set to default value
2888+
* Driver will issue a warning message when threshold is reached
2889+
* and continue runtime services.
2890+
* 0: Disable bad page retirement
2891+
* Driver will not retire bad pages
2892+
* which is intended for debugging purpose.
2893+
* -2: Threshold is determined by a formula
2894+
* that assumes 1 bad page per 100M of local memory.
2895+
* Driver will continue runtime services when threhold is reached.
2896+
* 0 < threshold < max number of bad page records in EEPROM,
2897+
* A user-defined threshold is set
2898+
* Driver will halt runtime services when this custom threshold is reached.
29022899
*/
2903-
2904-
if (amdgpu_bad_page_threshold < 0) {
2900+
if (amdgpu_bad_page_threshold == -2) {
29052901
u64 val = adev->gmc.mc_vram_size;
29062902

29072903
do_div(val, RAS_BAD_PAGE_COVER);
29082904
con->bad_page_cnt_threshold = min(lower_32_bits(val),
29092905
max_count);
2906+
} else if (amdgpu_bad_page_threshold == -1) {
2907+
con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> 21) << 4;
29102908
} else {
29112909
con->bad_page_cnt_threshold = min_t(int, max_count,
29122910
amdgpu_bad_page_threshold);
@@ -3654,8 +3652,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
36543652
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
36553653
case IP_VERSION(13, 0, 2):
36563654
case IP_VERSION(13, 0, 6):
3655+
con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT;
3656+
break;
36573657
case IP_VERSION(13, 0, 14):
3658-
con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
3658+
con->reserved_pages_in_bytes = (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT << 1);
36593659
break;
36603660
default:
36613661
break;

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ struct amdgpu_iv_entry;
6565

6666
/* Reserve 8 physical dram row for possible retirement.
6767
* In worst cases, it will lose 8 * 2MB memory in vram domain */
68-
#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20)
68+
#define AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20)
6969
/* The high three bits indicates socketid */
7070
#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
7171

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -557,16 +557,16 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
557557
return false;
558558

559559
if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
560-
if (amdgpu_bad_page_threshold == -1) {
560+
if ((amdgpu_bad_page_threshold == -1) ||
561+
(amdgpu_bad_page_threshold == -2)) {
561562
dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
562563
con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
563564
dev_warn(adev->dev,
564-
"But GPU can be operated due to bad_page_threshold = -1.\n");
565+
"Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
565566
return false;
566567
} else {
567-
dev_warn(adev->dev, "This GPU is in BAD status.");
568-
dev_warn(adev->dev, "Please retire it or set a larger "
569-
"threshold value when reloading driver.\n");
568+
dev_warn(adev->dev,
569+
"Please consider adjusting the customized threshold\n");
570570
return true;
571571
}
572572
}
@@ -750,7 +750,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
750750
control->tbl_rai.health_percent = 0;
751751
}
752752

753-
if (amdgpu_bad_page_threshold != -1)
753+
if ((amdgpu_bad_page_threshold != -1) &&
754+
(amdgpu_bad_page_threshold != -2))
754755
ras->is_rma = true;
755756

756757
/* ignore the -ENOTSUPP return value */
@@ -1385,8 +1386,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
13851386

13861387
res = __verify_ras_table_checksum(control);
13871388
if (res)
1388-
DRM_ERROR("RAS table incorrect checksum or error:%d\n",
1389-
res);
1389+
dev_err(adev->dev,
1390+
"RAS table incorrect checksum or error:%d\n",
1391+
res);
13901392

13911393
/* Warn if we are at 90% of the threshold or above
13921394
*/
@@ -1404,8 +1406,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
14041406

14051407
res = __verify_ras_table_checksum(control);
14061408
if (res)
1407-
DRM_ERROR("RAS Table incorrect checksum or error:%d\n",
1408-
res);
1409+
dev_err(adev->dev,
1410+
"RAS Table incorrect checksum or error:%d\n",
1411+
res);
14091412
if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
14101413
/* This means that, the threshold was increased since
14111414
* the last time the system was booted, and now,
@@ -1421,17 +1424,18 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
14211424
res = amdgpu_ras_eeprom_correct_header_tag(control,
14221425
RAS_TABLE_HDR_VAL);
14231426
} else {
1424-
dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
1425-
control->ras_num_recs, ras->bad_page_cnt_threshold);
1426-
if (amdgpu_bad_page_threshold == -1) {
1427-
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
1427+
dev_warn(adev->dev,
1428+
"RAS records:%d exceed threshold:%d",
1429+
control->ras_num_recs, ras->bad_page_cnt_threshold);
1430+
if ((amdgpu_bad_page_threshold == -1) ||
1431+
(amdgpu_bad_page_threshold == -2)) {
14281432
res = 0;
1433+
dev_warn(adev->dev,
1434+
"Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
14291435
} else {
14301436
ras->is_rma = true;
1431-
dev_err(adev->dev,
1432-
"RAS records:%d exceed threshold:%d, "
1433-
"GPU will not be initialized. Replace this GPU or increase the threshold",
1434-
control->ras_num_recs, ras->bad_page_cnt_threshold);
1437+
dev_warn(adev->dev,
1438+
"User defined threshold is set, runtime service will be halt when threshold is reached\n");
14351439
}
14361440
}
14371441
} else {

0 commit comments

Comments
 (0)