@@ -557,16 +557,16 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
557
557
return false;
558
558
559
559
if (con -> eeprom_control .tbl_hdr .header == RAS_TABLE_HDR_BAD ) {
560
- if (amdgpu_bad_page_threshold == -1 ) {
560
+ if ((amdgpu_bad_page_threshold == -1 ) ||
561
+ (amdgpu_bad_page_threshold == -2 )) {
561
562
dev_warn (adev -> dev , "RAS records:%d exceed threshold:%d" ,
562
563
con -> eeprom_control .ras_num_recs , con -> bad_page_cnt_threshold );
563
564
dev_warn (adev -> dev ,
564
- "But GPU can be operated due to bad_page_threshold = -1. \n" );
565
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures \n" );
565
566
return false;
566
567
} else {
567
- dev_warn (adev -> dev , "This GPU is in BAD status." );
568
- dev_warn (adev -> dev , "Please retire it or set a larger "
569
- "threshold value when reloading driver.\n" );
568
+ dev_warn (adev -> dev ,
569
+ "Please consider adjusting the customized threshold\n" );
570
570
return true;
571
571
}
572
572
}
@@ -750,7 +750,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
750
750
control -> tbl_rai .health_percent = 0 ;
751
751
}
752
752
753
- if (amdgpu_bad_page_threshold != -1 )
753
+ if ((amdgpu_bad_page_threshold != -1 ) &&
754
+ (amdgpu_bad_page_threshold != -2 ))
754
755
ras -> is_rma = true;
755
756
756
757
/* ignore the -ENOTSUPP return value */
@@ -1385,8 +1386,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
1385
1386
1386
1387
res = __verify_ras_table_checksum (control );
1387
1388
if (res )
1388
- DRM_ERROR ("RAS table incorrect checksum or error:%d\n" ,
1389
- res );
1389
+ dev_err (adev -> dev ,
1390
+ "RAS table incorrect checksum or error:%d\n" ,
1391
+ res );
1390
1392
1391
1393
/* Warn if we are at 90% of the threshold or above
1392
1394
*/
@@ -1404,8 +1406,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
1404
1406
1405
1407
res = __verify_ras_table_checksum (control );
1406
1408
if (res )
1407
- DRM_ERROR ("RAS Table incorrect checksum or error:%d\n" ,
1408
- res );
1409
+ dev_err (adev -> dev ,
1410
+ "RAS Table incorrect checksum or error:%d\n" ,
1411
+ res );
1409
1412
if (ras -> bad_page_cnt_threshold > control -> ras_num_recs ) {
1410
1413
/* This means that, the threshold was increased since
1411
1414
* the last time the system was booted, and now,
@@ -1421,17 +1424,18 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
1421
1424
res = amdgpu_ras_eeprom_correct_header_tag (control ,
1422
1425
RAS_TABLE_HDR_VAL );
1423
1426
} else {
1424
- dev_err (adev -> dev , "RAS records:%d exceed threshold:%d" ,
1425
- control -> ras_num_recs , ras -> bad_page_cnt_threshold );
1426
- if (amdgpu_bad_page_threshold == -1 ) {
1427
- dev_warn (adev -> dev , "GPU will be initialized due to bad_page_threshold = -1." );
1427
+ dev_warn (adev -> dev ,
1428
+ "RAS records:%d exceed threshold:%d" ,
1429
+ control -> ras_num_recs , ras -> bad_page_cnt_threshold );
1430
+ if ((amdgpu_bad_page_threshold == -1 ) ||
1431
+ (amdgpu_bad_page_threshold == -2 )) {
1428
1432
res = 0 ;
1433
+ dev_warn (adev -> dev ,
1434
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n" );
1429
1435
} else {
1430
1436
ras -> is_rma = true;
1431
- dev_err (adev -> dev ,
1432
- "RAS records:%d exceed threshold:%d, "
1433
- "GPU will not be initialized. Replace this GPU or increase the threshold" ,
1434
- control -> ras_num_recs , ras -> bad_page_cnt_threshold );
1437
+ dev_warn (adev -> dev ,
1438
+ "User defined threshold is set, runtime service will be halt when threshold is reached\n" );
1435
1439
}
1436
1440
}
1437
1441
} else {
0 commit comments