diff options
author | Stanley.Yang <Stanley.Yang@amd.com> | 2023-01-12 14:27:26 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2023-01-17 16:11:52 -0500 |
commit | 828fc79dcfa16340340382d888c03c7ea6a8c33e (patch) | |
tree | 24d14919625e0cd467f51ef7da1ae903f8ebf874 | |
parent | 1427a720273976a81d13d9d9fa60d53ce881cbd7 (diff) | |
download | linux-828fc79dcfa16340340382d888c03c7ea6a8c33e.tar.gz |
drm/amdgpu: support check xgmi/walf error mask bit for aldebaran
The pcs error count should be determined by PCS ERROR status and
PCS ERROR MASK registers, only PCS ERROR status register can not
refect error counts accurately.
Changed from V1:
remove clean noncorrectable mask registers
optimize query pcs error status
Changed from V2:
remove check mask_value bits
correct set value corresponding bit
Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 102 |
1 files changed, 64 insertions, 38 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 4b9e7b050ccd..922b0e19ae66 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -35,7 +35,9 @@ #include "amdgpu_reset.h" #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c +#define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218 #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 +#define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218 static DEFINE_MUTEX(xgmi_mutex); @@ -79,11 +81,27 @@ static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 }; +static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = { + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000, + smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000 +}; + static const int walf_pcs_err_status_reg_aldebaran[] = { smnPCS_GOPX1_PCS_ERROR_STATUS, smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 }; +static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { + smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK, + smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 +}; + static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { {"XGMI PCS DataLossErr", SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, @@ -809,39 +827,43 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, uint32_t value, + uint32_t mask_value, uint32_t *ue_count, uint32_t *ce_count, - bool is_xgmi_pcs) + bool is_xgmi_pcs, + bool check_mask) { int i; - int ue_cnt; + int ue_cnt = 0; + int mask_bit_value = 0; + const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL; + uint32_t field_array_size = 0; if (is_xgmi_pcs) { - /* query xgmi pcs error status, - * only ue is supported */ - for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) { - ue_cnt = (value & - xgmi_pcs_ras_fields[i].pcs_err_mask) >> - xgmi_pcs_ras_fields[i].pcs_err_shift; - if (ue_cnt) { - dev_info(adev->dev, "%s detected\n", - xgmi_pcs_ras_fields[i].err_name); - *ue_count += ue_cnt; - } - } + pcs_ras_fields = &xgmi_pcs_ras_fields[0]; + field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields); } else { - /* query wafl pcs error status, - * only ue is supported */ - for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) { - ue_cnt = (value & - wafl_pcs_ras_fields[i].pcs_err_mask) >> - wafl_pcs_ras_fields[i].pcs_err_shift; - if (ue_cnt) { - dev_info(adev->dev, "%s detected\n", - wafl_pcs_ras_fields[i].err_name); - *ue_count += ue_cnt; - } + pcs_ras_fields = &wafl_pcs_ras_fields[0]; + field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields); + } + + if (check_mask) + value = value & ~mask_value; + + /* query xgmi/walf pcs error status, + * only ue is supported */ + for (i = 0; value && i < field_array_size; i++) { + ue_cnt = (value & + pcs_ras_fields[i].pcs_err_mask) >> + pcs_ras_fields[i].pcs_err_shift; + if (ue_cnt) { + dev_info(adev->dev, "%s detected\n", + pcs_ras_fields[i].err_name); + *ue_count += ue_cnt; } + + /* reset bit value if the bit is checked */ + value &= ~(pcs_ras_fields[i].pcs_err_mask); } return 0; @@ -852,7 +874,7 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; int i; - uint32_t data; + uint32_t data, mask_data = 0; uint32_t ue_cnt = 0, ce_cnt = 0; if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL)) @@ -867,15 +889,15 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, true); + amdgpu_xgmi_query_pcs_error_status(adev, data, + mask_data, &ue_cnt, &ce_cnt, true, false); } /* check wafl pcs error */ for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, false); + amdgpu_xgmi_query_pcs_error_status(adev, data, + mask_data, &ue_cnt, &ce_cnt, false, false); } break; case CHIP_VEGA20: @@ -883,31 +905,35 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, true); + amdgpu_xgmi_query_pcs_error_status(adev, data, + mask_data, &ue_cnt, &ce_cnt, true, false); } /* check wafl pcs error */ for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, false); + amdgpu_xgmi_query_pcs_error_status(adev, data, + mask_data, &ue_cnt, &ce_cnt, false, false); } break; case CHIP_ALDEBARAN: /* check xgmi3x16 pcs error */ for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); + mask_data = + RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]); if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, true); + amdgpu_xgmi_query_pcs_error_status(adev, data, + mask_data, &ue_cnt, &ce_cnt, true, true); } /* check wafl pcs error */ for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); + mask_data = + RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]); if (data) - amdgpu_xgmi_query_pcs_error_status(adev, - data, &ue_cnt, &ce_cnt, false); + amdgpu_xgmi_query_pcs_error_status(adev, data, + mask_data, &ue_cnt, &ce_cnt, false, true); } break; default: |