diff options
author | YiPeng Chai <YiPeng.Chai@amd.com> | 2022-09-08 09:44:36 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2022-09-19 15:17:47 -0400 |
commit | 83d29a5f8a5a8ac76fdf8b8ccca65899345e6a9e (patch) | |
tree | aaaa3c7b526854b3a0793dbedd710dfc87c7bdcf | |
parent | f5c7e7797060255dbc8160734ccc5ad6183c5e04 (diff) | |
download | linux-83d29a5f8a5a8ac76fdf8b8ccca65899345e6a9e.tar.gz |
drm/amdgpu: Fixed psp fence and memory issues when removing amdgpu device
V3:
Fixed psp fence and memory issues for the asic
using smu v13_0_2 when removing amdgpu device.
[Why]:
1. psp_suspend->psp_free_shared_bufs->
psp_ta_free_shared_buf->
amdgpu_bo_free_kernel->
...->amdgpu_bo_release_notify->
amdgpu_fill_buffer
psp will free vram memory used by psp when psp_suspend
is called. But for the asic using smu v13_0_2, because
psp_suspend is called before adev->shutdown is set to
true when removing the first hive device, amdgpu fill_buffer
will be called, which will cause fence issues when evicting
all vram resources in amdgpu vram mgr_fini.
2. Since psp_hw_fini is not called after calling psp_suspend
and psp_suspend only calls psp_ring_stop, the psp ring memory
will not be released when amdgpu device is removed.
[How]:
1. Set shutdown to true before calling amdgpu_device_gpu_recover,
then amdgpu_fill_buffer will not be called when psp_suspend is
called.
2. Free psp ring memory in psp_sw_fini.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 |
3 files changed, 10 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c268bd033064..869c843c1d58 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5191,8 +5191,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ INIT_LIST_HEAD(&device_list); if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_add_tail(&tmp_adev->reset_list, &device_list); + if (gpu_reset_for_dev_remove && adev->shutdown) + tmp_adev->shutdown = true; + } if (!list_is_first(&adev->reset_list, &device_list)) list_rotate_to_front(&adev->reset_list, &device_list); device_list_handle = &device_list; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 2e16210bebaf..81b22c1bd8df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2207,6 +2207,7 @@ amdgpu_pci_remove(struct pci_dev *pdev) if (need_to_reset_gpu) { struct amdgpu_reset_context reset_context; + adev->shutdown = true; memset(&reset_context, 0, sizeof(reset_context)); reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index c4848522be16..effa7df3ddbf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -511,6 +511,11 @@ static int psp_sw_fini(void *handle) kfree(cmd); cmd = NULL; + if (psp->km_ring.ring_mem) + amdgpu_bo_free_kernel(&adev->firmware.rbuf, + &psp->km_ring.ring_mem_mc_addr, + (void **)&psp->km_ring.ring_mem); + amdgpu_bo_free_kernel(&psp->fw_pri_bo, &psp->fw_pri_mc_addr, &psp->fw_pri_buf); amdgpu_bo_free_kernel(&psp->fence_buf_bo, |