aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
authorPhilip Yang <Philip.Yang@amd.com>2022-01-13 21:24:20 -0500
committerAlex Deucher <alexander.deucher@amd.com>2022-06-30 15:31:14 -0400
commitc7f21978fa6aafaf7ad37155c7d3a217dc3d16b0 (patch)
treef4445b8613bde7f5aeb7e7097ab8269487333957 /drivers/gpu/drm/amd/amdkfd
parentacac270d09828edda2d530d255ee75ceb87583ec (diff)
downloadlinux-c7f21978fa6aafaf7ad37155c7d3a217dc3d16b0.tar.gz
drm/amdkfd: Add user queue eviction restore SMI event
Output user queue eviction and restore event. User queue eviction may be triggered by svm or userptr MMU notifier, TTM eviction, device suspend and CRIU checkpoint and restore. User queue restore may be rescheduled if eviction happens again while restore. Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c15
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c35
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c6
7 files changed, 60 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d07588230ed6..2b3d8bc8f0aa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2434,7 +2434,7 @@ static int criu_restore(struct file *filep,
* Set the process to evicted state to avoid running any new queues before all the memory
* mappings are ready.
*/
- ret = kfd_process_evict_queues(p);
+ ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
if (ret)
goto exit_unlock;
@@ -2553,7 +2553,7 @@ static int criu_process_info(struct file *filep,
goto err_unlock;
}
- ret = kfd_process_evict_queues(p);
+ ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
if (ret)
goto err_unlock;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index c8fee0dbfdcb..6ec0e9f0927d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
}
-int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
{
struct kfd_process *p;
int r;
@@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
return -ESRCH;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
- r = kfd_process_evict_queues(p);
+ r = kfd_process_evict_queues(p, trigger);
kfd_unref_process(p);
return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 4c4bbd493caa..d03a3b9c9c5d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -947,7 +947,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
}
void kfd_unref_process(struct kfd_process *p);
-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
int kfd_process_restore_queues(struct kfd_process *p);
void kfd_suspend_all_processes(void);
int kfd_resume_all_processes(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a13e60d48b73..fc38a4d81420 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -43,6 +43,7 @@ struct mm_struct;
#include "kfd_device_queue_manager.h"
#include "kfd_iommu.h"
#include "kfd_svm.h"
+#include "kfd_smi_events.h"
/*
* List of struct kfd_process (field kfd_process).
@@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
* Eviction is reference-counted per process-device. This means multiple
* evictions from different sources can be nested safely.
*/
-int kfd_process_evict_queues(struct kfd_process *p)
+int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
{
int r = 0;
int i;
@@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
+ kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
+ trigger);
+
r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
&pdd->qpd);
/* evict return -EIO if HWS is hang or asic is resetting, in this case
@@ -1769,6 +1773,9 @@ fail:
if (n_evicted == 0)
break;
+
+ kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
+
if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
&pdd->qpd))
pr_err("Failed to restore queues\n");
@@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p)
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
+ kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
+
r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
&pdd->qpd);
if (r) {
@@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work)
flush_delayed_work(&p->restore_work);
pr_debug("Started evicting pasid 0x%x\n", p->pasid);
- ret = kfd_process_evict_queues(p);
+ ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
if (!ret) {
dma_fence_signal(p->ef);
dma_fence_put(p->ef);
@@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void)
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
- if (kfd_process_evict_queues(p))
+ if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
pr_err("Failed to suspend process 0x%x\n", p->pasid);
dma_fence_signal(p->ef);
dma_fence_put(p->ef);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
index ec4d278c2a47..3917c38204d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
@@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
from, to, trigger);
}
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+ uint32_t trigger)
+{
+ kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
+ "%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
+ dev->id, trigger);
+}
+
+void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
+{
+ kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
+ "%lld -%d %x\n", ktime_get_boottime_ns(), pid,
+ dev->id);
+}
+
+void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
+{
+ struct kfd_process *p;
+ int i;
+
+ p = kfd_lookup_process_by_mm(mm);
+ if (!p)
+ return;
+
+ for (i = 0; i < p->n_pdds; i++) {
+ struct kfd_process_device *pdd = p->pdds[i];
+
+ kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
+ KFD_SMI_EVENT_QUEUE_RESTORE,
+ "%lld -%d %x %c\n", ktime_get_boottime_ns(),
+ p->lead_thread->pid, pdd->dev->id, 'R');
+ }
+ kfd_unref_process(p);
+}
+
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
{
struct kfd_smi_client *client;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
index ec5d74a2fef4..b23292637239 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
@@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
unsigned long start, unsigned long end,
uint32_t from, uint32_t to, uint32_t trigger);
+void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
+ uint32_t trigger);
+void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid);
+void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index e8ded7a02bcb..8bfb7b99e45d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1730,14 +1730,16 @@ out_reschedule:
mutex_unlock(&svms->lock);
mmap_write_unlock(mm);
mutex_unlock(&process_info->lock);
- mmput(mm);
/* If validation failed, reschedule another attempt */
if (evicted_ranges) {
pr_debug("reschedule to restore svm range\n");
schedule_delayed_work(&svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
+
+ kfd_smi_event_queue_restore_rescheduled(mm);
}
+ mmput(mm);
}
/**
@@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
prange->svms, prange->start, prange->last);
/* First eviction, stop the queues */
- r = kgd2kfd_quiesce_mm(mm);
+ r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
if (r)
pr_debug("failed to quiesce KFD\n");