Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1679ae8f authored by Felix Kuehling's avatar Felix Kuehling Committed by Oded Gabbay
Browse files

drm/amdkfd: Use ordered workqueue to restore processes



Restoring multiple processes concurrently can lead to live-locks
where each process prevents the other from validating all its BOs.

v2: fix duplicate check of same variable

Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 810955ba
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -133,7 +133,9 @@ static int __init kfd_module_init(void)
	if (err < 0)
		goto err_topology;

	kfd_process_create_wq();
	err = kfd_process_create_wq();
	if (err < 0)
		goto err_create_wq;

	kfd_debugfs_init();

@@ -143,6 +145,8 @@ static int __init kfd_module_init(void)

	return 0;

err_create_wq:
	kfd_topology_shutdown();
err_topology:
	kfd_chardev_exit();
err_ioctl:
+1 −1
Original line number Diff line number Diff line
@@ -674,7 +674,7 @@ struct amdkfd_ioctl_desc {
	const char *name;
};

void kfd_process_create_wq(void);
int kfd_process_create_wq(void);
void kfd_process_destroy_wq(void);
struct kfd_process *kfd_create_process(struct file *filep);
struct kfd_process *kfd_get_process(const struct task_struct *);
+26 −4
Original line number Diff line number Diff line
@@ -48,8 +48,17 @@ static DEFINE_MUTEX(kfd_processes_mutex);

DEFINE_SRCU(kfd_processes_srcu);

/* For process termination handling */
static struct workqueue_struct *kfd_process_wq;

/* Ordered, single-threaded workqueue for restoring evicted
 * processes. Restoring multiple processes concurrently under memory
 * pressure can lead to processes blocking each other from validating
 * their BOs and result in a live-lock situation where processes
 * remain evicted indefinitely.
 */
static struct workqueue_struct *kfd_restore_wq;

static struct kfd_process *find_process(const struct task_struct *thread);
static void kfd_process_ref_release(struct kref *ref);
static struct kfd_process *create_process(const struct task_struct *thread,
@@ -59,10 +68,19 @@ static void evict_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);


void kfd_process_create_wq(void)
int kfd_process_create_wq(void)
{
	if (!kfd_process_wq)
		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
	if (!kfd_restore_wq)
		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);

	if (!kfd_process_wq || !kfd_restore_wq) {
		kfd_process_destroy_wq();
		return -ENOMEM;
	}

	return 0;
}

void kfd_process_destroy_wq(void)
@@ -71,6 +89,10 @@ void kfd_process_destroy_wq(void)
		destroy_workqueue(kfd_process_wq);
		kfd_process_wq = NULL;
	}
	if (kfd_restore_wq) {
		destroy_workqueue(kfd_restore_wq);
		kfd_restore_wq = NULL;
	}
}

static void kfd_process_free_gpuvm(struct kgd_mem *mem,
@@ -869,7 +891,7 @@ static void evict_process_worker(struct work_struct *work)
		dma_fence_signal(p->ef);
		dma_fence_put(p->ef);
		p->ef = NULL;
		schedule_delayed_work(&p->restore_work,
		queue_delayed_work(kfd_restore_wq, &p->restore_work,
				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));

		pr_debug("Finished evicting pasid %d\n", p->pasid);
@@ -918,7 +940,7 @@ static void restore_process_worker(struct work_struct *work)
	if (ret) {
		pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n",
			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
		ret = schedule_delayed_work(&p->restore_work,
		ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
		WARN(!ret, "reschedule restore work failed\n");
		return;
@@ -957,7 +979,7 @@ int kfd_resume_all_processes(void)
	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);

	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
		if (!schedule_delayed_work(&p->restore_work, 0)) {
		if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
			pr_err("Restore process %d failed during resume\n",
			       p->pasid);
			ret = -EFAULT;