Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1679ae8f authored by Felix Kuehling's avatar Felix Kuehling Committed by Oded Gabbay
Browse files

drm/amdkfd: Use ordered workqueue to restore processes



Restoring multiple processes concurrently can lead to live-locks
where each process prevents the other from validating all its BOs.

v2: fix duplicate check of same variable

Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 810955ba
Loading
Loading
Loading
Loading
+5 −1
Original line number Original line Diff line number Diff line
@@ -133,7 +133,9 @@ static int __init kfd_module_init(void)
	if (err < 0)
	if (err < 0)
		goto err_topology;
		goto err_topology;


	kfd_process_create_wq();
	err = kfd_process_create_wq();
	if (err < 0)
		goto err_create_wq;


	kfd_debugfs_init();
	kfd_debugfs_init();


@@ -143,6 +145,8 @@ static int __init kfd_module_init(void)


	return 0;
	return 0;


err_create_wq:
	kfd_topology_shutdown();
err_topology:
err_topology:
	kfd_chardev_exit();
	kfd_chardev_exit();
err_ioctl:
err_ioctl:
+1 −1
Original line number Original line Diff line number Diff line
@@ -674,7 +674,7 @@ struct amdkfd_ioctl_desc {
	const char *name;
	const char *name;
};
};


void kfd_process_create_wq(void);
int kfd_process_create_wq(void);
void kfd_process_destroy_wq(void);
void kfd_process_destroy_wq(void);
struct kfd_process *kfd_create_process(struct file *filep);
struct kfd_process *kfd_create_process(struct file *filep);
struct kfd_process *kfd_get_process(const struct task_struct *);
struct kfd_process *kfd_get_process(const struct task_struct *);
+26 −4
Original line number Original line Diff line number Diff line
@@ -48,8 +48,17 @@ static DEFINE_MUTEX(kfd_processes_mutex);


DEFINE_SRCU(kfd_processes_srcu);
DEFINE_SRCU(kfd_processes_srcu);


/* For process termination handling */
static struct workqueue_struct *kfd_process_wq;
static struct workqueue_struct *kfd_process_wq;


/* Ordered, single-threaded workqueue for restoring evicted
 * processes. Restoring multiple processes concurrently under memory
 * pressure can lead to processes blocking each other from validating
 * their BOs and result in a live-lock situation where processes
 * remain evicted indefinitely.
 */
static struct workqueue_struct *kfd_restore_wq;

static struct kfd_process *find_process(const struct task_struct *thread);
static struct kfd_process *find_process(const struct task_struct *thread);
static void kfd_process_ref_release(struct kref *ref);
static void kfd_process_ref_release(struct kref *ref);
static struct kfd_process *create_process(const struct task_struct *thread,
static struct kfd_process *create_process(const struct task_struct *thread,
@@ -59,10 +68,19 @@ static void evict_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);
static void restore_process_worker(struct work_struct *work);




void kfd_process_create_wq(void)
int kfd_process_create_wq(void)
{
{
	if (!kfd_process_wq)
	if (!kfd_process_wq)
		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
		kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
	if (!kfd_restore_wq)
		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);

	if (!kfd_process_wq || !kfd_restore_wq) {
		kfd_process_destroy_wq();
		return -ENOMEM;
	}

	return 0;
}
}


void kfd_process_destroy_wq(void)
void kfd_process_destroy_wq(void)
@@ -71,6 +89,10 @@ void kfd_process_destroy_wq(void)
		destroy_workqueue(kfd_process_wq);
		destroy_workqueue(kfd_process_wq);
		kfd_process_wq = NULL;
		kfd_process_wq = NULL;
	}
	}
	if (kfd_restore_wq) {
		destroy_workqueue(kfd_restore_wq);
		kfd_restore_wq = NULL;
	}
}
}


static void kfd_process_free_gpuvm(struct kgd_mem *mem,
static void kfd_process_free_gpuvm(struct kgd_mem *mem,
@@ -869,7 +891,7 @@ static void evict_process_worker(struct work_struct *work)
		dma_fence_signal(p->ef);
		dma_fence_signal(p->ef);
		dma_fence_put(p->ef);
		dma_fence_put(p->ef);
		p->ef = NULL;
		p->ef = NULL;
		schedule_delayed_work(&p->restore_work,
		queue_delayed_work(kfd_restore_wq, &p->restore_work,
				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));


		pr_debug("Finished evicting pasid %d\n", p->pasid);
		pr_debug("Finished evicting pasid %d\n", p->pasid);
@@ -918,7 +940,7 @@ static void restore_process_worker(struct work_struct *work)
	if (ret) {
	if (ret) {
		pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n",
		pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n",
			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
		ret = schedule_delayed_work(&p->restore_work,
		ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
		WARN(!ret, "reschedule restore work failed\n");
		WARN(!ret, "reschedule restore work failed\n");
		return;
		return;
@@ -957,7 +979,7 @@ int kfd_resume_all_processes(void)
	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);


	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
		if (!schedule_delayed_work(&p->restore_work, 0)) {
		if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
			pr_err("Restore process %d failed during resume\n",
			pr_err("Restore process %d failed during resume\n",
			       p->pasid);
			       p->pasid);
			ret = -EFAULT;
			ret = -EFAULT;