Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e42051d2 authored by Shaoyun Liu's avatar Shaoyun Liu Committed by Oded Gabbay
Browse files

drm/amdkfd: Implement GPU reset handlers in KFD



Lock KFD and evict existing queues on reset. Notify user mode by
signaling hw_exception events.

Signed-off-by: default avatarShaoyun Liu <Shaoyun.Liu@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 5c6dd71e
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -122,6 +122,9 @@ static int kfd_open(struct inode *inode, struct file *filep)
	if (IS_ERR(process))
		return PTR_ERR(process);

	if (kfd_is_locked())
		return -EAGAIN;

	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
		process->pasid, process->is_32bit_user_mode);

+40 −3
Original line number Diff line number Diff line
@@ -30,7 +30,13 @@
#include "kfd_iommu.h"

#define MQD_SIZE_ALIGNED 768
static atomic_t kfd_device_suspended = ATOMIC_INIT(0);

/*
 * kfd_locked is used to lock the kfd driver during suspend or reset
 * once locked, kfd driver will stop any further GPU execution.
 * create process (open) will return -EAGAIN.
 */
static atomic_t kfd_locked = ATOMIC_INIT(0);

#ifdef KFD_SUPPORT_IOMMU_V2
static const struct kfd_device_info kaveri_device_info = {
@@ -516,12 +522,43 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)

int kgd2kfd_pre_reset(struct kfd_dev *kfd)
{
	if (!kfd->init_complete)
		return 0;
	kgd2kfd_suspend(kfd);

	/* hold dqm->lock to prevent further execution*/
	dqm_lock(kfd->dqm);

	kfd_signal_reset_event(kfd);
	return 0;
}

/*
 * Fix me. KFD won't be able to resume existing process for now.
 * We will keep all existing process in a evicted state and
 * wait the process to be terminated.
 */

int kgd2kfd_post_reset(struct kfd_dev *kfd)
{
	int ret, count;

	if (!kfd->init_complete)
		return 0;

	dqm_unlock(kfd->dqm);

	ret = kfd_resume(kfd);
	if (ret)
		return ret;
	count = atomic_dec_return(&kfd_locked);
	WARN_ONCE(count != 0, "KFD reset ref. error");
	return 0;
}

bool kfd_is_locked(void)
{
	return  (atomic_read(&kfd_locked) > 0);
}

void kgd2kfd_suspend(struct kfd_dev *kfd)
@@ -530,7 +567,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
		return;

	/* For first KFD device suspend all the KFD processes */
	if (atomic_inc_return(&kfd_device_suspended) == 1)
	if (atomic_inc_return(&kfd_locked) == 1)
		kfd_suspend_all_processes();

	kfd->dqm->ops.stop(kfd->dqm);
@@ -549,7 +586,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
	if (ret)
		return ret;

	count = atomic_dec_return(&kfd_device_suspended);
	count = atomic_dec_return(&kfd_locked);
	WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
	if (count == 0)
		ret = kfd_resume_all_processes();
+27 −0
Original line number Diff line number Diff line
@@ -1000,3 +1000,30 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
	mutex_unlock(&p->event_mutex);
	kfd_unref_process(p);
}

void kfd_signal_reset_event(struct kfd_dev *dev)
{
	struct kfd_hsa_hw_exception_data hw_exception_data;
	struct kfd_process *p;
	struct kfd_event *ev;
	unsigned int temp;
	uint32_t id, idx;

	/* Whole gpu reset caused by GPU hang and memory is lost */
	memset(&hw_exception_data, 0, sizeof(hw_exception_data));
	hw_exception_data.gpu_id = dev->id;
	hw_exception_data.memory_lost = 1;

	idx = srcu_read_lock(&kfd_processes_srcu);
	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
		mutex_lock(&p->event_mutex);
		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
		idr_for_each_entry_continue(&p->event_idr, ev, id)
			if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
				ev->hw_exception_data = hw_exception_data;
				set_event(ev);
			}
		mutex_unlock(&p->event_mutex);
	}
	srcu_read_unlock(&kfd_processes_srcu, idx);
}
+1 −0
Original line number Diff line number Diff line
@@ -66,6 +66,7 @@ struct kfd_event {
	/* type specific data */
	union {
		struct kfd_hsa_memory_exception_data memory_exception_data;
		struct kfd_hsa_hw_exception_data hw_exception_data;
	};
};

+4 −0
Original line number Diff line number Diff line
@@ -975,10 +975,14 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
				struct kfd_vm_fault_info *info);

void kfd_signal_reset_event(struct kfd_dev *dev);

void kfd_flush_tlb(struct kfd_process_device *pdd);

int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);

bool kfd_is_locked(void);

/* Debugfs */
#if defined(CONFIG_DEBUG_FS)