Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 26103436 authored by Felix Kuehling's avatar Felix Kuehling Committed by Oded Gabbay
Browse files

drm/amdkfd: Implement KFD process eviction/restore



When the TTM memory manager in KGD evicts BOs, all user mode queues
potentially accessing these BOs must be evicted temporarily. Once
user mode queues are evicted, the eviction fence is signaled,
allowing the migration of the BO to proceed.

A delayed worker is scheduled to restore all the BOs belonging to
the evicted process and restart its queues.

During suspend/resume of the GPU we also evict all processes to allow
KGD to save BOs in system memory, since VRAM will be lost.

v2:
* Account for eviction when updating of q->is_active in MQD manager

Signed-off-by: default avatarHarish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 403575c4
Loading
Loading
Loading
Loading
+64 −1
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@
#include "kfd_iommu.h"

#define MQD_SIZE_ALIGNED 768
static atomic_t kfd_device_suspended = ATOMIC_INIT(0);

#ifdef KFD_SUPPORT_IOMMU_V2
static const struct kfd_device_info kaveri_device_info = {
@@ -469,6 +470,10 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
	if (!kfd->init_complete)
		return;

	/* For first KFD device suspend all the KFD processes */
	if (atomic_inc_return(&kfd_device_suspended) == 1)
		kfd_suspend_all_processes();

	kfd->dqm->ops.stop(kfd->dqm);

	kfd_iommu_suspend(kfd);
@@ -476,11 +481,21 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)

int kgd2kfd_resume(struct kfd_dev *kfd)
{
	int ret, count;

	if (!kfd->init_complete)
		return 0;

	return kfd_resume(kfd);
	ret = kfd_resume(kfd);
	if (ret)
		return ret;

	count = atomic_dec_return(&kfd_device_suspended);
	WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
	if (count == 0)
		ret = kfd_resume_all_processes();

	return ret;
}

static int kfd_resume(struct kfd_dev *kfd)
@@ -526,6 +541,54 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
	spin_unlock(&kfd->interrupt_lock);
}

/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will
 *   prepare for safe eviction of KFD BOs that belong to the specified
 *   process.
 *
 * @mm: mm_struct that identifies the specified KFD process
 * @fence: eviction fence attached to KFD process BOs
 *
 */
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
					       struct dma_fence *fence)
{
	struct kfd_process *p;
	unsigned long active_time;
	unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS);

	if (!fence)
		return -EINVAL;

	if (dma_fence_is_signaled(fence))
		return 0;

	p = kfd_lookup_process_by_mm(mm);
	if (!p)
		return -ENODEV;

	if (fence->seqno == p->last_eviction_seqno)
		goto out;

	p->last_eviction_seqno = fence->seqno;

	/* Avoid KFD process starvation. Wait for at least
	 * PROCESS_ACTIVE_TIME_MS before evicting the process again
	 */
	active_time = get_jiffies_64() - p->last_restore_timestamp;
	if (delay_jiffies > active_time)
		delay_jiffies -= active_time;
	else
		delay_jiffies = 0;

	/* During process initialization eviction_work.dwork is initialized
	 * to kfd_evict_bo_worker
	 */
	schedule_delayed_work(&p->eviction_work, delay_jiffies);
out:
	kfd_unref_process(p);
	return 0;
}

static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
				unsigned int chunk_size)
{
+218 −1
Original line number Diff line number Diff line
@@ -21,10 +21,11 @@
 *
 */

#include <linux/ratelimit.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/printk.h>
#include <linux/bitops.h>
#include <linux/sched.h>
#include "kfd_priv.h"
@@ -180,6 +181,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
			goto out_unlock;
	}
	q->properties.vmid = qpd->vmid;
	/*
	 * Eviction state logic: we only mark active queues as evicted
	 * to avoid the overhead of restoring inactive queues later
	 */
	if (qpd->evicted)
		q->properties.is_evicted = (q->properties.queue_size > 0 &&
					    q->properties.queue_percent > 0 &&
					    q->properties.queue_address != 0);

	q->properties.tba_addr = qpd->tba_addr;
	q->properties.tma_addr = qpd->tma_addr;
@@ -377,15 +386,29 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
{
	int retval;
	struct mqd_manager *mqd;
	struct kfd_process_device *pdd;
	bool prev_active = false;

	mutex_lock(&dqm->lock);
	pdd = kfd_get_process_device_data(q->device, q->process);
	if (!pdd) {
		retval = -ENODEV;
		goto out_unlock;
	}
	mqd = dqm->ops.get_mqd_manager(dqm,
			get_mqd_type_from_queue_type(q->properties.type));
	if (!mqd) {
		retval = -ENOMEM;
		goto out_unlock;
	}
	/*
	 * Eviction state logic: we only mark active queues as evicted
	 * to avoid the overhead of restoring inactive queues later
	 */
	if (pdd->qpd.evicted)
		q->properties.is_evicted = (q->properties.queue_size > 0 &&
					    q->properties.queue_percent > 0 &&
					    q->properties.queue_address != 0);

	/* Save previous activity state for counters */
	prev_active = q->properties.is_active;
@@ -457,6 +480,187 @@ static struct mqd_manager *get_mqd_manager(
	return mqd;
}

static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
					struct qcm_process_device *qpd)
{
	struct queue *q;
	struct mqd_manager *mqd;
	struct kfd_process_device *pdd;
	int retval = 0;

	mutex_lock(&dqm->lock);
	if (qpd->evicted++ > 0) /* already evicted, do nothing */
		goto out;

	pdd = qpd_to_pdd(qpd);
	pr_info_ratelimited("Evicting PASID %u queues\n",
			    pdd->process->pasid);

	/* unactivate all active queues on the qpd */
	list_for_each_entry(q, &qpd->queues_list, list) {
		if (!q->properties.is_active)
			continue;
		mqd = dqm->ops.get_mqd_manager(dqm,
			get_mqd_type_from_queue_type(q->properties.type));
		if (!mqd) { /* should not be here */
			pr_err("Cannot evict queue, mqd mgr is NULL\n");
			retval = -ENOMEM;
			goto out;
		}
		q->properties.is_evicted = true;
		q->properties.is_active = false;
		retval = mqd->destroy_mqd(mqd, q->mqd,
				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
		if (retval)
			goto out;
		dqm->queue_count--;
	}

out:
	mutex_unlock(&dqm->lock);
	return retval;
}

static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
				      struct qcm_process_device *qpd)
{
	struct queue *q;
	struct kfd_process_device *pdd;
	int retval = 0;

	mutex_lock(&dqm->lock);
	if (qpd->evicted++ > 0) /* already evicted, do nothing */
		goto out;

	pdd = qpd_to_pdd(qpd);
	pr_info_ratelimited("Evicting PASID %u queues\n",
			    pdd->process->pasid);

	/* unactivate all active queues on the qpd */
	list_for_each_entry(q, &qpd->queues_list, list) {
		if (!q->properties.is_active)
			continue;
		q->properties.is_evicted = true;
		q->properties.is_active = false;
		dqm->queue_count--;
	}
	retval = execute_queues_cpsch(dqm,
				qpd->is_debug ?
				KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);

out:
	mutex_unlock(&dqm->lock);
	return retval;
}

static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
					  struct qcm_process_device *qpd)
{
	struct queue *q;
	struct mqd_manager *mqd;
	struct kfd_process_device *pdd;
	uint32_t pd_base;
	int retval = 0;

	pdd = qpd_to_pdd(qpd);
	/* Retrieve PD base */
	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);

	mutex_lock(&dqm->lock);
	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
		goto out;
	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
		qpd->evicted--;
		goto out;
	}

	pr_info_ratelimited("Restoring PASID %u queues\n",
			    pdd->process->pasid);

	/* Update PD Base in QPD */
	qpd->page_table_base = pd_base;
	pr_debug("Updated PD address to 0x%08x\n", pd_base);

	if (!list_empty(&qpd->queues_list)) {
		dqm->dev->kfd2kgd->set_vm_context_page_table_base(
				dqm->dev->kgd,
				qpd->vmid,
				qpd->page_table_base);
		kfd_flush_tlb(pdd);
	}

	/* activate all active queues on the qpd */
	list_for_each_entry(q, &qpd->queues_list, list) {
		if (!q->properties.is_evicted)
			continue;
		mqd = dqm->ops.get_mqd_manager(dqm,
			get_mqd_type_from_queue_type(q->properties.type));
		if (!mqd) { /* should not be here */
			pr_err("Cannot restore queue, mqd mgr is NULL\n");
			retval = -ENOMEM;
			goto out;
		}
		q->properties.is_evicted = false;
		q->properties.is_active = true;
		retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
				       q->queue, &q->properties,
				       q->process->mm);
		if (retval)
			goto out;
		dqm->queue_count++;
	}
	qpd->evicted = 0;
out:
	mutex_unlock(&dqm->lock);
	return retval;
}

static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
					struct qcm_process_device *qpd)
{
	struct queue *q;
	struct kfd_process_device *pdd;
	uint32_t pd_base;
	int retval = 0;

	pdd = qpd_to_pdd(qpd);
	/* Retrieve PD base */
	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);

	mutex_lock(&dqm->lock);
	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
		goto out;
	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
		qpd->evicted--;
		goto out;
	}

	pr_info_ratelimited("Restoring PASID %u queues\n",
			    pdd->process->pasid);

	/* Update PD Base in QPD */
	qpd->page_table_base = pd_base;
	pr_debug("Updated PD address to 0x%08x\n", pd_base);

	/* activate all active queues on the qpd */
	list_for_each_entry(q, &qpd->queues_list, list) {
		if (!q->properties.is_evicted)
			continue;
		q->properties.is_evicted = false;
		q->properties.is_active = true;
		dqm->queue_count++;
	}
	retval = execute_queues_cpsch(dqm,
				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
	if (!retval)
		qpd->evicted = 0;
out:
	mutex_unlock(&dqm->lock);
	return retval;
}

static int register_process(struct device_queue_manager *dqm,
					struct qcm_process_device *qpd)
{
@@ -853,6 +1057,14 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
		retval = -ENOMEM;
		goto out;
	}
	/*
	 * Eviction state logic: we only mark active queues as evicted
	 * to avoid the overhead of restoring inactive queues later
	 */
	if (qpd->evicted)
		q->properties.is_evicted = (q->properties.queue_size > 0 &&
					    q->properties.queue_percent > 0 &&
					    q->properties.queue_address != 0);

	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);

@@ -1291,6 +1503,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
		dqm->ops.set_trap_handler = set_trap_handler;
		dqm->ops.process_termination = process_termination_cpsch;
		dqm->ops.evict_process_queues = evict_process_queues_cpsch;
		dqm->ops.restore_process_queues = restore_process_queues_cpsch;
		break;
	case KFD_SCHED_POLICY_NO_HWS:
		/* initialize dqm for no cp scheduling */
@@ -1307,6 +1521,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
		dqm->ops.set_trap_handler = set_trap_handler;
		dqm->ops.process_termination = process_termination_nocpsch;
		dqm->ops.evict_process_queues = evict_process_queues_nocpsch;
		dqm->ops.restore_process_queues =
			restore_process_queues_nocpsch;
		break;
	default:
		pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
+9 −0
Original line number Diff line number Diff line
@@ -79,6 +79,10 @@ struct device_process_node {
 *
 * @process_termination: Clears all process queues belongs to that device.
 *
 * @evict_process_queues: Evict all active queues of a process
 *
 * @restore_process_queues: Restore all evicted queues queues of a process
 *
 */

struct device_queue_manager_ops {
@@ -129,6 +133,11 @@ struct device_queue_manager_ops {

	int (*process_termination)(struct device_queue_manager *dqm,
			struct qcm_process_device *qpd);

	int (*evict_process_queues)(struct device_queue_manager *dqm,
				    struct qcm_process_device *qpd);
	int (*restore_process_queues)(struct device_queue_manager *dqm,
				      struct qcm_process_device *qpd);
};

struct device_queue_manager_asic_ops {
+2 −0
Original line number Diff line number Diff line
@@ -43,6 +43,8 @@ static const struct kgd2kfd_calls kgd2kfd = {
	.interrupt	= kgd2kfd_interrupt,
	.suspend	= kgd2kfd_suspend,
	.resume		= kgd2kfd_resume,
	.schedule_evict_and_restore_process =
			  kgd2kfd_schedule_evict_and_restore_process,
};

int sched_policy = KFD_SCHED_POLICY_HWS;
+6 −3
Original line number Diff line number Diff line
@@ -202,7 +202,8 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,

	q->is_active = (q->queue_size > 0 &&
			q->queue_address != 0 &&
			q->queue_percent > 0);
			q->queue_percent > 0 &&
			!q->is_evicted);

	return 0;
}
@@ -245,7 +246,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,

	q->is_active = (q->queue_size > 0 &&
			q->queue_address != 0 &&
			q->queue_percent > 0);
			q->queue_percent > 0 &&
			!q->is_evicted);

	return 0;
}
@@ -377,7 +379,8 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,

	q->is_active = (q->queue_size > 0 &&
			q->queue_address != 0 &&
			q->queue_percent > 0);
			q->queue_percent > 0 &&
			!q->is_evicted);

	return 0;
}
Loading