Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 2640c3fa authored by shaoyunl's avatar shaoyunl Committed by Oded Gabbay
Browse files

drm/amdkfd: Handle VM faults in KFD



1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
   per-vmid. amdkfd needs to get the information from amdgpu through the
   new get_vm_fault_info interface. On GFX9 and later, all the required
   information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
   run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY

Signed-off-by: default avatarshaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent b97dfa27
Loading
Loading
Loading
Loading
+21 −4
Original line number Diff line number Diff line
@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
	return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
		ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
		ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
}

static void cik_event_interrupt_wq(struct kfd_dev *dev,
					const uint32_t *ih_ring_entry)
{
	unsigned int pasid;
	const struct cik_ih_ring_entry *ihre =
			(const struct cik_ih_ring_entry *)ih_ring_entry;
	uint32_t context_id = ihre->data & 0xfffffff;

	pasid = (ihre->ring_id & 0xffff0000) >> 16;
	unsigned int vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
	unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;

	if (pasid == 0)
		return;
@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
		kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
	else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
		kfd_signal_hw_exception_event(pasid);
	else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
		struct kfd_vm_fault_info info;

		kfd_process_vm_fault(dev->dqm, pasid);

		memset(&info, 0, sizeof(info));
		dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
		if (!info.page_addr && !info.status)
			return;

		if (info.vmid == vmid)
			kfd_signal_vm_fault_event(dev, pasid, &info);
		else
			kfd_signal_vm_fault_event(dev, pasid, NULL);
	}
}

const struct kfd_event_interrupt_class event_interrupt_class_cik = {
+2 −0
Original line number Diff line number Diff line
@@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
#define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
#define CIK_INTSRC_SDMA_TRAP		0xE0
#define CIK_INTSRC_SQ_INTERRUPT_MSG	0xEF
#define CIK_INTSRC_GFX_PAGE_INV_FAULT	0x92
#define CIK_INTSRC_GFX_MEM_PROT_FAULT	0x93

#endif
+17 −0
Original line number Diff line number Diff line
@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
	kfree(dqm);
}

int kfd_process_vm_fault(struct device_queue_manager *dqm,
			 unsigned int pasid)
{
	struct kfd_process_device *pdd;
	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
	int ret = 0;

	if (!p)
		return -EINVAL;
	pdd = kfd_get_process_device_data(dqm->dev, p);
	if (pdd)
		ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
	kfd_unref_process(p);

	return ret;
}

#if defined(CONFIG_DEBUG_FS)

static void seq_reg_dump(struct seq_file *m,
+37 −0
Original line number Diff line number Diff line
@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
	mutex_unlock(&p->event_mutex);
	kfd_unref_process(p);
}

void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
				struct kfd_vm_fault_info *info)
{
	struct kfd_event *ev;
	uint32_t id;
	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
	struct kfd_hsa_memory_exception_data memory_exception_data;

	if (!p)
		return; /* Presumably process exited. */
	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
	memory_exception_data.gpu_id = dev->id;
	memory_exception_data.failure.imprecise = 1;
	/* Set failure reason */
	if (info) {
		memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
		memory_exception_data.failure.NotPresent =
			info->prot_valid ? 1 : 0;
		memory_exception_data.failure.NoExecute =
			info->prot_exec ? 1 : 0;
		memory_exception_data.failure.ReadOnly =
			info->prot_write ? 1 : 0;
		memory_exception_data.failure.imprecise = 0;
	}
	mutex_lock(&p->event_mutex);

	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
	idr_for_each_entry_continue(&p->event_idr, ev, id)
		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
			ev->memory_exception_data = memory_exception_data;
			set_event(ev);
		}

	mutex_unlock(&p->event_mutex);
	kfd_unref_process(p);
}
+16 −2
Original line number Diff line number Diff line
@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
	return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
		source_id == SOC15_INTSRC_SDMA_TRAP ||
		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
		source_id == SOC15_INTSRC_CP_BAD_OPCODE;
		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
		client_id == SOC15_IH_CLIENTID_VMC ||
		client_id == SOC15_IH_CLIENTID_UTCL2;
}

static void event_interrupt_wq_v9(struct kfd_dev *dev,
@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
		kfd_signal_hw_exception_event(pasid);
	else if (client_id == SOC15_IH_CLIENTID_VMC ||
		 client_id == SOC15_IH_CLIENTID_UTCL2) {
		/* TODO */
		struct kfd_vm_fault_info info = {0};
		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);

		info.vmid = vmid;
		info.mc_id = client_id;
		info.page_addr = ih_ring_entry[4] |
			(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
		info.prot_valid = ring_id & 0x08;
		info.prot_read  = ring_id & 0x10;
		info.prot_write = ring_id & 0x20;

		kfd_process_vm_fault(dev->dqm, pasid);
		kfd_signal_vm_fault_event(dev, pasid, &info);
	}
}

Loading