Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9494a8dd authored by Oded Gabbay's avatar Oded Gabbay Committed by Greg Kroah-Hartman
Browse files

habanalabs: add h/w queues module



This patch adds the H/W queues module and the code to initialize Goya's
various compute and DMA engines and their queues.

Goya has 5 DMA channels, 8 TPC engines and a single MME engine. For each
channel/engine, there is a H/W queue logic which is used to pass commands
from the user to the H/W. That logic is called QMAN.

There are two types of QMANs: external and internal. The DMA QMANs are
considered external while the TPC and MME QMANs are considered internal.
For each external queue there is a completion queue, which is located on
the Host memory.

The differences between external and internal QMANs are:

1. The location of the queue's memory. External QMANs are located on the
   Host memory while internal QMANs are located on the on-chip memory.

2. The external QMAN write an entry to a completion queue and sends an
   MSI-X interrupt upon completion of a command buffer that was given to
   it. The internal QMAN doesn't do that.

Reviewed-by: default avatarMike Rapoport <rppt@linux.ibm.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 839c4803
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@
obj-m	:= habanalabs.o

habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
		command_buffer.o
		command_buffer.o hw_queue.o irq.o

include $(src)/goya/Makefile
habanalabs-y += $(HL_GOYA_FILES)
+73 −2
Original line number Diff line number Diff line
@@ -174,13 +174,23 @@ static int device_early_init(struct hl_device *hdev)
	if (rc)
		goto early_fini;

	hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
	if (hdev->cq_wq == NULL) {
		dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
		rc = -ENOMEM;
		goto asid_fini;
	}

	hl_cb_mgr_init(&hdev->kernel_cb_mgr);

	mutex_init(&hdev->fd_open_cnt_lock);
	mutex_init(&hdev->send_cpu_message_lock);
	atomic_set(&hdev->fd_open_cnt, 0);

	return 0;

asid_fini:
	hl_asid_fini(hdev);
early_fini:
	if (hdev->asic_funcs->early_fini)
		hdev->asic_funcs->early_fini(hdev);
@@ -196,9 +206,12 @@ static int device_early_init(struct hl_device *hdev)
 */
static void device_early_fini(struct hl_device *hdev)
{
	mutex_destroy(&hdev->send_cpu_message_lock);

	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);

	destroy_workqueue(hdev->cq_wq);

	hl_asid_fini(hdev);

	if (hdev->asic_funcs->early_fini)
@@ -277,7 +290,7 @@ int hl_device_resume(struct hl_device *hdev)
 */
int hl_device_init(struct hl_device *hdev, struct class *hclass)
{
	int rc;
	int i, rc, cq_ready_cnt;

	/* Create device */
	rc = device_setup_cdev(hdev, hclass, hdev->id, &hl_ops);
@@ -298,11 +311,48 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
	if (rc)
		goto early_fini;

	/*
	 * Initialize the H/W queues. Must be done before hw_init, because
	 * there the addresses of the kernel queue are being written to the
	 * registers of the device
	 */
	rc = hl_hw_queues_create(hdev);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize kernel queues\n");
		goto sw_fini;
	}

	/*
	 * Initialize the completion queues. Must be done before hw_init,
	 * because there the addresses of the completion queues are being
	 * passed as arguments to request_irq
	 */
	hdev->completion_queue =
			kcalloc(hdev->asic_prop.completion_queues_count,
				sizeof(*hdev->completion_queue), GFP_KERNEL);

	if (!hdev->completion_queue) {
		dev_err(hdev->dev, "failed to allocate completion queues\n");
		rc = -ENOMEM;
		goto hw_queues_destroy;
	}

	for (i = 0, cq_ready_cnt = 0;
			i < hdev->asic_prop.completion_queues_count;
			i++, cq_ready_cnt++) {
		rc = hl_cq_init(hdev, &hdev->completion_queue[i], i);
		if (rc) {
			dev_err(hdev->dev,
				"failed to initialize completion queue\n");
			goto cq_fini;
		}
	}

	/* Allocate the kernel context */
	hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
	if (!hdev->kernel_ctx) {
		rc = -ENOMEM;
		goto sw_fini;
		goto cq_fini;
	}

	hdev->user_ctx = NULL;
@@ -328,6 +378,14 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)

	hdev->disabled = false;

	/* Check that the communication with the device is working */
	rc = hdev->asic_funcs->test_queues(hdev);
	if (rc) {
		dev_err(hdev->dev, "Failed to detect if device is alive\n");
		rc = 0;
		goto out_disabled;
	}

	dev_notice(hdev->dev,
		"Successfully added device to habanalabs driver\n");

@@ -339,6 +397,12 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
			"kernel ctx is still alive on initialization failure\n");
free_ctx:
	kfree(hdev->kernel_ctx);
cq_fini:
	for (i = 0 ; i < cq_ready_cnt ; i++)
		hl_cq_fini(hdev, &hdev->completion_queue[i]);
	kfree(hdev->completion_queue);
hw_queues_destroy:
	hl_hw_queues_destroy(hdev);
sw_fini:
	hdev->asic_funcs->sw_fini(hdev);
early_fini:
@@ -368,6 +432,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 */
void hl_device_fini(struct hl_device *hdev)
{
	int i;
	dev_info(hdev->dev, "Removing device\n");

	/* Mark device as disabled */
@@ -382,6 +447,12 @@ void hl_device_fini(struct hl_device *hdev)
	/* Reset the H/W. It will be in idle state after this returns */
	hdev->asic_funcs->hw_fini(hdev, true);

	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
		hl_cq_fini(hdev, &hdev->completion_queue[i]);
	kfree(hdev->completion_queue);

	hl_hw_queues_destroy(hdev);

	/* Call ASIC S/W finalize function */
	hdev->asic_funcs->sw_fini(hdev);

+1415 −112

File changed.

Preview size limit exceeded, changes collapsed.

+7 −0
Original line number Diff line number Diff line
@@ -11,7 +11,9 @@
#include <uapi/misc/habanalabs.h>
#include "habanalabs.h"
#include "include/hl_boot_if.h"
#include "include/goya/goya_packets.h"
#include "include/goya/goya.h"
#include "include/goya/goya_async_events.h"
#include "include/goya/goya_fw_if.h"

#define NUMBER_OF_CMPLT_QUEUES		5
@@ -145,12 +147,17 @@ enum goya_fw_component {
};

struct goya_device {
	int (*test_cpu_queue)(struct hl_device *hdev);

	/* TODO: remove hw_queues_lock after moving to scheduler code */
	spinlock_t	hw_queues_lock;
	u64		ddr_bar_cur_addr;
	u32		hw_cap_initialized;
};

int goya_test_cpu_queue(struct hl_device *hdev);
int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
				u32 timeout, long *result);
void goya_init_security(struct hl_device *hdev);

#endif /* GOYAP_H_ */
+173 −1
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
#define HABANALABSP_H_

#include "include/armcp_if.h"
#include "include/qman_if.h"

#define pr_fmt(fmt)			"habanalabs: " fmt

@@ -26,9 +27,36 @@
struct hl_device;
struct hl_fpriv;

/**
 * enum hl_queue_type - Supported QUEUE types.
 * @QUEUE_TYPE_NA: queue is not available.
 * @QUEUE_TYPE_EXT: external queue which is a DMA channel that may access the
 *                  host.
 * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
 *			memories and/or operates the compute engines.
 * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
 */
enum hl_queue_type {
	QUEUE_TYPE_NA,
	QUEUE_TYPE_EXT,
	QUEUE_TYPE_INT,
	QUEUE_TYPE_CPU
};

/**
 * struct hw_queue_properties - queue information.
 * @type: queue type.
 * @kmd_only: true if only KMD is allowed to send a job to this queue, false
 *            otherwise.
 */
struct hw_queue_properties {
	enum hl_queue_type	type;
	u8			kmd_only;
};

/**
 * struct asic_fixed_properties - ASIC specific immutable properties.
 * @hw_queues_props: H/W queues properties.
 * @uboot_ver: F/W U-boot version.
 * @preboot_ver: F/W Preboot version.
 * @sram_base_address: SRAM physical start address.
@@ -59,6 +87,7 @@ struct hl_fpriv;
 * @tpc_enabled_mask: which TPCs are enabled.
 */
struct asic_fixed_properties {
	struct hw_queue_properties	hw_queues_props[HL_MAX_QUEUES];
	char			uboot_ver[VERSION_MAX_LEN];
	char			preboot_ver[VERSION_MAX_LEN];
	u64			sram_base_address;
@@ -132,7 +161,89 @@ struct hl_cb {
};


/*
 * QUEUES
 */

struct hl_cs_job;

/*
 * Currently, there are two limitations on the maximum length of a queue:
 *
 * 1. The memory footprint of the queue. The current allocated space for the
 *    queue is PAGE_SIZE. Because each entry in the queue is HL_BD_SIZE,
 *    the maximum length of the queue can be PAGE_SIZE / HL_BD_SIZE,
 *    which currently is 4096/16 = 256 entries.
 *
 *    To increase that, we need either to decrease the size of the
 *    BD (difficult), or allocate more than a single page (easier).
 *
 * 2. Because the size of the JOB handle field in the BD CTL / completion queue
 *    is 10-bit, we can have up to 1024 open jobs per hardware queue.
 *    Therefore, each queue can hold up to 1024 entries.
 *
 * HL_QUEUE_LENGTH is in units of struct hl_bd.
 * HL_QUEUE_LENGTH * sizeof(struct hl_bd) should be <= HL_PAGE_SIZE
 */

#define HL_PAGE_SIZE			4096 /* minimum page size */
/* Must be power of 2 (HL_PAGE_SIZE / HL_BD_SIZE) */
#define HL_QUEUE_LENGTH			256
#define HL_QUEUE_SIZE_IN_BYTES		(HL_QUEUE_LENGTH * HL_BD_SIZE)

/*
 * HL_CQ_LENGTH is in units of struct hl_cq_entry.
 * HL_CQ_LENGTH should be <= HL_PAGE_SIZE
 */
#define HL_CQ_LENGTH			HL_QUEUE_LENGTH
#define HL_CQ_SIZE_IN_BYTES		(HL_CQ_LENGTH * HL_CQ_ENTRY_SIZE)



/**
 * struct hl_hw_queue - describes a H/W transport queue.
 * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
 * @queue_type: type of queue.
 * @kernel_address: holds the queue's kernel virtual address.
 * @bus_address: holds the queue's DMA address.
 * @pi: holds the queue's pi value.
 * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci).
 * @hw_queue_id: the id of the H/W queue.
 * @int_queue_len: length of internal queue (number of entries).
 * @valid: is the queue valid (we have array of 32 queues, not all of them
 *		exists).
 */
struct hl_hw_queue {
	struct hl_cs_job	**shadow_queue;
	enum hl_queue_type	queue_type;
	u64			kernel_address;
	dma_addr_t		bus_address;
	u32			pi;
	u32			ci;
	u32			hw_queue_id;
	u16			int_queue_len;
	u8			valid;
};

/**
 * struct hl_cq - describes a completion queue
 * @hdev: pointer to the device structure
 * @kernel_address: holds the queue's kernel virtual address
 * @bus_address: holds the queue's DMA address
 * @hw_queue_id: the id of the matching H/W queue
 * @ci: ci inside the queue
 * @pi: pi inside the queue
 * @free_slots_cnt: counter of free slots in queue
 */
struct hl_cq {
	struct hl_device	*hdev;
	u64			kernel_address;
	dma_addr_t		bus_address;
	u32			hw_queue_id;
	u32			ci;
	u32			pi;
	atomic_t		free_slots_cnt;
};


/*
@@ -164,6 +275,8 @@ enum hl_asic_type {
 * @resume: handles IP specific H/W or SW changes for resume.
 * @mmap: mmap function, does nothing.
 * @cb_mmap: maps a CB.
 * @ring_doorbell: increment PI on a given QMAN.
 * @flush_pq_write: flush PQ entry write if necessary, WARN if flushing failed.
 * @dma_alloc_coherent: Allocate coherent DMA memory by calling
 *                      dma_alloc_coherent(). This is ASIC function because its
 *                      implementation is not trivial when the driver is loaded
@@ -172,6 +285,16 @@ enum hl_asic_type {
 *                     This is ASIC function because its implementation is not
 *                     trivial when the driver is loaded in simulation mode
 *                     (not upstreamed).
 * @get_int_queue_base: get the internal queue base address.
 * @test_queues: run simple test on all queues for sanity check.
 * @dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
 *                   size of allocation is HL_DMA_POOL_BLK_SIZE.
 * @dma_pool_free: free small DMA allocation from pool.
 * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
 * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
 * @hw_queues_lock: acquire H/W queues lock.
 * @hw_queues_unlock: release H/W queues lock.
 * @send_cpu_message: send buffer to ArmCP.
 */
struct hl_asic_funcs {
	int (*early_init)(struct hl_device *hdev);
@@ -185,10 +308,27 @@ struct hl_asic_funcs {
	int (*mmap)(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
	int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
			u64 kaddress, phys_addr_t paddress, u32 size);
	void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
	void (*flush_pq_write)(struct hl_device *hdev, u64 *pq, u64 exp_val);
	void* (*dma_alloc_coherent)(struct hl_device *hdev, size_t size,
					dma_addr_t *dma_handle, gfp_t flag);
	void (*dma_free_coherent)(struct hl_device *hdev, size_t size,
					void *cpu_addr, dma_addr_t dma_handle);
	void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
				dma_addr_t *dma_handle, u16 *queue_len);
	int (*test_queues)(struct hl_device *hdev);
	void* (*dma_pool_zalloc)(struct hl_device *hdev, size_t size,
				gfp_t mem_flags, dma_addr_t *dma_handle);
	void (*dma_pool_free)(struct hl_device *hdev, void *vaddr,
				dma_addr_t dma_addr);
	void* (*cpu_accessible_dma_pool_alloc)(struct hl_device *hdev,
				size_t size, dma_addr_t *dma_handle);
	void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
				size_t size, void *vaddr);
	void (*hw_queues_lock)(struct hl_device *hdev);
	void (*hw_queues_unlock)(struct hl_device *hdev);
	int (*send_cpu_message)(struct hl_device *hdev, u32 *msg,
				u16 len, u32 timeout, long *result);
};


@@ -224,6 +364,17 @@ struct hl_ctx_mgr {
};




/**
 * struct hl_cs_job - command submission job.
 * @finish_work: workqueue object to run when job is completed.
 * @id: the id of this job inside a CS.
 */
struct hl_cs_job {
	struct work_struct	finish_work;
	u32			id;
};
/*
 * FILE PRIVATE STRUCTURE
 */
@@ -298,7 +449,11 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 * @dev: realted kernel basic device structure.
 * @asic_name: ASIC specific nmae.
 * @asic_type: ASIC specific type.
 * @completion_queue: array of hl_cq.
 * @cq_wq: work queue of completion queues for executing work in process context
 * @eq_wq: work queue of event queue for executing work in process context.
 * @kernel_ctx: KMD context structure.
 * @kernel_queues: array of hl_hw_queue.
 * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
 * @dma_pool: DMA pool for small allocations.
 * @cpu_accessible_dma_mem: KMD <-> ArmCP shared memory CPU address.
@@ -312,6 +467,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 *                    only a single process at a time. In addition, we need a
 *                    lock here so we can flush user processes which are opening
 *                    the device while we are trying to hard reset it
 * @send_cpu_message_lock: enforces only one message in KMD <-> ArmCP queue.
 * @asic_prop: ASIC specific immutable properties.
 * @asic_funcs: ASIC specific functions.
 * @asic_specific: ASIC specific information to use only from ASIC files.
@@ -331,7 +487,10 @@ struct hl_device {
	struct device			*dev;
	char				asic_name[16];
	enum hl_asic_type		asic_type;
	struct hl_cq			*completion_queue;
	struct workqueue_struct		*cq_wq;
	struct hl_ctx			*kernel_ctx;
	struct hl_hw_queue		*kernel_queues;
	struct hl_cb_mgr		kernel_cb_mgr;
	struct dma_pool			*dma_pool;
	void				*cpu_accessible_dma_mem;
@@ -341,6 +500,7 @@ struct hl_device {
	struct mutex			asid_mutex;
	/* TODO: remove fd_open_cnt_lock for multiple process support */
	struct mutex			fd_open_cnt_lock;
	struct mutex			send_cpu_message_lock;
	struct asic_fixed_properties	asic_prop;
	const struct hl_asic_funcs	*asic_funcs;
	void				*asic_specific;
@@ -358,6 +518,7 @@ struct hl_device {
	/* Parameters for bring-up */
	u8				cpu_enable;
	u8				reset_pcilink;
	u8				cpu_queues_enable;
	u8				fw_loading;
	u8				pldm;
};
@@ -400,7 +561,18 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr, u32 timeout_us,
				u32 *val);
int hl_poll_timeout_device_memory(struct hl_device *hdev, void __iomem *addr,
				u32 timeout_us, u32 *val);

int hl_hw_queues_create(struct hl_device *hdev);
void hl_hw_queues_destroy(struct hl_device *hdev);
int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
				u32 cb_size, u64 cb_ptr);
u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);

#define hl_queue_inc_ptr(p)		hl_hw_queue_add_ptr(p, 1)
#define hl_pi_2_offset(pi)		((pi) & (HL_QUEUE_LENGTH - 1))

int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id);
void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q);
int hl_asid_init(struct hl_device *hdev);
void hl_asid_fini(struct hl_device *hdev);
unsigned long hl_asid_alloc(struct hl_device *hdev);
Loading