Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f8c8c7d5 authored by Oded Gabbay's avatar Oded Gabbay Committed by Greg Kroah-Hartman
Browse files

habanalabs: add device reset support



This patch adds support for doing various on-the-fly reset of Goya.

The driver supports two types of resets:
1. soft-reset
2. hard-reset

Soft-reset is done when the device detects a timeout of a command
submission that was given to the device. The soft-reset process only resets
the engines that are relevant for the submission of compute jobs, i.e. the
DMA channels, the TPCs and the MME. The purpose is to bring the device as
fast as possible to a working state.

Hard-reset is done in several cases:
1. After soft-reset is done but the device is not responding
2. When fatal errors occur inside the device, e.g. ECC error
3. When the driver is removed

Hard-reset performs a reset of the entire chip except for the PCI
controller and the PLLs. It is a much longer process then soft-reset but it
helps to recover the device without the need to reboot the Host.

After hard-reset, the driver will restore the max power attribute and in
case of manual power management, the frequencies that were set.

This patch also adds two entries to the sysfs, which allows the root user
to initiate a soft or hard reset.

Reviewed-by: default avatarMike Rapoport <rppt@linux.ibm.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent d91389bc
Loading
Loading
Loading
Loading
+7 −2
Original line number Diff line number Diff line
@@ -91,9 +91,14 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
	bool alloc_new_cb = true;
	int rc;

	if (hdev->disabled) {
	/*
	 * Can't use generic function to check this because of special case
	 * where we create a CB as part of the reset process
	 */
	if ((hdev->disabled) || ((atomic_read(&hdev->in_reset)) &&
					(ctx_id != HL_KERNEL_ASID_ID))) {
		dev_warn_ratelimited(hdev->dev,
			"Device is disabled. Can't create new CBs\n");
			"Device is disabled or in reset. Can't create new CBs\n");
		rc = -EBUSY;
		goto out_err;
	}
+323 −2
Original line number Diff line number Diff line
@@ -8,9 +8,17 @@
#include "habanalabs.h"

#include <linux/pci.h>
#include <linux/delay.h>
#include <linux/sched/signal.h>
#include <linux/hwmon.h>

bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
{
	if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
		return true;
	else
		return false;
}

static void hpriv_release(struct kref *ref)
{
	struct hl_fpriv *hpriv;
@@ -200,6 +208,7 @@ static int device_early_init(struct hl_device *hdev)

	mutex_init(&hdev->fd_open_cnt_lock);
	mutex_init(&hdev->send_cpu_message_lock);
	atomic_set(&hdev->in_reset, 0);
	atomic_set(&hdev->fd_open_cnt, 0);

	return 0;
@@ -254,6 +263,27 @@ static void set_freq_to_low_job(struct work_struct *work)
			usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
}

static void hl_device_heartbeat(struct work_struct *work)
{
	struct hl_device *hdev = container_of(work, struct hl_device,
						work_heartbeat.work);

	if (hl_device_disabled_or_in_reset(hdev))
		goto reschedule;

	if (!hdev->asic_funcs->send_heartbeat(hdev))
		goto reschedule;

	dev_err(hdev->dev, "Device heartbeat failed!\n");
	hl_device_reset(hdev, true, false);

	return;

reschedule:
	schedule_delayed_work(&hdev->work_heartbeat,
			usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
}

/*
 * device_late_init - do late stuff initialization for the habanalabs device
 *
@@ -289,6 +319,12 @@ static int device_late_init(struct hl_device *hdev)
	schedule_delayed_work(&hdev->work_freq,
			usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));

	if (hdev->heartbeat) {
		INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
		schedule_delayed_work(&hdev->work_heartbeat,
				usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
	}

	hdev->late_init_done = true;

	return 0;
@@ -306,6 +342,8 @@ static void device_late_fini(struct hl_device *hdev)
		return;

	cancel_delayed_work_sync(&hdev->work_freq);
	if (hdev->heartbeat)
		cancel_delayed_work_sync(&hdev->work_heartbeat);

	if (hdev->asic_funcs->late_fini)
		hdev->asic_funcs->late_fini(hdev);
@@ -413,6 +451,260 @@ int hl_device_resume(struct hl_device *hdev)
	return 0;
}

static void hl_device_hard_reset_pending(struct work_struct *work)
{
	struct hl_device_reset_work *device_reset_work =
		container_of(work, struct hl_device_reset_work, reset_work);
	struct hl_device *hdev = device_reset_work->hdev;
	u16 pending_cnt = HL_PENDING_RESET_PER_SEC;
	struct task_struct *task = NULL;

	/* Flush all processes that are inside hl_open */
	mutex_lock(&hdev->fd_open_cnt_lock);

	while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {

		pending_cnt--;

		dev_info(hdev->dev,
			"Can't HARD reset, waiting for user to close FD\n");
		ssleep(1);
	}

	if (atomic_read(&hdev->fd_open_cnt)) {
		task = get_pid_task(hdev->user_ctx->hpriv->taskpid,
					PIDTYPE_PID);
		if (task) {
			dev_info(hdev->dev, "Killing user processes\n");
			send_sig(SIGKILL, task, 1);
			msleep(100);

			put_task_struct(task);
		}
	}

	mutex_unlock(&hdev->fd_open_cnt_lock);

	hl_device_reset(hdev, true, true);

	kfree(device_reset_work);
}

/*
 * hl_device_reset - reset the device
 *
 * @hdev: pointer to habanalabs device structure
 * @hard_reset: should we do hard reset to all engines or just reset the
 *              compute/dma engines
 *
 * Block future CS and wait for pending CS to be enqueued
 * Call ASIC H/W fini
 * Flush all completions
 * Re-initialize all internal data structures
 * Call ASIC H/W init, late_init
 * Test queues
 * Enable device
 *
 * Returns 0 for success or an error on failure.
 */
int hl_device_reset(struct hl_device *hdev, bool hard_reset,
			bool from_hard_reset_thread)
{
	int i, rc;

	if (!hdev->init_done) {
		dev_err(hdev->dev,
			"Can't reset before initialization is done\n");
		return 0;
	}

	/*
	 * Prevent concurrency in this function - only one reset should be
	 * done at any given time. Only need to perform this if we didn't
	 * get from the dedicated hard reset thread
	 */
	if (!from_hard_reset_thread) {
		/* Block future CS/VM/JOB completion operations */
		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
		if (rc)
			return 0;

		/* This also blocks future CS/VM/JOB completion operations */
		hdev->disabled = true;

		/*
		 * Flush anyone that is inside the critical section of enqueue
		 * jobs to the H/W
		 */
		hdev->asic_funcs->hw_queues_lock(hdev);
		hdev->asic_funcs->hw_queues_unlock(hdev);

		dev_err(hdev->dev, "Going to RESET device!\n");
	}

again:
	if ((hard_reset) && (!from_hard_reset_thread)) {
		struct hl_device_reset_work *device_reset_work;

		if (!hdev->pdev) {
			dev_err(hdev->dev,
				"Reset action is NOT supported in simulator\n");
			rc = -EINVAL;
			goto out_err;
		}

		hdev->hard_reset_pending = true;

		device_reset_work = kzalloc(sizeof(*device_reset_work),
						GFP_ATOMIC);
		if (!device_reset_work) {
			rc = -ENOMEM;
			goto out_err;
		}

		/*
		 * Because the reset function can't run from interrupt or
		 * from heartbeat work, we need to call the reset function
		 * from a dedicated work
		 */
		INIT_WORK(&device_reset_work->reset_work,
				hl_device_hard_reset_pending);
		device_reset_work->hdev = hdev;
		schedule_work(&device_reset_work->reset_work);

		return 0;
	}

	if (hard_reset) {
		device_late_fini(hdev);

		/*
		 * Now that the heartbeat thread is closed, flush processes
		 * which are sending messages to CPU
		 */
		mutex_lock(&hdev->send_cpu_message_lock);
		mutex_unlock(&hdev->send_cpu_message_lock);
	}

	/*
	 * Halt the engines and disable interrupts so we won't get any more
	 * completions from H/W and we won't have any accesses from the
	 * H/W to the host machine
	 */
	hdev->asic_funcs->halt_engines(hdev, hard_reset);

	if (hard_reset) {
		/* Release kernel context */
		if (hl_ctx_put(hdev->kernel_ctx) != 1) {
			dev_err(hdev->dev,
				"kernel ctx is alive during hard reset\n");
			rc = -EBUSY;
			goto out_err;
		}

		hdev->kernel_ctx = NULL;
	}

	/* Reset the H/W. It will be in idle state after this returns */
	hdev->asic_funcs->hw_fini(hdev, hard_reset);

	if (hard_reset)
		hl_eq_reset(hdev, &hdev->event_queue);

	/* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
	hl_hw_queue_reset(hdev, hard_reset);
	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
		hl_cq_reset(hdev, &hdev->completion_queue[i]);

	/* Finished tear-down, starting to re-initialize */

	if (hard_reset) {
		/* Allocate the kernel context */
		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
						GFP_KERNEL);
		if (!hdev->kernel_ctx) {
			rc = -ENOMEM;
			goto out_err;
		}

		hdev->user_ctx = NULL;

		rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
		if (rc) {
			dev_err(hdev->dev,
				"failed to init kernel ctx in hard reset\n");
			kfree(hdev->kernel_ctx);
			hdev->kernel_ctx = NULL;
			goto out_err;
		}
	}

	rc = hdev->asic_funcs->hw_init(hdev);
	if (rc) {
		dev_err(hdev->dev,
			"failed to initialize the H/W after reset\n");
		goto out_err;
	}

	hdev->disabled = false;

	/* Check that the communication with the device is working */
	rc = hdev->asic_funcs->test_queues(hdev);
	if (rc) {
		dev_err(hdev->dev,
			"Failed to detect if device is alive after reset\n");
		goto out_err;
	}

	if (hard_reset) {
		rc = device_late_init(hdev);
		if (rc) {
			dev_err(hdev->dev,
				"Failed late init after hard reset\n");
			goto out_err;
		}

		hl_set_max_power(hdev, hdev->max_power);

		hdev->hard_reset_pending = false;
	} else {
		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
		if (rc) {
			dev_err(hdev->dev,
				"Failed late init after soft reset\n");
			goto out_err;
		}
	}

	atomic_set(&hdev->in_reset, 0);

	if (hard_reset)
		hdev->hard_reset_cnt++;
	else
		hdev->soft_reset_cnt++;

	return 0;

out_err:
	hdev->disabled = true;

	if (hard_reset) {
		dev_err(hdev->dev,
			"Failed to reset! Device is NOT usable\n");
		hdev->hard_reset_cnt++;
	} else {
		dev_err(hdev->dev,
			"Failed to do soft-reset, trying hard reset\n");
		hdev->soft_reset_cnt++;
		hard_reset = true;
		goto again;
	}

	atomic_set(&hdev->in_reset, 0);

	return rc;
}

/*
 * hl_device_init - main initialization function for habanalabs device
 *
@@ -520,6 +812,12 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
		goto free_cb_pool;
	}

	if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
		dev_info(hdev->dev,
			"H/W state is dirty, must reset before initializing\n");
		hdev->asic_funcs->hw_fini(hdev, true);
	}

	rc = hdev->asic_funcs->hw_init(hdev);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize the H/W\n");
@@ -565,6 +863,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
	dev_notice(hdev->dev,
		"Successfully added device to habanalabs driver\n");

	hdev->init_done = true;

	return 0;

free_cb_pool:
@@ -612,9 +912,30 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 */
void hl_device_fini(struct hl_device *hdev)
{
	int i;
	int i, rc;
	ktime_t timeout;

	dev_info(hdev->dev, "Removing device\n");

	/*
	 * This function is competing with the reset function, so try to
	 * take the reset atomic and if we are already in middle of reset,
	 * wait until reset function is finished. Reset function is designed
	 * to always finish (could take up to a few seconds in worst case).
	 */

	timeout = ktime_add_us(ktime_get(),
				HL_PENDING_RESET_PER_SEC * 1000 * 1000 * 4);
	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
	while (rc) {
		usleep_range(50, 200);
		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
		if (ktime_compare(ktime_get(), timeout) > 0) {
			WARN(1, "Failed to remove device because reset function did not finish\n");
			return;
		}
	};

	/* Mark device as disabled */
	hdev->disabled = true;

+225 −1
Original line number Diff line number Diff line
@@ -120,6 +120,130 @@ static const char *goya_axi_name[GOYA_MAX_INITIATORS] = {

#define GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE 121

static u32 goya_non_fatal_events[GOYA_ASYC_EVENT_GROUP_NON_FATAL_SIZE] = {
	GOYA_ASYNC_EVENT_ID_PCIE_IF,
	GOYA_ASYNC_EVENT_ID_TPC0_ECC,
	GOYA_ASYNC_EVENT_ID_TPC1_ECC,
	GOYA_ASYNC_EVENT_ID_TPC2_ECC,
	GOYA_ASYNC_EVENT_ID_TPC3_ECC,
	GOYA_ASYNC_EVENT_ID_TPC4_ECC,
	GOYA_ASYNC_EVENT_ID_TPC5_ECC,
	GOYA_ASYNC_EVENT_ID_TPC6_ECC,
	GOYA_ASYNC_EVENT_ID_TPC7_ECC,
	GOYA_ASYNC_EVENT_ID_MME_ECC,
	GOYA_ASYNC_EVENT_ID_MME_ECC_EXT,
	GOYA_ASYNC_EVENT_ID_MMU_ECC,
	GOYA_ASYNC_EVENT_ID_DMA_MACRO,
	GOYA_ASYNC_EVENT_ID_DMA_ECC,
	GOYA_ASYNC_EVENT_ID_CPU_IF_ECC,
	GOYA_ASYNC_EVENT_ID_PSOC_MEM,
	GOYA_ASYNC_EVENT_ID_PSOC_CORESIGHT,
	GOYA_ASYNC_EVENT_ID_SRAM0,
	GOYA_ASYNC_EVENT_ID_SRAM1,
	GOYA_ASYNC_EVENT_ID_SRAM2,
	GOYA_ASYNC_EVENT_ID_SRAM3,
	GOYA_ASYNC_EVENT_ID_SRAM4,
	GOYA_ASYNC_EVENT_ID_SRAM5,
	GOYA_ASYNC_EVENT_ID_SRAM6,
	GOYA_ASYNC_EVENT_ID_SRAM7,
	GOYA_ASYNC_EVENT_ID_SRAM8,
	GOYA_ASYNC_EVENT_ID_SRAM9,
	GOYA_ASYNC_EVENT_ID_SRAM10,
	GOYA_ASYNC_EVENT_ID_SRAM11,
	GOYA_ASYNC_EVENT_ID_SRAM12,
	GOYA_ASYNC_EVENT_ID_SRAM13,
	GOYA_ASYNC_EVENT_ID_SRAM14,
	GOYA_ASYNC_EVENT_ID_SRAM15,
	GOYA_ASYNC_EVENT_ID_SRAM16,
	GOYA_ASYNC_EVENT_ID_SRAM17,
	GOYA_ASYNC_EVENT_ID_SRAM18,
	GOYA_ASYNC_EVENT_ID_SRAM19,
	GOYA_ASYNC_EVENT_ID_SRAM20,
	GOYA_ASYNC_EVENT_ID_SRAM21,
	GOYA_ASYNC_EVENT_ID_SRAM22,
	GOYA_ASYNC_EVENT_ID_SRAM23,
	GOYA_ASYNC_EVENT_ID_SRAM24,
	GOYA_ASYNC_EVENT_ID_SRAM25,
	GOYA_ASYNC_EVENT_ID_SRAM26,
	GOYA_ASYNC_EVENT_ID_SRAM27,
	GOYA_ASYNC_EVENT_ID_SRAM28,
	GOYA_ASYNC_EVENT_ID_SRAM29,
	GOYA_ASYNC_EVENT_ID_GIC500,
	GOYA_ASYNC_EVENT_ID_PLL0,
	GOYA_ASYNC_EVENT_ID_PLL1,
	GOYA_ASYNC_EVENT_ID_PLL3,
	GOYA_ASYNC_EVENT_ID_PLL4,
	GOYA_ASYNC_EVENT_ID_PLL5,
	GOYA_ASYNC_EVENT_ID_PLL6,
	GOYA_ASYNC_EVENT_ID_AXI_ECC,
	GOYA_ASYNC_EVENT_ID_L2_RAM_ECC,
	GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET,
	GOYA_ASYNC_EVENT_ID_PSOC_GPIO_10_VRHOT_ICRIT,
	GOYA_ASYNC_EVENT_ID_PCIE_DEC,
	GOYA_ASYNC_EVENT_ID_TPC0_DEC,
	GOYA_ASYNC_EVENT_ID_TPC1_DEC,
	GOYA_ASYNC_EVENT_ID_TPC2_DEC,
	GOYA_ASYNC_EVENT_ID_TPC3_DEC,
	GOYA_ASYNC_EVENT_ID_TPC4_DEC,
	GOYA_ASYNC_EVENT_ID_TPC5_DEC,
	GOYA_ASYNC_EVENT_ID_TPC6_DEC,
	GOYA_ASYNC_EVENT_ID_TPC7_DEC,
	GOYA_ASYNC_EVENT_ID_MME_WACS,
	GOYA_ASYNC_EVENT_ID_MME_WACSD,
	GOYA_ASYNC_EVENT_ID_CPU_AXI_SPLITTER,
	GOYA_ASYNC_EVENT_ID_PSOC_AXI_DEC,
	GOYA_ASYNC_EVENT_ID_PSOC,
	GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR,
	GOYA_ASYNC_EVENT_ID_TPC0_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC1_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC2_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC3_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC4_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC5_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC6_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC7_CMDQ,
	GOYA_ASYNC_EVENT_ID_TPC0_QM,
	GOYA_ASYNC_EVENT_ID_TPC1_QM,
	GOYA_ASYNC_EVENT_ID_TPC2_QM,
	GOYA_ASYNC_EVENT_ID_TPC3_QM,
	GOYA_ASYNC_EVENT_ID_TPC4_QM,
	GOYA_ASYNC_EVENT_ID_TPC5_QM,
	GOYA_ASYNC_EVENT_ID_TPC6_QM,
	GOYA_ASYNC_EVENT_ID_TPC7_QM,
	GOYA_ASYNC_EVENT_ID_MME_QM,
	GOYA_ASYNC_EVENT_ID_MME_CMDQ,
	GOYA_ASYNC_EVENT_ID_DMA0_QM,
	GOYA_ASYNC_EVENT_ID_DMA1_QM,
	GOYA_ASYNC_EVENT_ID_DMA2_QM,
	GOYA_ASYNC_EVENT_ID_DMA3_QM,
	GOYA_ASYNC_EVENT_ID_DMA4_QM,
	GOYA_ASYNC_EVENT_ID_DMA0_CH,
	GOYA_ASYNC_EVENT_ID_DMA1_CH,
	GOYA_ASYNC_EVENT_ID_DMA2_CH,
	GOYA_ASYNC_EVENT_ID_DMA3_CH,
	GOYA_ASYNC_EVENT_ID_DMA4_CH,
	GOYA_ASYNC_EVENT_ID_TPC0_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_TPC1_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_TPC2_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_TPC3_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_TPC4_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_TPC5_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_TPC6_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_TPC7_BMON_SPMU,
	GOYA_ASYNC_EVENT_ID_DMA_BM_CH0,
	GOYA_ASYNC_EVENT_ID_DMA_BM_CH1,
	GOYA_ASYNC_EVENT_ID_DMA_BM_CH2,
	GOYA_ASYNC_EVENT_ID_DMA_BM_CH3,
	GOYA_ASYNC_EVENT_ID_DMA_BM_CH4
};

static int goya_armcp_info_get(struct hl_device *hdev);

static void goya_get_fixed_properties(struct hl_device *hdev)
@@ -2447,6 +2571,14 @@ static int goya_hw_init(struct hl_device *hdev)
	/* Perform read from the device to make sure device is up */
	val = RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG);

	/*
	 * Let's mark in the H/W that we have reached this point. We check
	 * this value in the reset_before_init function to understand whether
	 * we need to reset the chip before doing H/W init. This register is
	 * cleared by the H/W upon H/W reset
	 */
	WREG32(mmPSOC_GLOBAL_CONF_APP_STATUS, HL_DEVICE_HW_STATE_DIRTY);

	rc = goya_init_cpu(hdev, GOYA_CPU_TIMEOUT_USEC);
	if (rc) {
		dev_err(hdev->dev, "failed to initialize CPU\n");
@@ -2575,6 +2707,14 @@ static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
			"Timeout while waiting for device to reset 0x%x\n",
			status);

	if (!hard_reset) {
		goya->hw_cap_initialized &= ~(HW_CAP_DMA | HW_CAP_MME |
						HW_CAP_GOLDEN | HW_CAP_TPC);
		WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
				GOYA_ASYNC_EVENT_ID_SOFT_RESET);
		return;
	}

	/* Chicken bit to re-initiate boot sequencer flow */
	WREG32(mmPSOC_GLOBAL_CONF_BOOT_SEQ_RE_START,
		1 << PSOC_GLOBAL_CONF_BOOT_SEQ_RE_START_IND_SHIFT);
@@ -3184,6 +3324,57 @@ static void goya_print_irq_info(struct hl_device *hdev, u16 event_type)
	}
}

static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
		size_t irq_arr_size)
{
	struct armcp_unmask_irq_arr_packet *pkt;
	size_t total_pkt_size;
	long result;
	int rc;

	total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
			irq_arr_size;

	/* data should be aligned to 8 bytes in order to ArmCP to copy it */
	total_pkt_size = (total_pkt_size + 0x7) & ~0x7;

	/* total_pkt_size is casted to u16 later on */
	if (total_pkt_size > USHRT_MAX) {
		dev_err(hdev->dev, "too many elements in IRQ array\n");
		return -EINVAL;
	}

	pkt = kzalloc(total_pkt_size, GFP_KERNEL);
	if (!pkt)
		return -ENOMEM;

	pkt->length = irq_arr_size / sizeof(irq_arr[0]);
	memcpy(&pkt->irqs, irq_arr, irq_arr_size);

	pkt->armcp_pkt.ctl = ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
						ARMCP_PKT_CTL_OPCODE_SHIFT;

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
			total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);

	if (rc)
		dev_err(hdev->dev, "failed to unmask IRQ array\n");

	kfree(pkt);

	return rc;
}

static int goya_soft_reset_late_init(struct hl_device *hdev)
{
	/*
	 * Unmask all IRQs since some could have been received
	 * during the soft reset
	 */
	return goya_unmask_irq_arr(hdev, goya_non_fatal_events,
			sizeof(goya_non_fatal_events));
}

static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
{
	struct armcp_packet pkt;
@@ -3245,6 +3436,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
		dev_err(hdev->dev,
			"Received H/W interrupt %d, reset the chip\n",
			event_type);
		hl_device_reset(hdev, true, false);
		break;

	case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
@@ -3310,6 +3502,30 @@ void *goya_get_events_stat(struct hl_device *hdev, u32 *size)
	return goya->events_stat;
}

int goya_send_heartbeat(struct hl_device *hdev)
{
	struct goya_device *goya = hdev->asic_specific;
	struct armcp_packet hb_pkt;
	long result;
	int rc;

	if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q))
		return 0;

	memset(&hb_pkt, 0, sizeof(hb_pkt));

	hb_pkt.ctl = ARMCP_PACKET_TEST << ARMCP_PKT_CTL_OPCODE_SHIFT;
	hb_pkt.value = ARMCP_PACKET_FENCE_VAL;

	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
			sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);

	if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
		rc = -EIO;

	return rc;
}

static int goya_armcp_info_get(struct hl_device *hdev)
{
	struct goya_device *goya = hdev->asic_specific;
@@ -3455,6 +3671,11 @@ int goya_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
	return rc;
}

static enum hl_device_hw_state goya_get_hw_state(struct hl_device *hdev)
{
	return RREG32(mmPSOC_GLOBAL_CONF_APP_STATUS);
}

static const struct hl_asic_funcs goya_funcs = {
	.early_init = goya_early_init,
	.early_fini = goya_early_fini,
@@ -3484,12 +3705,15 @@ static const struct hl_asic_funcs goya_funcs = {
	.handle_eqe = goya_handle_eqe,
	.set_pll_profile = goya_set_pll_profile,
	.get_events_stat = goya_get_events_stat,
	.send_heartbeat = goya_send_heartbeat,
	.enable_clock_gating = goya_init_clock_gating,
	.disable_clock_gating = goya_disable_clock_gating,
	.soft_reset_late_init = goya_soft_reset_late_init,
	.hw_queues_lock = goya_hw_queues_lock,
	.hw_queues_unlock = goya_hw_queues_unlock,
	.get_eeprom_data = goya_get_eeprom_data,
	.send_cpu_message = goya_send_cpu_message
	.send_cpu_message = goya_send_cpu_message,
	.get_hw_state = goya_get_hw_state
};

/*
+9 −9
Original line number Diff line number Diff line
@@ -38,7 +38,7 @@ static ssize_t mme_clk_show(struct device *dev, struct device_attribute *attr,
	struct hl_device *hdev = dev_get_drvdata(dev);
	long value;

	if (hdev->disabled)
	if (hl_device_disabled_or_in_reset(hdev))
		return -ENODEV;

	value = hl_get_frequency(hdev, MME_PLL, false);
@@ -57,7 +57,7 @@ static ssize_t mme_clk_store(struct device *dev, struct device_attribute *attr,
	int rc;
	long value;

	if (hdev->disabled) {
	if (hl_device_disabled_or_in_reset(hdev)) {
		count = -ENODEV;
		goto fail;
	}
@@ -87,7 +87,7 @@ static ssize_t tpc_clk_show(struct device *dev, struct device_attribute *attr,
	struct hl_device *hdev = dev_get_drvdata(dev);
	long value;

	if (hdev->disabled)
	if (hl_device_disabled_or_in_reset(hdev))
		return -ENODEV;

	value = hl_get_frequency(hdev, TPC_PLL, false);
@@ -106,7 +106,7 @@ static ssize_t tpc_clk_store(struct device *dev, struct device_attribute *attr,
	int rc;
	long value;

	if (hdev->disabled) {
	if (hl_device_disabled_or_in_reset(hdev)) {
		count = -ENODEV;
		goto fail;
	}
@@ -136,7 +136,7 @@ static ssize_t ic_clk_show(struct device *dev, struct device_attribute *attr,
	struct hl_device *hdev = dev_get_drvdata(dev);
	long value;

	if (hdev->disabled)
	if (hl_device_disabled_or_in_reset(hdev))
		return -ENODEV;

	value = hl_get_frequency(hdev, IC_PLL, false);
@@ -155,7 +155,7 @@ static ssize_t ic_clk_store(struct device *dev, struct device_attribute *attr,
	int rc;
	long value;

	if (hdev->disabled) {
	if (hl_device_disabled_or_in_reset(hdev)) {
		count = -ENODEV;
		goto fail;
	}
@@ -185,7 +185,7 @@ static ssize_t mme_clk_curr_show(struct device *dev,
	struct hl_device *hdev = dev_get_drvdata(dev);
	long value;

	if (hdev->disabled)
	if (hl_device_disabled_or_in_reset(hdev))
		return -ENODEV;

	value = hl_get_frequency(hdev, MME_PLL, true);
@@ -202,7 +202,7 @@ static ssize_t tpc_clk_curr_show(struct device *dev,
	struct hl_device *hdev = dev_get_drvdata(dev);
	long value;

	if (hdev->disabled)
	if (hl_device_disabled_or_in_reset(hdev))
		return -ENODEV;

	value = hl_get_frequency(hdev, TPC_PLL, true);
@@ -219,7 +219,7 @@ static ssize_t ic_clk_curr_show(struct device *dev,
	struct hl_device *hdev = dev_get_drvdata(dev);
	long value;

	if (hdev->disabled)
	if (hl_device_disabled_or_in_reset(hdev))
		return -ENODEV;

	value = hl_get_frequency(hdev, IC_PLL, true);
+52 −0

File changed.

Preview size limit exceeded, changes collapsed.

Loading