Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a28ce422 authored by Oded Gabbay's avatar Oded Gabbay Committed by Greg Kroah-Hartman
Browse files

habanalabs: disable CPU access on timeouts



This patch provides a workaround for a bug in the F/W where the response
time for a request from KMD may take more then 100ms. This could cause the
queue between KMD and the F/W to get out of sync.

The WA is to:
1. Increase the timeout of ALL requests to 1s.
2. In case a request isn't answered in time, mark the state as
"cpu_disabled" and prevent sending further requests from KMD to the F/W.
This will eventually lead to a heartbeat failure and hard reset of the
device.

Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 27ca384c
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -723,7 +723,7 @@ static ssize_t hl_device_read(struct file *f, char __user *buf,
		return 0;

	sprintf(tmp_buf,
		"Valid values are: disable, enable, suspend, resume\n");
		"Valid values: disable, enable, suspend, resume, cpu_timeout\n");
	rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
			strlen(tmp_buf) + 1);

@@ -751,9 +751,11 @@ static ssize_t hl_device_write(struct file *f, const char __user *buf,
		hdev->asic_funcs->suspend(hdev);
	} else if (strncmp("resume", data, strlen("resume")) == 0) {
		hdev->asic_funcs->resume(hdev);
	} else if (strncmp("cpu_timeout", data, strlen("cpu_timeout")) == 0) {
		hdev->device_cpu_disabled = true;
	} else {
		dev_err(hdev->dev,
			"Valid values are: disable, enable, suspend, resume\n");
			"Valid values: disable, enable, suspend, resume, cpu_timeout\n");
		count = -EINVAL;
	}

+2 −0
Original line number Diff line number Diff line
@@ -636,6 +636,8 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
	/* Finished tear-down, starting to re-initialize */

	if (hard_reset) {
		hdev->device_cpu_disabled = false;

		/* Allocate the kernel context */
		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
						GFP_KERNEL);
+7 −2
Original line number Diff line number Diff line
@@ -3232,6 +3232,11 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
	if (hdev->disabled)
		goto out;

	if (hdev->device_cpu_disabled) {
		rc = -EIO;
		goto out;
	}

	rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_CPU_PQ, len,
			pkt_dma_addr);
	if (rc) {
@@ -3245,8 +3250,8 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
	hl_hw_queue_inc_ci_kernel(hdev, GOYA_QUEUE_ID_CPU_PQ);

	if (rc == -ETIMEDOUT) {
		dev_err(hdev->dev,
			"Timeout while waiting for CPU packet fence\n");
		dev_err(hdev->dev, "Timeout while waiting for device CPU\n");
		hdev->device_cpu_disabled = true;
		goto out;
	}

+2 −0
Original line number Diff line number Diff line
@@ -1079,6 +1079,7 @@ struct hl_device_reset_work {
 * @dram_default_page_mapping: is DRAM default page mapping enabled.
 * @init_done: is the initialization of the device done.
 * @mmu_enable: is MMU enabled.
 * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
 */
struct hl_device {
	struct pci_dev			*pdev;
@@ -1146,6 +1147,7 @@ struct hl_device {
	u8				dram_supports_virtual_memory;
	u8				dram_default_page_mapping;
	u8				init_done;
	u8				device_cpu_disabled;

	/* Parameters for bring-up */
	u8				mmu_enable;
+1 −1
Original line number Diff line number Diff line
@@ -10,7 +10,7 @@
#include <linux/pci.h>
#include <linux/hwmon.h>

#define SENSORS_PKT_TIMEOUT		100000	/* 100ms */
#define SENSORS_PKT_TIMEOUT		1000000	/* 1s */
#define HWMON_NR_SENSOR_TYPES		(hwmon_pwm + 1)

int hl_build_hwmon_channel_info(struct hl_device *hdev,
Loading