Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 22546b74 authored by Tadeusz Struk's avatar Tadeusz Struk Committed by Doug Ledford
Browse files

IB/hfi1: Fix softlockup issue



Soft lockups can occur because the mad processing on different CPUs acquire
the spin lock dc8051_lock:

[534552.835870]  [<ffffffffa026f993>] ? read_dev_port_cntr.isra.37+0x23/0x160 [hfi1]
[534552.835880]  [<ffffffffa02775af>] read_dev_cntr+0x4f/0x60 [hfi1]
[534552.835893]  [<ffffffffa028d7cd>] pma_get_opa_portstatus+0x64d/0x8c0 [hfi1]
[534552.835904]  [<ffffffffa0290e7d>] hfi1_process_mad+0x48d/0x18c0 [hfi1]
[534552.835908]  [<ffffffff811dc1f1>] ? __slab_free+0x81/0x2f0
[534552.835936]  [<ffffffffa024c34e>] ? ib_mad_recv_done+0x21e/0xa30 [ib_core]
[534552.835939]  [<ffffffff811dd153>] ? __kmalloc+0x1f3/0x240
[534552.835947]  [<ffffffffa024c3fb>] ib_mad_recv_done+0x2cb/0xa30 [ib_core]
[534552.835955]  [<ffffffffa0237c85>] __ib_process_cq+0x55/0xd0 [ib_core]
[534552.835962]  [<ffffffffa0237d70>] ib_cq_poll_work+0x20/0x60 [ib_core]
[534552.835964]  [<ffffffff810a7f3b>] process_one_work+0x17b/0x470
[534552.835966]  [<ffffffff810a8d76>] worker_thread+0x126/0x410
[534552.835969]  [<ffffffff810a8c50>] ? rescuer_thread+0x460/0x460
[534552.835971]  [<ffffffff810b052f>] kthread+0xcf/0xe0
[534552.835974]  [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140
[534552.835977]  [<ffffffff81696418>] ret_from_fork+0x58/0x90
[534552.835980]  [<ffffffff810b0460>] ? kthread_create_on_node+0x140/0x140

This issue is made worse when the 8051 is busy and the reads take longer.
Fix by using a non-spinning lock procure.

Reviewed-by: default avatarMichael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: default avatarMike Marciszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarTadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent b6eac931
Loading
Loading
Loading
Loading
+52 −34
Original line number Diff line number Diff line
@@ -6410,18 +6410,17 @@ static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
 *
 * The expectation is that the caller of this routine would have taken
 * care of properly transitioning the link into the correct state.
 * NOTE: the caller needs to acquire the dd->dc8051_lock lock
 *       before calling this function.
 */
static void dc_shutdown(struct hfi1_devdata *dd)
static void _dc_shutdown(struct hfi1_devdata *dd)
{
	unsigned long flags;
	lockdep_assert_held(&dd->dc8051_lock);

	spin_lock_irqsave(&dd->dc8051_lock, flags);
	if (dd->dc_shutdown) {
		spin_unlock_irqrestore(&dd->dc8051_lock, flags);
	if (dd->dc_shutdown)
		return;
	}

	dd->dc_shutdown = 1;
	spin_unlock_irqrestore(&dd->dc8051_lock, flags);
	/* Shutdown the LCB */
	lcb_shutdown(dd, 1);
	/*
@@ -6432,35 +6431,45 @@ static void dc_shutdown(struct hfi1_devdata *dd)
	write_csr(dd, DC_DC8051_CFG_RST, 0x1);
}

static void dc_shutdown(struct hfi1_devdata *dd)
{
	mutex_lock(&dd->dc8051_lock);
	_dc_shutdown(dd);
	mutex_unlock(&dd->dc8051_lock);
}

/*
 * Calling this after the DC has been brought out of reset should not
 * do any damage.
 * NOTE: the caller needs to acquire the dd->dc8051_lock lock
 *       before calling this function.
 */
static void dc_start(struct hfi1_devdata *dd)
static void _dc_start(struct hfi1_devdata *dd)
{
	unsigned long flags;
	int ret;
	lockdep_assert_held(&dd->dc8051_lock);

	spin_lock_irqsave(&dd->dc8051_lock, flags);
	if (!dd->dc_shutdown)
		goto done;
	spin_unlock_irqrestore(&dd->dc8051_lock, flags);
		return;

	/* Take the 8051 out of reset */
	write_csr(dd, DC_DC8051_CFG_RST, 0ull);
	/* Wait until 8051 is ready */
	ret = wait_fm_ready(dd, TIMEOUT_8051_START);
	if (ret) {
	if (wait_fm_ready(dd, TIMEOUT_8051_START))
		dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
			   __func__);
	}

	/* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
	write_csr(dd, DCC_CFG_RESET, 0x10);
	/* lcb_shutdown() with abort=1 does not restore these */
	write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
	spin_lock_irqsave(&dd->dc8051_lock, flags);
	dd->dc_shutdown = 0;
done:
	spin_unlock_irqrestore(&dd->dc8051_lock, flags);
}

static void dc_start(struct hfi1_devdata *dd)
{
	mutex_lock(&dd->dc8051_lock);
	_dc_start(dd);
	mutex_unlock(&dd->dc8051_lock);
}

/*
@@ -8513,16 +8522,11 @@ static int do_8051_command(
{
	u64 reg, completed;
	int return_code;
	unsigned long flags;
	unsigned long timeout;

	hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);

	/*
	 * Alternative to holding the lock for a long time:
	 * - keep busy wait - have other users bounce off
	 */
	spin_lock_irqsave(&dd->dc8051_lock, flags);
	mutex_lock(&dd->dc8051_lock);

	/* We can't send any commands to the 8051 if it's in reset */
	if (dd->dc_shutdown) {
@@ -8548,10 +8552,8 @@ static int do_8051_command(
			return_code = -ENXIO;
			goto fail;
		}
		spin_unlock_irqrestore(&dd->dc8051_lock, flags);
		dc_shutdown(dd);
		dc_start(dd);
		spin_lock_irqsave(&dd->dc8051_lock, flags);
		_dc_shutdown(dd);
		_dc_start(dd);
	}

	/*
@@ -8632,8 +8634,7 @@ static int do_8051_command(
	write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);

fail:
	spin_unlock_irqrestore(&dd->dc8051_lock, flags);

	mutex_unlock(&dd->dc8051_lock);
	return return_code;
}

@@ -12007,6 +12008,10 @@ static void free_cntrs(struct hfi1_devdata *dd)
	dd->scntrs = NULL;
	kfree(dd->cntrnames);
	dd->cntrnames = NULL;
	if (dd->update_cntr_wq) {
		destroy_workqueue(dd->update_cntr_wq);
		dd->update_cntr_wq = NULL;
	}
}

static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
@@ -12162,7 +12167,7 @@ u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
	return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
}

static void update_synth_timer(unsigned long opaque)
static void do_update_synth_timer(struct work_struct *work)
{
	u64 cur_tx;
	u64 cur_rx;
@@ -12171,8 +12176,8 @@ static void update_synth_timer(unsigned long opaque)
	int i, j, vl;
	struct hfi1_pportdata *ppd;
	struct cntr_entry *entry;

	struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
	struct hfi1_devdata *dd = container_of(work, struct hfi1_devdata,
					       update_cntr_work);

	/*
	 * Rather than keep beating on the CSRs pick a minimal set that we can
@@ -12255,7 +12260,13 @@ static void update_synth_timer(unsigned long opaque)
	} else {
		hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
	}
}

static void update_synth_timer(unsigned long opaque)
{
	struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;

	queue_work(dd->update_cntr_wq, &dd->update_cntr_work);
	mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
}

@@ -12491,6 +12502,13 @@ static int init_cntrs(struct hfi1_devdata *dd)
	if (init_cpu_counters(dd))
		goto bail;

	dd->update_cntr_wq = alloc_ordered_workqueue("hfi1_update_cntr_%d",
						     WQ_MEM_RECLAIM, dd->unit);
	if (!dd->update_cntr_wq)
		goto bail;

	INIT_WORK(&dd->update_cntr_work, do_update_synth_timer);

	mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
	return 0;
bail:
+4 −3
Original line number Diff line number Diff line
@@ -484,7 +484,7 @@ struct rvt_sge_state;
#define HFI1_PART_ENFORCE_OUT	0x2

/* how often we check for synthetic counter wrap around */
#define SYNTH_CNT_TIME 2
#define SYNTH_CNT_TIME 3

/* Counter flags */
#define CNTR_NORMAL		0x0 /* Normal counters, just read register */
@@ -962,8 +962,9 @@ struct hfi1_devdata {
	spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */
	/* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
	spinlock_t uctxt_lock; /* rcd and user context changes */
	/* exclusive access to 8051 */
	spinlock_t dc8051_lock;
	struct mutex dc8051_lock; /* exclusive access to 8051 */
	struct workqueue_struct *update_cntr_wq;
	struct work_struct update_cntr_work;
	/* exclusive access to 8051 memory */
	spinlock_t dc8051_memlock;
	int dc8051_timed_out;	/* remember if the 8051 timed out */
+1 −1
Original line number Diff line number Diff line
@@ -1081,11 +1081,11 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
	spin_lock_init(&dd->uctxt_lock);
	spin_lock_init(&dd->hfi1_diag_trans_lock);
	spin_lock_init(&dd->sc_init_lock);
	spin_lock_init(&dd->dc8051_lock);
	spin_lock_init(&dd->dc8051_memlock);
	seqlock_init(&dd->sc2vl_lock);
	spin_lock_init(&dd->sde_map_lock);
	spin_lock_init(&dd->pio_map_lock);
	mutex_init(&dd->dc8051_lock);
	init_waitqueue_head(&dd->event_queue);

	dd->int_counter = alloc_percpu(u64);