Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6d2cb5aa authored by Chris Wilson's avatar Chris Wilson
Browse files

drm/i915/execlists: Read the context-status buffer from the HWSP



The engine provides a mirror of the CSB in the HWSP. If we use the
cacheable reads from the HWSP, we can shave off a few mmio reads per
context-switch interrupt (which are quite frequent!). Just removing a
couple of mmio is not enough to actually reduce any latency, but a small
reduction in overall cpu usage.

Much appreciation for Ben dropping the bombshell that the CSB was in the
HWSP and for Michel in digging out the details.

v2: Don't be lazy, add the defines for the indices.
v3: Include the HWSP in debugfs/i915_engine_info
v4: Check for GVT-g, it currently depends on intercepting CSB mmio
v5: Fixup GVT-g mmio path
v6: Disable HWSP if VT-d is active as the iommu adds unpredictable
memory latency. (Mika)
v7: Also markup the CSB read with READ_ONCE() as it may still be an mmio
read and we want to stop the compiler from issuing a later (v.slow) reload.

Suggested-by: default avatarBen Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Acked-by: default avatarMichel Thierry <michel.thierry@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20170913133534.26927-1-chris@chris-wilson.co.uk


Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
parent 34a04e5e
Loading
Loading
Loading
Loading
+5 −2
Original line number Original line Diff line number Diff line
@@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
			   upper_32_bits(addr), lower_32_bits(addr));
			   upper_32_bits(addr), lower_32_bits(addr));


		if (i915.enable_execlists) {
		if (i915.enable_execlists) {
			const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
			u32 ptr, read, write;
			u32 ptr, read, write;
			unsigned int idx;
			unsigned int idx;


@@ -3337,10 +3338,12 @@ static int i915_engine_info(struct seq_file *m, void *unused)
				write += GEN8_CSB_ENTRIES;
				write += GEN8_CSB_ENTRIES;
			while (read < write) {
			while (read < write) {
				idx = ++read % GEN8_CSB_ENTRIES;
				idx = ++read % GEN8_CSB_ENTRIES;
				seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
				seq_printf(m, "\tExeclist CSB[%d]: 0x%08x [0x%08x in hwsp], context: %d [%d in hwsp]\n",
					   idx,
					   idx,
					   I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
					   I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
					   I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)));
					   hws[idx * 2],
					   I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)),
					   hws[idx * 2 + 1]);
			}
			}


			rcu_read_lock();
			rcu_read_lock();
+30 −5
Original line number Original line Diff line number Diff line
@@ -541,10 +541,17 @@ static void intel_lrc_irq_handler(unsigned long data)
	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
	while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) {
		u32 __iomem *csb_mmio =
		u32 __iomem *csb_mmio =
			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
		u32 __iomem *buf =
		/* The HWSP contains a (cacheable) mirror of the CSB */
			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
		const u32 *buf =
			&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
		unsigned int head, tail;
		unsigned int head, tail;


		/* However GVT emulation depends upon intercepting CSB mmio */
		if (unlikely(engine->csb_use_mmio)) {
			buf = (u32 * __force)
				(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)));
		}

		/* The write will be ordered by the uncached read (itself
		/* The write will be ordered by the uncached read (itself
		 * a memory barrier), so we do not need another in the form
		 * a memory barrier), so we do not need another in the form
		 * of a locked instruction. The race between the interrupt
		 * of a locked instruction. The race between the interrupt
@@ -584,13 +591,12 @@ static void intel_lrc_irq_handler(unsigned long data)
			 * status notifier.
			 * status notifier.
			 */
			 */


			status = readl(buf + 2 * head);
			status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
				continue;
				continue;


			/* Check the context/desc id for this event matches */
			/* Check the context/desc id for this event matches */
			GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
			GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
					 port->context_id);


			rq = port_unpack(port, &count);
			rq = port_unpack(port, &count);
			GEM_BUG_ON(count == 0);
			GEM_BUG_ON(count == 0);
@@ -1720,6 +1726,23 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
}
}


static bool irq_handler_force_mmio(struct drm_i915_private *i915)
{
	/* GVT emulation depends upon intercepting CSB mmio */
	if (intel_vgpu_active(i915))
		return true;

	/*
	 * IOMMU adds unpredictable latency causing the CSB write (from the
	 * GPU into the HWSP) to only be visible some time after the interrupt
	 * (missed breadcrumb syndrome).
	 */
	if (intel_vtd_active())
		return true;

	return false;
}

static void
static void
logical_ring_setup(struct intel_engine_cs *engine)
logical_ring_setup(struct intel_engine_cs *engine)
{
{
@@ -1731,6 +1754,8 @@ logical_ring_setup(struct intel_engine_cs *engine)
	/* Intentionally left blank. */
	/* Intentionally left blank. */
	engine->buffer = NULL;
	engine->buffer = NULL;


	engine->csb_use_mmio = irq_handler_force_mmio(dev_priv);

	fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
	fw_domains = intel_uncore_forcewake_for_reg(dev_priv,
						    RING_ELSP(engine),
						    RING_ELSP(engine),
						    FW_REG_WRITE);
						    FW_REG_WRITE);
+3 −0
Original line number Original line Diff line number Diff line
@@ -391,6 +391,7 @@ struct intel_engine_cs {
	struct rb_root execlist_queue;
	struct rb_root execlist_queue;
	struct rb_node *execlist_first;
	struct rb_node *execlist_first;
	unsigned int fw_domains;
	unsigned int fw_domains;
	bool csb_use_mmio;


	/* Contexts are pinned whilst they are active on the GPU. The last
	/* Contexts are pinned whilst they are active on the GPU. The last
	 * context executed remains active whilst the GPU is idle - the
	 * context executed remains active whilst the GPU is idle - the
@@ -496,6 +497,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
#define I915_GEM_HWS_SCRATCH_INDEX	0x40
#define I915_GEM_HWS_SCRATCH_INDEX	0x40
#define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
#define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)


#define I915_HWS_CSB_BUF0_INDEX		0x10

struct intel_ring *
struct intel_ring *
intel_engine_create_ring(struct intel_engine_cs *engine, int size);
intel_engine_create_ring(struct intel_engine_cs *engine, int size);
int intel_ring_pin(struct intel_ring *ring,
int intel_ring_pin(struct intel_ring *ring,