Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 73dec95e authored by Tvrtko Ursulin's avatar Tvrtko Ursulin
Browse files

drm/i915: Emit to ringbuffer directly



This removes the usage of intel_ring_emit in favour of
directly writing to the ring buffer.

intel_ring_emit was preventing the compiler for optimising
fetch and increment of the current ring buffer pointer and
therefore generating very verbose code for every write.

It had no useful purpose since all ringbuffer operations
are started and ended with intel_ring_begin and
intel_ring_advance respectively, with no bail out in the
middle possible, so it is fine to increment the tail in
intel_ring_begin and let the code manage the pointer
itself.

Useless instruction removal amounts to approximately
two and half kilobytes of saved text on my build.

Not sure if this has any measurable performance
implications but executing a ton of useless instructions
on fast paths cannot be good.

v2:
 * Change return from intel_ring_begin to error pointer by
   popular demand.
 * Move tail increment to intel_ring_advance to enable some
   error checking.

v3:
 * Move tail advance back into intel_ring_begin.
 * Rebase and tidy.

v4:
 * Complete rebase after a few months since v3.

v5:
 * Remove unecessary cast and fix !debug compile. (Chris Wilson)

v6:
 * Make intel_ring_offset take request as well.
 * Fix recording of request postfix plus a sprinkle of asserts.
   (Chris Wilson)

v7:
 * Use intel_ring_offset to get the postfix. (Chris Wilson)
 * Convert GVT code as well.

v8:
 * Rename *out++ to *cs++.

v9:
 * Fix GVT out to cs conversion in GVT.

v10:
 * Rebase for new intel_ring_begin in selftests.

Signed-off-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Reviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Acked-by: default avatarJoonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170214113242.29241-1-tvrtko.ursulin@linux.intel.com
parent d2d15016
Loading
Loading
Loading
Loading
+15 −21
Original line number Diff line number Diff line
@@ -1513,7 +1513,7 @@ static int copy_gma_to_hva(struct intel_vgpu *vgpu, struct intel_vgpu_mm *mm,
		len += copy_len;
		gma += copy_len;
	}
	return 0;
	return len;
}


@@ -1630,7 +1630,7 @@ static int perform_bb_shadow(struct parser_exec_state *s)
	ret = copy_gma_to_hva(s->vgpu, s->vgpu->gtt.ggtt_mm,
			      gma, gma + bb_size,
			      dst);
	if (ret) {
	if (ret < 0) {
		gvt_err("fail to copy guest ring buffer\n");
		goto unmap_src;
	}
@@ -2594,11 +2594,8 @@ static int scan_wa_ctx(struct intel_shadow_wa_ctx *wa_ctx)
static int shadow_workload_ring_buffer(struct intel_vgpu_workload *workload)
{
	struct intel_vgpu *vgpu = workload->vgpu;
	int ring_id = workload->ring_id;
	struct i915_gem_context *shadow_ctx = vgpu->shadow_ctx;
	struct intel_ring *ring = shadow_ctx->engine[ring_id].ring;
	unsigned long gma_head, gma_tail, gma_top, guest_rb_size;
	unsigned int copy_len = 0;
	u32 *cs;
	int ret;

	guest_rb_size = _RING_CTL_BUF_SIZE(workload->rb_ctl);
@@ -2612,36 +2609,33 @@ static int shadow_workload_ring_buffer(struct intel_vgpu_workload *workload)
	gma_top = workload->rb_start + guest_rb_size;

	/* allocate shadow ring buffer */
	ret = intel_ring_begin(workload->req, workload->rb_len / 4);
	if (ret)
		return ret;
	cs = intel_ring_begin(workload->req, workload->rb_len / sizeof(u32));
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	/* get shadow ring buffer va */
	workload->shadow_ring_buffer_va = ring->vaddr + ring->tail;
	workload->shadow_ring_buffer_va = cs;

	/* head > tail --> copy head <-> top */
	if (gma_head > gma_tail) {
		ret = copy_gma_to_hva(vgpu, vgpu->gtt.ggtt_mm,
				gma_head, gma_top,
				workload->shadow_ring_buffer_va);
		if (ret) {
				      gma_head, gma_top, cs);
		if (ret < 0) {
			gvt_err("fail to copy guest ring buffer\n");
			return ret;
		}
		copy_len = gma_top - gma_head;
		cs += ret / sizeof(u32);
		gma_head = workload->rb_start;
	}

	/* copy head or start <-> tail */
	ret = copy_gma_to_hva(vgpu, vgpu->gtt.ggtt_mm,
			gma_head, gma_tail,
			workload->shadow_ring_buffer_va + copy_len);
	if (ret) {
	ret = copy_gma_to_hva(vgpu, vgpu->gtt.ggtt_mm, gma_head, gma_tail, cs);
	if (ret < 0) {
		gvt_err("fail to copy guest ring buffer\n");
		return ret;
	}
	ring->tail += workload->rb_len;
	intel_ring_advance(ring);
	cs += ret / sizeof(u32);
	intel_ring_advance(workload->req, cs);
	return 0;
}

@@ -2695,7 +2689,7 @@ static int shadow_indirect_ctx(struct intel_shadow_wa_ctx *wa_ctx)
				wa_ctx->workload->vgpu->gtt.ggtt_mm,
				guest_gma, guest_gma + ctx_size,
				map);
	if (ret) {
	if (ret < 0) {
		gvt_err("fail to copy guest indirect ctx\n");
		goto unmap_src;
	}
+34 −42
Original line number Diff line number Diff line
@@ -596,10 +596,9 @@ static inline int
mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
{
	struct drm_i915_private *dev_priv = req->i915;
	struct intel_ring *ring = req->ring;
	struct intel_engine_cs *engine = req->engine;
	enum intel_engine_id id;
	u32 flags = hw_flags | MI_MM_SPACE_GTT;
	u32 *cs, flags = hw_flags | MI_MM_SPACE_GTT;
	const int num_rings =
		/* Use an extended w/a on ivb+ if signalling from other rings */
		i915.semaphores ?
@@ -629,99 +628,92 @@ mi_set_context(struct drm_i915_gem_request *req, u32 hw_flags)
	if (INTEL_GEN(dev_priv) >= 7)
		len += 2 + (num_rings ? 4*num_rings + 6 : 0);

	ret = intel_ring_begin(req, len);
	if (ret)
		return ret;
	cs = intel_ring_begin(req, len);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	/* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */
	if (INTEL_GEN(dev_priv) >= 7) {
		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_DISABLE);
		*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
		if (num_rings) {
			struct intel_engine_cs *signaller;

			intel_ring_emit(ring,
					MI_LOAD_REGISTER_IMM(num_rings));
			*cs++ = MI_LOAD_REGISTER_IMM(num_rings);
			for_each_engine(signaller, dev_priv, id) {
				if (signaller == engine)
					continue;

				intel_ring_emit_reg(ring,
				*cs++ = i915_mmio_reg_offset(
					   RING_PSMI_CTL(signaller->mmio_base));
				intel_ring_emit(ring,
						_MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
				*cs++ = _MASKED_BIT_ENABLE(
						GEN6_PSMI_SLEEP_MSG_DISABLE);
			}
		}
	}

	intel_ring_emit(ring, MI_NOOP);
	intel_ring_emit(ring, MI_SET_CONTEXT);
	intel_ring_emit(ring,
			i915_ggtt_offset(req->ctx->engine[RCS].state) | flags);
	*cs++ = MI_NOOP;
	*cs++ = MI_SET_CONTEXT;
	*cs++ = i915_ggtt_offset(req->ctx->engine[RCS].state) | flags;
	/*
	 * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP
	 * WaMiSetContext_Hang:snb,ivb,vlv
	 */
	intel_ring_emit(ring, MI_NOOP);
	*cs++ = MI_NOOP;

	if (INTEL_GEN(dev_priv) >= 7) {
		if (num_rings) {
			struct intel_engine_cs *signaller;
			i915_reg_t last_reg = {}; /* keep gcc quiet */

			intel_ring_emit(ring,
					MI_LOAD_REGISTER_IMM(num_rings));
			*cs++ = MI_LOAD_REGISTER_IMM(num_rings);
			for_each_engine(signaller, dev_priv, id) {
				if (signaller == engine)
					continue;

				last_reg = RING_PSMI_CTL(signaller->mmio_base);
				intel_ring_emit_reg(ring, last_reg);
				intel_ring_emit(ring,
						_MASKED_BIT_DISABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
				*cs++ = i915_mmio_reg_offset(last_reg);
				*cs++ = _MASKED_BIT_DISABLE(
						GEN6_PSMI_SLEEP_MSG_DISABLE);
			}

			/* Insert a delay before the next switch! */
			intel_ring_emit(ring,
					MI_STORE_REGISTER_MEM |
					MI_SRM_LRM_GLOBAL_GTT);
			intel_ring_emit_reg(ring, last_reg);
			intel_ring_emit(ring,
					i915_ggtt_offset(engine->scratch));
			intel_ring_emit(ring, MI_NOOP);
			*cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT;
			*cs++ = i915_mmio_reg_offset(last_reg);
			*cs++ = i915_ggtt_offset(engine->scratch);
			*cs++ = MI_NOOP;
		}
		intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE);
		*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
	}

	intel_ring_advance(ring);
	intel_ring_advance(req, cs);

	return ret;
}

static int remap_l3(struct drm_i915_gem_request *req, int slice)
{
	u32 *remap_info = req->i915->l3_parity.remap_info[slice];
	struct intel_ring *ring = req->ring;
	int i, ret;
	u32 *cs, *remap_info = req->i915->l3_parity.remap_info[slice];
	int i;

	if (!remap_info)
		return 0;

	ret = intel_ring_begin(req, GEN7_L3LOG_SIZE/4 * 2 + 2);
	if (ret)
		return ret;
	cs = intel_ring_begin(req, GEN7_L3LOG_SIZE/4 * 2 + 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	/*
	 * Note: We do not worry about the concurrent register cacheline hang
	 * here because no other code should access these registers other than
	 * at initialization time.
	 */
	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4));
	*cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
	for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
		intel_ring_emit_reg(ring, GEN7_L3LOG(slice, i));
		intel_ring_emit(ring, remap_info[i]);
		*cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
		*cs++ = remap_info[i];
	}
	intel_ring_emit(ring, MI_NOOP);
	intel_ring_advance(ring);
	*cs++ = MI_NOOP;
	intel_ring_advance(req, cs);

	return 0;
}
+19 −21
Original line number Diff line number Diff line
@@ -1336,25 +1336,25 @@ i915_gem_execbuffer_move_to_active(struct list_head *vmas,
static int
i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
{
	struct intel_ring *ring = req->ring;
	int ret, i;
	u32 *cs;
	int i;

	if (!IS_GEN7(req->i915) || req->engine->id != RCS) {
		DRM_DEBUG("sol reset is gen7/rcs only\n");
		return -EINVAL;
	}

	ret = intel_ring_begin(req, 4 * 3);
	if (ret)
		return ret;
	cs = intel_ring_begin(req, 4 * 3);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	for (i = 0; i < 4; i++) {
		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
		intel_ring_emit_reg(ring, GEN7_SO_WRITE_OFFSET(i));
		intel_ring_emit(ring, 0);
		*cs++ = MI_LOAD_REGISTER_IMM(1);
		*cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i));
		*cs++ = 0;
	}

	intel_ring_advance(ring);
	intel_ring_advance(req, cs);

	return 0;
}
@@ -1415,7 +1415,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
	struct drm_i915_private *dev_priv = params->request->i915;
	u64 exec_start, exec_len;
	int instp_mode;
	u32 instp_mask;
	u32 instp_mask, *cs;
	int ret;

	ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas);
@@ -1461,17 +1461,15 @@ execbuf_submit(struct i915_execbuffer_params *params,

	if (params->engine->id == RCS &&
	    instp_mode != dev_priv->relative_constants_mode) {
		struct intel_ring *ring = params->request->ring;

		ret = intel_ring_begin(params->request, 4);
		if (ret)
			return ret;

		intel_ring_emit(ring, MI_NOOP);
		intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
		intel_ring_emit_reg(ring, INSTPM);
		intel_ring_emit(ring, instp_mask << 16 | instp_mode);
		intel_ring_advance(ring);
		cs = intel_ring_begin(params->request, 4);
		if (IS_ERR(cs))
			return PTR_ERR(cs);

		*cs++ = MI_NOOP;
		*cs++ = MI_LOAD_REGISTER_IMM(1);
		*cs++ = i915_mmio_reg_offset(INSTPM);
		*cs++ = instp_mask << 16 | instp_mode;
		intel_ring_advance(params->request, cs);

		dev_priv->relative_constants_mode = instp_mode;
	}
+34 −35
Original line number Diff line number Diff line
@@ -686,23 +686,22 @@ static int gen8_write_pdp(struct drm_i915_gem_request *req,
			  unsigned entry,
			  dma_addr_t addr)
{
	struct intel_ring *ring = req->ring;
	struct intel_engine_cs *engine = req->engine;
	int ret;
	u32 *cs;

	BUG_ON(entry >= 4);

	ret = intel_ring_begin(req, 6);
	if (ret)
		return ret;
	cs = intel_ring_begin(req, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
	intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, entry));
	intel_ring_emit(ring, upper_32_bits(addr));
	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
	intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, entry));
	intel_ring_emit(ring, lower_32_bits(addr));
	intel_ring_advance(ring);
	*cs++ = MI_LOAD_REGISTER_IMM(1);
	*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, entry));
	*cs++ = upper_32_bits(addr);
	*cs++ = MI_LOAD_REGISTER_IMM(1);
	*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, entry));
	*cs++ = lower_32_bits(addr);
	intel_ring_advance(req, cs);

	return 0;
}
@@ -1730,8 +1729,8 @@ static uint32_t get_pd_offset(struct i915_hw_ppgtt *ppgtt)
static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
			 struct drm_i915_gem_request *req)
{
	struct intel_ring *ring = req->ring;
	struct intel_engine_cs *engine = req->engine;
	u32 *cs;
	int ret;

	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1739,17 +1738,17 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
	if (ret)
		return ret;

	ret = intel_ring_begin(req, 6);
	if (ret)
		return ret;
	cs = intel_ring_begin(req, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
	intel_ring_emit(ring, PP_DIR_DCLV_2G);
	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
	intel_ring_emit(ring, get_pd_offset(ppgtt));
	intel_ring_emit(ring, MI_NOOP);
	intel_ring_advance(ring);
	*cs++ = MI_LOAD_REGISTER_IMM(2);
	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine));
	*cs++ = PP_DIR_DCLV_2G;
	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine));
	*cs++ = get_pd_offset(ppgtt);
	*cs++ = MI_NOOP;
	intel_ring_advance(req, cs);

	return 0;
}
@@ -1757,8 +1756,8 @@ static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt,
static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
			  struct drm_i915_gem_request *req)
{
	struct intel_ring *ring = req->ring;
	struct intel_engine_cs *engine = req->engine;
	u32 *cs;
	int ret;

	/* NB: TLBs must be flushed and invalidated before a switch */
@@ -1766,17 +1765,17 @@ static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt,
	if (ret)
		return ret;

	ret = intel_ring_begin(req, 6);
	if (ret)
		return ret;

	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(2));
	intel_ring_emit_reg(ring, RING_PP_DIR_DCLV(engine));
	intel_ring_emit(ring, PP_DIR_DCLV_2G);
	intel_ring_emit_reg(ring, RING_PP_DIR_BASE(engine));
	intel_ring_emit(ring, get_pd_offset(ppgtt));
	intel_ring_emit(ring, MI_NOOP);
	intel_ring_advance(ring);
	cs = intel_ring_begin(req, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_LOAD_REGISTER_IMM(2);
	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine));
	*cs++ = PP_DIR_DCLV_2G;
	*cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine));
	*cs++ = get_pd_offset(ppgtt);
	*cs++ = MI_NOOP;
	intel_ring_advance(req, cs);

	/* XXX: RCS is the only one to auto invalidate the TLBs? */
	if (engine->id != RCS) {
+4 −4
Original line number Diff line number Diff line
@@ -824,6 +824,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
	struct intel_ring *ring = request->ring;
	struct intel_timeline *timeline = request->timeline;
	struct drm_i915_gem_request *prev;
	u32 *cs;
	int err;

	lockdep_assert_held(&request->i915->drm.struct_mutex);
@@ -862,10 +863,9 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
	 * GPU processing the request, we never over-estimate the
	 * position of the ring's HEAD.
	 */
	err = intel_ring_begin(request, engine->emit_breadcrumb_sz);
	GEM_BUG_ON(err);
	request->postfix = ring->tail;
	ring->tail += engine->emit_breadcrumb_sz * sizeof(u32);
	cs = intel_ring_begin(request, engine->emit_breadcrumb_sz);
	GEM_BUG_ON(IS_ERR(cs));
	request->postfix = intel_ring_offset(request, cs);

	/* Seal the request and mark it as pending execution. Note that
	 * we may inspect this state, without holding any locks, during
Loading