Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 90fed9c9 authored by David S. Miller's avatar David S. Miller
Browse files


Alexei Starovoitov says:

====================
pull-request: bpf-next 2018-05-24

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Björn Töpel cleans up AF_XDP (removes rebind, explicit cache alignment from uapi, etc).

2) David Ahern adds mtu checks to bpf_ipv{4,6}_fib_lookup() helpers.

3) Jesper Dangaard Brouer adds bulking support to ndo_xdp_xmit.

4) Jiong Wang adds support for indirect and arithmetic shifts to NFP

5) Martin KaFai Lau cleans up BTF uapi and makes the btf_header extensible.

6) Mathieu Xhonneux adds an End.BPF action to seg6local with BPF helpers allowing
   to edit/grow/shrink a SRH and apply on a packet generic SRv6 actions.

7) Sandipan Das adds support for bpf2bpf function calls in ppc64 JIT.

8) Yonghong Song adds BPF_TASK_FD_QUERY command for introspection of tracing events.

9) other misc fixes from Gustavo A. R. Silva, Sirio Balmelli, John Fastabend, and Magnus Karlsson
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 49a473f5 10f67868
Loading
Loading
Loading
Loading
+89 −21
Original line number Diff line number Diff line
@@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx)

static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, u64 func)
{
	unsigned int i, ctx_idx = ctx->idx;

	/* Load function address into r12 */
	PPC_LI64(12, func);

	/* For bpf-to-bpf function calls, the callee's address is unknown
	 * until the last extra pass. As seen above, we use PPC_LI64() to
	 * load the callee's address, but this may optimize the number of
	 * instructions required based on the nature of the address.
	 *
	 * Since we don't want the number of instructions emitted to change,
	 * we pad the optimized PPC_LI64() call with NOPs to guarantee that
	 * we always have a five-instruction sequence, which is the maximum
	 * that PPC_LI64() can emit.
	 */
	for (i = ctx->idx - ctx_idx; i < 5; i++)
		PPC_NOP();

#ifdef PPC64_ELF_ABI_v1
	/* func points to the function descriptor */
	PPC_LI64(b2p[TMP_REG_2], func);
	/* Load actual entry point from function descriptor */
	PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
	/* ... and move it to LR */
	PPC_MTLR(b2p[TMP_REG_1]);
	/*
	 * Load TOC from function descriptor at offset 8.
	 * We can clobber r2 since we get called through a
	 * function pointer (so caller will save/restore r2)
	 * and since we don't use a TOC ourself.
	 */
	PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
#else
	/* We can clobber r12 */
	PPC_FUNC_ADDR(12, func);
	PPC_MTLR(12);
	PPC_BPF_LL(2, 12, 8);
	/* Load actual entry point from function descriptor */
	PPC_BPF_LL(12, 12, 0);
#endif

	PPC_MTLR(12);
	PPC_BLRL();
}

@@ -256,7 +268,7 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32
/* Assemble the body code between the prologue & epilogue */
static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
			      struct codegen_context *ctx,
			      u32 *addrs)
			      u32 *addrs, bool extra_pass)
{
	const struct bpf_insn *insn = fp->insnsi;
	int flen = fp->len;
@@ -712,10 +724,24 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
			break;

		/*
		 * Call kernel helper
		 * Call kernel helper or bpf function
		 */
		case BPF_JMP | BPF_CALL:
			ctx->seen |= SEEN_FUNC;

			/* bpf function call */
			if (insn[i].src_reg == BPF_PSEUDO_CALL)
				if (!extra_pass)
					func = NULL;
				else if (fp->aux->func && off < fp->aux->func_cnt)
					/* use the subprog id from the off
					 * field to lookup the callee address
					 */
					func = (u8 *) fp->aux->func[off]->bpf_func;
				else
					return -EINVAL;
			/* kernel helper call */
			else
				func = (u8 *) __bpf_call_base + imm;

			bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -864,6 +890,14 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image,
	return 0;
}

struct powerpc64_jit_data {
	struct bpf_binary_header *header;
	u32 *addrs;
	u8 *image;
	u32 proglen;
	struct codegen_context ctx;
};

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
{
	u32 proglen;
@@ -871,6 +905,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
	u8 *image = NULL;
	u32 *code_base;
	u32 *addrs;
	struct powerpc64_jit_data *jit_data;
	struct codegen_context cgctx;
	int pass;
	int flen;
@@ -878,6 +913,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
	struct bpf_prog *org_fp = fp;
	struct bpf_prog *tmp_fp;
	bool bpf_blinded = false;
	bool extra_pass = false;

	if (!fp->jit_requested)
		return org_fp;
@@ -891,11 +927,32 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
		fp = tmp_fp;
	}

	jit_data = fp->aux->jit_data;
	if (!jit_data) {
		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
		if (!jit_data) {
			fp = org_fp;
			goto out;
		}
		fp->aux->jit_data = jit_data;
	}

	flen = fp->len;
	addrs = jit_data->addrs;
	if (addrs) {
		cgctx = jit_data->ctx;
		image = jit_data->image;
		bpf_hdr = jit_data->header;
		proglen = jit_data->proglen;
		alloclen = proglen + FUNCTION_DESCR_SIZE;
		extra_pass = true;
		goto skip_init_ctx;
	}

	addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
	if (addrs == NULL) {
		fp = org_fp;
		goto out;
		goto out_addrs;
	}

	memset(&cgctx, 0, sizeof(struct codegen_context));
@@ -904,10 +961,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
	cgctx.stack_size = round_up(fp->aux->stack_depth, 16);

	/* Scouting faux-generate pass 0 */
	if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) {
	if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
		/* We hit something illegal or unsupported. */
		fp = org_fp;
		goto out;
		goto out_addrs;
	}

	/*
@@ -925,9 +982,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
			bpf_jit_fill_ill_insns);
	if (!bpf_hdr) {
		fp = org_fp;
		goto out;
		goto out_addrs;
	}

skip_init_ctx:
	code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);

	/* Code generation passes 1-2 */
@@ -935,7 +993,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
		/* Now build the prologue, body code & epilogue for real. */
		cgctx.idx = 0;
		bpf_jit_build_prologue(code_base, &cgctx);
		bpf_jit_build_body(fp, code_base, &cgctx, addrs);
		bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass);
		bpf_jit_build_epilogue(code_base, &cgctx);

		if (bpf_jit_enable > 1)
@@ -961,10 +1019,20 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
	fp->jited_len = alloclen;

	bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));

out:
	if (!fp->is_func || extra_pass) {
out_addrs:
		kfree(addrs);
		kfree(jit_data);
		fp->aux->jit_data = NULL;
	} else {
		jit_data->addrs = addrs;
		jit_data->ctx = cgctx;
		jit_data->proglen = proglen;
		jit_data->image = image;
		jit_data->header = bpf_hdr;
	}

out:
	if (bpf_blinded)
		bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp);

+19 −7
Original line number Diff line number Diff line
@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 * @dev: netdev
 * @xdp: XDP buffer
 *
 * Returns Zero if sent, else an error code
 * Returns number of frames successfully sent. Frames that fail are
 * free'ed via XDP return API.
 *
 * For error cases, a negative errno code is returned and no-frames
 * are transmitted (caller must handle freeing frames).
 **/
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{
	struct i40e_netdev_priv *np = netdev_priv(dev);
	unsigned int queue_index = smp_processor_id();
	struct i40e_vsi *vsi = np->vsi;
	int err;
	int drops = 0;
	int i;

	if (test_bit(__I40E_VSI_DOWN, vsi->state))
		return -ENETDOWN;
@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
		return -ENXIO;

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];
		int err;

		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
	if (err != I40E_XDP_TX)
		return -ENOSPC;
		if (err != I40E_XDP_TX) {
			xdp_return_frame_rx_napi(xdpf);
			drops++;
		}
	}

	return 0;
	return n - drops;
}

/**
+1 −1
Original line number Diff line number Diff line
@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
void i40e_detect_recover_hung(struct i40e_vsi *vsi);
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
void i40e_xdp_flush(struct net_device *dev);

/**
+15 −6
Original line number Diff line number Diff line
@@ -10022,11 +10022,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
	}
}

static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
static int ixgbe_xdp_xmit(struct net_device *dev, int n,
			  struct xdp_frame **frames)
{
	struct ixgbe_adapter *adapter = netdev_priv(dev);
	struct ixgbe_ring *ring;
	int err;
	int drops = 0;
	int i;

	if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
		return -ENETDOWN;
@@ -10038,11 +10040,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
	if (unlikely(!ring))
		return -ENXIO;

	for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];
		int err;

		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
	if (err != IXGBE_XDP_TX)
		return -ENOSPC;
		if (err != IXGBE_XDP_TX) {
			xdp_return_frame_rx_napi(xdpf);
			drops++;
		}
	}

	return 0;
	return n - drops;
}

static void ixgbe_xdp_flush(struct net_device *dev)
+381 −29
Original line number Diff line number Diff line
@@ -211,6 +211,60 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
}

static void
__emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
	      bool set, bool src_lmextn)
{
	u16 addr_lo, addr_hi;
	u64 insn;

	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
	addr_hi = addr != addr_lo;

	insn = OP_BR_BIT_BASE |
		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
		FIELD_PREP(OP_BR_BIT_BV, set) |
		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);

	nfp_prog_push(nfp_prog, insn);
}

static void
emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
		 u8 defer, bool set, enum nfp_relo_type relo)
{
	struct nfp_insn_re_regs reg;
	int err;

	/* NOTE: The bit to test is specified as an rotation amount, such that
	 *	 the bit to test will be placed on the MSB of the result when
	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
	 */
	bit += 1;

	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
	if (err) {
		nfp_prog->error = err;
		return;
	}

	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
		      reg.src_lmextn);

	nfp_prog->prog[nfp_prog->prog_len - 1] |=
		FIELD_PREP(OP_RELO_TYPE, relo);
}

static void
emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
{
	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
}

static void
__emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
	     enum immed_width width, bool invert,
@@ -309,6 +363,19 @@ emit_shf(struct nfp_prog *nfp_prog, swreg dst,
		   reg.dst_lmextn, reg.src_lmextn);
}

static void
emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
{
	if (sc == SHF_SC_R_ROT) {
		pr_err("indirect shift is not allowed on rotation\n");
		nfp_prog->error = -EFAULT;
		return;
	}

	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
}

static void
__emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
@@ -1629,28 +1696,144 @@ static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
	return 0;
}

/* Pseudo code:
 *   if shift_amt >= 32
 *     dst_high = dst_low << shift_amt[4:0]
 *     dst_low = 0;
 *   else
 *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
 *     dst_low = dst_low << shift_amt
 *
 * The indirect shift will use the same logic at runtime.
 */
static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
{
	if (shift_amt < 32) {
		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
			 32 - shift_amt);
		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
	} else if (shift_amt == 32) {
		wrp_reg_mov(nfp_prog, dst + 1, dst);
		wrp_immed(nfp_prog, reg_both(dst), 0);
	} else if (shift_amt > 32) {
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
		wrp_immed(nfp_prog, reg_both(dst), 0);
	}

	return 0;
}

static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
	const struct bpf_insn *insn = &meta->insn;
	u8 dst = insn->dst_reg * 2;

	if (insn->imm < 32) {
		emit_shf(nfp_prog, reg_both(dst + 1),
			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
			 SHF_SC_R_DSHF, 32 - insn->imm);
		emit_shf(nfp_prog, reg_both(dst),
			 reg_none(), SHF_OP_NONE, reg_b(dst),
			 SHF_SC_L_SHF, insn->imm);
	} else if (insn->imm == 32) {
		wrp_reg_mov(nfp_prog, dst + 1, dst);
		wrp_immed(nfp_prog, reg_both(dst), 0);
	} else if (insn->imm > 32) {
		emit_shf(nfp_prog, reg_both(dst + 1),
			 reg_none(), SHF_OP_NONE, reg_b(dst),
			 SHF_SC_L_SHF, insn->imm - 32);
	return __shl_imm64(nfp_prog, dst, insn->imm);
}

static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
		 reg_b(src));
	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
		       reg_b(dst), SHF_SC_R_DSHF);
}

/* NOTE: for indirect left shift, HIGH part should be calculated first. */
static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
		       reg_b(dst), SHF_SC_L_SHF);
}

static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	shl_reg64_lt32_high(nfp_prog, dst, src);
	shl_reg64_lt32_low(nfp_prog, dst, src);
}

static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
		       reg_b(dst), SHF_SC_L_SHF);
	wrp_immed(nfp_prog, reg_both(dst), 0);
}

static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
	const struct bpf_insn *insn = &meta->insn;
	u64 umin, umax;
	u8 dst, src;

	dst = insn->dst_reg * 2;
	umin = meta->umin;
	umax = meta->umax;
	if (umin == umax)
		return __shl_imm64(nfp_prog, dst, umin);

	src = insn->src_reg * 2;
	if (umax < 32) {
		shl_reg64_lt32(nfp_prog, dst, src);
	} else if (umin >= 32) {
		shl_reg64_ge32(nfp_prog, dst, src);
	} else {
		/* Generate different instruction sequences depending on runtime
		 * value of shift amount.
		 */
		u16 label_ge32, label_end;

		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);

		shl_reg64_lt32_high(nfp_prog, dst, src);
		label_end = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br(nfp_prog, BR_UNC, label_end, 2);
		/* shl_reg64_lt32_low packed in delay slot. */
		shl_reg64_lt32_low(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
			return -EINVAL;
		shl_reg64_ge32(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
			return -EINVAL;
	}

	return 0;
}

/* Pseudo code:
 *   if shift_amt >= 32
 *     dst_high = 0;
 *     dst_low = dst_high >> shift_amt[4:0]
 *   else
 *     dst_high = dst_high >> shift_amt
 *     dst_low = (dst_high, dst_low) >> shift_amt
 *
 * The indirect shift will use the same logic at runtime.
 */
static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
{
	if (shift_amt < 32) {
		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
	} else if (shift_amt == 32) {
		wrp_reg_mov(nfp_prog, dst, dst + 1);
		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
	} else if (shift_amt > 32) {
		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
	}

	return 0;
}

@@ -1659,23 +1842,188 @@ static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
	const struct bpf_insn *insn = &meta->insn;
	u8 dst = insn->dst_reg * 2;

	if (insn->imm < 32) {
		emit_shf(nfp_prog, reg_both(dst),
			 reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
			 SHF_SC_R_DSHF, insn->imm);
		emit_shf(nfp_prog, reg_both(dst + 1),
			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
			 SHF_SC_R_SHF, insn->imm);
	} else if (insn->imm == 32) {
		wrp_reg_mov(nfp_prog, dst, dst + 1);
		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
	} else if (insn->imm > 32) {
		emit_shf(nfp_prog, reg_both(dst),
			 reg_none(), SHF_OP_NONE, reg_b(dst + 1),
			 SHF_SC_R_SHF, insn->imm - 32);
	return __shr_imm64(nfp_prog, dst, insn->imm);
}

/* NOTE: for indirect right shift, LOW part should be calculated first. */
static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
		       reg_b(dst + 1), SHF_SC_R_SHF);
}

static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
		       reg_b(dst), SHF_SC_R_DSHF);
}

static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	shr_reg64_lt32_low(nfp_prog, dst, src);
	shr_reg64_lt32_high(nfp_prog, dst, src);
}

static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
		       reg_b(dst + 1), SHF_SC_R_SHF);
	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
}

static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
	const struct bpf_insn *insn = &meta->insn;
	u64 umin, umax;
	u8 dst, src;

	dst = insn->dst_reg * 2;
	umin = meta->umin;
	umax = meta->umax;
	if (umin == umax)
		return __shr_imm64(nfp_prog, dst, umin);

	src = insn->src_reg * 2;
	if (umax < 32) {
		shr_reg64_lt32(nfp_prog, dst, src);
	} else if (umin >= 32) {
		shr_reg64_ge32(nfp_prog, dst, src);
	} else {
		/* Generate different instruction sequences depending on runtime
		 * value of shift amount.
		 */
		u16 label_ge32, label_end;

		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
		shr_reg64_lt32_low(nfp_prog, dst, src);
		label_end = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br(nfp_prog, BR_UNC, label_end, 2);
		/* shr_reg64_lt32_high packed in delay slot. */
		shr_reg64_lt32_high(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
			return -EINVAL;
		shr_reg64_ge32(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
			return -EINVAL;
	}

	return 0;
}

/* Code logic is the same as __shr_imm64 except ashr requires signedness bit
 * told through PREV_ALU result.
 */
static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
{
	if (shift_amt < 32) {
		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
		/* Set signedness bit. */
		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
			 reg_imm(0));
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
	} else if (shift_amt == 32) {
		/* NOTE: this also helps setting signedness bit. */
		wrp_reg_mov(nfp_prog, dst, dst + 1);
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
	} else if (shift_amt > 32) {
		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
			 reg_imm(0));
		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
	}

	return 0;
}

static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
	const struct bpf_insn *insn = &meta->insn;
	u8 dst = insn->dst_reg * 2;

	return __ashr_imm64(nfp_prog, dst, insn->imm);
}

static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	/* NOTE: the first insn will set both indirect shift amount (source A)
	 * and signedness bit (MSB of result).
	 */
	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
		       reg_b(dst + 1), SHF_SC_R_SHF);
}

static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	/* NOTE: it is the same as logic shift because we don't need to shift in
	 * signedness bit when the shift amount is less than 32.
	 */
	return shr_reg64_lt32_low(nfp_prog, dst, src);
}

static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	ashr_reg64_lt32_low(nfp_prog, dst, src);
	ashr_reg64_lt32_high(nfp_prog, dst, src);
}

static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
{
	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
		       reg_b(dst + 1), SHF_SC_R_SHF);
	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
}

/* Like ashr_imm64, but need to use indirect shift. */
static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
	const struct bpf_insn *insn = &meta->insn;
	u64 umin, umax;
	u8 dst, src;

	dst = insn->dst_reg * 2;
	umin = meta->umin;
	umax = meta->umax;
	if (umin == umax)
		return __ashr_imm64(nfp_prog, dst, umin);

	src = insn->src_reg * 2;
	if (umax < 32) {
		ashr_reg64_lt32(nfp_prog, dst, src);
	} else if (umin >= 32) {
		ashr_reg64_ge32(nfp_prog, dst, src);
	} else {
		u16 label_ge32, label_end;

		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
		ashr_reg64_lt32_low(nfp_prog, dst, src);
		label_end = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br(nfp_prog, BR_UNC, label_end, 2);
		/* ashr_reg64_lt32_high packed in delay slot. */
		ashr_reg64_lt32_high(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
			return -EINVAL;
		ashr_reg64_ge32(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
			return -EINVAL;
	}

	return 0;
}

@@ -2501,8 +2849,12 @@ static const instr_cb_t instr_cb[256] = {
	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
Loading