Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next (90fed9c9) · Commits · e / devices / android_kernel_fairphone_FP4

arch/powerpc/net/bpf_jit_comp64.c

+89 −21

Original line number	Diff line number	Diff line
		@@ -167,25 +167,37 @@ static void bpf_jit_build_epilogue(u32 image, struct codegen_context ctx)

		static void bpf_jit_emit_func_call(u32 image, struct codegen_context ctx, u64 func)
		{
		unsigned int i, ctx_idx = ctx->idx;

		/* Load function address into r12 */
		PPC_LI64(12, func);

		/* For bpf-to-bpf function calls, the callee's address is unknown
		* until the last extra pass. As seen above, we use PPC_LI64() to
		* load the callee's address, but this may optimize the number of
		* instructions required based on the nature of the address.
		*
		* Since we don't want the number of instructions emitted to change,
		* we pad the optimized PPC_LI64() call with NOPs to guarantee that
		* we always have a five-instruction sequence, which is the maximum
		* that PPC_LI64() can emit.
		*/
		for (i = ctx->idx - ctx_idx; i < 5; i++)
		PPC_NOP();

		#ifdef PPC64_ELF_ABI_v1
		/* func points to the function descriptor */
		PPC_LI64(b2p[TMP_REG_2], func);
		/* Load actual entry point from function descriptor */
		PPC_BPF_LL(b2p[TMP_REG_1], b2p[TMP_REG_2], 0);
		/* ... and move it to LR */
		PPC_MTLR(b2p[TMP_REG_1]);
		/*
		* Load TOC from function descriptor at offset 8.
		* We can clobber r2 since we get called through a
		* function pointer (so caller will save/restore r2)
		* and since we don't use a TOC ourself.
		*/
		PPC_BPF_LL(2, b2p[TMP_REG_2], 8);
		#else
		/* We can clobber r12 */
		PPC_FUNC_ADDR(12, func);
		PPC_MTLR(12);
		PPC_BPF_LL(2, 12, 8);
		/* Load actual entry point from function descriptor */
		PPC_BPF_LL(12, 12, 0);
		#endif

		PPC_MTLR(12);
		PPC_BLRL();
		}

		@@ -256,7 +268,7 @@ static void bpf_jit_emit_tail_call(u32 image, struct codegen_context ctx, u32
		/* Assemble the body code between the prologue & epilogue */
		static int bpf_jit_build_body(struct bpf_prog fp, u32 image,
		struct codegen_context *ctx,
		u32 *addrs)
		u32 *addrs, bool extra_pass)
		{
		const struct bpf_insn *insn = fp->insnsi;
		int flen = fp->len;
		@@ -712,10 +724,24 @@ static int bpf_jit_build_body(struct bpf_prog fp, u32 image,
		break;

		/*
		* Call kernel helper
		* Call kernel helper or bpf function
		*/
		case BPF_JMP \| BPF_CALL:
		ctx->seen \|= SEEN_FUNC;

		/* bpf function call */
		if (insn[i].src_reg == BPF_PSEUDO_CALL)
		if (!extra_pass)
		func = NULL;
		else if (fp->aux->func && off < fp->aux->func_cnt)
		/* use the subprog id from the off
		* field to lookup the callee address
		*/
		func = (u8 *) fp->aux->func[off]->bpf_func;
		else
		return -EINVAL;
		/* kernel helper call */
		else
		func = (u8 *) __bpf_call_base + imm;

		bpf_jit_emit_func_call(image, ctx, (u64)func);
		@@ -864,6 +890,14 @@ static int bpf_jit_build_body(struct bpf_prog fp, u32 image,
		return 0;
		}

		struct powerpc64_jit_data {
		struct bpf_binary_header *header;
		u32 *addrs;
		u8 *image;
		u32 proglen;
		struct codegen_context ctx;
		};

		struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		{
		u32 proglen;
		@@ -871,6 +905,7 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		u8 *image = NULL;
		u32 *code_base;
		u32 *addrs;
		struct powerpc64_jit_data *jit_data;
		struct codegen_context cgctx;
		int pass;
		int flen;
		@@ -878,6 +913,7 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		struct bpf_prog *org_fp = fp;
		struct bpf_prog *tmp_fp;
		bool bpf_blinded = false;
		bool extra_pass = false;

		if (!fp->jit_requested)
		return org_fp;
		@@ -891,11 +927,32 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		fp = tmp_fp;
		}

		jit_data = fp->aux->jit_data;
		if (!jit_data) {
		jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL);
		if (!jit_data) {
		fp = org_fp;
		goto out;
		}
		fp->aux->jit_data = jit_data;
		}

		flen = fp->len;
		addrs = jit_data->addrs;
		if (addrs) {
		cgctx = jit_data->ctx;
		image = jit_data->image;
		bpf_hdr = jit_data->header;
		proglen = jit_data->proglen;
		alloclen = proglen + FUNCTION_DESCR_SIZE;
		extra_pass = true;
		goto skip_init_ctx;
		}

		addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
		if (addrs == NULL) {
		fp = org_fp;
		goto out;
		goto out_addrs;
		}

		memset(&cgctx, 0, sizeof(struct codegen_context));
		@@ -904,10 +961,10 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		cgctx.stack_size = round_up(fp->aux->stack_depth, 16);

		/* Scouting faux-generate pass 0 */
		if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) {
		if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
		/* We hit something illegal or unsupported. */
		fp = org_fp;
		goto out;
		goto out_addrs;
		}

		/*
		@@ -925,9 +982,10 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		bpf_jit_fill_ill_insns);
		if (!bpf_hdr) {
		fp = org_fp;
		goto out;
		goto out_addrs;
		}

		skip_init_ctx:
		code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);

		/* Code generation passes 1-2 */
		@@ -935,7 +993,7 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		/* Now build the prologue, body code & epilogue for real. */
		cgctx.idx = 0;
		bpf_jit_build_prologue(code_base, &cgctx);
		bpf_jit_build_body(fp, code_base, &cgctx, addrs);
		bpf_jit_build_body(fp, code_base, &cgctx, addrs, extra_pass);
		bpf_jit_build_epilogue(code_base, &cgctx);

		if (bpf_jit_enable > 1)
		@@ -961,10 +1019,20 @@ struct bpf_prog bpf_int_jit_compile(struct bpf_prog fp)
		fp->jited_len = alloclen;

		bpf_flush_icache(bpf_hdr, (u8 )bpf_hdr + (bpf_hdr->pages PAGE_SIZE));

		out:
		if (!fp->is_func \|\| extra_pass) {
		out_addrs:
		kfree(addrs);
		kfree(jit_data);
		fp->aux->jit_data = NULL;
		} else {
		jit_data->addrs = addrs;
		jit_data->ctx = cgctx;
		jit_data->proglen = proglen;
		jit_data->image = image;
		jit_data->header = bpf_hdr;
		}

		out:
		if (bpf_blinded)
		bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp);

drivers/net/ethernet/intel/i40e/i40e_txrx.c

+19 −7

Original line number	Diff line number	Diff line
		@@ -3664,14 +3664,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff skb, struct net_device netdev)
		* @dev: netdev
		* @xdp: XDP buffer
		*
		* Returns Zero if sent, else an error code
		* Returns number of frames successfully sent. Frames that fail are
		* free'ed via XDP return API.
		*
		* For error cases, a negative errno code is returned and no-frames
		* are transmitted (caller must handle freeing frames).
		**/
		int i40e_xdp_xmit(struct net_device dev, struct xdp_frame xdpf)
		int i40e_xdp_xmit(struct net_device dev, int n, struct xdp_frame *frames)
		{
		struct i40e_netdev_priv *np = netdev_priv(dev);
		unsigned int queue_index = smp_processor_id();
		struct i40e_vsi *vsi = np->vsi;
		int err;
		int drops = 0;
		int i;

		if (test_bit(__I40E_VSI_DOWN, vsi->state))
		return -ENETDOWN;
		@@ -3679,11 +3684,18 @@ int i40e_xdp_xmit(struct net_device dev, struct xdp_frame xdpf)
		if (!i40e_enabled_xdp_vsi(vsi) \|\| queue_index >= vsi->num_queue_pairs)
		return -ENXIO;

		for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];
		int err;

		err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
		if (err != I40E_XDP_TX)
		return -ENOSPC;
		if (err != I40E_XDP_TX) {
		xdp_return_frame_rx_napi(xdpf);
		drops++;
		}
		}

		return 0;
		return n - drops;
		}

		/**

drivers/net/ethernet/intel/i40e/i40e_txrx.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -487,7 +487,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
		void i40e_detect_recover_hung(struct i40e_vsi *vsi);
		int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
		bool __i40e_chk_linearize(struct sk_buff *skb);
		int i40e_xdp_xmit(struct net_device dev, struct xdp_frame xdpf);
		int i40e_xdp_xmit(struct net_device dev, int n, struct xdp_frame *frames);
		void i40e_xdp_flush(struct net_device *dev);

		/**

drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

+15 −6

Original line number	Diff line number	Diff line
		@@ -10022,11 +10022,13 @@ static int ixgbe_xdp(struct net_device dev, struct netdev_bpf xdp)
		}
		}

		static int ixgbe_xdp_xmit(struct net_device dev, struct xdp_frame xdpf)
		static int ixgbe_xdp_xmit(struct net_device *dev, int n,
		struct xdp_frame **frames)
		{
		struct ixgbe_adapter *adapter = netdev_priv(dev);
		struct ixgbe_ring *ring;
		int err;
		int drops = 0;
		int i;

		if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
		return -ENETDOWN;
		@@ -10038,11 +10040,18 @@ static int ixgbe_xdp_xmit(struct net_device dev, struct xdp_frame xdpf)
		if (unlikely(!ring))
		return -ENXIO;

		for (i = 0; i < n; i++) {
		struct xdp_frame *xdpf = frames[i];
		int err;

		err = ixgbe_xmit_xdp_ring(adapter, xdpf);
		if (err != IXGBE_XDP_TX)
		return -ENOSPC;
		if (err != IXGBE_XDP_TX) {
		xdp_return_frame_rx_napi(xdpf);
		drops++;
		}
		}

		return 0;
		return n - drops;
		}

		static void ixgbe_xdp_flush(struct net_device *dev)

drivers/net/ethernet/netronome/nfp/bpf/jit.c

+381 −29

Original line number	Diff line number	Diff line
		@@ -211,6 +211,60 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
		emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
		}

		static void
		__emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
		bool set, bool src_lmextn)
		{
		u16 addr_lo, addr_hi;
		u64 insn;

		addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
		addr_hi = addr != addr_lo;

		insn = OP_BR_BIT_BASE \|
		FIELD_PREP(OP_BR_BIT_A_SRC, areg) \|
		FIELD_PREP(OP_BR_BIT_B_SRC, breg) \|
		FIELD_PREP(OP_BR_BIT_BV, set) \|
		FIELD_PREP(OP_BR_BIT_DEFBR, defer) \|
		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) \|
		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) \|
		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);

		nfp_prog_push(nfp_prog, insn);
		}

		static void
		emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
		u8 defer, bool set, enum nfp_relo_type relo)
		{
		struct nfp_insn_re_regs reg;
		int err;

		/* NOTE: The bit to test is specified as an rotation amount, such that
		* the bit to test will be placed on the MSB of the result when
		* doing a rotate right. For bit X, we need right rotate X + 1.
		*/
		bit += 1;

		err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
		if (err) {
		nfp_prog->error = err;
		return;
		}

		__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
		reg.src_lmextn);

		nfp_prog->prog[nfp_prog->prog_len - 1] \|=
		FIELD_PREP(OP_RELO_TYPE, relo);
		}

		static void
		emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
		{
		emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
		}

		static void
		__emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
		enum immed_width width, bool invert,
		@@ -309,6 +363,19 @@ emit_shf(struct nfp_prog *nfp_prog, swreg dst,
		reg.dst_lmextn, reg.src_lmextn);
		}

		static void
		emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
		swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
		{
		if (sc == SHF_SC_R_ROT) {
		pr_err("indirect shift is not allowed on rotation\n");
		nfp_prog->error = -EFAULT;
		return;
		}

		emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
		}

		static void
		__emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
		u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
		@@ -1629,28 +1696,144 @@ static int neg_reg64(struct nfp_prog nfp_prog, struct nfp_insn_meta meta)
		return 0;
		}

		/* Pseudo code:
		* if shift_amt >= 32
		* dst_high = dst_low << shift_amt[4:0]
		* dst_low = 0;
		* else
		* dst_high = (dst_high, dst_low) >> (32 - shift_amt)
		* dst_low = dst_low << shift_amt
		*
		* The indirect shift will use the same logic at runtime.
		*/
		static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
		{
		if (shift_amt < 32) {
		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
		SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
		32 - shift_amt);
		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
		reg_b(dst), SHF_SC_L_SHF, shift_amt);
		} else if (shift_amt == 32) {
		wrp_reg_mov(nfp_prog, dst + 1, dst);
		wrp_immed(nfp_prog, reg_both(dst), 0);
		} else if (shift_amt > 32) {
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
		reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
		wrp_immed(nfp_prog, reg_both(dst), 0);
		}

		return 0;
		}

		static int shl_imm64(struct nfp_prog nfp_prog, struct nfp_insn_meta meta)
		{
		const struct bpf_insn *insn = &meta->insn;
		u8 dst = insn->dst_reg * 2;

		if (insn->imm < 32) {
		emit_shf(nfp_prog, reg_both(dst + 1),
		reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
		SHF_SC_R_DSHF, 32 - insn->imm);
		emit_shf(nfp_prog, reg_both(dst),
		reg_none(), SHF_OP_NONE, reg_b(dst),
		SHF_SC_L_SHF, insn->imm);
		} else if (insn->imm == 32) {
		wrp_reg_mov(nfp_prog, dst + 1, dst);
		wrp_immed(nfp_prog, reg_both(dst), 0);
		} else if (insn->imm > 32) {
		emit_shf(nfp_prog, reg_both(dst + 1),
		reg_none(), SHF_OP_NONE, reg_b(dst),
		SHF_SC_L_SHF, insn->imm - 32);
		return __shl_imm64(nfp_prog, dst, insn->imm);
		}

		static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
		reg_b(src));
		emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
		emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
		reg_b(dst), SHF_SC_R_DSHF);
		}

		/* NOTE: for indirect left shift, HIGH part should be calculated first. */
		static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
		emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
		reg_b(dst), SHF_SC_L_SHF);
		}

		static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		shl_reg64_lt32_high(nfp_prog, dst, src);
		shl_reg64_lt32_low(nfp_prog, dst, src);
		}

		static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
		emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
		reg_b(dst), SHF_SC_L_SHF);
		wrp_immed(nfp_prog, reg_both(dst), 0);
		}

		static int shl_reg64(struct nfp_prog nfp_prog, struct nfp_insn_meta meta)
		{
		const struct bpf_insn *insn = &meta->insn;
		u64 umin, umax;
		u8 dst, src;

		dst = insn->dst_reg * 2;
		umin = meta->umin;
		umax = meta->umax;
		if (umin == umax)
		return __shl_imm64(nfp_prog, dst, umin);

		src = insn->src_reg * 2;
		if (umax < 32) {
		shl_reg64_lt32(nfp_prog, dst, src);
		} else if (umin >= 32) {
		shl_reg64_ge32(nfp_prog, dst, src);
		} else {
		/* Generate different instruction sequences depending on runtime
		* value of shift amount.
		*/
		u16 label_ge32, label_end;

		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);

		shl_reg64_lt32_high(nfp_prog, dst, src);
		label_end = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br(nfp_prog, BR_UNC, label_end, 2);
		/* shl_reg64_lt32_low packed in delay slot. */
		shl_reg64_lt32_low(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
		return -EINVAL;
		shl_reg64_ge32(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
		return -EINVAL;
		}

		return 0;
		}

		/* Pseudo code:
		* if shift_amt >= 32
		* dst_high = 0;
		* dst_low = dst_high >> shift_amt[4:0]
		* else
		* dst_high = dst_high >> shift_amt
		* dst_low = (dst_high, dst_low) >> shift_amt
		*
		* The indirect shift will use the same logic at runtime.
		*/
		static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
		{
		if (shift_amt < 32) {
		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
		reg_b(dst), SHF_SC_R_DSHF, shift_amt);
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
		reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
		} else if (shift_amt == 32) {
		wrp_reg_mov(nfp_prog, dst, dst + 1);
		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
		} else if (shift_amt > 32) {
		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
		reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
		}

		return 0;
		}

		@@ -1659,23 +1842,188 @@ static int shr_imm64(struct nfp_prog nfp_prog, struct nfp_insn_meta meta)
		const struct bpf_insn *insn = &meta->insn;
		u8 dst = insn->dst_reg * 2;

		if (insn->imm < 32) {
		emit_shf(nfp_prog, reg_both(dst),
		reg_a(dst + 1), SHF_OP_NONE, reg_b(dst),
		SHF_SC_R_DSHF, insn->imm);
		emit_shf(nfp_prog, reg_both(dst + 1),
		reg_none(), SHF_OP_NONE, reg_b(dst + 1),
		SHF_SC_R_SHF, insn->imm);
		} else if (insn->imm == 32) {
		wrp_reg_mov(nfp_prog, dst, dst + 1);
		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
		} else if (insn->imm > 32) {
		emit_shf(nfp_prog, reg_both(dst),
		reg_none(), SHF_OP_NONE, reg_b(dst + 1),
		SHF_SC_R_SHF, insn->imm - 32);
		return __shr_imm64(nfp_prog, dst, insn->imm);
		}

		/* NOTE: for indirect right shift, LOW part should be calculated first. */
		static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
		emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
		reg_b(dst + 1), SHF_SC_R_SHF);
		}

		static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
		emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
		reg_b(dst), SHF_SC_R_DSHF);
		}

		static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		shr_reg64_lt32_low(nfp_prog, dst, src);
		shr_reg64_lt32_high(nfp_prog, dst, src);
		}

		static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
		emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
		reg_b(dst + 1), SHF_SC_R_SHF);
		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
		}

		static int shr_reg64(struct nfp_prog nfp_prog, struct nfp_insn_meta meta)
		{
		const struct bpf_insn *insn = &meta->insn;
		u64 umin, umax;
		u8 dst, src;

		dst = insn->dst_reg * 2;
		umin = meta->umin;
		umax = meta->umax;
		if (umin == umax)
		return __shr_imm64(nfp_prog, dst, umin);

		src = insn->src_reg * 2;
		if (umax < 32) {
		shr_reg64_lt32(nfp_prog, dst, src);
		} else if (umin >= 32) {
		shr_reg64_ge32(nfp_prog, dst, src);
		} else {
		/* Generate different instruction sequences depending on runtime
		* value of shift amount.
		*/
		u16 label_ge32, label_end;

		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
		shr_reg64_lt32_low(nfp_prog, dst, src);
		label_end = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br(nfp_prog, BR_UNC, label_end, 2);
		/* shr_reg64_lt32_high packed in delay slot. */
		shr_reg64_lt32_high(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
		return -EINVAL;
		shr_reg64_ge32(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
		return -EINVAL;
		}

		return 0;
		}

		/* Code logic is the same as __shr_imm64 except ashr requires signedness bit
		* told through PREV_ALU result.
		*/
		static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
		{
		if (shift_amt < 32) {
		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
		reg_b(dst), SHF_SC_R_DSHF, shift_amt);
		/* Set signedness bit. */
		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
		reg_imm(0));
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
		reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
		} else if (shift_amt == 32) {
		/* NOTE: this also helps setting signedness bit. */
		wrp_reg_mov(nfp_prog, dst, dst + 1);
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
		reg_b(dst + 1), SHF_SC_R_SHF, 31);
		} else if (shift_amt > 32) {
		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
		reg_imm(0));
		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
		reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
		reg_b(dst + 1), SHF_SC_R_SHF, 31);
		}

		return 0;
		}

		static int ashr_imm64(struct nfp_prog nfp_prog, struct nfp_insn_meta meta)
		{
		const struct bpf_insn *insn = &meta->insn;
		u8 dst = insn->dst_reg * 2;

		return __ashr_imm64(nfp_prog, dst, insn->imm);
		}

		static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		/* NOTE: the first insn will set both indirect shift amount (source A)
		* and signedness bit (MSB of result).
		*/
		emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
		emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
		reg_b(dst + 1), SHF_SC_R_SHF);
		}

		static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		/* NOTE: it is the same as logic shift because we don't need to shift in
		* signedness bit when the shift amount is less than 32.
		*/
		return shr_reg64_lt32_low(nfp_prog, dst, src);
		}

		static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		ashr_reg64_lt32_low(nfp_prog, dst, src);
		ashr_reg64_lt32_high(nfp_prog, dst, src);
		}

		static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
		{
		emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
		emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
		reg_b(dst + 1), SHF_SC_R_SHF);
		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
		reg_b(dst + 1), SHF_SC_R_SHF, 31);
		}

		/* Like ashr_imm64, but need to use indirect shift. */
		static int ashr_reg64(struct nfp_prog nfp_prog, struct nfp_insn_meta meta)
		{
		const struct bpf_insn *insn = &meta->insn;
		u64 umin, umax;
		u8 dst, src;

		dst = insn->dst_reg * 2;
		umin = meta->umin;
		umax = meta->umax;
		if (umin == umax)
		return __ashr_imm64(nfp_prog, dst, umin);

		src = insn->src_reg * 2;
		if (umax < 32) {
		ashr_reg64_lt32(nfp_prog, dst, src);
		} else if (umin >= 32) {
		ashr_reg64_ge32(nfp_prog, dst, src);
		} else {
		u16 label_ge32, label_end;

		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
		ashr_reg64_lt32_low(nfp_prog, dst, src);
		label_end = nfp_prog_current_offset(nfp_prog) + 6;
		emit_br(nfp_prog, BR_UNC, label_end, 2);
		/* ashr_reg64_lt32_high packed in delay slot. */
		ashr_reg64_lt32_high(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
		return -EINVAL;
		ashr_reg64_ge32(nfp_prog, dst, src);

		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
		return -EINVAL;
		}

		return 0;
		}

		@@ -2501,8 +2849,12 @@ static const instr_cb_t instr_cb[256] = {
		[BPF_ALU64 \| BPF_SUB \| BPF_X] = sub_reg64,
		[BPF_ALU64 \| BPF_SUB \| BPF_K] = sub_imm64,
		[BPF_ALU64 \| BPF_NEG] = neg_reg64,
		[BPF_ALU64 \| BPF_LSH \| BPF_X] = shl_reg64,
		[BPF_ALU64 \| BPF_LSH \| BPF_K] = shl_imm64,
		[BPF_ALU64 \| BPF_RSH \| BPF_X] = shr_reg64,
		[BPF_ALU64 \| BPF_RSH \| BPF_K] = shr_imm64,
		[BPF_ALU64 \| BPF_ARSH \| BPF_X] = ashr_reg64,
		[BPF_ALU64 \| BPF_ARSH \| BPF_K] = ashr_imm64,
		[BPF_ALU \| BPF_MOV \| BPF_X] = mov_reg,
		[BPF_ALU \| BPF_MOV \| BPF_K] = mov_imm,
		[BPF_ALU \| BPF_XOR \| BPF_X] = xor_reg,