am 410a1966: am fd382f2e: am fd7eabe4: Merge "Pixelflinger: Add AArch64... (49a88c3c) · Commits · e / os / android_system_core

include/private/pixelflinger/ggl_fixed.h

+66 −1

Original line number	Diff line number	Diff line
		@@ -457,6 +457,69 @@ inline int64_t gglMulii(int32_t x, int32_t y) {
		return u.res;
		}

		#elif defined(__aarch64__)

		// inline AArch64 implementations

		inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) CONST;
		inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift)
		{
		GGLfixed result;
		GGLfixed round;

		asm("mov %x[round], #1 \n"
		"lsl %x[round], %x[round], %x[shift] \n"
		"lsr %x[round], %x[round], #1 \n"
		"smaddl %x[result], %w[x], %w[y],%x[round] \n"
		"lsr %x[result], %x[result], %x[shift] \n"
		: [round]"=&r"(round), [result]"=&r"(result) \
		: [x]"r"(x), [y]"r"(y), [shift] "r"(shift) \
		:
		);
		return result;
		}
		inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST;
		inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift)
		{
		GGLfixed result;
		asm("smull %x[result], %w[x], %w[y] \n"
		"lsr %x[result], %x[result], %x[shift] \n"
		"add %w[result], %w[result], %w[a] \n"
		: [result]"=&r"(result) \
		: [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \
		:
		);
		return result;
		}

		inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST;
		inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift)
		{

		GGLfixed result;
		int rshift;

		asm("smull %x[result], %w[x], %w[y] \n"
		"lsr %x[result], %x[result], %x[shift] \n"
		"sub %w[result], %w[result], %w[a] \n"
		: [result]"=&r"(result) \
		: [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \
		:
		);
		return result;
		}
		inline int64_t gglMulii(int32_t x, int32_t y) CONST;
		inline int64_t gglMulii(int32_t x, int32_t y)
		{
		int64_t res;
		asm("smull %x0, %w1, %w2 \n"
		: "=r"(res)
		: "%r"(x), "r"(y)
		:
		);
		return res;
		}

		#else // ----------------------------------------------------------------------

		inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) CONST;
		@@ -498,7 +561,7 @@ inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c) {
		inline int32_t gglClz(int32_t x) CONST;
		inline int32_t gglClz(int32_t x)
		{
		#if (defined(__arm__) && !defined(__thumb__)) \|\| defined(__mips__)
		#if (defined(__arm__) && !defined(__thumb__)) \|\| defined(__mips__) \|\| defined(__aarch64__)
		return __builtin_clz(x);
		#else
		if (!x) return 32;
		@@ -554,6 +617,8 @@ inline GGLfixed gglClampx(GGLfixed c)
		// clamps to zero in one instruction, but gcc won't generate it and
		// replace it by a cmp + movlt (it's quite amazing actually).
		asm("bic %0, %1, %1, asr #31\n" : "=r"(c) : "r"(c));
		#elif defined(__aarch64__)
		asm("bic %w0, %w1, %w1, asr #31\n" : "=r"(c) : "r"(c));
		#else
		c &= ~(c>>31);
		#endif

libpixelflinger/Android.mk

+10 −2

Original line number	Diff line number	Diff line
		@@ -9,13 +9,11 @@ include $(CLEAR_VARS)
		PIXELFLINGER_SRC_FILES:= \
		codeflinger/ARMAssemblerInterface.cpp \
		codeflinger/ARMAssemblerProxy.cpp \
		codeflinger/ARMAssembler.cpp \
		codeflinger/CodeCache.cpp \
		codeflinger/GGLAssembler.cpp \
		codeflinger/load_store.cpp \
		codeflinger/blending.cpp \
		codeflinger/texturing.cpp \
		codeflinger/disassem.c \
		codeflinger/tinyutils/SharedBuffer.cpp \
		codeflinger/tinyutils/VectorImpl.cpp \
		fixed.cpp.arm \
		@@ -39,6 +37,8 @@ endif
		endif

		ifeq ($(TARGET_ARCH),arm)
		PIXELFLINGER_SRC_FILES += codeflinger/ARMAssembler.cpp
		PIXELFLINGER_SRC_FILES += codeflinger/disassem.c
		# special optimization flags for pixelflinger
		PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer
		endif
		@@ -52,6 +52,14 @@ endif

		LOCAL_SHARED_LIBRARIES := libcutils liblog

		ifeq ($(TARGET_ARCH),aarch64)
		PIXELFLINGER_SRC_FILES += arch-aarch64/t32cb16blend.S
		PIXELFLINGER_SRC_FILES += arch-aarch64/col32cb16blend.S
		PIXELFLINGER_SRC_FILES += codeflinger/Aarch64Assembler.cpp
		PIXELFLINGER_SRC_FILES += codeflinger/Aarch64Disassembler.cpp
		PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer
		endif

		#
		# Shared library
		#

libpixelflinger/arch-aarch64/col32cb16blend.S

0 → 100644

+87 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2013 The Android Open Source Project
		* All rights reserved.
		*
		* Redistribution and use in source and binary forms, with or without
		* modification, are permitted provided that the following conditions
		* are met:
		* * Redistributions of source code must retain the above copyright
		* notice, this list of conditions and the following disclaimer.
		* * Redistributions in binary form must reproduce the above copyright
		* notice, this list of conditions and the following disclaimer in
		* the documentation and/or other materials provided with the
		* distribution.
		*
		* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
		* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
		* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
		* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
		* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
		* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
		* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
		* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
		* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
		* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
		* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
		* SUCH DAMAGE.
		*/
		.text
		.align

		.global scanline_col32cb16blend_aarch64

		//
		// This function alpha blends a fixed color into a destination scanline, using
		// the formula:
		//
		// d = s + (((a + (a >> 7)) * d) >> 8)
		//
		// where d is the destination pixel,
		// s is the source color,
		// a is the alpha channel of the source color.
		//

		// x0 = destination buffer pointer
		// w1 = color value
		// w2 = count


		scanline_col32cb16blend_aarch64:

		lsr w5, w1, #24 // shift down alpha
		mov w9, #0xff // create mask
		add w5, w5, w5, lsr #7 // add in top bit
		mov w4, #256 // create #0x100
		sub w5, w4, w5 // invert alpha
		and w10, w1, #0xff // extract red
		and w12, w9, w1, lsr #8 // extract green
		and w4, w9, w1, lsr #16 // extract blue
		lsl w10, w10, #5 // prescale red
		lsl w12, w12, #6 // prescale green
		lsl w4, w4, #5 // prescale blue
		lsr w9, w9, #2 // create dest green mask

		1:
		ldrh w8, [x0] // load dest pixel
		subs w2, w2, #1 // decrement loop counter
		lsr w6, w8, #11 // extract dest red
		and w7, w9, w8, lsr #5 // extract dest green
		and w8, w8, #0x1f // extract dest blue

		madd w6, w6, w5, w10 // dest red * alpha + src red
		madd w7, w7, w5, w12 // dest green * alpha + src green
		madd w8, w8, w5, w4 // dest blue * alpha + src blue

		lsr w6, w6, #8 // shift down red
		lsr w7, w7, #8 // shift down green
		lsl w6, w6, #11 // shift red into 565
		orr w6, w6, w7, lsl #5 // shift green into 565
		orr w6, w6, w8, lsr #8 // shift blue into 565

		strh w6, [x0], #2 // store pixel to dest, update ptr
		b.ne 1b // if count != 0, loop

		ret

libpixelflinger/arch-aarch64/t32cb16blend.S

0 → 100644

+213 −0

Original line number	Diff line number	Diff line
		/*
		* Copyright (C) 2013 The Android Open Source Project
		* All rights reserved.
		*
		* Redistribution and use in source and binary forms, with or without
		* modification, are permitted provided that the following conditions
		* are met:
		* * Redistributions of source code must retain the above copyright
		* notice, this list of conditions and the following disclaimer.
		* * Redistributions in binary form must reproduce the above copyright
		* notice, this list of conditions and the following disclaimer in
		* the documentation and/or other materials provided with the
		* distribution.
		*
		* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
		* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
		* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
		* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
		* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
		* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
		* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
		* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
		* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
		* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
		* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
		* SUCH DAMAGE.
		*/
		.text
		.align

		.global scanline_t32cb16blend_aarch64

		/*
		* .macro pixel
		*
		* This macro alpha blends RGB565 original pixel located in either
		* top or bottom 16 bits of DREG register with SRC 32 bit pixel value
		* and writes the result to FB register
		*
		* \DREG is a 32-bit register containing two original destination RGB565
		* pixels, with the even one in the low-16 bits, and the odd one in the
		* high 16 bits.
		*
		* \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
		*
		* \FB is a target register that will contain the blended pixel values.
		*
		* \ODD is either 0 or 1 and indicates if we're blending the lower or
		* upper 16-bit pixels in DREG into FB
		*
		*
		* clobbered: w6, w7, w16, w17, w18
		*
		*/

		.macro pixel, DREG, SRC, FB, ODD

		// SRC = 0xAABBGGRR
		lsr w7, \SRC, #24 // sA
		add w7, w7, w7, lsr #7 // sA + (sA >> 7)
		mov w6, #0x100
		sub w7, w6, w7 // sA = 0x100 - (sA+(sA>>7))

		1:

		.if \ODD //Blending odd pixel present in top 16 bits of DREG register

		// red
		lsr w16, \DREG, #(16 + 11)
		mul w16, w7, w16
		lsr w6, \SRC, #3
		and w6, w6, #0x1F
		add w16, w6, w16, lsr #8
		cmp w16, #0x1F
		orr w17, \FB, #(0x1F<<(16 + 11))
		orr w18, \FB, w16, lsl #(16 + 11)
		csel \FB, w17, w18, hi
		// green
		and w6, \DREG, #(0x3F<<(16 + 5))
		lsr w17,w6,#(16+5)
		mul w6, w7, w17
		lsr w16, \SRC, #(8+2)
		and w16, w16, #0x3F
		add w6, w16, w6, lsr #8
		cmp w6, #0x3F
		orr w17, \FB, #(0x3F<<(16 + 5))
		orr w18, \FB, w6, lsl #(16 + 5)
		csel \FB, w17, w18, hi
		// blue
		and w16, \DREG, #(0x1F << 16)
		lsr w17,w16,#16
		mul w16, w7, w17
		lsr w6, \SRC, #(8+8+3)
		and w6, w6, #0x1F
		add w16, w6, w16, lsr #8
		cmp w16, #0x1F
		orr w17, \FB, #(0x1F << 16)
		orr w18, \FB, w16, lsl #16
		csel \FB, w17, w18, hi

		.else //Blending even pixel present in bottom 16 bits of DREG register

		// red
		lsr w16, \DREG, #11
		and w16, w16, #0x1F
		mul w16, w7, w16
		lsr w6, \SRC, #3
		and w6, w6, #0x1F
		add w16, w6, w16, lsr #8
		cmp w16, #0x1F
		mov w17, #(0x1F<<11)
		lsl w18, w16, #11
		csel \FB, w17, w18, hi


		// green
		and w6, \DREG, #(0x3F<<5)
		mul w6, w7, w6
		lsr w16, \SRC, #(8+2)
		and w16, w16, #0x3F
		add w6, w16, w6, lsr #(5+8)
		cmp w6, #0x3F
		orr w17, \FB, #(0x3F<<5)
		orr w18, \FB, w6, lsl #5
		csel \FB, w17, w18, hi

		// blue
		and w16, \DREG, #0x1F
		mul w16, w7, w16
		lsr w6, \SRC, #(8+8+3)
		and w6, w6, #0x1F
		add w16, w6, w16, lsr #8
		cmp w16, #0x1F
		orr w17, \FB, #0x1F
		orr w18, \FB, w16
		csel \FB, w17, w18, hi

		.endif // End of blending even pixel

		.endm // End of pixel macro


		// x0: dst ptr
		// x1: src ptr
		// w2: count
		// w3: d
		// w4: s0
		// w5: s1
		// w6: pixel
		// w7: pixel
		// w8: free
		// w9: free
		// w10: free
		// w11: free
		// w12: scratch
		// w14: pixel

		scanline_t32cb16blend_aarch64:

		// align DST to 32 bits
		tst x0, #0x3
		b.eq aligned
		subs w2, w2, #1
		b.lo return

		last:
		ldr w4, [x1], #4
		ldrh w3, [x0]
		pixel w3, w4, w12, 0
		strh w12, [x0], #2

		aligned:
		subs w2, w2, #2
		b.lo 9f

		// The main loop is unrolled twice and processes 4 pixels
		8:
		ldp w4,w5, [x1], #8
		add x0, x0, #4
		// it's all zero, skip this pixel
		orr w3, w4, w5
		cbz w3, 7f

		// load the destination
		ldr w3, [x0, #-4]
		// stream the destination
		pixel w3, w4, w12, 0
		pixel w3, w5, w12, 1
		str w12, [x0, #-4]

		// 2nd iteration of the loop, don't stream anything
		subs w2, w2, #2
		csel w4, w5, w4, lt
		blt 9f
		ldp w4,w5, [x1], #8
		add x0, x0, #4
		orr w3, w4, w5
		cbz w3, 7f
		ldr w3, [x0, #-4]
		pixel w3, w4, w12, 0
		pixel w3, w5, w12, 1
		str w12, [x0, #-4]

		7: subs w2, w2, #2
		bhs 8b
		mov w4, w5

		9: adds w2, w2, #1
		b.lo return
		b last

		return:
		ret

libpixelflinger/codeflinger/ARMAssemblerInterface.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -63,7 +63,7 @@ public:
		};

		enum {
		CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS
		CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS, CODEGEN_ARCH_AARCH64
		};

		// -----------------------------------------------------------------------