Loading include/private/pixelflinger/ggl_fixed.h +66 −1 Original line number Diff line number Diff line Loading @@ -457,6 +457,69 @@ inline int64_t gglMulii(int32_t x, int32_t y) { return u.res; } #elif defined(__aarch64__) // inline AArch64 implementations inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) CONST; inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) { GGLfixed result; GGLfixed round; asm("mov %x[round], #1 \n" "lsl %x[round], %x[round], %x[shift] \n" "lsr %x[round], %x[round], #1 \n" "smaddl %x[result], %w[x], %w[y],%x[round] \n" "lsr %x[result], %x[result], %x[shift] \n" : [round]"=&r"(round), [result]"=&r"(result) \ : [x]"r"(x), [y]"r"(y), [shift] "r"(shift) \ : ); return result; } inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST; inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) { GGLfixed result; asm("smull %x[result], %w[x], %w[y] \n" "lsr %x[result], %x[result], %x[shift] \n" "add %w[result], %w[result], %w[a] \n" : [result]"=&r"(result) \ : [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \ : ); return result; } inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST; inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) { GGLfixed result; int rshift; asm("smull %x[result], %w[x], %w[y] \n" "lsr %x[result], %x[result], %x[shift] \n" "sub %w[result], %w[result], %w[a] \n" : [result]"=&r"(result) \ : [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \ : ); return result; } inline int64_t gglMulii(int32_t x, int32_t y) CONST; inline int64_t gglMulii(int32_t x, int32_t y) { int64_t res; asm("smull %x0, %w1, %w2 \n" : "=r"(res) : "%r"(x), "r"(y) : ); return res; } #else // ---------------------------------------------------------------------- inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) CONST; Loading Loading @@ -498,7 +561,7 @@ inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c) { inline int32_t gglClz(int32_t x) CONST; inline int32_t gglClz(int32_t x) { #if (defined(__arm__) && !defined(__thumb__)) || defined(__mips__) #if (defined(__arm__) && !defined(__thumb__)) || defined(__mips__) || defined(__aarch64__) return __builtin_clz(x); #else if (!x) return 32; Loading Loading @@ -554,6 +617,8 @@ inline GGLfixed gglClampx(GGLfixed c) // clamps to zero in one instruction, but gcc won't generate it and // replace it by a cmp + movlt (it's quite amazing actually). asm("bic %0, %1, %1, asr #31\n" : "=r"(c) : "r"(c)); #elif defined(__aarch64__) asm("bic %w0, %w1, %w1, asr #31\n" : "=r"(c) : "r"(c)); #else c &= ~(c>>31); #endif Loading libpixelflinger/Android.mk +10 −2 Original line number Diff line number Diff line Loading @@ -9,13 +9,11 @@ include $(CLEAR_VARS) PIXELFLINGER_SRC_FILES:= \ codeflinger/ARMAssemblerInterface.cpp \ codeflinger/ARMAssemblerProxy.cpp \ codeflinger/ARMAssembler.cpp \ codeflinger/CodeCache.cpp \ codeflinger/GGLAssembler.cpp \ codeflinger/load_store.cpp \ codeflinger/blending.cpp \ codeflinger/texturing.cpp \ codeflinger/disassem.c \ codeflinger/tinyutils/SharedBuffer.cpp \ codeflinger/tinyutils/VectorImpl.cpp \ fixed.cpp.arm \ Loading @@ -39,6 +37,8 @@ endif endif ifeq ($(TARGET_ARCH),arm) PIXELFLINGER_SRC_FILES += codeflinger/ARMAssembler.cpp PIXELFLINGER_SRC_FILES += codeflinger/disassem.c # special optimization flags for pixelflinger PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer endif Loading @@ -52,6 +52,14 @@ endif LOCAL_SHARED_LIBRARIES := libcutils liblog ifeq ($(TARGET_ARCH),aarch64) PIXELFLINGER_SRC_FILES += arch-aarch64/t32cb16blend.S PIXELFLINGER_SRC_FILES += arch-aarch64/col32cb16blend.S PIXELFLINGER_SRC_FILES += codeflinger/Aarch64Assembler.cpp PIXELFLINGER_SRC_FILES += codeflinger/Aarch64Disassembler.cpp PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer endif # # Shared library # Loading libpixelflinger/arch-aarch64/col32cb16blend.S 0 → 100644 +87 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ .text .align .global scanline_col32cb16blend_aarch64 // // This function alpha blends a fixed color into a destination scanline, using // the formula: // // d = s + (((a + (a >> 7)) * d) >> 8) // // where d is the destination pixel, // s is the source color, // a is the alpha channel of the source color. // // x0 = destination buffer pointer // w1 = color value // w2 = count scanline_col32cb16blend_aarch64: lsr w5, w1, #24 // shift down alpha mov w9, #0xff // create mask add w5, w5, w5, lsr #7 // add in top bit mov w4, #256 // create #0x100 sub w5, w4, w5 // invert alpha and w10, w1, #0xff // extract red and w12, w9, w1, lsr #8 // extract green and w4, w9, w1, lsr #16 // extract blue lsl w10, w10, #5 // prescale red lsl w12, w12, #6 // prescale green lsl w4, w4, #5 // prescale blue lsr w9, w9, #2 // create dest green mask 1: ldrh w8, [x0] // load dest pixel subs w2, w2, #1 // decrement loop counter lsr w6, w8, #11 // extract dest red and w7, w9, w8, lsr #5 // extract dest green and w8, w8, #0x1f // extract dest blue madd w6, w6, w5, w10 // dest red * alpha + src red madd w7, w7, w5, w12 // dest green * alpha + src green madd w8, w8, w5, w4 // dest blue * alpha + src blue lsr w6, w6, #8 // shift down red lsr w7, w7, #8 // shift down green lsl w6, w6, #11 // shift red into 565 orr w6, w6, w7, lsl #5 // shift green into 565 orr w6, w6, w8, lsr #8 // shift blue into 565 strh w6, [x0], #2 // store pixel to dest, update ptr b.ne 1b // if count != 0, loop ret libpixelflinger/arch-aarch64/t32cb16blend.S 0 → 100644 +213 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ .text .align .global scanline_t32cb16blend_aarch64 /* * .macro pixel * * This macro alpha blends RGB565 original pixel located in either * top or bottom 16 bits of DREG register with SRC 32 bit pixel value * and writes the result to FB register * * \DREG is a 32-bit register containing *two* original destination RGB565 * pixels, with the even one in the low-16 bits, and the odd one in the * high 16 bits. * * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors. * * \FB is a target register that will contain the blended pixel values. * * \ODD is either 0 or 1 and indicates if we're blending the lower or * upper 16-bit pixels in DREG into FB * * * clobbered: w6, w7, w16, w17, w18 * */ .macro pixel, DREG, SRC, FB, ODD // SRC = 0xAABBGGRR lsr w7, \SRC, #24 // sA add w7, w7, w7, lsr #7 // sA + (sA >> 7) mov w6, #0x100 sub w7, w6, w7 // sA = 0x100 - (sA+(sA>>7)) 1: .if \ODD //Blending odd pixel present in top 16 bits of DREG register // red lsr w16, \DREG, #(16 + 11) mul w16, w7, w16 lsr w6, \SRC, #3 and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F orr w17, \FB, #(0x1F<<(16 + 11)) orr w18, \FB, w16, lsl #(16 + 11) csel \FB, w17, w18, hi // green and w6, \DREG, #(0x3F<<(16 + 5)) lsr w17,w6,#(16+5) mul w6, w7, w17 lsr w16, \SRC, #(8+2) and w16, w16, #0x3F add w6, w16, w6, lsr #8 cmp w6, #0x3F orr w17, \FB, #(0x3F<<(16 + 5)) orr w18, \FB, w6, lsl #(16 + 5) csel \FB, w17, w18, hi // blue and w16, \DREG, #(0x1F << 16) lsr w17,w16,#16 mul w16, w7, w17 lsr w6, \SRC, #(8+8+3) and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F orr w17, \FB, #(0x1F << 16) orr w18, \FB, w16, lsl #16 csel \FB, w17, w18, hi .else //Blending even pixel present in bottom 16 bits of DREG register // red lsr w16, \DREG, #11 and w16, w16, #0x1F mul w16, w7, w16 lsr w6, \SRC, #3 and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F mov w17, #(0x1F<<11) lsl w18, w16, #11 csel \FB, w17, w18, hi // green and w6, \DREG, #(0x3F<<5) mul w6, w7, w6 lsr w16, \SRC, #(8+2) and w16, w16, #0x3F add w6, w16, w6, lsr #(5+8) cmp w6, #0x3F orr w17, \FB, #(0x3F<<5) orr w18, \FB, w6, lsl #5 csel \FB, w17, w18, hi // blue and w16, \DREG, #0x1F mul w16, w7, w16 lsr w6, \SRC, #(8+8+3) and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F orr w17, \FB, #0x1F orr w18, \FB, w16 csel \FB, w17, w18, hi .endif // End of blending even pixel .endm // End of pixel macro // x0: dst ptr // x1: src ptr // w2: count // w3: d // w4: s0 // w5: s1 // w6: pixel // w7: pixel // w8: free // w9: free // w10: free // w11: free // w12: scratch // w14: pixel scanline_t32cb16blend_aarch64: // align DST to 32 bits tst x0, #0x3 b.eq aligned subs w2, w2, #1 b.lo return last: ldr w4, [x1], #4 ldrh w3, [x0] pixel w3, w4, w12, 0 strh w12, [x0], #2 aligned: subs w2, w2, #2 b.lo 9f // The main loop is unrolled twice and processes 4 pixels 8: ldp w4,w5, [x1], #8 add x0, x0, #4 // it's all zero, skip this pixel orr w3, w4, w5 cbz w3, 7f // load the destination ldr w3, [x0, #-4] // stream the destination pixel w3, w4, w12, 0 pixel w3, w5, w12, 1 str w12, [x0, #-4] // 2nd iteration of the loop, don't stream anything subs w2, w2, #2 csel w4, w5, w4, lt blt 9f ldp w4,w5, [x1], #8 add x0, x0, #4 orr w3, w4, w5 cbz w3, 7f ldr w3, [x0, #-4] pixel w3, w4, w12, 0 pixel w3, w5, w12, 1 str w12, [x0, #-4] 7: subs w2, w2, #2 bhs 8b mov w4, w5 9: adds w2, w2, #1 b.lo return b last return: ret libpixelflinger/codeflinger/ARMAssemblerInterface.h +1 −1 Original line number Diff line number Diff line Loading @@ -63,7 +63,7 @@ public: }; enum { CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS, CODEGEN_ARCH_AARCH64 }; // ----------------------------------------------------------------------- Loading Loading
include/private/pixelflinger/ggl_fixed.h +66 −1 Original line number Diff line number Diff line Loading @@ -457,6 +457,69 @@ inline int64_t gglMulii(int32_t x, int32_t y) { return u.res; } #elif defined(__aarch64__) // inline AArch64 implementations inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) CONST; inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) { GGLfixed result; GGLfixed round; asm("mov %x[round], #1 \n" "lsl %x[round], %x[round], %x[shift] \n" "lsr %x[round], %x[round], #1 \n" "smaddl %x[result], %w[x], %w[y],%x[round] \n" "lsr %x[result], %x[result], %x[shift] \n" : [round]"=&r"(round), [result]"=&r"(result) \ : [x]"r"(x), [y]"r"(y), [shift] "r"(shift) \ : ); return result; } inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST; inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) { GGLfixed result; asm("smull %x[result], %w[x], %w[y] \n" "lsr %x[result], %x[result], %x[shift] \n" "add %w[result], %w[result], %w[a] \n" : [result]"=&r"(result) \ : [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \ : ); return result; } inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST; inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) { GGLfixed result; int rshift; asm("smull %x[result], %w[x], %w[y] \n" "lsr %x[result], %x[result], %x[shift] \n" "sub %w[result], %w[result], %w[a] \n" : [result]"=&r"(result) \ : [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \ : ); return result; } inline int64_t gglMulii(int32_t x, int32_t y) CONST; inline int64_t gglMulii(int32_t x, int32_t y) { int64_t res; asm("smull %x0, %w1, %w2 \n" : "=r"(res) : "%r"(x), "r"(y) : ); return res; } #else // ---------------------------------------------------------------------- inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) CONST; Loading Loading @@ -498,7 +561,7 @@ inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c) { inline int32_t gglClz(int32_t x) CONST; inline int32_t gglClz(int32_t x) { #if (defined(__arm__) && !defined(__thumb__)) || defined(__mips__) #if (defined(__arm__) && !defined(__thumb__)) || defined(__mips__) || defined(__aarch64__) return __builtin_clz(x); #else if (!x) return 32; Loading Loading @@ -554,6 +617,8 @@ inline GGLfixed gglClampx(GGLfixed c) // clamps to zero in one instruction, but gcc won't generate it and // replace it by a cmp + movlt (it's quite amazing actually). asm("bic %0, %1, %1, asr #31\n" : "=r"(c) : "r"(c)); #elif defined(__aarch64__) asm("bic %w0, %w1, %w1, asr #31\n" : "=r"(c) : "r"(c)); #else c &= ~(c>>31); #endif Loading
libpixelflinger/Android.mk +10 −2 Original line number Diff line number Diff line Loading @@ -9,13 +9,11 @@ include $(CLEAR_VARS) PIXELFLINGER_SRC_FILES:= \ codeflinger/ARMAssemblerInterface.cpp \ codeflinger/ARMAssemblerProxy.cpp \ codeflinger/ARMAssembler.cpp \ codeflinger/CodeCache.cpp \ codeflinger/GGLAssembler.cpp \ codeflinger/load_store.cpp \ codeflinger/blending.cpp \ codeflinger/texturing.cpp \ codeflinger/disassem.c \ codeflinger/tinyutils/SharedBuffer.cpp \ codeflinger/tinyutils/VectorImpl.cpp \ fixed.cpp.arm \ Loading @@ -39,6 +37,8 @@ endif endif ifeq ($(TARGET_ARCH),arm) PIXELFLINGER_SRC_FILES += codeflinger/ARMAssembler.cpp PIXELFLINGER_SRC_FILES += codeflinger/disassem.c # special optimization flags for pixelflinger PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer endif Loading @@ -52,6 +52,14 @@ endif LOCAL_SHARED_LIBRARIES := libcutils liblog ifeq ($(TARGET_ARCH),aarch64) PIXELFLINGER_SRC_FILES += arch-aarch64/t32cb16blend.S PIXELFLINGER_SRC_FILES += arch-aarch64/col32cb16blend.S PIXELFLINGER_SRC_FILES += codeflinger/Aarch64Assembler.cpp PIXELFLINGER_SRC_FILES += codeflinger/Aarch64Disassembler.cpp PIXELFLINGER_CFLAGS += -fstrict-aliasing -fomit-frame-pointer endif # # Shared library # Loading
libpixelflinger/arch-aarch64/col32cb16blend.S 0 → 100644 +87 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ .text .align .global scanline_col32cb16blend_aarch64 // // This function alpha blends a fixed color into a destination scanline, using // the formula: // // d = s + (((a + (a >> 7)) * d) >> 8) // // where d is the destination pixel, // s is the source color, // a is the alpha channel of the source color. // // x0 = destination buffer pointer // w1 = color value // w2 = count scanline_col32cb16blend_aarch64: lsr w5, w1, #24 // shift down alpha mov w9, #0xff // create mask add w5, w5, w5, lsr #7 // add in top bit mov w4, #256 // create #0x100 sub w5, w4, w5 // invert alpha and w10, w1, #0xff // extract red and w12, w9, w1, lsr #8 // extract green and w4, w9, w1, lsr #16 // extract blue lsl w10, w10, #5 // prescale red lsl w12, w12, #6 // prescale green lsl w4, w4, #5 // prescale blue lsr w9, w9, #2 // create dest green mask 1: ldrh w8, [x0] // load dest pixel subs w2, w2, #1 // decrement loop counter lsr w6, w8, #11 // extract dest red and w7, w9, w8, lsr #5 // extract dest green and w8, w8, #0x1f // extract dest blue madd w6, w6, w5, w10 // dest red * alpha + src red madd w7, w7, w5, w12 // dest green * alpha + src green madd w8, w8, w5, w4 // dest blue * alpha + src blue lsr w6, w6, #8 // shift down red lsr w7, w7, #8 // shift down green lsl w6, w6, #11 // shift red into 565 orr w6, w6, w7, lsl #5 // shift green into 565 orr w6, w6, w8, lsr #8 // shift blue into 565 strh w6, [x0], #2 // store pixel to dest, update ptr b.ne 1b // if count != 0, loop ret
libpixelflinger/arch-aarch64/t32cb16blend.S 0 → 100644 +213 −0 Original line number Diff line number Diff line /* * Copyright (C) 2013 The Android Open Source Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ .text .align .global scanline_t32cb16blend_aarch64 /* * .macro pixel * * This macro alpha blends RGB565 original pixel located in either * top or bottom 16 bits of DREG register with SRC 32 bit pixel value * and writes the result to FB register * * \DREG is a 32-bit register containing *two* original destination RGB565 * pixels, with the even one in the low-16 bits, and the odd one in the * high 16 bits. * * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors. * * \FB is a target register that will contain the blended pixel values. * * \ODD is either 0 or 1 and indicates if we're blending the lower or * upper 16-bit pixels in DREG into FB * * * clobbered: w6, w7, w16, w17, w18 * */ .macro pixel, DREG, SRC, FB, ODD // SRC = 0xAABBGGRR lsr w7, \SRC, #24 // sA add w7, w7, w7, lsr #7 // sA + (sA >> 7) mov w6, #0x100 sub w7, w6, w7 // sA = 0x100 - (sA+(sA>>7)) 1: .if \ODD //Blending odd pixel present in top 16 bits of DREG register // red lsr w16, \DREG, #(16 + 11) mul w16, w7, w16 lsr w6, \SRC, #3 and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F orr w17, \FB, #(0x1F<<(16 + 11)) orr w18, \FB, w16, lsl #(16 + 11) csel \FB, w17, w18, hi // green and w6, \DREG, #(0x3F<<(16 + 5)) lsr w17,w6,#(16+5) mul w6, w7, w17 lsr w16, \SRC, #(8+2) and w16, w16, #0x3F add w6, w16, w6, lsr #8 cmp w6, #0x3F orr w17, \FB, #(0x3F<<(16 + 5)) orr w18, \FB, w6, lsl #(16 + 5) csel \FB, w17, w18, hi // blue and w16, \DREG, #(0x1F << 16) lsr w17,w16,#16 mul w16, w7, w17 lsr w6, \SRC, #(8+8+3) and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F orr w17, \FB, #(0x1F << 16) orr w18, \FB, w16, lsl #16 csel \FB, w17, w18, hi .else //Blending even pixel present in bottom 16 bits of DREG register // red lsr w16, \DREG, #11 and w16, w16, #0x1F mul w16, w7, w16 lsr w6, \SRC, #3 and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F mov w17, #(0x1F<<11) lsl w18, w16, #11 csel \FB, w17, w18, hi // green and w6, \DREG, #(0x3F<<5) mul w6, w7, w6 lsr w16, \SRC, #(8+2) and w16, w16, #0x3F add w6, w16, w6, lsr #(5+8) cmp w6, #0x3F orr w17, \FB, #(0x3F<<5) orr w18, \FB, w6, lsl #5 csel \FB, w17, w18, hi // blue and w16, \DREG, #0x1F mul w16, w7, w16 lsr w6, \SRC, #(8+8+3) and w6, w6, #0x1F add w16, w6, w16, lsr #8 cmp w16, #0x1F orr w17, \FB, #0x1F orr w18, \FB, w16 csel \FB, w17, w18, hi .endif // End of blending even pixel .endm // End of pixel macro // x0: dst ptr // x1: src ptr // w2: count // w3: d // w4: s0 // w5: s1 // w6: pixel // w7: pixel // w8: free // w9: free // w10: free // w11: free // w12: scratch // w14: pixel scanline_t32cb16blend_aarch64: // align DST to 32 bits tst x0, #0x3 b.eq aligned subs w2, w2, #1 b.lo return last: ldr w4, [x1], #4 ldrh w3, [x0] pixel w3, w4, w12, 0 strh w12, [x0], #2 aligned: subs w2, w2, #2 b.lo 9f // The main loop is unrolled twice and processes 4 pixels 8: ldp w4,w5, [x1], #8 add x0, x0, #4 // it's all zero, skip this pixel orr w3, w4, w5 cbz w3, 7f // load the destination ldr w3, [x0, #-4] // stream the destination pixel w3, w4, w12, 0 pixel w3, w5, w12, 1 str w12, [x0, #-4] // 2nd iteration of the loop, don't stream anything subs w2, w2, #2 csel w4, w5, w4, lt blt 9f ldp w4,w5, [x1], #8 add x0, x0, #4 orr w3, w4, w5 cbz w3, 7f ldr w3, [x0, #-4] pixel w3, w4, w12, 0 pixel w3, w5, w12, 1 str w12, [x0, #-4] 7: subs w2, w2, #2 bhs 8b mov w4, w5 9: adds w2, w2, #1 b.lo return b last return: ret
libpixelflinger/codeflinger/ARMAssemblerInterface.h +1 −1 Original line number Diff line number Diff line Loading @@ -63,7 +63,7 @@ public: }; enum { CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS, CODEGEN_ARCH_AARCH64 }; // ----------------------------------------------------------------------- Loading