Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0f2787e8 authored by Automerger Merge Worker's avatar Automerger Merge Worker
Browse files

Merge "Supported FMA Intrinsics in Audio Resampler FIR processing module" am:...

Merge "Supported FMA Intrinsics in Audio Resampler FIR processing module" am: a2fa90ba am: 865d6023 am: 85296af8

Change-Id: I6fe62e43b79e8881e7267f0c5fe3c5e11f12dfea
parents 00a61b56 85296af8
Loading
Loading
Loading
Loading
+19 −0
Original line number Original line Diff line number Diff line
@@ -19,6 +19,25 @@ cc_defaults {
        // uncomment to disable NEON on architectures that actually do support NEON, for benchmarking
        // uncomment to disable NEON on architectures that actually do support NEON, for benchmarking
        // "-DUSE_NEON=false",
        // "-DUSE_NEON=false",
    ],
    ],

    arch: {
        x86: {
            avx2: {
                cflags: [
                    "-mavx2",
                    "-mfma",
                ],
            },
        },
        x86_64: {
            avx2: {
                cflags: [
                    "-mavx2",
                    "-mfma",
                ],
            },
        },
    },
}
}


cc_library_shared {
cc_library_shared {
+8 −1
Original line number Original line Diff line number Diff line
@@ -36,13 +36,20 @@ namespace android {
#include <arm_neon.h>
#include <arm_neon.h>
#endif
#endif


#if defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
#if defined(__AVX2__)  // Should be supported in x86 ABI for both 32 & 64-bit.
#define USE_AVX2 (true)  // Inference AVX2/FMA Intrinsics
#define USE_SSE (true)
#define USE_SSE (true)
#include <immintrin.h>
#elif defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
#define USE_SSE (true)  // Inference SSE Intrinsics
#define USE_AVX2 (false)
#include <tmmintrin.h>
#include <tmmintrin.h>
#else
#else
#define USE_SSE (false)
#define USE_SSE (false)
#define USE_AVX2(false)
#endif
#endif



template<typename T, typename U>
template<typename T, typename U>
struct is_same
struct is_same
{
{
+34 −11
Original line number Original line Diff line number Diff line
@@ -80,11 +80,16 @@ static inline void ProcessSSEIntrinsic(float* out,
            posCoef1 = _mm_sub_ps(posCoef1, posCoef);
            posCoef1 = _mm_sub_ps(posCoef1, posCoef);
            negCoef = _mm_sub_ps(negCoef, negCoef1);
            negCoef = _mm_sub_ps(negCoef, negCoef1);



            #if USE_AVX2
            posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
            negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
            #else
            posCoef1 = _mm_mul_ps(posCoef1, interp);
            posCoef1 = _mm_mul_ps(posCoef1, interp);
            negCoef = _mm_mul_ps(negCoef, interp);
            negCoef = _mm_mul_ps(negCoef, interp);

            posCoef = _mm_add_ps(posCoef1, posCoef);
            posCoef = _mm_add_ps(posCoef1, posCoef);
            negCoef = _mm_add_ps(negCoef, negCoef1);
            negCoef = _mm_add_ps(negCoef, negCoef1);
            #endif //USE_AVX2
        }
        }
        switch (CHANNELS) {
        switch (CHANNELS) {
        case 1: {
        case 1: {
@@ -94,11 +99,17 @@ static inline void ProcessSSEIntrinsic(float* out,
            sN += 4;
            sN += 4;


            posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
            posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);

            #if USE_AVX2
            accL = _mm_fmadd_ps(posSamp, posCoef, accL);
            accL = _mm_fmadd_ps(negSamp, negCoef, accL);
            #else
            posSamp = _mm_mul_ps(posSamp, posCoef);
            posSamp = _mm_mul_ps(posSamp, posCoef);
            negSamp = _mm_mul_ps(negSamp, negCoef);
            negSamp = _mm_mul_ps(negSamp, negCoef);

            accL = _mm_add_ps(accL, posSamp);
            accL = _mm_add_ps(accL, posSamp);
            accL = _mm_add_ps(accL, negSamp);
            accL = _mm_add_ps(accL, negSamp);
            #endif

        } break;
        } break;
        case 2: {
        case 2: {
            __m128 posSamp0 = _mm_loadu_ps(sP);
            __m128 posSamp0 = _mm_loadu_ps(sP);
@@ -114,6 +125,12 @@ static inline void ProcessSSEIntrinsic(float* out,
            __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
            __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
            __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
            __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);


           #if USE_AVX2
           accL = _mm_fmadd_ps(posSampL, posCoef, accL);
           accR = _mm_fmadd_ps(posSampR, posCoef, accR);
           accL = _mm_fmadd_ps(negSampL, negCoef, accL);
           accR = _mm_fmadd_ps(negSampR, negCoef, accR);
           #else
           posSampL = _mm_mul_ps(posSampL, posCoef);
           posSampL = _mm_mul_ps(posSampL, posCoef);
           posSampR = _mm_mul_ps(posSampR, posCoef);
           posSampR = _mm_mul_ps(posSampR, posCoef);
           negSampL = _mm_mul_ps(negSampL, negCoef);
           negSampL = _mm_mul_ps(negSampL, negCoef);
@@ -123,6 +140,8 @@ static inline void ProcessSSEIntrinsic(float* out,
           accR = _mm_add_ps(accR, posSampR);
           accR = _mm_add_ps(accR, posSampR);
           accL = _mm_add_ps(accL, negSampL);
           accL = _mm_add_ps(accL, negSampL);
           accR = _mm_add_ps(accR, negSampR);
           accR = _mm_add_ps(accR, negSampR);
           #endif

        } break;
        } break;
        }
        }
    } while (count -= 4);
    } while (count -= 4);
@@ -144,9 +163,13 @@ static inline void ProcessSSEIntrinsic(float* out,
        outAccum = _mm_hadd_ps(accL, accR);
        outAccum = _mm_hadd_ps(accL, accR);
        outAccum = _mm_hadd_ps(outAccum, outAccum);
        outAccum = _mm_hadd_ps(outAccum, outAccum);
    }
    }

    #if USE_AVX2
    outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
    #else
    outAccum = _mm_mul_ps(outAccum, vLR);
    outAccum = _mm_mul_ps(outAccum, vLR);
    outSamp = _mm_add_ps(outSamp, outAccum);
    outSamp = _mm_add_ps(outSamp, outAccum);
    #endif

    _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
    _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
}
}