Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a2fa90ba authored by Treehugger Robot's avatar Treehugger Robot Committed by Gerrit Code Review
Browse files

Merge "Supported FMA Intrinsics in Audio Resampler FIR processing module"

parents 124176d8 8070d8dd
Loading
Loading
Loading
Loading
+19 −0
Original line number Diff line number Diff line
@@ -19,6 +19,25 @@ cc_defaults {
        // uncomment to disable NEON on architectures that actually do support NEON, for benchmarking
        // "-DUSE_NEON=false",
    ],

    arch: {
        x86: {
            avx2: {
                cflags: [
                    "-mavx2",
                    "-mfma",
                ],
            },
        },
        x86_64: {
            avx2: {
                cflags: [
                    "-mavx2",
                    "-mfma",
                ],
            },
        },
    },
}

cc_library_shared {
+8 −1
Original line number Diff line number Diff line
@@ -36,13 +36,20 @@ namespace android {
#include <arm_neon.h>
#endif

#if defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
#if defined(__AVX2__)  // Should be supported in x86 ABI for both 32 & 64-bit.
#define USE_AVX2 (true)  // Inference AVX2/FMA Intrinsics
#define USE_SSE (true)
#include <immintrin.h>
#elif defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
#define USE_SSE (true)  // Inference SSE Intrinsics
#define USE_AVX2 (false)
#include <tmmintrin.h>
#else
#define USE_SSE (false)
#define USE_AVX2(false)
#endif


template<typename T, typename U>
struct is_same
{
+34 −11
Original line number Diff line number Diff line
@@ -80,11 +80,16 @@ static inline void ProcessSSEIntrinsic(float* out,
            posCoef1 = _mm_sub_ps(posCoef1, posCoef);
            negCoef = _mm_sub_ps(negCoef, negCoef1);


            #if USE_AVX2
            posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
            negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
            #else
            posCoef1 = _mm_mul_ps(posCoef1, interp);
            negCoef = _mm_mul_ps(negCoef, interp);

            posCoef = _mm_add_ps(posCoef1, posCoef);
            negCoef = _mm_add_ps(negCoef, negCoef1);
            #endif //USE_AVX2
        }
        switch (CHANNELS) {
        case 1: {
@@ -94,11 +99,17 @@ static inline void ProcessSSEIntrinsic(float* out,
            sN += 4;

            posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);

            #if USE_AVX2
            accL = _mm_fmadd_ps(posSamp, posCoef, accL);
            accL = _mm_fmadd_ps(negSamp, negCoef, accL);
            #else
            posSamp = _mm_mul_ps(posSamp, posCoef);
            negSamp = _mm_mul_ps(negSamp, negCoef);

            accL = _mm_add_ps(accL, posSamp);
            accL = _mm_add_ps(accL, negSamp);
            #endif

        } break;
        case 2: {
            __m128 posSamp0 = _mm_loadu_ps(sP);
@@ -114,6 +125,12 @@ static inline void ProcessSSEIntrinsic(float* out,
            __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
            __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);

           #if USE_AVX2
           accL = _mm_fmadd_ps(posSampL, posCoef, accL);
           accR = _mm_fmadd_ps(posSampR, posCoef, accR);
           accL = _mm_fmadd_ps(negSampL, negCoef, accL);
           accR = _mm_fmadd_ps(negSampR, negCoef, accR);
           #else
           posSampL = _mm_mul_ps(posSampL, posCoef);
           posSampR = _mm_mul_ps(posSampR, posCoef);
           negSampL = _mm_mul_ps(negSampL, negCoef);
@@ -123,6 +140,8 @@ static inline void ProcessSSEIntrinsic(float* out,
           accR = _mm_add_ps(accR, posSampR);
           accL = _mm_add_ps(accL, negSampL);
           accR = _mm_add_ps(accR, negSampR);
           #endif

        } break;
        }
    } while (count -= 4);
@@ -144,9 +163,13 @@ static inline void ProcessSSEIntrinsic(float* out,
        outAccum = _mm_hadd_ps(accL, accR);
        outAccum = _mm_hadd_ps(outAccum, outAccum);
    }

    #if USE_AVX2
    outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
    #else
    outAccum = _mm_mul_ps(outAccum, vLR);
    outSamp = _mm_add_ps(outSamp, outAccum);
    #endif

    _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
}