Bug 1158741 - Implement a version of omxSP_FFTInv_CCSToR_F32_Sfs in openmax DL's FFT that is not scaled r?padenot draft
authorDan Minor <dminor@mozilla.com>
Mon, 25 Jan 2016 06:38:29 -0500
changeset 335366 af21895753dc70365a4c7e3ee8013736817db425
parent 334552 c1e0d1890cfee9d86c8d566b0490053f21e0afc6
child 515122 6bb8da4f259a508f3a1fef4fa32ce964db84d5db
push id11775
push userdminor@mozilla.com
push dateMon, 29 Feb 2016 13:59:37 +0000
reviewerspadenot
bugs1158741
milestone47.0a1
Bug 1158741 - Implement a version of omxSP_FFTInv_CCSToR_F32_Sfs in openmax DL's FFT that is not scaled r?padenot The new routine actually multiplies by two for consistency with the other FFT routines in use. MozReview-Commit-ID: Hk2Dg3fR2cQ
dom/media/webaudio/FFTBlock.h
media/openmax_dl/README.mozilla
media/openmax_dl/dl/moz.build
media/openmax_dl/dl/sp/api/omxSP.h
media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_unscaled_s.S
--- a/dom/media/webaudio/FFTBlock.h
+++ b/dom/media/webaudio/FFTBlock.h
@@ -116,20 +116,17 @@ public:
       AudioBufferCopyWithScale(mOutputBuffer.Elements()->f, 2.0f,
                                aDataOut, mFFTSize);
       aDataOut[1] = 2.0f * mOutputBuffer[mFFTSize/2].r; // Packed Nyquist
       av_rdft_calc(mAvIRDFT, aDataOut);
     }
 #else
 #ifdef BUILD_ARM_NEON
     if (mozilla::supports_neon()) {
-      omxSP_FFTInv_CCSToR_F32_Sfs(mOutputBuffer.Elements()->f, aDataOut, mOmxIFFT);
-      // There is no function that computes de inverse FFT without scaling, so
-      // we have to scale back up here. Bug 1158741.
-      AudioBufferInPlaceScale(aDataOut, mFFTSize, mFFTSize);
+      omxSP_FFTInv_CCSToR_F32_Sfs_unscaled(mOutputBuffer.Elements()->f, aDataOut, mOmxIFFT);
     } else
 #endif
     {
       kiss_fftri(mKissIFFT, &(mOutputBuffer.Elements()->c), aDataOut);
     }
 #endif
   }
 
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/README.mozilla
@@ -0,0 +1,9 @@
+Bug 1158741 added an omxSP_FFTInv_CCSToR_F32_Sfs_unscaled function as an
+optimization which performs the same operation as
+omxSP_FFTInv_CCSToR_F32_Sfs except it doesn't scale the results by the
+length of the FFT. For consistency with other FFT routines used, it does
+multiply the results by two.
+
+The affected files are:
+media/openmax_dl/dl/sp/api/omxSP.h
+media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_unscaled_s.S
--- a/media/openmax_dl/dl/moz.build
+++ b/media/openmax_dl/dl/moz.build
@@ -63,16 +63,17 @@ if CONFIG['CPU_ARCH'] == 'arm' and CONFI
         'sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S',
         'sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S',
         'sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S',
         'sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S',
         'sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S',
         'sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S',
         'sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S',
         'sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S',
+        'sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_unscaled_s.S',
         'sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S',
         'sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S',
         'sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S',
         'sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S',
         'sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S',
     ]
 
     LOCAL_INCLUDES += [
--- a/media/openmax_dl/dl/sp/api/omxSP.h
+++ b/media/openmax_dl/dl/sp/api/omxSP.h
@@ -2593,16 +2593,28 @@ OMXResult omxSP_FFTInv_CCSToR_F32_Sfs_vf
 extern OMXResult (*omxSP_FFTInv_CCSToR_F32)(
     const OMX_F32* pSrc,
     OMX_F32* pDst,
     const OMXFFTSpec_R_F32* pFFTSpec);
 #else
 #define omxSP_FFTInv_CCSToR_F32 omxSP_FFTInv_CCSToR_F32_Sfs    
 #endif
 
+/*
+ * Just like omxSP_FFTInv_CCSToR_F32_Sfs, but does not scale the result.
+ * (Actually, we multiple by two for consistency with other FFT routines in
+ * use.)
+ */
+OMXResult omxSP_FFTInv_CCSToR_F32_Sfs_unscaled(
+    const OMX_F32* pSrc,
+    OMX_F32* pDst,
+    const OMXFFTSpec_R_F32* pFFTSpec
+);
+
+
 #ifdef __cplusplus
 }
 #endif
 
 #endif /** end of #define _OMXSP_H_ */
 
 /** EOF */
 
new file mode 100644
--- /dev/null
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_unscaled_s.S
@@ -0,0 +1,284 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Copyright 2016, Mozilla Foundation and contributors
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTInv_CCSToR_S32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+@//  It is further modified to produce an "unscaled" version, which
+@//  actually multiplies by two for consistency with the other FFT functions
+@//  in use.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+      @// Guarding implementation by the processor name
+
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r8
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@// Neon registers
+
+#define dX0     D0.F32
+#define dShift  D1.F32
+#define dX1     D1.F32
+#define dY0     D2.F32
+#define dY1     D3.F32
+#define dX0r    D0.F32
+#define dX0i    D1.F32
+#define dX1r    D2.F32
+#define dX1i    D3.F32
+#define dW0r    D4.F32
+#define dW0i    D5.F32
+#define dW1r    D6.F32
+#define dW1i    D7.F32
+#define dT0     D8.F32
+#define dT1     D9.F32
+#define dT2     D10.F32
+#define dT3     D11.F32
+#define qT0     d12.F32
+#define qT1     d14.F32
+#define qT2     d16.F32
+#define qT3     d18.F32
+#define dY0r    D4.F32
+#define dY0i    D5.F32
+#define dY1r    D6.F32
+#define dY1i    D7.F32
+#define dzero   D20.F32
+
+#define dY2     D4.F32
+#define dY3     D5.F32
+#define dW0     D6.F32
+#define dW1     D7.F32
+#define dW0Tmp  D10.F32
+#define dW1Neg  D11.F32
+
+#define sN      S0.S32
+#define fN      S1.F32
+@// two must be the same as dScale[0]!
+#define dScale  D2.F32
+#define two S4.F32
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        complexFFTSize, 4
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CCSToR_F32_Sfs_unscaled,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        @//  N=1 Treat seperately
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        VLD1    dX0[0],[pSrc]
+        VST1    dX0[0],[pDst]
+
+        B       End
+
+sizeGreaterThanOne:
+
+        @// Call the preTwiddle Radix2 stage before doing the compledIFFT
+
+
+        BL    armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe
+
+
+complexIFFT:
+
+        ASR     N,N,#1                             @// N/2 point complex IFFT
+        M_STR   N, complexFFTSize                  @ Save N for scaling later
+        ADD     pSrc,pOut,N,LSL #3                 @// set pSrc as pOut1
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pDst]
+        MOV     pSrc,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        BGE     orderGreaterthan1
+        BLLT    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        B       FFTEnd
+
+orderGreaterthan1:
+        MOV     tmpOrder,order                          @// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+
+orderGreaterthan3:
+specialScaleCase:
+
+        @// Set input args to fft stages
+        TST     order, #2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine even though
+        @// the first BL would corrupt the flags. This is because the end of
+        @// the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
+        @// to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+FFTEnd:                                               @// Does only the scaling
+        @ Scale inverse FFT result by 2 for consistency with other FFTs
+        VMOV    two, 2.0                   @ two = dScale[0]
+
+        @// N = subFFTSize  ; dataptr = pDst
+scaleFFTData:
+        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VMUL    dX0, dX0, dScale[0]
+        VST1    {dX0},[pSrc]!
+
+        BGT     scaleFFTData
+
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+
+
+        .end