Bug 881587 - Use SSE2 version of AudioNodeEngine.cpp routines added in bug 815643. r?padenot draft
authorDan Minor <dminor@mozilla.com>
Thu, 14 Apr 2016 08:57:21 -0400
changeset 351946 c706a971fc370521ad10e05ba63181818db7a99c
parent 351945 7e2e4736f83f3c516a2d1d0dc8de90ee2763e2ac
child 518529 8887507e89aba4fc828d6bd0a74f1569b966be51
push id15558
push userdminor@mozilla.com
push dateFri, 15 Apr 2016 09:23:03 +0000
reviewerspadenot
bugs881587, 815643
milestone48.0a1
Bug 881587 - Use SSE2 version of AudioNodeEngine.cpp routines added in bug 815643. r?padenot MozReview-Commit-ID: 3cfU3oTruAC
dom/media/webaudio/AudioNodeEngine.cpp
dom/media/webaudio/AudioNodeEngineSSE2.cpp
--- a/dom/media/webaudio/AudioNodeEngine.cpp
+++ b/dom/media/webaudio/AudioNodeEngine.cpp
@@ -130,16 +130,24 @@ AudioBlockCopyChannelWithScale(const flo
 }
 
 void
 BufferComplexMultiply(const float* aInput,
                       const float* aScale,
                       float* aOutput,
                       uint32_t aSize)
 {
+
+#ifdef USE_SSE2
+  if (mozilla::supports_sse()) {
+    BufferComplexMultiply_SSE(aInput, aScale, aOutput, aSize);
+    return;
+  }
+#endif
+
   for (uint32_t i = 0; i < aSize * 2; i += 2) {
     float real1 = aInput[i];
     float imag1 = aInput[i + 1];
     float real2 = aScale[i];
     float imag2 = aScale[i + 1];
     float realResult = real1 * real2 - imag1 * imag2;
     float imagResult = real1 * imag2 + imag1 * real2;
     aOutput[i] = realResult;
@@ -308,16 +316,37 @@ AudioBlockPanStereoToStereo(const float 
     }
   }
 }
 
 float
 AudioBufferSumOfSquares(const float* aInput, uint32_t aLength)
 {
   float sum = 0.0f;
+
+#ifdef USE_SSE2
+  if (mozilla::supports_sse()) {
+    const float* alignedInput = ALIGNED16(aInput);
+    float vLength = (aLength >> 4) << 4;
+
+    // use scalar operations for any unaligned data at the beginning
+    while (aInput != alignedInput) {
+        sum += *aInput * *aInput;
+        ++aInput;
+    }
+
+    sum += AudioBufferSumOfSquares_SSE(alignedInput, vLength);
+
+    // adjust aInput and aLength to use scalar operations for any
+    // remaining values
+    aInput = alignedInput + 1;
+    aLength -= vLength;
+  }
+#endif
+
   while (aLength--) {
     sum += *aInput * *aInput;
     ++aInput;
   }
   return sum;
 }
 
 void
--- a/dom/media/webaudio/AudioNodeEngineSSE2.cpp
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
@@ -218,16 +218,21 @@ void BufferComplexMultiply_SSE(const flo
                                float* aOutput,
                                uint32_t aSize)
 {
   unsigned i;
   __m128 in0, in1, in2, in3,
          outreal0, outreal1, outreal2, outreal3,
          outimag0, outimag1, outimag2, outimag3;
 
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aScale);
+  ASSERT_ALIGNED16(aOutput);
+  ASSERT_MULTIPLE16(aSize);
+
   for (i = 0; i < aSize * 2; i += 16) {
     in0 = _mm_load_ps(&aInput[i]);
     in1 = _mm_load_ps(&aInput[i + 4]);
     in2 = _mm_load_ps(&aInput[i + 8]);
     in3 = _mm_load_ps(&aInput[i + 12]);
 
     outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
     outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
@@ -268,16 +273,19 @@ void BufferComplexMultiply_SSE(const flo
 float
 AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength)
 {
   unsigned i;
   __m128 in0, in1, in2, in3,
          acc0, acc1, acc2, acc3;
   float out[4];
 
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_MULTIPLE16(aLength);
+
   acc0 = _mm_setzero_ps();
   acc1 = _mm_setzero_ps();
   acc2 = _mm_setzero_ps();
   acc3 = _mm_setzero_ps();
 
   for (i = 0; i < aLength; i+=16) {
     in0 = _mm_load_ps(&aInput[i]);
     in1 = _mm_load_ps(&aInput[i + 4]);