Bug 881587 - Use SSE2 version of AudioNodeEngine.cpp routines added in
bug 815643. r?padenot
MozReview-Commit-ID: 3cfU3oTruAC
--- a/dom/media/webaudio/AudioNodeEngine.cpp
+++ b/dom/media/webaudio/AudioNodeEngine.cpp
@@ -130,16 +130,24 @@ AudioBlockCopyChannelWithScale(const flo
}
void
BufferComplexMultiply(const float* aInput,
const float* aScale,
float* aOutput,
uint32_t aSize)
{
+
+#ifdef USE_SSE2
+ if (mozilla::supports_sse()) {
+ BufferComplexMultiply_SSE(aInput, aScale, aOutput, aSize);
+ return;
+ }
+#endif
+
for (uint32_t i = 0; i < aSize * 2; i += 2) {
float real1 = aInput[i];
float imag1 = aInput[i + 1];
float real2 = aScale[i];
float imag2 = aScale[i + 1];
float realResult = real1 * real2 - imag1 * imag2;
float imagResult = real1 * imag2 + imag1 * real2;
aOutput[i] = realResult;
@@ -308,16 +316,37 @@ AudioBlockPanStereoToStereo(const float
}
}
}
float
AudioBufferSumOfSquares(const float* aInput, uint32_t aLength)
{
float sum = 0.0f;
+
+#ifdef USE_SSE2
+ if (mozilla::supports_sse()) {
+ const float* alignedInput = ALIGNED16(aInput);
+ float vLength = (aLength >> 4) << 4;
+
+ // use scalar operations for any unaligned data at the beginning
+ while (aInput != alignedInput) {
+ sum += *aInput * *aInput;
+ ++aInput;
+ }
+
+ sum += AudioBufferSumOfSquares_SSE(alignedInput, vLength);
+
+ // adjust aInput and aLength to use scalar operations for any
+ // remaining values
+ aInput = alignedInput + 1;
+ aLength -= vLength;
+ }
+#endif
+
while (aLength--) {
sum += *aInput * *aInput;
++aInput;
}
return sum;
}
void
--- a/dom/media/webaudio/AudioNodeEngineSSE2.cpp
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
@@ -218,16 +218,21 @@ void BufferComplexMultiply_SSE(const flo
float* aOutput,
uint32_t aSize)
{
unsigned i;
__m128 in0, in1, in2, in3,
outreal0, outreal1, outreal2, outreal3,
outimag0, outimag1, outimag2, outimag3;
+ ASSERT_ALIGNED16(aInput);
+ ASSERT_ALIGNED16(aScale);
+ ASSERT_ALIGNED16(aOutput);
+ ASSERT_MULTIPLE16(aSize);
+
for (i = 0; i < aSize * 2; i += 16) {
in0 = _mm_load_ps(&aInput[i]);
in1 = _mm_load_ps(&aInput[i + 4]);
in2 = _mm_load_ps(&aInput[i + 8]);
in3 = _mm_load_ps(&aInput[i + 12]);
outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
@@ -268,16 +273,19 @@ void BufferComplexMultiply_SSE(const flo
float
AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength)
{
unsigned i;
__m128 in0, in1, in2, in3,
acc0, acc1, acc2, acc3;
float out[4];
+ ASSERT_ALIGNED16(aInput);
+ ASSERT_MULTIPLE16(aLength);
+
acc0 = _mm_setzero_ps();
acc1 = _mm_setzero_ps();
acc2 = _mm_setzero_ps();
acc3 = _mm_setzero_ps();
for (i = 0; i < aLength; i+=16) {
in0 = _mm_load_ps(&aInput[i]);
in1 = _mm_load_ps(&aInput[i + 4]);