Bug 1397793 - Move to APM - Part 2 - Actual processing. r?pehrsons draft
authorPaul Adenot <paul@paul.cx>
Mon, 04 Dec 2017 13:34:14 +0100
changeset 707564 d2245037e8ee7145af7eef528dcee50817b69d83
parent 707563 7044c2d1695cdf0d6a69b4faa19349e3261ef204
child 707565 3fdab4238aeda44a12383d64941d0470e4c709ad
push id92158
push userpaul@paul.cx
push dateTue, 05 Dec 2017 14:38:23 +0000
reviewerspehrsons
bugs1397793
milestone59.0a1
Bug 1397793 - Move to APM - Part 2 - Actual processing. r?pehrsons This also is long, but simple. First, we switch to floats everywhere. This allows to work with any rate, is more flexible with channel layout, and is a stable API (see audio_processing.h in webrtc.org). Then, 10ms worth of audio (already at the graph rate) are poped from the lock-free queue (fed on the other end by the MSG mixer), and does the following: - Down mixing to stereo (if needed) - De-interleaving into planar buffer - Prepare input and output config - Actually make the API call - Free the data Now, first, we should use a ring buffer, and not have to free any data. Then we also should not use a lock-free queue, and synchronously process the reverse-stream, but this is enough code already. Then, the actual mic data processing: - Pop a packet from the packetizer (that gives us 10ms worth of audio, note that we switch from int16_t to float, i.e. we don't do this conversion anymore). - We convert to planar buffers, deinterleaving - Prepare input and output config - Allocate a SharedBuffer of the right size - Process the data with the processing algorithm selected in UpdateSingleSource - Append to the a MediaSegment, and append to the right MediaStreamTrack for the correct SourceMediaStream (the data is already planar and all well). MozReview-Commit-ID: 2IjgHP0GAmw
dom/media/webrtc/AudioOutputObserver.h
dom/media/webrtc/MediaEngineWebRTC.h
dom/media/webrtc/MediaEngineWebRTCAudio.cpp
--- a/dom/media/webrtc/AudioOutputObserver.h
+++ b/dom/media/webrtc/AudioOutputObserver.h
@@ -12,23 +12,23 @@
 
 namespace webrtc {
 class SingleRwFifo;
 }
 
 namespace mozilla {
 
 typedef struct FarEndAudioChunk_ {
-  uint16_t mSamples;
+  size_t mSamples;
   bool mOverrun;
-  int16_t mData[1]; // variable-length
+  AudioDataValue mData[1]; // variable-length
 } FarEndAudioChunk;
 
 // This class is used to packetize and send the mixed audio from an MSG, in
-// int16, to the AEC module of WebRTC.org.
+// float, to the AEC module of WebRTC.org.
 class AudioOutputObserver
 {
 public:
   AudioOutputObserver();
 
   NS_INLINE_DECL_THREADSAFE_REFCOUNTING(AudioOutputObserver);
 
   void Clear();
--- a/dom/media/webrtc/MediaEngineWebRTC.h
+++ b/dom/media/webrtc/MediaEngineWebRTC.h
@@ -511,17 +511,17 @@ private:
   RefPtr<AudioOutputObserver> mAudioOutputObserver;
 
   // Note: shared across all microphone sources
   static int sChannelsOpen;
 
   const UniquePtr<webrtc::AudioProcessing> mAudioProcessing;
 
   // accessed from the GraphDriver thread except for deletion
-  nsAutoPtr<AudioPacketizer<AudioDataValue, AudioDataValue>> mPacketizer;
+  nsAutoPtr<AudioPacketizer<AudioDataValue, float>> mPacketizer;
   ScopedCustomReleasePtr<webrtc::VoEExternalMedia> mVoERenderListener;
 
   // mMonitor protects mSources[] and mPrinicpalIds[] access/changes, and
   // transitions of mState from kStarted to kStopped (which are combined with
   // EndTrack()). mSources[] and mPrincipalHandles[] are accessed from webrtc
   // threads.
   Monitor mMonitor;
   nsTArray<RefPtr<SourceMediaStream>> mSources;
@@ -544,17 +544,18 @@ private:
   // because of prefs or constraints. This allows simply copying the audio into
   // the MSG, skipping resampling and the whole webrtc.org code.
   bool mSkipProcessing;
 
   // To only update microphone when needed, we keep track of previous settings.
   MediaEnginePrefs mLastPrefs;
 
   AlignedFloatBuffer mInputBuffer;
-  AlignedFloatBuffer mInputDownmixBuffer;
+  AlignedFloatBuffer mDeinterleavedBuffer;
+  AlignedAudioBuffer mInputDownmixBuffer;
 };
 
 class MediaEngineWebRTC : public MediaEngine
 {
   typedef MediaEngine Super;
 public:
   explicit MediaEngineWebRTC(MediaEnginePrefs& aPrefs);
 
--- a/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
+++ b/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
@@ -17,23 +17,16 @@
 #endif
 #include "webrtc/modules/audio_device/opensl/single_rw_fifo.h"
 #include "webrtc/voice_engine/voice_engine_defines.h"
 #include "webrtc/modules/audio_processing/include/audio_processing.h"
 #include "webrtc/common_audio/include/audio_util.h"
 
 using namespace webrtc;
 
-#define CHANNELS 1
-#define ENCODING "L16"
-#define DEFAULT_PORT 5555
-
-#define SAMPLE_RATE(freq) ((freq)*2*8) // bps, 16-bit samples
-#define SAMPLE_LENGTH(freq) (((freq)*10)/1000)
-
 // These are restrictions from the webrtc.org code
 #define MAX_CHANNELS 2
 #define MAX_SAMPLING_FREQ 48000 // Hz - multiple of 100
 
 #define MAX_AEC_FIFO_DEPTH 200 // ms - multiple of 10
 static_assert(!(MAX_AEC_FIFO_DEPTH % 10), "Invalid MAX_AEC_FIFO_DEPTH");
 
 namespace mozilla {
@@ -139,38 +132,38 @@ AudioOutputObserver::InsertFarEnd(const 
     aOverran = false;
   }
   // Rechunk to 10ms.
   // The AnalyzeReverseStream() and WebRtcAec_BufferFarend() functions insist on 10ms
   // samples per call.  Annoying...
   while (aFrames) {
     if (!mSaved) {
       mSaved = (FarEndAudioChunk *) moz_xmalloc(sizeof(FarEndAudioChunk) +
-                                                (mChunkSize * channels - 1)*sizeof(int16_t));
+                                                (mChunkSize * channels - 1)*sizeof(AudioDataValue));
       mSaved->mSamples = mChunkSize;
       mSaved->mOverrun = aOverran;
       aOverran = false;
     }
     uint32_t to_copy = mChunkSize - mSamplesSaved;
     if (to_copy > aFrames) {
       to_copy = aFrames;
     }
 
-    int16_t* dest = &(mSaved->mData[mSamplesSaved * channels]);
+    AudioDataValue* dest = &(mSaved->mData[mSamplesSaved * channels]);
     if (aChannels > MAX_CHANNELS) {
       AudioConverter converter(AudioConfig(aChannels, 0), AudioConfig(channels, 0));
       converter.Process(mDownmixBuffer, aBuffer, to_copy);
       ConvertAudioSamples(mDownmixBuffer.Data(), dest, to_copy * channels);
     } else {
       ConvertAudioSamples(aBuffer, dest, to_copy * channels);
     }
 
 #ifdef LOG_FAREND_INSERTION
     if (fp) {
-      fwrite(&(mSaved->mData[mSamplesSaved * aChannels]), to_copy * aChannels, sizeof(int16_t), fp);
+      fwrite(&(mSaved->mData[mSamplesSaved * aChannels]), to_copy * aChannels, sizeof(AudioDataValue), fp);
     }
 #endif
     aFrames -= to_copy;
     mSamplesSaved += to_copy;
     aBuffer += to_copy * aChannels;
 
     if (mSamplesSaved >= mChunkSize) {
       int free_slots = mPlayoutFifo->capacity() - mPlayoutFifo->size();
@@ -198,17 +191,17 @@ MediaEngineWebRTCMicrophoneSource::Media
   , mAudioInput(aAudioInput)
   , mAudioProcessing(AudioProcessing::Create())
   , mMonitor("WebRTCMic.Monitor")
   , mCapIndex(aIndex)
   , mDelayAgnostic(aDelayAgnostic)
   , mExtendedFilter(aExtendedFilter)
   , mTrackID(TRACK_NONE)
   , mStarted(false)
-  , mSampleFrequency(MediaEngine::DEFAULT_SAMPLE_RATE)
+  , mSampleFrequency(MediaEngine::USE_GRAPH_RATE)
   , mTotalFrames(0)
   , mLastLogFrames(0)
   , mSkipProcessing(false)
   , mInputDownmixBuffer(MAX_SAMPLING_FREQ * MAX_CHANNELS / 100)
 {
   MOZ_ASSERT(aAudioInput);
   mDeviceName.Assign(NS_ConvertUTF8toUTF16(name));
   mDeviceUUID.Assign(uuid);
@@ -449,22 +442,34 @@ MediaEngineWebRTCMicrophoneSource::Updat
   switch (mState) {
     case kReleased:
       MOZ_ASSERT(aHandle);
       if (sChannelsOpen != 0) {
         // Until we fix (or wallpaper) support for multiple mic input
         // (Bug 1238038) fail allocation for a second device
         return NS_ERROR_FAILURE;
       }
+      if (mAudioInput->SetRecordingDevice(mCapIndex)) {
+         return NS_ERROR_FAILURE;
+      }
       mAudioInput->SetUserChannelCount(prefs.mChannels);
       if (!AllocChannel()) {
         FreeChannel();
         LOG(("Audio device is not initalized"));
         return NS_ERROR_FAILURE;
       }
+      LOG(("Audio device %d allocated", mCapIndex));
+      {
+        // Update with the actual applied channelCount in order
+        // to store it in settings.
+        uint32_t channelCount = 0;
+        mAudioInput->GetChannelCount(channelCount);
+        MOZ_ASSERT(channelCount > 0);
+        prefs.mChannels = channelCount;
+      }
       break;
 
     case kStarted:
       if (prefs == mLastPrefs) {
         return NS_OK;
       }
 
       if (prefs.mChannels != mLastPrefs.mChannels) {
@@ -671,64 +676,222 @@ MediaEngineWebRTCMicrophoneSource::Notif
                                                     uint32_t aChannels)
 {
   if (mAudioOutputObserver) {
     mAudioOutputObserver->InsertFarEnd(aBuffer, aFrames, false,
                                   aRate, aChannels);
   }
 }
 
+// Only called if we're not in passthrough mode
 void
 MediaEngineWebRTCMicrophoneSource::PacketizeAndProcess(MediaStreamGraph* aGraph,
                                                        const AudioDataValue* aBuffer,
                                                        size_t aFrames,
                                                        TrackRate aRate,
                                                        uint32_t aChannels)
 {
   MOZ_ASSERT(!PassThrough(), "This should be bypassed when in PassThrough mode.");
   size_t offset = 0;
 
   if (!mPacketizer ||
       mPacketizer->PacketSize() != aRate/100u ||
       mPacketizer->Channels() != aChannels) {
     // It's ok to drop the audio still in the packetizer here.
     mPacketizer =
-      new AudioPacketizer<AudioDataValue, AudioDataValue>(aRate/100, aChannels);
+      new AudioPacketizer<AudioDataValue, float>(aRate/100, aChannels);
   }
 
   // On initial capture, throw away all far-end data except the most recent sample
   // since it's already irrelevant and we want to keep avoid confusing the AEC far-end
   // input code with "old" audio.
   if (!mStarted) {
     mStarted  = true;
     while (mAudioOutputObserver->Size() > 1) {
       free(mAudioOutputObserver->Pop()); // only call if size() > 0
     }
   }
 
+  // Feed the far-end audio data (speakers) to the feedback input of the AEC.
+  while (mAudioOutputObserver->Size() > 0) {
+    // Bug 1414837: This will call `free()`, and we should remove it.
+    // Pop gives ownership.
+    UniquePtr<FarEndAudioChunk> buffer(mAudioOutputObserver->Pop()); // only call if size() > 0
+    if (!buffer) {
+      continue;
+    }
+    AudioDataValue* packetDataPointer = buffer->mData;
+    AutoTArray<AudioDataValue*, MAX_CHANNELS> deinterleavedPacketDataChannelPointers;
+    AudioDataValue* interleavedFarend = nullptr;
+    uint32_t channelCountFarend = 0;
+    uint32_t framesPerPacketFarend = 0;
+
+    // Downmix from aChannels to MAX_CHANNELS if needed
+    if (mAudioOutputObserver->PlayoutChannels() > MAX_CHANNELS) {
+      AudioConverter converter(AudioConfig(aChannels, 0, AudioConfig::FORMAT_DEFAULT),
+                               AudioConfig(MAX_CHANNELS, 0, AudioConfig::FORMAT_DEFAULT));
+      framesPerPacketFarend =
+        buffer->mSamples;
+      framesPerPacketFarend =
+        converter.Process(mInputDownmixBuffer,
+                          packetDataPointer,
+                          framesPerPacketFarend);
+      interleavedFarend = mInputDownmixBuffer.Data();
+      channelCountFarend = MAX_CHANNELS;
+      deinterleavedPacketDataChannelPointers.SetLength(MAX_CHANNELS);
+    } else {
+      uint32_t outputChannels = mAudioOutputObserver->PlayoutChannels();
+      interleavedFarend = packetDataPointer;
+      channelCountFarend = outputChannels;
+      framesPerPacketFarend = buffer->mSamples;
+      deinterleavedPacketDataChannelPointers.SetLength(outputChannels);
+    }
+
+    MOZ_ASSERT(interleavedFarend &&
+               (channelCountFarend == 1 || channelCountFarend == 2) &&
+               framesPerPacketFarend);
+
+    offset = 0;
+    for (size_t i = 0; i < deinterleavedPacketDataChannelPointers.Length(); ++i) {
+      deinterleavedPacketDataChannelPointers[i] = packetDataPointer + offset;
+      offset += framesPerPacketFarend;
+    }
+
+    // deinterleave back into the FarEndAudioChunk buffer to save an alloc.
+    // There is enough room because either there is the same number of
+    // channels/frames or we've just downmixed.
+    Deinterleave(interleavedFarend,
+                 framesPerPacketFarend,
+                 channelCountFarend,
+                 deinterleavedPacketDataChannelPointers.Elements());
+
+    // Having the same config for input and output means we potentially save
+    // some CPU. We won't need the output here, the API forces us to set a
+    // valid pointer with enough space.
+    StreamConfig inputConfig(mAudioOutputObserver->PlayoutFrequency(),
+                             channelCountFarend,
+                             false /* we don't use typing detection*/);
+    StreamConfig outputConfig = inputConfig;
+
+    // Prepare a channel pointers array, with enough storage for the
+    // frames.
+    //
+    // If this is a platform that uses s16 for audio input and output,
+    // convert to floats, the APM API we use only accepts floats.
+
+    float* inputData = nullptr;
+#ifdef MOZ_SAMPLE_TYPE_S16
+    // Convert to floats, use mInputBuffer for this.
+    size_t sampleCount = framesPerPacketFarend * channelCountFarend;
+    if (mInputBuffer.Length() < sampleCount) {
+      mInputBuffer.SetLength(sampleCount);
+    }
+    ConvertAudioSamples(buffer->mData, mInputBuffer.Data(), sampleCount);
+    inputData = mInputBuffer.Data();
+#else // MOZ_SAMPLE_TYPE_F32
+    inputData = buffer->mData;
+#endif
+
+    AutoTArray<float*, MAX_CHANNELS> channelsPointers;
+    channelsPointers.SetLength(channelCountFarend);
+    offset = 0;
+    for (size_t i = 0; i < channelsPointers.Length(); ++i) {
+      channelsPointers[i]  = inputData + offset;
+      offset += framesPerPacketFarend;
+    }
+
+    // Passing the same pointers here saves a copy inside this function.
+    int err =
+      mAudioProcessing->ProcessReverseStream(channelsPointers.Elements(),
+                                             inputConfig,
+                                             outputConfig,
+                                             channelsPointers.Elements());
+
+    if (err) {
+      MOZ_LOG(GetMediaManagerLog(), LogLevel::Error,
+          ("error in audio ProcessReverseStream(): %d", err));
+      return;
+    }
+  }
+
+  // Packetize our input data into 10ms chunks, deinterleave into planar channel
+  // buffers, process, and append to the right MediaStreamTrack.
   mPacketizer->Input(aBuffer, static_cast<uint32_t>(aFrames));
 
   while (mPacketizer->PacketsAvailable()) {
     uint32_t samplesPerPacket = mPacketizer->PacketSize() *
       mPacketizer->Channels();
     if (mInputBuffer.Length() < samplesPerPacket) {
       mInputBuffer.SetLength(samplesPerPacket);
+      mDeinterleavedBuffer.SetLength(samplesPerPacket);
     }
-    int16_t* packet = mInputBuffer.Elements();
+    float* packet = mInputBuffer.Data();
     mPacketizer->Output(packet);
 
-    if (aChannels > MAX_CHANNELS) {
-      AudioConverter converter(AudioConfig(aChannels, 0, AudioConfig::FORMAT_S16),
-                               AudioConfig(MAX_CHANNELS, 0, AudioConfig::FORMAT_S16));
-      converter.Process(mInputDownmixBuffer, packet, mPacketizer->PacketSize());
-      mVoERender->ExternalRecordingInsertData(mInputDownmixBuffer.Data(),
-                                              mPacketizer->PacketSize() * MAX_CHANNELS,
-                                              aRate, 0);
-    } else {
-      mVoERender->ExternalRecordingInsertData(packet, samplesPerPacket, aRate, 0);
+    // Deinterleave the input data
+    // Prepare an array pointing to deinterleaved channels.
+    AutoTArray<float*, 8> deinterleavedPacketizedInputDataChannelPointers;
+    deinterleavedPacketizedInputDataChannelPointers.SetLength(aChannels);
+    offset = 0;
+    for (size_t i = 0; i < deinterleavedPacketizedInputDataChannelPointers.Length(); ++i) {
+      deinterleavedPacketizedInputDataChannelPointers[i] = mDeinterleavedBuffer.Data() + offset;
+      offset += aFrames;
+    }
+
+    // Deinterleave to mInputBuffer, pointed to by inputBufferChannelPointers.
+    Deinterleave(packet, mPacketizer->PacketSize(), aChannels,
+        deinterleavedPacketizedInputDataChannelPointers.Elements());
+
+    StreamConfig inputConfig(aRate,
+                             aChannels,
+                             false /* we don't use typing detection*/);
+    StreamConfig outputConfig = inputConfig;
+
+    // Bug 1404965: Get the right delay here, it saves some work down the line.
+    mAudioProcessing->set_stream_delay_ms(0);
+
+    // Bug 1414837: find a way to not allocate here.
+    RefPtr<SharedBuffer> buffer =
+      SharedBuffer::Create(mPacketizer->PacketSize() * aChannels * sizeof(float));
+    AudioSegment segment;
+
+    // Prepare channel pointers to the SharedBuffer created above.
+    AutoTArray<float*, 8> processedOutputChannelPointers;
+    AutoTArray<const float*, 8> processedOutputChannelPointersConst;
+    processedOutputChannelPointers.SetLength(aChannels);
+    processedOutputChannelPointersConst.SetLength(aChannels);
+
+    offset = 0;
+    for (size_t i = 0; i < processedOutputChannelPointers.Length(); ++i) {
+      processedOutputChannelPointers[i] = static_cast<float*>(buffer->Data()) + offset;
+      processedOutputChannelPointersConst[i] = static_cast<float*>(buffer->Data()) + offset;
+      offset += aFrames;
+    }
+
+    mAudioProcessing->ProcessStream(deinterleavedPacketizedInputDataChannelPointers.Elements(),
+                                    inputConfig,
+                                    outputConfig,
+                                    processedOutputChannelPointers.Elements());
+    MonitorAutoLock lock(mMonitor);
+    if (mState != kStarted)
+      return;
+
+    for (size_t i = 0; i < mSources.Length(); ++i) {
+      if (!mSources[i]) { // why ?!
+        continue;
+      }
+
+      // We already have planar audio data of the right format. Insert into the
+      // MSG.
+      MOZ_ASSERT(processedOutputChannelPointers.Length() == aChannels);
+      segment.AppendFrames(buffer.forget(),
+                           processedOutputChannelPointersConst,
+                           mPacketizer->PacketSize(),
+                           mPrincipalHandles[i]);
+      mSources[i]->AppendToTrack(mTrackID, &segment);
     }
   }
 }
 
 template<typename T>
 void
 MediaEngineWebRTCMicrophoneSource::InsertInGraph(const T* aBuffer,
                                                  size_t aFrames,
@@ -745,33 +908,33 @@ MediaEngineWebRTCMicrophoneSource::Inser
       MOZ_LOG(AudioLogModule(), LogLevel::Debug,
               ("%p: Inserting %zu samples into graph, total frames = %" PRIu64,
                (void*)this, aFrames, mTotalFrames));
       mLastLogFrames = mTotalFrames;
     }
   }
 
   size_t len = mSources.Length();
-  for (size_t i = 0; i < len; i++) {
+  for (size_t i = 0; i < len; ++i) {
     if (!mSources[i]) {
       continue;
     }
 
     TimeStamp insertTime;
     // Make sure we include the stream and the track.
     // The 0:1 is a flag to note when we've done the final insert for a given input block.
     LogTime(AsyncLatencyLogger::AudioTrackInsertion,
             LATENCY_STREAM_ID(mSources[i].get(), mTrackID),
             (i+1 < len) ? 0 : 1, insertTime);
 
     // Bug 971528 - Support stereo capture in gUM
     MOZ_ASSERT(aChannels >= 1 && aChannels <= 8,
                "Support up to 8 channels");
 
-    nsAutoPtr<AudioSegment> segment(new AudioSegment());
+    AudioSegment segment;
     RefPtr<SharedBuffer> buffer =
       SharedBuffer::Create(aFrames * aChannels * sizeof(T));
     AutoTArray<const T*, 8> channels;
     if (aChannels == 1) {
       PodCopy(static_cast<T*>(buffer->Data()), aBuffer, aFrames);
       channels.AppendElement(static_cast<T*>(buffer->Data()));
     } else {
       channels.SetLength(aChannels);
@@ -787,21 +950,21 @@ MediaEngineWebRTCMicrophoneSource::Inser
 
       DeinterleaveAndConvertBuffer(aBuffer,
                                    aFrames,
                                    aChannels,
                                    write_channels.Elements());
     }
 
     MOZ_ASSERT(aChannels == channels.Length());
-    segment->AppendFrames(buffer.forget(), channels, aFrames,
+    segment.AppendFrames(buffer.forget(), channels, aFrames,
                          mPrincipalHandles[i]);
-    segment->GetStartTime(insertTime);
+    segment.GetStartTime(insertTime);
 
-    mSources[i]->AppendToTrack(mTrackID, segment);
+    mSources[i]->AppendToTrack(mTrackID, &segment);
   }
 }
 
 // Called back on GraphDriver thread!
 // Note this can be called back after ::Shutdown()
 void
 MediaEngineWebRTCMicrophoneSource::NotifyInputData(MediaStreamGraph* aGraph,
                                                    const AudioDataValue* aBuffer,