Bug 1369932 - Backport win32 build fix for aom. draft
authorRalph Giles <giles@mozilla.com>
Thu, 22 Jun 2017 17:11:42 -0700
changeset 599330 3e49ea2d21713420e3f84c2265cbbfc0a68a1766
parent 599255 b1b9129838ade91684574f42219b2010928d7db4
child 599331 f8656cef1bfc3be3468d3d69fdc66fbca2868166
child 599334 f98c71e0bc412176bd7f9a8d66b7800975867c69
push id65484
push userbmo:giles@thaumas.net
push dateFri, 23 Jun 2017 00:22:15 +0000
bugs1369932
milestone56.0a1
Bug 1369932 - Backport win32 build fix for aom. Backport patch by David Barker converting arguments to pointers. The win32 ABI spills to the stack after the first three arguments, but doesn't guarantee correct alignment for simd data there. This is fixed upstream, but I don't want to bump our commit just yet since it forces a re-encode of active demo streams. Patch from https://bugs.chromium.org/p/aomedia/issues/detail?id=587 MozReview-Commit-ID: 1pQaU8H1dXO
third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -207,25 +207,25 @@ static void bilinear_filter(const uint8_
         _mm_storeu_si128((__m128i *)&dst[j], res);
       }
 
       dst += w;
     }
   }
 }
 
-static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0,
-                                         const __m128i a1, const __m128i b1,
-                                         const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi8(a0, b0);
-  v0 = _mm_maddubs_epi16(v0, filter);
+static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+                                         const __m128i *a1, const __m128i *b1,
+                                         const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
+  v0 = _mm_maddubs_epi16(v0, *filter);
   v0 = xx_roundn_epu16(v0, FILTER_BITS);
 
-  __m128i v1 = _mm_unpacklo_epi8(a1, b1);
-  v1 = _mm_maddubs_epi16(v1, filter);
+  __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
+  v1 = _mm_maddubs_epi16(v1, *filter);
   v1 = xx_roundn_epu16(v1, FILTER_BITS);
 
   return _mm_packus_epi16(v0, v1);
 }
 
 static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
                                int yoffset, uint8_t *dst, int h) {
   int i;
@@ -251,17 +251,17 @@ static void bilinear_filter8xh(const uin
     uint8_t *b = dst;
     const uint8_t *hfilter = bilinear_filters_2t[xoffset];
     const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
     for (i = 0; i < h; i += 2) {
       const __m128i x0 = _mm_loadu_si128((__m128i *)src);
       const __m128i z0 = _mm_srli_si128(x0, 1);
       const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
       const __m128i z1 = _mm_srli_si128(x1, 1);
-      const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+      const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
       _mm_storeu_si128((__m128i *)b, res);
 
       src += src_stride * 2;
       b += 16;
     }
     // Handle i = h separately
     const __m128i x0 = _mm_loadu_si128((__m128i *)src);
     const __m128i z0 = _mm_srli_si128(x0, 1);
@@ -285,17 +285,17 @@ static void bilinear_filter8xh(const uin
     }
   } else {
     const uint8_t *vfilter = bilinear_filters_2t[yoffset];
     const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
     for (i = 0; i < h; i += 2) {
       const __m128i x = _mm_loadl_epi64((__m128i *)dst);
       const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
       const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
-      const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec);
+      const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
       _mm_storeu_si128((__m128i *)dst, res);
 
       dst += 16;
     }
   }
 }
 
 static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
@@ -332,17 +332,17 @@ static void bilinear_filter4xh(const uin
       const __m128i z2 = _mm_srli_si128(x2, 1);
       const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
       const __m128i z3 = _mm_srli_si128(x3, 1);
 
       const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
       const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
       const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
       const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
       _mm_storeu_si128((__m128i *)b, res);
 
       src += src_stride * 4;
       b += 16;
     }
     // Handle i = h separately
     const __m128i x = _mm_loadl_epi64((__m128i *)src);
     const __m128i z = _mm_srli_si128(x, 1);
@@ -373,47 +373,47 @@ static void bilinear_filter4xh(const uin
       const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
       const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
       const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
 
       const __m128i a0 = _mm_unpacklo_epi32(a, b);
       const __m128i b0 = _mm_unpacklo_epi32(b, c);
       const __m128i a1 = _mm_unpacklo_epi32(c, d);
       const __m128i b1 = _mm_unpacklo_epi32(d, e);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
       _mm_storeu_si128((__m128i *)dst, res);
 
       dst += 16;
     }
   }
 }
 
-static INLINE void accumulate_block(const __m128i src, const __m128i a,
-                                    const __m128i b, const __m128i m,
+static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+                                    const __m128i *b, const __m128i *m,
                                     __m128i *sum, __m128i *sum_sq) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+  const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
 
   // Calculate 16 predicted pixels.
   // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
   // is 64 * 255, so we have plenty of space to add rounding constants.
-  const __m128i data_l = _mm_unpacklo_epi8(a, b);
-  const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+  const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
+  const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
   __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
   pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
 
-  const __m128i data_r = _mm_unpackhi_epi8(a, b);
-  const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+  const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
+  const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
   __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
   pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
 
-  const __m128i src_l = _mm_unpacklo_epi8(src, zero);
-  const __m128i src_r = _mm_unpackhi_epi8(src, zero);
+  const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
+  const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
   const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
   const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
 
   // Update partial sums and partial sums of squares
   *sum =
       _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
   *sum_sq =
       _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
@@ -429,17 +429,17 @@ static void masked_variance(const uint8_
   __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
 
   for (y = 0; y < height; y++) {
     for (x = 0; x < width; x += 16) {
       const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
       const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
       const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
       const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
-      accumulate_block(src, a, b, m, &sum, &sum_sq);
+      accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
     }
 
     src_ptr += src_stride;
     a_ptr += a_stride;
     b_ptr += b_stride;
     m_ptr += m_stride;
   }
   // Reduce down to a single sum and sum of squares
@@ -460,17 +460,17 @@ static void masked_variance8xh(const uin
     __m128i src = _mm_unpacklo_epi64(
         _mm_loadl_epi64((const __m128i *)src_ptr),
         _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
     const __m128i m =
         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
                            _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
 
     src_ptr += src_stride * 2;
     a_ptr += 16;
     b_ptr += 16;
     m_ptr += m_stride * 2;
   }
   // Reduce down to a single sum and sum of squares
   sum = _mm_hadd_epi32(sum, sum_sq);
@@ -492,17 +492,17 @@ static void masked_variance4xh(const uin
         _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
                        *(uint32_t *)&src_ptr[src_stride * 2],
                        *(uint32_t *)&src_ptr[src_stride * 3]);
     const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
     const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
     const __m128i m = _mm_setr_epi32(
         *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
         *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
 
     src_ptr += src_stride * 4;
     a_ptr += 16;
     b_ptr += 16;
     m_ptr += m_stride * 4;
   }
   // Reduce down to a single sum and sum of squares
   sum = _mm_hadd_epi32(sum, sum_sq);
@@ -775,27 +775,27 @@ static void highbd_bilinear_filter(const
         _mm_storeu_si128((__m128i *)&dst[j], res);
       }
 
       dst += w;
     }
   }
 }
 
-static INLINE __m128i highbd_filter_block_2rows(const __m128i a0,
-                                                const __m128i b0,
-                                                const __m128i a1,
-                                                const __m128i b1,
-                                                const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi16(a0, b0);
-  v0 = _mm_madd_epi16(v0, filter);
+static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+                                                const __m128i *b0,
+                                                const __m128i *a1,
+                                                const __m128i *b1,
+                                                const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
+  v0 = _mm_madd_epi16(v0, *filter);
   v0 = xx_roundn_epu32(v0, FILTER_BITS);
 
-  __m128i v1 = _mm_unpacklo_epi16(a1, b1);
-  v1 = _mm_madd_epi16(v1, filter);
+  __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
+  v1 = _mm_madd_epi16(v1, *filter);
   v1 = xx_roundn_epu32(v1, FILTER_BITS);
 
   return _mm_packs_epi32(v0, v1);
 }
 
 static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
                                       int xoffset, int yoffset, uint16_t *dst,
                                       int h) {
@@ -823,17 +823,17 @@ static void highbd_bilinear_filter4xh(co
     const uint8_t *hfilter = bilinear_filters_2t[xoffset];
     const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
     for (i = 0; i < h; i += 2) {
       const __m128i x0 = _mm_loadu_si128((__m128i *)src);
       const __m128i z0 = _mm_srli_si128(x0, 2);
       const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
       const __m128i z1 = _mm_srli_si128(x1, 2);
       const __m128i res =
-          highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+          highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
       _mm_storeu_si128((__m128i *)b, res);
 
       src += src_stride * 2;
       b += 8;
     }
     // Process i = h separately
     __m128i x = _mm_loadu_si128((__m128i *)src);
     __m128i z = _mm_srli_si128(x, 2);
@@ -857,17 +857,18 @@ static void highbd_bilinear_filter4xh(co
     }
   } else {
     const uint8_t *vfilter = bilinear_filters_2t[yoffset];
     const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
     for (i = 0; i < h; i += 2) {
       const __m128i x = _mm_loadl_epi64((__m128i *)dst);
       const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
       const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
-      const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec);
+      const __m128i res =
+          highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
       _mm_storeu_si128((__m128i *)dst, res);
 
       dst += 8;
     }
   }
 }
 
 static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,