--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -207,25 +207,25 @@ static void bilinear_filter(const uint8_
_mm_storeu_si128((__m128i *)&dst[j], res);
}
dst += w;
}
}
}
-static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0,
- const __m128i a1, const __m128i b1,
- const __m128i filter) {
- __m128i v0 = _mm_unpacklo_epi8(a0, b0);
- v0 = _mm_maddubs_epi16(v0, filter);
+static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+ const __m128i *a1, const __m128i *b1,
+ const __m128i *filter) {
+ __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
+ v0 = _mm_maddubs_epi16(v0, *filter);
v0 = xx_roundn_epu16(v0, FILTER_BITS);
- __m128i v1 = _mm_unpacklo_epi8(a1, b1);
- v1 = _mm_maddubs_epi16(v1, filter);
+ __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
+ v1 = _mm_maddubs_epi16(v1, *filter);
v1 = xx_roundn_epu16(v1, FILTER_BITS);
return _mm_packus_epi16(v0, v1);
}
static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
int yoffset, uint8_t *dst, int h) {
int i;
@@ -251,17 +251,17 @@ static void bilinear_filter8xh(const uin
uint8_t *b = dst;
const uint8_t *hfilter = bilinear_filters_2t[xoffset];
const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
for (i = 0; i < h; i += 2) {
const __m128i x0 = _mm_loadu_si128((__m128i *)src);
const __m128i z0 = _mm_srli_si128(x0, 1);
const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
const __m128i z1 = _mm_srli_si128(x1, 1);
- const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+ const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
_mm_storeu_si128((__m128i *)b, res);
src += src_stride * 2;
b += 16;
}
// Handle i = h separately
const __m128i x0 = _mm_loadu_si128((__m128i *)src);
const __m128i z0 = _mm_srli_si128(x0, 1);
@@ -285,17 +285,17 @@ static void bilinear_filter8xh(const uin
}
} else {
const uint8_t *vfilter = bilinear_filters_2t[yoffset];
const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
for (i = 0; i < h; i += 2) {
const __m128i x = _mm_loadl_epi64((__m128i *)dst);
const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
- const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec);
+ const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
_mm_storeu_si128((__m128i *)dst, res);
dst += 16;
}
}
}
static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
@@ -332,17 +332,17 @@ static void bilinear_filter4xh(const uin
const __m128i z2 = _mm_srli_si128(x2, 1);
const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
const __m128i z3 = _mm_srli_si128(x3, 1);
const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
- const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec);
+ const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
_mm_storeu_si128((__m128i *)b, res);
src += src_stride * 4;
b += 16;
}
// Handle i = h separately
const __m128i x = _mm_loadl_epi64((__m128i *)src);
const __m128i z = _mm_srli_si128(x, 1);
@@ -373,47 +373,47 @@ static void bilinear_filter4xh(const uin
const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
const __m128i a0 = _mm_unpacklo_epi32(a, b);
const __m128i b0 = _mm_unpacklo_epi32(b, c);
const __m128i a1 = _mm_unpacklo_epi32(c, d);
const __m128i b1 = _mm_unpacklo_epi32(d, e);
- const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec);
+ const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
_mm_storeu_si128((__m128i *)dst, res);
dst += 16;
}
}
}
-static INLINE void accumulate_block(const __m128i src, const __m128i a,
- const __m128i b, const __m128i m,
+static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+ const __m128i *b, const __m128i *m,
__m128i *sum, __m128i *sum_sq) {
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+ const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
// Calculate 16 predicted pixels.
// Note that the maximum value of any entry of 'pred_l' or 'pred_r'
// is 64 * 255, so we have plenty of space to add rounding constants.
- const __m128i data_l = _mm_unpacklo_epi8(a, b);
- const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+ const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
+ const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
__m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
- const __m128i data_r = _mm_unpackhi_epi8(a, b);
- const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+ const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
+ const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
__m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
- const __m128i src_l = _mm_unpacklo_epi8(src, zero);
- const __m128i src_r = _mm_unpackhi_epi8(src, zero);
+ const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
+ const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
// Update partial sums and partial sums of squares
*sum =
_mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
*sum_sq =
_mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
@@ -429,17 +429,17 @@ static void masked_variance(const uint8_
__m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
for (y = 0; y < height; y++) {
for (x = 0; x < width; x += 16) {
const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
- accumulate_block(src, a, b, m, &sum, &sum_sq);
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
}
src_ptr += src_stride;
a_ptr += a_stride;
b_ptr += b_stride;
m_ptr += m_stride;
}
// Reduce down to a single sum and sum of squares
@@ -460,17 +460,17 @@ static void masked_variance8xh(const uin
__m128i src = _mm_unpacklo_epi64(
_mm_loadl_epi64((const __m128i *)src_ptr),
_mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
const __m128i m =
_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
_mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
- accumulate_block(src, a, b, m, &sum, &sum_sq);
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
src_ptr += src_stride * 2;
a_ptr += 16;
b_ptr += 16;
m_ptr += m_stride * 2;
}
// Reduce down to a single sum and sum of squares
sum = _mm_hadd_epi32(sum, sum_sq);
@@ -492,17 +492,17 @@ static void masked_variance4xh(const uin
_mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
*(uint32_t *)&src_ptr[src_stride * 2],
*(uint32_t *)&src_ptr[src_stride * 3]);
const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
const __m128i m = _mm_setr_epi32(
*(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
*(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
- accumulate_block(src, a, b, m, &sum, &sum_sq);
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
src_ptr += src_stride * 4;
a_ptr += 16;
b_ptr += 16;
m_ptr += m_stride * 4;
}
// Reduce down to a single sum and sum of squares
sum = _mm_hadd_epi32(sum, sum_sq);
@@ -775,27 +775,27 @@ static void highbd_bilinear_filter(const
_mm_storeu_si128((__m128i *)&dst[j], res);
}
dst += w;
}
}
}
-static INLINE __m128i highbd_filter_block_2rows(const __m128i a0,
- const __m128i b0,
- const __m128i a1,
- const __m128i b1,
- const __m128i filter) {
- __m128i v0 = _mm_unpacklo_epi16(a0, b0);
- v0 = _mm_madd_epi16(v0, filter);
+static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+ const __m128i *b0,
+ const __m128i *a1,
+ const __m128i *b1,
+ const __m128i *filter) {
+ __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
+ v0 = _mm_madd_epi16(v0, *filter);
v0 = xx_roundn_epu32(v0, FILTER_BITS);
- __m128i v1 = _mm_unpacklo_epi16(a1, b1);
- v1 = _mm_madd_epi16(v1, filter);
+ __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
+ v1 = _mm_madd_epi16(v1, *filter);
v1 = xx_roundn_epu32(v1, FILTER_BITS);
return _mm_packs_epi32(v0, v1);
}
static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
int xoffset, int yoffset, uint16_t *dst,
int h) {
@@ -823,17 +823,17 @@ static void highbd_bilinear_filter4xh(co
const uint8_t *hfilter = bilinear_filters_2t[xoffset];
const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
for (i = 0; i < h; i += 2) {
const __m128i x0 = _mm_loadu_si128((__m128i *)src);
const __m128i z0 = _mm_srli_si128(x0, 2);
const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
const __m128i z1 = _mm_srli_si128(x1, 2);
const __m128i res =
- highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+ highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
_mm_storeu_si128((__m128i *)b, res);
src += src_stride * 2;
b += 8;
}
// Process i = h separately
__m128i x = _mm_loadu_si128((__m128i *)src);
__m128i z = _mm_srli_si128(x, 2);
@@ -857,17 +857,18 @@ static void highbd_bilinear_filter4xh(co
}
} else {
const uint8_t *vfilter = bilinear_filters_2t[yoffset];
const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
for (i = 0; i < h; i += 2) {
const __m128i x = _mm_loadl_epi64((__m128i *)dst);
const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
- const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec);
+ const __m128i res =
+ highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
_mm_storeu_si128((__m128i *)dst, res);
dst += 8;
}
}
}
static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,