6#include <private/qdrawhelper_x86_p.h>
8#if defined(QT_COMPILER_SUPPORTS_SSSE3)
10#include <private/qdrawingprimitive_sse2_p.h>
15
16
17#define BLENDING_LOOP(palignrOffset, length)
18 for (; x-minusOffsetToAlignSrcOn16Bytes < length-7
; x += 4
) {
19 const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4
]);
20 const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset);
21 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
22 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff
) {
23 _mm_store_si128((__m128i *)&dst[x], srcVector);
24 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff
) {
25 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
26 alphaChannel = _mm_sub_epi16(one, alphaChannel);
27 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
28 __m128i destMultipliedByOneMinusAlpha;
29 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
30 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
31 _mm_store_si128((__m128i *)&dst[x], result);
33 srcVectorPrevLoaded = srcVectorLastLoaded;
48static inline void Q_DECL_VECTORCALL
49BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst,
const quint32 *src,
int length,
50 __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
55 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
56 blend_pixel(dst[x], src[x]);
59 const int minusOffsetToAlignSrcOn16Bytes = (
reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;
61 if (!minusOffsetToAlignSrcOn16Bytes) {
63
64 const __m128i alphaShuffleMask = _mm_set_epi8(
char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3);
65 for (; x < length-3; x += 4) {
66 const __m128i srcVector = _mm_load_si128((
const __m128i *)&src[x]);
67 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
68 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
69 _mm_store_si128((__m128i *)&dst[x], srcVector);
70 }
else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
71 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
72 alphaChannel = _mm_sub_epi16(one, alphaChannel);
73 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
74 __m128i destMultipliedByOneMinusAlpha;
75 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
76 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
77 _mm_store_si128((__m128i *)&dst[x], result);
80 }
else if ((length - x) >= 8) {
82 __m128i srcVectorPrevLoaded = _mm_load_si128((
const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
83 const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;
85 const __m128i alphaShuffleMask = _mm_set_epi8(
char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3);
86 switch (palignrOffset) {
88 BLENDING_LOOP(4, length)
91 BLENDING_LOOP(8, length)
94 BLENDING_LOOP(12, length)
98 for (; x < length; ++x)
99 blend_pixel(dst[x], src[x]);
102void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels,
int dbpl,
103 const uchar *srcPixels,
int sbpl,
107 const quint32 *src = (
const quint32 *) srcPixels;
108 quint32 *dst = (quint32 *) destPixels;
109 if (const_alpha == 256) {
110 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
111 const __m128i nullVector = _mm_setzero_si128();
112 const __m128i half = _mm_set1_epi16(0x80);
113 const __m128i one = _mm_set1_epi16(0xff);
114 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
116 for (
int y = 0; y < h; ++y) {
117 BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
118 dst = (quint32 *)(((uchar *) dst) + dbpl);
119 src = (
const quint32 *)(((
const uchar *) src) + sbpl);
121 }
else if (const_alpha != 0) {
125 const_alpha = (const_alpha * 255) >> 8;
126 const __m128i nullVector = _mm_setzero_si128();
127 const __m128i half = _mm_set1_epi16(0x80);
128 const __m128i one = _mm_set1_epi16(0xff);
129 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
130 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
131 for (
int y = 0; y < h; ++y) {
132 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
133 dst = (quint32 *)(((uchar *) dst) + dbpl);
134 src = (
const quint32 *)(((
const uchar *) src) + sbpl);
139const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer,
const uchar *src,
int index,
int count)
141 const quint24 *s =
reinterpret_cast<
const quint24 *>(src);
142 for (
int i = 0; i < count; ++i)
143 buffer[i] = s[index + i];
147extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst,
const uchar *src,
int len);
149const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer,
const Operator *,
const QSpanData *data,
150 int y,
int x,
int length)
152 const uchar *line = data->texture.scanLine(y) + x * 3;
153 qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length);
157void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
161 __m128i m = _mm_cvtsi32_si128(v);
162 quint24 *end = dest + count;
164 constexpr uchar x = 2, y = 1, z = 0;
165 alignas(__m128i)
static const uchar
166 shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
168 __m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(
reinterpret_cast<
const __m128i *>(shuffleMask)));
169 __m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(shuffleMask + 1)));
170 __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);
172 for ( ; dest + 16 <= end; dest += 16) {
175 __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
176 mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
177 _mm256_storeu_ps(
reinterpret_cast<
float *>(dest), mval12);
179 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dest) + 0, mval1);
180 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dest) + 1, mval2);
182 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dest) + 2, mval3);
194 uchar *ptr =
reinterpret_cast<uchar *>(dest);
195 uchar *ptr_end =
reinterpret_cast<uchar *>(end);
196 qptrdiff left = ptr_end - ptr;
199 _mm_storeu_si128(
reinterpret_cast<__m128i *>(ptr) + 0, mval1);
200 _mm_storel_epi64(
reinterpret_cast<__m128i *>(ptr) + 1, mval2);
209 _mm_storeu_si128(
reinterpret_cast<__m128i *>(ptr) , mval1);
210 }
else if (left >= 8) {
212 _mm_storel_epi64(
reinterpret_cast<__m128i *>(ptr), mval1);
218 _mm_storel_epi64(
reinterpret_cast<__m128i *>(ptr_end - 8), mval2);
222void QT_FASTCALL rbSwap_888_ssse3(uchar *dst,
const uchar *src,
int count)
226 const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 15);
227 const static __m128i shuffleMask2 = _mm_setr_epi8(0, 1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, 14, 15);
228 const static __m128i shuffleMask3 = _mm_setr_epi8(0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
230 for (; i + 15 < count; i += 16) {
231 __m128i s1 = _mm_loadu_si128((
const __m128i *)src);
232 __m128i s2 = _mm_loadu_si128((
const __m128i *)(src + 16));
233 __m128i s3 = _mm_loadu_si128((
const __m128i *)(src + 32));
234 s1 = _mm_shuffle_epi8(s1, shuffleMask1);
235 s2 = _mm_shuffle_epi8(s2, shuffleMask2);
236 s3 = _mm_shuffle_epi8(s3, shuffleMask3);
237 _mm_storeu_si128((__m128i *)dst, s1);
238 _mm_storeu_si128((__m128i *)(dst + 16), s2);
239 _mm_storeu_si128((__m128i *)(dst + 32), s3);
242 std::swap(dst[15], dst[17]);
243 std::swap(dst[30], dst[32]);
250 SIMD_EPILOGUE(i, count, 15) {
258 SIMD_EPILOGUE(i, count, 15) {
259 std::swap(dst[0], dst[2]);