5#include <private/qdrawhelper_x86_p.h>
7#if defined(QT_COMPILER_SUPPORTS_SSSE3)
9#include <private/qdrawingprimitive_sse2_p.h>
14
15
16#define BLENDING_LOOP(palignrOffset, length)
17 for (; x-minusOffsetToAlignSrcOn16Bytes < length-7
; x += 4
) {
18 const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4
]);
19 const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset);
20 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
21 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff
) {
22 _mm_store_si128((__m128i *)&dst[x], srcVector);
23 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff
) {
24 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
25 alphaChannel = _mm_sub_epi16(one, alphaChannel);
26 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
27 __m128i destMultipliedByOneMinusAlpha;
28 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
29 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
30 _mm_store_si128((__m128i *)&dst[x], result);
32 srcVectorPrevLoaded = srcVectorLastLoaded;
47static inline void Q_DECL_VECTORCALL
48BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst,
const quint32 *src,
int length,
49 __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
54 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
55 blend_pixel(dst[x], src[x]);
58 const int minusOffsetToAlignSrcOn16Bytes = (
reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;
60 if (!minusOffsetToAlignSrcOn16Bytes) {
62
63 const __m128i alphaShuffleMask = _mm_set_epi8(
char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3);
64 for (; x < length-3; x += 4) {
65 const __m128i srcVector = _mm_load_si128((
const __m128i *)&src[x]);
66 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
67 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
68 _mm_store_si128((__m128i *)&dst[x], srcVector);
69 }
else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
70 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
71 alphaChannel = _mm_sub_epi16(one, alphaChannel);
72 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
73 __m128i destMultipliedByOneMinusAlpha;
74 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
75 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
76 _mm_store_si128((__m128i *)&dst[x], result);
79 }
else if ((length - x) >= 8) {
81 __m128i srcVectorPrevLoaded = _mm_load_si128((
const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
82 const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;
84 const __m128i alphaShuffleMask = _mm_set_epi8(
char(0xff),15,
char(0xff),15,
char(0xff),11,
char(0xff),11,
char(0xff),7,
char(0xff),7,
char(0xff),3,
char(0xff),3);
85 switch (palignrOffset) {
87 BLENDING_LOOP(4, length)
90 BLENDING_LOOP(8, length)
93 BLENDING_LOOP(12, length)
97 for (; x < length; ++x)
98 blend_pixel(dst[x], src[x]);
101void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels,
int dbpl,
102 const uchar *srcPixels,
int sbpl,
106 const quint32 *src = (
const quint32 *) srcPixels;
107 quint32 *dst = (quint32 *) destPixels;
108 if (const_alpha == 256) {
109 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
110 const __m128i nullVector = _mm_setzero_si128();
111 const __m128i half = _mm_set1_epi16(0x80);
112 const __m128i one = _mm_set1_epi16(0xff);
113 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
115 for (
int y = 0; y < h; ++y) {
116 BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
117 dst = (quint32 *)(((uchar *) dst) + dbpl);
118 src = (
const quint32 *)(((
const uchar *) src) + sbpl);
120 }
else if (const_alpha != 0) {
124 const_alpha = (const_alpha * 255) >> 8;
125 const __m128i nullVector = _mm_setzero_si128();
126 const __m128i half = _mm_set1_epi16(0x80);
127 const __m128i one = _mm_set1_epi16(0xff);
128 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
129 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
130 for (
int y = 0; y < h; ++y) {
131 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
132 dst = (quint32 *)(((uchar *) dst) + dbpl);
133 src = (
const quint32 *)(((
const uchar *) src) + sbpl);
138const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer,
const uchar *src,
int index,
int count)
140 const quint24 *s =
reinterpret_cast<
const quint24 *>(src);
141 for (
int i = 0; i < count; ++i)
142 buffer[i] = s[index + i];
146extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst,
const uchar *src,
int len);
148const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer,
const Operator *,
const QSpanData *data,
149 int y,
int x,
int length)
151 const uchar *line = data->texture.scanLine(y) + x * 3;
152 qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length);
156void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
160 __m128i m = _mm_cvtsi32_si128(v);
161 quint24 *end = dest + count;
163 constexpr uchar x = 2, y = 1, z = 0;
164 alignas(__m128i)
static const uchar
165 shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
167 __m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(
reinterpret_cast<
const __m128i *>(shuffleMask)));
168 __m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(shuffleMask + 1)));
169 __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);
171 for ( ; dest + 16 <= end; dest += 16) {
174 __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
175 mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
176 _mm256_storeu_ps(
reinterpret_cast<
float *>(dest), mval12);
178 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dest) + 0, mval1);
179 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dest) + 1, mval2);
181 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dest) + 2, mval3);
193 uchar *ptr =
reinterpret_cast<uchar *>(dest);
194 uchar *ptr_end =
reinterpret_cast<uchar *>(end);
195 qptrdiff left = ptr_end - ptr;
198 _mm_storeu_si128(
reinterpret_cast<__m128i *>(ptr) + 0, mval1);
199 _mm_storel_epi64(
reinterpret_cast<__m128i *>(ptr) + 1, mval2);
208 _mm_storeu_si128(
reinterpret_cast<__m128i *>(ptr) , mval1);
209 }
else if (left >= 8) {
211 _mm_storel_epi64(
reinterpret_cast<__m128i *>(ptr), mval1);
217 _mm_storel_epi64(
reinterpret_cast<__m128i *>(ptr_end - 8), mval2);
221void QT_FASTCALL rbSwap_888_ssse3(uchar *dst,
const uchar *src,
int count)
225 const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 15);
226 const static __m128i shuffleMask2 = _mm_setr_epi8(0, 1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, 14, 15);
227 const static __m128i shuffleMask3 = _mm_setr_epi8(0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
229 for (; i + 15 < count; i += 16) {
230 __m128i s1 = _mm_loadu_si128((
const __m128i *)src);
231 __m128i s2 = _mm_loadu_si128((
const __m128i *)(src + 16));
232 __m128i s3 = _mm_loadu_si128((
const __m128i *)(src + 32));
233 s1 = _mm_shuffle_epi8(s1, shuffleMask1);
234 s2 = _mm_shuffle_epi8(s2, shuffleMask2);
235 s3 = _mm_shuffle_epi8(s3, shuffleMask3);
236 _mm_storeu_si128((__m128i *)dst, s1);
237 _mm_storeu_si128((__m128i *)(dst + 16), s2);
238 _mm_storeu_si128((__m128i *)(dst + 32), s3);
241 std::swap(dst[15], dst[17]);
242 std::swap(dst[30], dst[32]);
249 SIMD_EPILOGUE(i, count, 15) {
257 SIMD_EPILOGUE(i, count, 15) {
258 std::swap(dst[0], dst[2]);