Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qdrawhelper_ssse3.cpp
Go to the documentation of this file.
1// Copyright (C) 2018 The Qt Company Ltd.
2// Copyright (C) 2018 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4// Qt-Security score:significant reason:default
5
6#include <private/qdrawhelper_x86_p.h>
7
8#if defined(QT_COMPILER_SUPPORTS_SSSE3)
9
10#include <private/qdrawingprimitive_sse2_p.h>
11
12QT_BEGIN_NAMESPACE
13
14/* The instruction palignr uses direct arguments, so we have to generate the code fo the different
15 shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow.
16 */
17#define BLENDING_LOOP(palignrOffset, length)
18 for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) {
19 const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);
20 const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset);
21 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
22 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
23 _mm_store_si128((__m128i *)&dst[x], srcVector);
24 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
25 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
26 alphaChannel = _mm_sub_epi16(one, alphaChannel);
27 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
28 __m128i destMultipliedByOneMinusAlpha;
29 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
30 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
31 _mm_store_si128((__m128i *)&dst[x], result);
32 }
33 srcVectorPrevLoaded = srcVectorLastLoaded;
34 }
35
36
37// Basically blend src over dst with the const alpha defined as constAlphaVector.
38// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
39//const __m128i nullVector = _mm_set1_epi32(0);
40//const __m128i half = _mm_set1_epi16(0x80);
41//const __m128i one = _mm_set1_epi16(0xff);
42//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
43//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
44//
45// The computation being done is:
46// result = s + d * (1-alpha)
47// with shortcuts if fully opaque or fully transparent.
48static inline void Q_DECL_VECTORCALL
49BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst, const quint32 *src, int length,
50 __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
51{
52 int x = 0;
53
54 /* First, get dst aligned. */
55 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
56 blend_pixel(dst[x], src[x]);
57 }
58
59 const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;
60
61 if (!minusOffsetToAlignSrcOn16Bytes) {
62 /* src is aligned, usual algorithm but with aligned operations.
63 See the SSE2 version for more documentation on the algorithm itself. */
64 const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
65 for (; x < length-3; x += 4) {
66 const __m128i srcVector = _mm_load_si128((const __m128i *)&src[x]);
67 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
68 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
69 _mm_store_si128((__m128i *)&dst[x], srcVector);
70 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
71 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask);
72 alphaChannel = _mm_sub_epi16(one, alphaChannel);
73 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
74 __m128i destMultipliedByOneMinusAlpha;
75 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
76 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
77 _mm_store_si128((__m128i *)&dst[x], result);
78 }
79 } /* end for() */
80 } else if ((length - x) >= 8) {
81 /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */
82 __m128i srcVectorPrevLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
83 const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;
84
85 const __m128i alphaShuffleMask = _mm_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
86 switch (palignrOffset) {
87 case 4:
88 BLENDING_LOOP(4, length)
89 break;
90 case 8:
91 BLENDING_LOOP(8, length)
92 break;
93 case 12:
94 BLENDING_LOOP(12, length)
95 break;
96 }
97 }
98 for (; x < length; ++x)
99 blend_pixel(dst[x], src[x]);
100}
101
102void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
103 const uchar *srcPixels, int sbpl,
104 int w, int h,
105 int const_alpha)
106{
107 const quint32 *src = (const quint32 *) srcPixels;
108 quint32 *dst = (quint32 *) destPixels;
109 if (const_alpha == 256) {
110 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
111 const __m128i nullVector = _mm_setzero_si128();
112 const __m128i half = _mm_set1_epi16(0x80);
113 const __m128i one = _mm_set1_epi16(0xff);
114 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
115
116 for (int y = 0; y < h; ++y) {
117 BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
118 dst = (quint32 *)(((uchar *) dst) + dbpl);
119 src = (const quint32 *)(((const uchar *) src) + sbpl);
120 }
121 } else if (const_alpha != 0) {
122 // dest = (s + d * sia) * ca + d * cia
123 // = s * ca + d * (sia * ca + cia)
124 // = s * ca + d * (1 - sa*ca)
125 const_alpha = (const_alpha * 255) >> 8;
126 const __m128i nullVector = _mm_setzero_si128();
127 const __m128i half = _mm_set1_epi16(0x80);
128 const __m128i one = _mm_set1_epi16(0xff);
129 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
130 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
131 for (int y = 0; y < h; ++y) {
132 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
133 dst = (quint32 *)(((uchar *) dst) + dbpl);
134 src = (const quint32 *)(((const uchar *) src) + sbpl);
135 }
136 }
137}
138
139const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer, const uchar *src, int index, int count)
140{
141 const quint24 *s = reinterpret_cast<const quint24 *>(src);
142 for (int i = 0; i < count; ++i)
143 buffer[i] = s[index + i];
144 return buffer;
145}
146
147extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len);
148
149const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer, const Operator *, const QSpanData *data,
150 int y, int x, int length)
151{
152 const uchar *line = data->texture.scanLine(y) + x * 3;
153 qt_convert_rgb888_to_rgb32_ssse3(buffer, line, length);
154 return buffer;
155}
156
157void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
158{
159 // LCM of 12 and 16 bytes is 48 bytes (16 px)
160 quint32 v = color;
161 __m128i m = _mm_cvtsi32_si128(v);
162 quint24 *end = dest + count;
163
164 constexpr uchar x = 2, y = 1, z = 0;
165 alignas(__m128i) static const uchar
166 shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
167
168 __m128i mval1 = _mm_shuffle_epi8(m, _mm_load_si128(reinterpret_cast<const __m128i *>(shuffleMask)));
169 __m128i mval2 = _mm_shuffle_epi8(m, _mm_loadu_si128(reinterpret_cast<const __m128i *>(shuffleMask + 1)));
170 __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);
171
172 for ( ; dest + 16 <= end; dest += 16) {
173#ifdef __AVX__
174 // Store using 32-byte AVX instruction
175 __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
176 mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
177 _mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
178#else
179 _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 0, mval1);
180 _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 1, mval2);
181#endif
182 _mm_storeu_si128(reinterpret_cast<__m128i *>(dest) + 2, mval3);
183 }
184
185 if (count < 3) {
186 if (count > 1)
187 end[-2] = v;
188 if (count)
189 end[-1] = v;
190 return;
191 }
192
193 // less than 16px/48B left
194 uchar *ptr = reinterpret_cast<uchar *>(dest);
195 uchar *ptr_end = reinterpret_cast<uchar *>(end);
196 qptrdiff left = ptr_end - ptr;
197 if (left >= 24) {
198 // 8px/24B or more left
199 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) + 0, mval1);
200 _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr) + 1, mval2);
201 ptr += 24;
202 left -= 24;
203 }
204
205 // less than 8px/24B left
206
207 if (left >= 16) {
208 // but more than 5px/15B left
209 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr) , mval1);
210 } else if (left >= 8) {
211 // but more than 2px/6B left
212 _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr), mval1);
213 }
214
215 if (left) {
216 // 1 or 2px left
217 // store 8 bytes ending with the right values (will overwrite a bit)
218 _mm_storel_epi64(reinterpret_cast<__m128i *>(ptr_end - 8), mval2);
219 }
220}
221
222void QT_FASTCALL rbSwap_888_ssse3(uchar *dst, const uchar *src, int count)
223{
224 int i = 0;
225
226 const static __m128i shuffleMask1 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, /*!!*/15);
227 const static __m128i shuffleMask2 = _mm_setr_epi8(0, /*!!*/1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, /*!!*/14, 15);
228 const static __m128i shuffleMask3 = _mm_setr_epi8(/*!!*/0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
229
230 for (; i + 15 < count; i += 16) {
231 __m128i s1 = _mm_loadu_si128((const __m128i *)src);
232 __m128i s2 = _mm_loadu_si128((const __m128i *)(src + 16));
233 __m128i s3 = _mm_loadu_si128((const __m128i *)(src + 32));
234 s1 = _mm_shuffle_epi8(s1, shuffleMask1);
235 s2 = _mm_shuffle_epi8(s2, shuffleMask2);
236 s3 = _mm_shuffle_epi8(s3, shuffleMask3);
237 _mm_storeu_si128((__m128i *)dst, s1);
238 _mm_storeu_si128((__m128i *)(dst + 16), s2);
239 _mm_storeu_si128((__m128i *)(dst + 32), s3);
240
241 // Now fix the last four misplaced values
242 std::swap(dst[15], dst[17]);
243 std::swap(dst[30], dst[32]);
244
245 src += 48;
246 dst += 48;
247 }
248
249 if (src != dst) {
250 SIMD_EPILOGUE(i, count, 15) {
251 dst[0] = src[2];
252 dst[1] = src[1];
253 dst[2] = src[0];
254 dst += 3;
255 src += 3;
256 }
257 } else {
258 SIMD_EPILOGUE(i, count, 15) {
259 std::swap(dst[0], dst[2]);
260 dst += 3;
261 }
262 }
263}
264
265QT_END_NAMESPACE
266
267#endif // QT_COMPILER_SUPPORTS_SSSE3