Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qdrawhelper_sse4.cpp
Go to the documentation of this file.
1// Copyright (C) 2016 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:significant reason:default
4
5#include <private/qdrawhelper_p.h>
6#include <private/qdrawingprimitive_sse2_p.h>
7#include <private/qpaintengine_raster_p.h>
8#include <private/qpixellayout_p.h>
9
10#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
11
12QT_BEGIN_NAMESPACE
13
14#ifndef __haswell__
15template<bool RGBA>
16static void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count)
17{
18 int i = 0;
19 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
20 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
21 const __m128i shuffleMask = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
22 const __m128i half = _mm_set1_epi16(0x0080);
23 const __m128i zero = _mm_setzero_si128();
24
25 for (; i < count - 3; i += 4) {
26 __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]);
27 if (!_mm_testz_si128(srcVector, alphaMask)) {
28 if (!_mm_testc_si128(srcVector, alphaMask)) {
29 if (RGBA)
30 srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
31 __m128i src1 = _mm_unpacklo_epi8(srcVector, zero);
32 __m128i src2 = _mm_unpackhi_epi8(srcVector, zero);
33 __m128i alpha1 = _mm_shuffle_epi8(src1, shuffleMask);
34 __m128i alpha2 = _mm_shuffle_epi8(src2, shuffleMask);
35 src1 = _mm_mullo_epi16(src1, alpha1);
36 src2 = _mm_mullo_epi16(src2, alpha2);
37 src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8));
38 src2 = _mm_add_epi16(src2, _mm_srli_epi16(src2, 8));
39 src1 = _mm_add_epi16(src1, half);
40 src2 = _mm_add_epi16(src2, half);
41 src1 = _mm_srli_epi16(src1, 8);
42 src2 = _mm_srli_epi16(src2, 8);
43 src1 = _mm_blend_epi16(src1, alpha1, 0x88);
44 src2 = _mm_blend_epi16(src2, alpha2, 0x88);
45 srcVector = _mm_packus_epi16(src1, src2);
46 _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
47 } else {
48 if (RGBA)
49 _mm_storeu_si128((__m128i *)&buffer[i], _mm_shuffle_epi8(srcVector, rgbaMask));
50 else if (buffer != src)
51 _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
52 }
53 } else {
54 _mm_storeu_si128((__m128i *)&buffer[i], zero);
55 }
56 }
57
58 SIMD_EPILOGUE(i, count, 3) {
59 uint v = qPremultiply(src[i]);
60 buffer[i] = RGBA ? RGBA2ARGB(v) : v;
61 }
62}
63
64template<bool RGBA>
65static void convertARGBToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count)
66{
67 int i = 0;
68 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
69 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
70 const __m128i shuffleMask = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
71 const __m128i zero = _mm_setzero_si128();
72
73 for (; i < count - 3; i += 4) {
74 __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]);
75 if (!_mm_testz_si128(srcVector, alphaMask)) {
76 bool cf = _mm_testc_si128(srcVector, alphaMask);
77
78 if (!RGBA)
79 srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
80 const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector);
81 const __m128i src2 = _mm_unpackhi_epi8(srcVector, srcVector);
82 if (!cf) {
83 __m128i alpha1 = _mm_shuffle_epi8(src1, shuffleMask);
84 __m128i alpha2 = _mm_shuffle_epi8(src2, shuffleMask);
85 __m128i dst1 = _mm_mulhi_epu16(src1, alpha1);
86 __m128i dst2 = _mm_mulhi_epu16(src2, alpha2);
87 // Map 0->0xfffe to 0->0xffff
88 dst1 = _mm_add_epi16(dst1, _mm_srli_epi16(dst1, 15));
89 dst2 = _mm_add_epi16(dst2, _mm_srli_epi16(dst2, 15));
90 // correct alpha value:
91 dst1 = _mm_blend_epi16(dst1, src1, 0x88);
92 dst2 = _mm_blend_epi16(dst2, src2, 0x88);
93 _mm_storeu_si128((__m128i *)&buffer[i], dst1);
94 _mm_storeu_si128((__m128i *)&buffer[i + 2], dst2);
95 } else {
96 _mm_storeu_si128((__m128i *)&buffer[i], src1);
97 _mm_storeu_si128((__m128i *)&buffer[i + 2], src2);
98 }
99 } else {
100 _mm_storeu_si128((__m128i *)&buffer[i], zero);
101 _mm_storeu_si128((__m128i *)&buffer[i + 2], zero);
102 }
103 }
104
105 SIMD_EPILOGUE(i, count, 3) {
106 const uint s = RGBA ? RGBA2ARGB(src[i]) : src[i];
107 buffer[i] = QRgba64::fromArgb32(s).premultiplied();
108 }
109}
110#endif // __haswell__
111
112static inline __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(__m128 a, float mul)
113{
114 __m128 ia = _mm_rcp_ps(a); // Approximate 1/a
115 // Improve precision of ia using Newton-Raphson
116 ia = _mm_sub_ps(_mm_add_ps(ia, ia), _mm_mul_ps(ia, _mm_mul_ps(ia, a)));
117 ia = _mm_mul_ps(ia, _mm_set1_ps(mul));
118 return ia;
119}
120
121template<bool RGBA, bool RGBx>
122static inline void convertARGBFromARGB32PM_sse4(uint *buffer, const uint *src, int count)
123{
124 int i = 0;
125 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
126 for (; i < count; ++i) {
127 uint v = qUnpremultiply(src[i]);
128 if (RGBx)
129 v = 0xff000000 | v;
130 if (RGBA)
131 v = ARGB2RGBA(v);
132 buffer[i] = v;
133 }
134 return;
135 }
136 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
137 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
138 const __m128i zero = _mm_setzero_si128();
139
140 for (; i < count - 3; i += 4) {
141 __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]);
142 if (!_mm_testz_si128(srcVector, alphaMask)) {
143 if (!_mm_testc_si128(srcVector, alphaMask)) {
144 __m128i srcVectorAlpha = _mm_srli_epi32(srcVector, 24);
145 if (RGBA)
146 srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
147 const __m128 a = _mm_cvtepi32_ps(srcVectorAlpha);
148 const __m128 ia = reciprocal_mul_ps(a, 255.0f);
149 __m128i src1 = _mm_unpacklo_epi8(srcVector, zero);
150 __m128i src3 = _mm_unpackhi_epi8(srcVector, zero);
151 __m128i src2 = _mm_unpackhi_epi16(src1, zero);
152 __m128i src4 = _mm_unpackhi_epi16(src3, zero);
153 src1 = _mm_unpacklo_epi16(src1, zero);
154 src3 = _mm_unpacklo_epi16(src3, zero);
155 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
156 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
157 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
158 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
159 src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
160 src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
161 src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
162 src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
163 src1 = _mm_packus_epi32(src1, src2);
164 src3 = _mm_packus_epi32(src3, src4);
165 src1 = _mm_packus_epi16(src1, src3);
166 // Handle potential alpha == 0 values:
167 __m128i srcVectorAlphaMask = _mm_cmpeq_epi32(srcVectorAlpha, zero);
168 src1 = _mm_andnot_si128(srcVectorAlphaMask, src1);
169 // Fixup alpha values:
170 if (RGBx)
171 srcVector = _mm_or_si128(src1, alphaMask);
172 else
173 srcVector = _mm_blendv_epi8(src1, srcVector, alphaMask);
174 _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
175 } else {
176 if (RGBA)
177 _mm_storeu_si128((__m128i *)&buffer[i], _mm_shuffle_epi8(srcVector, rgbaMask));
178 else if (buffer != src)
179 _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
180 }
181 } else {
182 if (RGBx)
183 _mm_storeu_si128((__m128i *)&buffer[i], alphaMask);
184 else
185 _mm_storeu_si128((__m128i *)&buffer[i], zero);
186 }
187 }
188
189 SIMD_EPILOGUE(i, count, 3) {
190 uint v = qUnpremultiply_sse4(src[i]);
191 if (RGBx)
192 v = 0xff000000 | v;
193 if (RGBA)
194 v = ARGB2RGBA(v);
195 buffer[i] = v;
196 }
197}
198
199template<bool RGBA>
200static inline void convertARGBFromRGBA64PM_sse4(uint *buffer, const QRgba64 *src, int count)
201{
202 int i = 0;
203 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
204 for (; i < count; ++i) {
205 const QRgba64 v = src[i].unpremultiplied();
206 buffer[i] = RGBA ? toRgba8888(v) : toArgb32(v);
207 }
208 return;
209 }
210 const __m128i alphaMask = _mm_set1_epi64x(qint64(Q_UINT64_C(0xffff) << 48));
211 const __m128i alphaMask32 = _mm_set1_epi32(0xff000000);
212 const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
213 const __m128i zero = _mm_setzero_si128();
214
215 for (; i < count - 3; i += 4) {
216 __m128i srcVector1 = _mm_loadu_si128((const __m128i *)&src[i]);
217 __m128i srcVector2 = _mm_loadu_si128((const __m128i *)&src[i + 2]);
218 bool transparent1 = _mm_testz_si128(srcVector1, alphaMask);
219 bool opaque1 = _mm_testc_si128(srcVector1, alphaMask);
220 bool transparent2 = _mm_testz_si128(srcVector2, alphaMask);
221 bool opaque2 = _mm_testc_si128(srcVector2, alphaMask);
222
223 if (!(transparent1 && transparent2)) {
224 if (!(opaque1 && opaque2)) {
225 __m128i srcVector1Alpha = _mm_srli_epi64(srcVector1, 48);
226 __m128i srcVector2Alpha = _mm_srli_epi64(srcVector2, 48);
227 __m128i srcVectorAlpha = _mm_packus_epi32(srcVector1Alpha, srcVector2Alpha);
228 const __m128 a = _mm_cvtepi32_ps(srcVectorAlpha);
229 // Convert srcVectorAlpha to final 8-bit alpha channel
230 srcVectorAlpha = _mm_add_epi32(srcVectorAlpha, _mm_set1_epi32(128));
231 srcVectorAlpha = _mm_sub_epi32(srcVectorAlpha, _mm_srli_epi32(srcVectorAlpha, 8));
232 srcVectorAlpha = _mm_srli_epi32(srcVectorAlpha, 8);
233 srcVectorAlpha = _mm_slli_epi32(srcVectorAlpha, 24);
234 const __m128 ia = reciprocal_mul_ps(a, 255.0f);
235 __m128i src1 = _mm_unpacklo_epi16(srcVector1, zero);
236 __m128i src2 = _mm_unpackhi_epi16(srcVector1, zero);
237 __m128i src3 = _mm_unpacklo_epi16(srcVector2, zero);
238 __m128i src4 = _mm_unpackhi_epi16(srcVector2, zero);
239 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
240 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
241 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
242 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
243 src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
244 src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
245 src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
246 src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
247 src1 = _mm_packus_epi32(src1, src2);
248 src3 = _mm_packus_epi32(src3, src4);
249 // Handle potential alpha == 0 values:
250 __m128i srcVector1AlphaMask = _mm_cmpeq_epi64(srcVector1Alpha, zero);
251 __m128i srcVector2AlphaMask = _mm_cmpeq_epi64(srcVector2Alpha, zero);
252 src1 = _mm_andnot_si128(srcVector1AlphaMask, src1);
253 src3 = _mm_andnot_si128(srcVector2AlphaMask, src3);
254 src1 = _mm_packus_epi16(src1, src3);
255 // Fixup alpha values:
256 src1 = _mm_blendv_epi8(src1, srcVectorAlpha, alphaMask32);
257 // Fix RGB order
258 if (!RGBA)
259 src1 = _mm_shuffle_epi8(src1, rgbaMask);
260 _mm_storeu_si128((__m128i *)&buffer[i], src1);
261 } else {
262 __m128i src1 = _mm_unpacklo_epi16(srcVector1, zero);
263 __m128i src2 = _mm_unpackhi_epi16(srcVector1, zero);
264 __m128i src3 = _mm_unpacklo_epi16(srcVector2, zero);
265 __m128i src4 = _mm_unpackhi_epi16(srcVector2, zero);
266 src1 = _mm_add_epi32(src1, _mm_set1_epi32(128));
267 src2 = _mm_add_epi32(src2, _mm_set1_epi32(128));
268 src3 = _mm_add_epi32(src3, _mm_set1_epi32(128));
269 src4 = _mm_add_epi32(src4, _mm_set1_epi32(128));
270 src1 = _mm_sub_epi32(src1, _mm_srli_epi32(src1, 8));
271 src2 = _mm_sub_epi32(src2, _mm_srli_epi32(src2, 8));
272 src3 = _mm_sub_epi32(src3, _mm_srli_epi32(src3, 8));
273 src4 = _mm_sub_epi32(src4, _mm_srli_epi32(src4, 8));
274 src1 = _mm_srli_epi32(src1, 8);
275 src2 = _mm_srli_epi32(src2, 8);
276 src3 = _mm_srli_epi32(src3, 8);
277 src4 = _mm_srli_epi32(src4, 8);
278 src1 = _mm_packus_epi32(src1, src2);
279 src3 = _mm_packus_epi32(src3, src4);
280 src1 = _mm_packus_epi16(src1, src3);
281 if (!RGBA)
282 src1 = _mm_shuffle_epi8(src1, rgbaMask);
283 _mm_storeu_si128((__m128i *)&buffer[i], src1);
284 }
285 } else {
286 _mm_storeu_si128((__m128i *)&buffer[i], zero);
287 }
288 }
289
290 SIMD_EPILOGUE(i, count, 3) {
291 buffer[i] = qConvertRgba64ToRgb32_sse4<RGBA ? PixelOrderRGB : PixelOrderBGR>(src[i]);
292 }
293}
294
295template<bool mask>
296static inline void convertRGBA64FromRGBA64PM_sse4(QRgba64 *buffer, const QRgba64 *src, int count)
297{
298 int i = 0;
299 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
300 for (; i < count; ++i) {
301 QRgba64 v = src[i].unpremultiplied();
302 if (mask)
303 v.setAlpha(65535);
304 buffer[i] = v;
305 }
306 return;
307 }
308 const __m128i alphaMask = _mm_set1_epi64x(qint64(Q_UINT64_C(0xffff) << 48));
309 const __m128i zero = _mm_setzero_si128();
310
311 for (; i < count - 3; i += 4) {
312 __m128i srcVector1 = _mm_loadu_si128((const __m128i *)&src[i + 0]);
313 __m128i srcVector2 = _mm_loadu_si128((const __m128i *)&src[i + 2]);
314 bool transparent1 = _mm_testz_si128(srcVector1, alphaMask);
315 bool opaque1 = _mm_testc_si128(srcVector1, alphaMask);
316 bool transparent2 = _mm_testz_si128(srcVector2, alphaMask);
317 bool opaque2 = _mm_testc_si128(srcVector2, alphaMask);
318
319 if (!(transparent1 && transparent2)) {
320 if (!(opaque1 && opaque2)) {
321 __m128i srcVector1Alpha = _mm_srli_epi64(srcVector1, 48);
322 __m128i srcVector2Alpha = _mm_srli_epi64(srcVector2, 48);
323 __m128i srcVectorAlpha = _mm_packus_epi32(srcVector1Alpha, srcVector2Alpha);
324 const __m128 a = _mm_cvtepi32_ps(srcVectorAlpha);
325 const __m128 ia = reciprocal_mul_ps(a, 65535.0f);
326 __m128i src1 = _mm_unpacklo_epi16(srcVector1, zero);
327 __m128i src2 = _mm_unpackhi_epi16(srcVector1, zero);
328 __m128i src3 = _mm_unpacklo_epi16(srcVector2, zero);
329 __m128i src4 = _mm_unpackhi_epi16(srcVector2, zero);
330 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
331 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
332 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
333 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
334 src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1));
335 src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2));
336 src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3));
337 src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4));
338 src1 = _mm_packus_epi32(src1, src2);
339 src3 = _mm_packus_epi32(src3, src4);
340 // Handle potential alpha == 0 values:
341 __m128i srcVector1AlphaMask = _mm_cmpeq_epi64(srcVector1Alpha, zero);
342 __m128i srcVector2AlphaMask = _mm_cmpeq_epi64(srcVector2Alpha, zero);
343 src1 = _mm_andnot_si128(srcVector1AlphaMask, src1);
344 src3 = _mm_andnot_si128(srcVector2AlphaMask, src3);
345 // Fixup alpha values:
346 if (mask) {
347 src1 = _mm_or_si128(src1, alphaMask);
348 src3 = _mm_or_si128(src3, alphaMask);
349 } else {
350 src1 = _mm_blendv_epi8(src1, srcVector1, alphaMask);
351 src3 = _mm_blendv_epi8(src3, srcVector2, alphaMask);
352 }
353 _mm_storeu_si128((__m128i *)&buffer[i + 0], src1);
354 _mm_storeu_si128((__m128i *)&buffer[i + 2], src3);
355 } else {
356 if (mask) {
357 srcVector1 = _mm_or_si128(srcVector1, alphaMask);
358 srcVector2 = _mm_or_si128(srcVector2, alphaMask);
359 }
360 if (mask || src != buffer) {
361 _mm_storeu_si128((__m128i *)&buffer[i + 0], srcVector1);
362 _mm_storeu_si128((__m128i *)&buffer[i + 2], srcVector2);
363 }
364 }
365 } else {
366 _mm_storeu_si128((__m128i *)&buffer[i + 0], zero);
367 _mm_storeu_si128((__m128i *)&buffer[i + 2], zero);
368 }
369 }
370
371 SIMD_EPILOGUE(i, count, 3) {
372 QRgba64 v = src[i].unpremultiplied();
373 if (mask)
374 v.setAlpha(65535);
375 buffer[i] = v;
376 }
377}
378
379#ifndef __haswell__
380void QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, int count, const QList<QRgb> *)
381{
382 convertARGBToARGB32PM_sse4<false>(buffer, buffer, count);
383}
384
385void QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, int count, const QList<QRgb> *)
386{
387 convertARGBToARGB32PM_sse4<true>(buffer, buffer, count);
388}
389
390const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
391 const QList<QRgb> *, QDitherInfo *)
392{
393 convertARGBToRGBA64PM_sse4<false>(buffer, src, count);
394 return buffer;
395}
396
397const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
398 const QList<QRgb> *, QDitherInfo *)
399{
400 convertARGBToRGBA64PM_sse4<true>(buffer, src, count);
401 return buffer;
402}
403
404const uint *QT_FASTCALL fetchARGB32ToARGB32PM_sse4(uint *buffer, const uchar *src, int index, int count,
405 const QList<QRgb> *, QDitherInfo *)
406{
407 convertARGBToARGB32PM_sse4<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
408 return buffer;
409}
410
411const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_sse4(uint *buffer, const uchar *src, int index, int count,
412 const QList<QRgb> *, QDitherInfo *)
413{
414 convertARGBToARGB32PM_sse4<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
415 return buffer;
416}
417
418const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
419 const QList<QRgb> *, QDitherInfo *)
420{
421 convertARGBToRGBA64PM_sse4<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
422 return buffer;
423}
424
425const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
426 const QList<QRgb> *, QDitherInfo *)
427{
428 convertARGBToRGBA64PM_sse4<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
429 return buffer;
430}
431#endif // __haswell__
432
433void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
434 const QList<QRgb> *, QDitherInfo *)
435{
436 uint *d = reinterpret_cast<uint *>(dest) + index;
437 convertARGBFromARGB32PM_sse4<false,true>(d, src, count);
438}
439
440void QT_FASTCALL storeARGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
441 const QList<QRgb> *, QDitherInfo *)
442{
443 uint *d = reinterpret_cast<uint *>(dest) + index;
444 convertARGBFromARGB32PM_sse4<false,false>(d, src, count);
445}
446
447void QT_FASTCALL storeRGBA8888FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
448 const QList<QRgb> *, QDitherInfo *)
449{
450 uint *d = reinterpret_cast<uint *>(dest) + index;
451 convertARGBFromARGB32PM_sse4<true,false>(d, src, count);
452}
453
454void QT_FASTCALL storeRGBXFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
455 const QList<QRgb> *, QDitherInfo *)
456{
457 uint *d = reinterpret_cast<uint *>(dest) + index;
458 convertARGBFromARGB32PM_sse4<true,true>(d, src, count);
459}
460
461template<QtPixelOrder PixelOrder>
462void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
463 const QList<QRgb> *, QDitherInfo *)
464{
465 uint *d = reinterpret_cast<uint *>(dest) + index;
466 for (int i = 0; i < count; ++i)
467 d[i] = qConvertArgb32ToA2rgb30_sse4<PixelOrder>(src[i]);
468}
469
470template
471void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4<PixelOrderBGR>(uchar *dest, const uint *src, int index, int count,
472 const QList<QRgb> *, QDitherInfo *);
473template
474void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4<PixelOrderRGB>(uchar *dest, const uint *src, int index, int count,
475 const QList<QRgb> *, QDitherInfo *);
476
477#if QT_CONFIG(raster_64bit)
478void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length)
479{
480 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
481 convertARGBFromRGBA64PM_sse4<false>(dest, buffer, length);
482}
483
484void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length)
485{
486 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
487 convertARGBFromRGBA64PM_sse4<true>(dest, buffer, length);
488}
489#endif
490
491void QT_FASTCALL storeARGB32FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
492 const QList<QRgb> *, QDitherInfo *)
493{
494 uint *d = (uint*)dest + index;
495 convertARGBFromRGBA64PM_sse4<false>(d, src, count);
496}
497
498void QT_FASTCALL storeRGBA8888FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
499 const QList<QRgb> *, QDitherInfo *)
500{
501 uint *d = (uint*)dest + index;
502 convertARGBFromRGBA64PM_sse4<true>(d, src, count);
503}
504
505void QT_FASTCALL storeRGBA64FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
506 const QList<QRgb> *, QDitherInfo *)
507{
508 QRgba64 *d = (QRgba64 *)dest + index;
509 convertRGBA64FromRGBA64PM_sse4<false>(d, src, count);
510}
511
512void QT_FASTCALL storeRGBx64FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
513 const QList<QRgb> *, QDitherInfo *)
514{
515 QRgba64 *d = (QRgba64 *)dest + index;
516 convertRGBA64FromRGBA64PM_sse4<true>(d, src, count);
517}
518
519#if QT_CONFIG(raster_fp)
520const QRgbaFloat32 *QT_FASTCALL fetchRGBA32FToRGBA32F_sse4(QRgbaFloat32 *buffer, const uchar *src, int index, int count,
521 const QList<QRgb> *, QDitherInfo *)
522{
523 const QRgbaFloat32 *s = reinterpret_cast<const QRgbaFloat32 *>(src) + index;
524 for (int i = 0; i < count; ++i) {
525 __m128 vsf = _mm_load_ps(reinterpret_cast<const float *>(s + i));
526 __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
527 vsf = _mm_mul_ps(vsf, vsa);
528 vsf = _mm_insert_ps(vsf, vsa, 0x30);
529 _mm_store_ps(reinterpret_cast<float *>(buffer + i), vsf);
530 }
531 return buffer;
532}
533
534void QT_FASTCALL storeRGBX32FFromRGBA32F_sse4(uchar *dest, const QRgbaFloat32 *src, int index, int count,
535 const QList<QRgb> *, QDitherInfo *)
536{
537 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
538 const __m128 zero = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
539 for (int i = 0; i < count; ++i) {
540 __m128 vsf = _mm_load_ps(reinterpret_cast<const float *>(src + i));
541 const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
542 const float a = _mm_cvtss_f32(vsa);
543 if (a == 1.0f)
544 { }
545 else if (a == 0.0f)
546 vsf = zero;
547 else {
548 __m128 vsr = _mm_rcp_ps(vsa);
549 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
550 vsf = _mm_mul_ps(vsf, vsr);
551 vsf = _mm_insert_ps(vsf, _mm_set_ss(1.0f), 0x30);
552 }
553 _mm_store_ps(reinterpret_cast<float *>(d + i), vsf);
554 }
555}
556
557void QT_FASTCALL storeRGBA32FFromRGBA32F_sse4(uchar *dest, const QRgbaFloat32 *src, int index, int count,
558 const QList<QRgb> *, QDitherInfo *)
559{
560 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
561 const __m128 zero = _mm_set1_ps(0.0f);
562 for (int i = 0; i < count; ++i) {
563 __m128 vsf = _mm_load_ps(reinterpret_cast<const float *>(src + i));
564 const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
565 const float a = _mm_cvtss_f32(vsa);
566 if (a == 1.0f)
567 { }
568 else if (a == 0.0f)
569 vsf = zero;
570 else {
571 __m128 vsr = _mm_rcp_ps(vsa);
572 vsr = _mm_sub_ps(_mm_add_ps(vsr, vsr), _mm_mul_ps(vsr, _mm_mul_ps(vsr, vsa)));
573 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
574 vsf = _mm_mul_ps(vsf, vsr);
575 }
576 _mm_store_ps(reinterpret_cast<float *>(d + i), vsf);
577 }
578}
579#endif
580
581
582QT_END_NAMESPACE
583
584#endif