Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qdrawhelper_lsx.cpp
Go to the documentation of this file.
1// Copyright (C) 2024 Loongson Technology Corporation Limited.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include <private/qdrawhelper_loongarch64_p.h>
5
6#ifdef QT_COMPILER_SUPPORTS_LSX
7
8#include <private/qdrawingprimitive_lsx_p.h>
9#include <private/qpaintengine_raster_p.h>
10
11QT_BEGIN_NAMESPACE
12
13void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
14 const uchar *srcPixels, int sbpl,
15 int w, int h,
16 int const_alpha)
17{
18 const quint32 *src = (const quint32 *) srcPixels;
19 quint32 *dst = (quint32 *) destPixels;
20 if (const_alpha == 256) {
21 for (int y = 0; y < h; ++y) {
22 BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w);
23 dst = (quint32 *)(((uchar *) dst) + dbpl);
24 src = (const quint32 *)(((const uchar *) src) + sbpl);
25 }
26 } else if (const_alpha != 0) {
27 // dest = (s + d * sia) * ca + d * cia
28 // = s * ca + d * (sia * ca + cia)
29 // = s * ca + d * (1 - sa*ca)
30 const_alpha = (const_alpha * 255) >> 8;
31
32 for (int y = 0; y < h; ++y) {
33 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha);
34 dst = (quint32 *)(((uchar *) dst) + dbpl);
35 src = (const quint32 *)(((const uchar *) src) + sbpl);
36 }
37 }
38}
39
40// qblendfunctions.cpp
41void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
42 const uchar *srcPixels, int sbpl,
43 int w, int h,
44 int const_alpha);
45
46void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
47 const uchar *srcPixels, int sbpl,
48 int w, int h,
49 int const_alpha)
50{
51 const quint32 *src = (const quint32 *) srcPixels;
52 quint32 *dst = (quint32 *) destPixels;
53 if (const_alpha != 256) {
54 if (const_alpha != 0) {
55 const __m128i half = __lsx_vreplgr2vr_h(0x80);
56 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
57
58 const_alpha = (const_alpha * 255) >> 8;
59 int one_minus_const_alpha = 255 - const_alpha;
60 const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
61 const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
62 for (int y = 0; y < h; ++y) {
63 int x = 0;
64
65 // First, align dest to 16 bytes:
66 ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
67 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
68 dst[x], one_minus_const_alpha);
69 }
70
71 for (; x < w-3; x += 4) {
72 __m128i srcVector = __lsx_vld(&src[x], 0);
73 __m128i dstVector = __lsx_vld(&dst[x], 0);
74 INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
75 oneMinusConstAlpha, colorMask, half);
76 __lsx_vst(dstVector, &dst[x], 0);
77 }
78 SIMD_EPILOGUE(x, w, 3)
79 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
80 dst[x], one_minus_const_alpha);
81 dst = (quint32 *)(((uchar *) dst) + dbpl);
82 src = (const quint32 *)(((const uchar *) src) + sbpl);
83 }
84 }
85 } else {
86 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
87 }
88}
89
90void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels,
91 int length, uint const_alpha)
92{
93 Q_ASSERT(const_alpha < 256);
94
95 const quint32 *src = (const quint32 *) srcPixels;
96 quint32 *dst = (quint32 *) destPixels;
97
98 if (const_alpha == 255) {
99 BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length);
100 } else {
101 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha);
102 }
103}
104
105void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha)
106{
107 int x = 0;
108
109 if (const_alpha == 255) {
110 // 1) Prologue: align destination on 16 bytes
111 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
112 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
113
114 // 2) composition with LSX
115 for (; x < length - 3; x += 4) {
116 const __m128i srcVector = __lsx_vld(&src[x], 0);
117 const __m128i dstVector = __lsx_vld(&dst[x], 0);
118
119 const __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
120 __lsx_vst(result, &dst[x], 0);
121 }
122
123 // 3) Epilogue:
124 SIMD_EPILOGUE(x, length, 3)
125 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
126 } else {
127 const int one_minus_const_alpha = 255 - const_alpha;
128 const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
129 const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
130
131 // 1) Prologue: align destination on 16 bytes
132 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
133 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
134 const_alpha,
135 one_minus_const_alpha);
136
137 const __m128i half = __lsx_vreplgr2vr_h(0x80);
138 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
139 // 2) composition with LSX
140 for (; x < length - 3; x += 4) {
141 const __m128i srcVector = __lsx_vld(&src[x], 0);
142 __m128i dstVector = __lsx_vld(&dst[x], 0);
143 __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
144 INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector,
145 oneMinusConstAlpha, colorMask, half);
146 __lsx_vst(dstVector, &dst[x], 0);
147 }
148
149 // 3) Epilogue:
150 SIMD_EPILOGUE(x, length, 3)
151 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
152 const_alpha, one_minus_const_alpha);
153 }
154}
155
156void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha)
157{
158 if (const_alpha == 255) {
159 ::memcpy(dst, src, length * sizeof(uint));
160 } else {
161 const int ialpha = 255 - const_alpha;
162
163 int x = 0;
164
165 // 1) prologue, align on 16 bytes
166 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
167 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
168
169 // 2) interpolate pixels with LSX
170 const __m128i half = __lsx_vreplgr2vr_h(0x80);
171 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
172
173 const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
174 const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(ialpha);
175 for (; x < length - 3; x += 4) {
176 const __m128i srcVector = __lsx_vld(&src[x], 0);
177 __m128i dstVector = __lsx_vld(&dst[x], 0);
178 INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
179 oneMinusConstAlpha, colorMask, half);
180 __lsx_vst(dstVector, &dst[x], 0);
181 }
182
183 // 3) Epilogue
184 SIMD_EPILOGUE(x, length, 3)
185 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
186 }
187}
188
189static Q_NEVER_INLINE
190void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
191{
192 __m128i *dst128 = reinterpret_cast<__m128i *>(dest);
193 __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
194
195 while (dst128 + 4 <= end128) {
196 __lsx_vst(value128, dst128 + 0, 0);
197 __lsx_vst(value128, dst128 + 1, 0);
198 __lsx_vst(value128, dst128 + 2, 0);
199 __lsx_vst(value128, dst128 + 3, 0);
200 dst128 += 4;
201 }
202
203 bytecount %= 4 * sizeof(__m128i);
204 switch (bytecount / sizeof(__m128i)) {
205 case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
206 case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
207 case 1: __lsx_vst(value128, dst128++, 0);
208 }
209}
210
211void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count)
212{
213 quintptr misaligned = quintptr(dest) % sizeof(__m128i);
214 if (misaligned && count) {
215 *dest++ = value;
216 --count;
217 }
218
219 if (count % 2) {
220 dest[count - 1] = value;
221 --count;
222 }
223
224 qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64));
225}
226
227void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count)
228{
229 if (count < 4) {
230 // this simplifies the code below: the first switch can fall through
231 // without checking the value of count
232 switch (count) {
233 case 3: *dest++ = value; Q_FALLTHROUGH();
234 case 2: *dest++ = value; Q_FALLTHROUGH();
235 case 1: *dest = value;
236 }
237 return;
238 }
239
240 const int align = (quintptr)(dest) & 0xf;
241 switch (align) {
242 case 4: *dest++ = value; --count; Q_FALLTHROUGH();
243 case 8: *dest++ = value; --count; Q_FALLTHROUGH();
244 case 12: *dest++ = value; --count;
245 }
246
247 const int rest = count & 0x3;
248 if (rest) {
249 switch (rest) {
250 case 3: dest[count - 3] = value; Q_FALLTHROUGH();
251 case 2: dest[count - 2] = value; Q_FALLTHROUGH();
252 case 1: dest[count - 1] = value;
253 }
254 }
255
256 qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32));
257}
258
259void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length,
260 uint color, uint const_alpha)
261{
262 if (const_alpha == 255) {
263 qt_memfill32(destPixels, color, length);
264 } else {
265 const quint32 ialpha = 255 - const_alpha;
266 color = BYTE_MUL(color, const_alpha);
267 int x = 0;
268
269 quint32 *dst = (quint32 *) destPixels;
270 const __m128i colorVector = __lsx_vreplgr2vr_w(color);
271 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
272 const __m128i half = __lsx_vreplgr2vr_h(0x80);
273 const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha);
274
275 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
276 destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
277
278 for (; x < length-3; x += 4) {
279 __m128i dstVector = __lsx_vld(&dst[x], 0);
280 BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half);
281 dstVector = __lsx_vadd_b(colorVector, dstVector);
282 __lsx_vst(dstVector, &dst[x], 0);
283 }
284 SIMD_EPILOGUE(x, length, 3)
285 destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
286 }
287}
288
289void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length,
290 uint color, uint const_alpha)
291{
292 if ((const_alpha & qAlpha(color)) == 255) {
293 qt_memfill32(destPixels, color, length);
294 } else {
295 if (const_alpha != 255)
296 color = BYTE_MUL(color, const_alpha);
297
298 const quint32 minusAlphaOfColor = qAlpha(~color);
299 int x = 0;
300
301 quint32 *dst = (quint32 *) destPixels;
302 const __m128i colorVector = __lsx_vreplgr2vr_w(color);
303 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
304 const __m128i half = __lsx_vreplgr2vr_h(0x80);
305 const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor);
306
307 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
308 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
309
310 for (; x < length-3; x += 4) {
311 __m128i dstVector = __lsx_vld(&dst[x], 0);
312 BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half);
313 dstVector = __lsx_vadd_b(colorVector, dstVector);
314 __lsx_vst(dstVector, &dst[x], 0);
315 }
316 SIMD_EPILOGUE(x, length, 3)
317 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
318 }
319}
320
321void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y,
322 quint32 color,
323 const uchar *src, int width, int height, int stride)
324{
325 quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
326 const int destStride = rasterBuffer->stride<quint32>();
327
328 const __m128i c128 = __lsx_vreplgr2vr_w(color);
329 const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040,
330 0x20202020, 0x10101010};
331 const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040,
332 0x60606060, 0x70707070};
333
334 if (width > 4) {
335 const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404,
336 0x02020202, 0x01010101};
337 const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c,
338 0x7e7e7e7e, 0x7f7f7f7f};
339 while (height--) {
340 for (int x = 0; x < width; x += 8) {
341 const quint8 s = src[x >> 3];
342 if (!s)
343 continue;
344 __m128i mask1 = __lsx_vreplgr2vr_b(s);
345 __m128i mask2 = mask1;
346
347 mask1 = __lsx_vand_v(mask1, maskmask1);
348 mask1 = __lsx_vadd_b(mask1, maskadd1);
349
350 __m128i destSrc1 = __lsx_vld((char*)(dest + x), 0);
351
352 mask1 = __lsx_vslti_b(mask1,0);
353 destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
354 __lsx_vst(destSrc1, (char*)(dest + x), 0);
355
356 __m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0);
357
358 mask2 = __lsx_vand_v(mask2, maskmask2);
359 mask2 = __lsx_vadd_b(mask2, maskadd2);
360
361 mask2 = __lsx_vslti_b(mask2,0);
362 destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2);
363 __lsx_vst(destSrc2, (char*)(dest + x + 4), 0);
364 }
365 dest += destStride;
366 src += stride;
367 }
368 } else {
369 while (height--) {
370 const quint8 s = *src;
371 if (s) {
372 __m128i mask1 = __lsx_vreplgr2vr_b(s);
373
374 __m128i destSrc1 = __lsx_vld((char*)(dest), 0);
375 mask1 = __lsx_vand_v(mask1, maskmask1);
376 mask1 = __lsx_vadd_b(mask1, maskadd1);
377
378 mask1 = __lsx_vslti_b(mask1, 0);
379 destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
380 __lsx_vst(destSrc1, (char*)(dest), 0);
381 }
382 dest += destStride;
383 src += stride;
384 }
385 }
386}
387
388void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
389 const QRgba64 &color,
390 const uchar *src, int width, int height, int stride)
391{
392 qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride);
393}
394
395void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
396 const QRgba64 &color,
397 const uchar *src, int width, int height, int stride)
398{
399 qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride);
400}
401
402void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
403 const QRgba64 &color,
404 const uchar *src, int width, int height, int stride)
405{
406 const quint16 c = qConvertRgb32To16(color.toArgb32());
407 quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
408 const int destStride = rasterBuffer->stride<quint32>();
409
410 const __m128i c128 = __lsx_vreplgr2vr_h(c);
411 const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010,
412 0x0808, 0x0404, 0x0202, 0x0101};
413
414 const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070,
415 0x7878, 0x7c7c, 0x7e7e, 0x7f7f};
416 while (--height >= 0) {
417 for (int x = 0; x < width; x += 8) {
418 const quint8 s = src[x >> 3];
419 if (!s)
420 continue;
421 __m128i mask = __lsx_vreplgr2vr_b(s);
422 __m128i destSrc = __lsx_vld((char*)(dest + x), 0);
423 mask = __lsx_vand_v(mask, maskmask);
424 mask = __lsx_vadd_b(mask, maskadd);
425 mask = __lsx_vslti_b(mask, 0);
426 destSrc = __lsx_vbitsel_v(destSrc, c128, mask);
427 __lsx_vst(destSrc, (char*)(dest + x), 0);
428 }
429 dest += destStride;
430 src += stride;
431 }
432}
433
434class QSimdLsx
435{
436public:
437 typedef __m128i Int32x4;
438 typedef __m128 Float32x4;
439
440 union Vect_buffer_i { Int32x4 v; int i[4]; };
441 union Vect_buffer_f { Float32x4 v; float f[4]; };
442
443 static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); }
444 static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); }
445 static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); }
446 static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); }
447
448 static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); }
449 static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); }
450
451 static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); }
452 static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); }
453 static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); }
454
455 static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); }
456
457 static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); }
458 static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); }
459
460 static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); }
461
462 static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); }
463
464 static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); }
465
466 static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); }
467};
468
469const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op,
470 const QSpanData *data,
471 int y, int x, int length)
472{
473 return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdLsx>,uint>(buffer, op, data, y, x, length);
474}
475
476void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
477 const uchar *srcPixels, int sbpl, int srch,
478 const QRectF &targetRect,
479 const QRectF &sourceRect,
480 const QRect &clip,
481 int const_alpha)
482{
483 if (const_alpha != 256) {
484 // from qblendfunctions.cpp
485 extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
486 const uchar *srcPixels, int sbpl, int srch,
487 const QRectF &targetRect,
488 const QRectF &sourceRect,
489 const QRect &clip,
490 int const_alpha);
491 return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch,
492 targetRect, sourceRect, clip, const_alpha);
493 }
494
495 qreal sx = sourceRect.width() / (qreal)targetRect.width();
496 qreal sy = sourceRect.height() / (qreal)targetRect.height();
497
498
499 const int ix = 0x00010000 * sx;
500 const int iy = 0x00010000 * sy;
501
502 QRect tr = targetRect.normalized().toRect();
503 tr = tr.intersected(clip);
504 if (tr.isEmpty())
505 return;
506 const int tx1 = tr.left();
507 const int ty1 = tr.top();
508 int h = tr.height();
509 int w = tr.width();
510
511 quint32 basex;
512 quint32 srcy;
513
514 if (sx < 0) {
515 int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1;
516 basex = quint32(sourceRect.right() * 65536) + dstx;
517 } else {
518 int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1;
519 basex = quint32(sourceRect.left() * 65536) + dstx;
520 }
521 if (sy < 0) {
522 int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1;
523 srcy = quint32(sourceRect.bottom() * 65536) + dsty;
524 } else {
525 int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1;
526 srcy = quint32(sourceRect.top() * 65536) + dsty;
527 }
528
529 quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
530
531 const __m128i nullVector = __lsx_vreplgr2vr_w(0);
532 const __m128i half = __lsx_vreplgr2vr_h(0x80);
533 const __m128i one = __lsx_vreplgr2vr_h(0xff);
534 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
535 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
536 const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix);
537
538 // this bounds check here is required as floating point rounding above might in some cases lead to
539 // w/h values that are one pixel too large, falling outside of the valid image area.
540 const int ystart = srcy >> 16;
541 if (ystart >= srch && iy < 0) {
542 srcy += iy;
543 --h;
544 }
545 const int xstart = basex >> 16;
546 if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) {
547 basex += ix;
548 --w;
549 }
550 int yend = (srcy + iy * (h - 1)) >> 16;
551 if (yend < 0 || yend >= srch)
552 --h;
553 int xend = (basex + ix * (w - 1)) >> 16;
554 if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32)))
555 --w;
556
557 while (--h >= 0) {
558 const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
559 int srcx = basex;
560 int x = 0;
561
562 ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
563 uint s = src[srcx >> 16];
564 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
565 srcx += ix;
566 }
567
568 __m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx};
569
570 for (; x < (w - 3); x += 4) {
571 const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1);
572 const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3);
573 const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5);
574 const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7);
575 srcxVector = __lsx_vadd_w(srcxVector, ixVector);
576
577 const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]});
578
579 BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
580 }
581
582 SIMD_EPILOGUE(x, w, 3) {
583 uint s = src[(basex + x*ix) >> 16];
584 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
585 }
586 dst = (quint32 *)(((uchar *) dst) + dbpl);
587 srcy += iy;
588 }
589}
590
591const uint *QT_FASTCALL fetchPixelsBPP24_lsx(uint *buffer, const uchar *src, int index, int count)
592{
593 const quint24 *s = reinterpret_cast<const quint24 *>(src);
594 for (int i = 0; i < count; ++i)
595 buffer[i] = s[index + i];
596 return buffer;
597}
598
599const uint * QT_FASTCALL qt_fetchUntransformed_888_lsx(uint *buffer, const Operator *,
600 const QSpanData *data,
601 int y, int x, int length)
602{
603 const uchar *line = data->texture.scanLine(y) + x * 3;
604 // from image/qimage_lsx.cpp
605 extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len);
606 qt_convert_rgb888_to_rgb32_lsx(buffer, line, length);
607 return buffer;
608}
609
610void qt_memfill24_lsx(quint24 *dest, quint24 color, qsizetype count)
611{
612 // LCM of 12 and 16 bytes is 48 bytes (16 px)
613 quint32 v = color;
614 __m128i m = __lsx_vinsgr2vr_w(__lsx_vldi(0), v, 0);
615 quint24 *end = dest + count;
616
617 constexpr uchar x = 2, y = 1, z = 0;
618 alignas(__m128i) static const uchar
619 shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
620 __m128i indexMask = (__m128i)(v16i8){2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
621
622 __m128i mval1 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast<const __m128i *>(shuffleMask), 0));
623 __m128i mval2 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast<const __m128i *>(shuffleMask + 1), 0));
624 __m128i mval3 = __lsx_vshuf_b(mval2, mval1, indexMask);
625
626 for ( ; dest + 16 <= end; dest += 16) {
627 __lsx_vst(mval1, reinterpret_cast<__m128i *>(dest) + 0, 0);
628 __lsx_vst(mval2, reinterpret_cast<__m128i *>(dest) + 1, 0);
629 __lsx_vst(mval3, reinterpret_cast<__m128i *>(dest) + 2, 0);
630 }
631
632 if (count < 3) {
633 if (count > 1)
634 end[-2] = v;
635 if (count)
636 end[-1] = v;
637 return;
638 }
639
640 // less than 16px/48B left
641 uchar *ptr = reinterpret_cast<uchar *>(dest);
642 uchar *ptr_end = reinterpret_cast<uchar *>(end);
643 qptrdiff left = ptr_end - ptr;
644 if (left >= 24) {
645 // 8px/24B or more left
646 __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) + 0, 0);
647 __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr) + 1, 0, 0);
648 ptr += 24;
649 left -= 24;
650 }
651
652 // less than 8px/24B left
653
654 if (left >= 16) {
655 // but more than 5px/15B left
656 __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) , 0);
657 } else if (left >= 8) {
658 // but more than 2px/6B left
659 __lsx_vstelm_d(mval1, reinterpret_cast<__m128i *>(ptr), 0, 0);
660 }
661
662 if (left) {
663 // 1 or 2px left
664 // store 8 bytes ending with the right values (will overwrite a bit)
665 __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr_end - 8), 0, 0);
666 }
667}
668
669void QT_FASTCALL rbSwap_888_lsx(uchar *dst, const uchar *src, int count)
670{
671 int i = 0;
672 const static __m128i shuffleMask1 = (__m128i)(v16i8){2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 15};
673 const static __m128i shuffleMask2 = (__m128i)(v16i8){0, 1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, 14, 15};
674 const static __m128i shuffleMask3 = (__m128i)(v16i8){0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13};
675
676 for (; i + 15 < count; i += 16) {
677 __m128i s1 = __lsx_vld(src, 0);
678 __m128i s2 = __lsx_vld((src + 16), 0);
679 __m128i s3 = __lsx_vld((src + 32), 0);
680 s1 = __lsx_vshuf_b(s1, s1, shuffleMask1);
681 s2 = __lsx_vshuf_b(s2, s2, shuffleMask2);
682 s3 = __lsx_vshuf_b(s3, s3, shuffleMask3);
683 __lsx_vst(s1, dst, 0);
684 __lsx_vst(s2, (dst + 16), 0);
685 __lsx_vst(s3, (dst + 32), 0);
686
687 // Now fix the last four misplaced values
688 std::swap(dst[15], dst[17]);
689 std::swap(dst[30], dst[32]);
690
691 src += 48;
692 dst += 48;
693 }
694
695 if (src != dst) {
696 SIMD_EPILOGUE(i, count, 15) {
697 dst[0] = src[2];
698 dst[1] = src[1];
699 dst[2] = src[0];
700 dst += 3;
701 src += 3;
702 }
703 } else {
704 SIMD_EPILOGUE(i, count, 15) {
705 std::swap(dst[0], dst[2]);
706 dst += 3;
707 }
708 }
709}
710
711template<bool RGBA>
712static void convertARGBToARGB32PM_lsx(uint *buffer, const uint *src, int count)
713{
714 int i = 0;
715 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
716 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
717 const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
718 const __m128i half = __lsx_vreplgr2vr_h(0x0080);
719 const __m128i zero = __lsx_vldi(0);
720
721 for (; i < count - 3; i += 4) {
722 __m128i srcVector = __lsx_vld(&src[i], 0);
723 const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask));
724 if (testz[0]!=0) {
725 const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask));
726 if (testc[0]!=0) {
727 if (RGBA)
728 srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask);
729 __m128i src1 = __lsx_vilvl_b(zero, srcVector);
730 __m128i src2 = __lsx_vilvh_b(zero, srcVector);
731 __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask);
732 __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask);
733 src1 = __lsx_vmul_h(src1, alpha1);
734 src2 = __lsx_vmul_h(src2, alpha2);
735 src1 = __lsx_vadd_h(src1, __lsx_vsrli_h(src1, 8));
736 src2 = __lsx_vadd_h(src2, __lsx_vsrli_h(src2, 8));
737 src1 = __lsx_vadd_h(src1, half);
738 src2 = __lsx_vadd_h(src2, half);
739 src1 = __lsx_vsrli_h(src1, 8);
740 src2 = __lsx_vsrli_h(src2, 8);
741 __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15};
742 src1 = __lsx_vshuf_h(blendMask, alpha1, src1);
743 src2 = __lsx_vshuf_h(blendMask, alpha2, src2);
744 src1 = __lsx_vmaxi_h(src1, 0);
745 src2 = __lsx_vmaxi_h(src2, 0);
746 srcVector = __lsx_vpickev_b(__lsx_vsat_hu(src2, 7), __lsx_vsat_hu(src1, 7));
747 __lsx_vst(srcVector, &buffer[i], 0);
748 } else {
749 if (RGBA)
750 __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0);
751 else if (buffer != src)
752 __lsx_vst(srcVector, &buffer[i], 0);
753 }
754 } else {
755 __lsx_vst(zero, &buffer[i], 0);
756 }
757 }
758
759 SIMD_EPILOGUE(i, count, 3) {
760 uint v = qPremultiply(src[i]);
761 buffer[i] = RGBA ? RGBA2ARGB(v) : v;
762 }
763}
764
765template<bool RGBA>
766static void convertARGBToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
767{
768 int i = 0;
769 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
770 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
771 const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
772 const __m128i zero = __lsx_vldi(0);
773
774 for (; i < count - 3; i += 4) {
775 __m128i srcVector = __lsx_vld(&src[i], 0);
776 const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask));
777 if (testz[0]!=0) {
778 const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask));
779 if (!RGBA)
780 srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask);
781 const __m128i src1 = __lsx_vilvl_b(srcVector, srcVector);
782 const __m128i src2 = __lsx_vilvh_b(srcVector, srcVector);
783 if (testc[0]!=0) {
784 __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask);
785 __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask);
786 __m128i dst1 = __lsx_vmuh_hu(src1, alpha1);
787 __m128i dst2 = __lsx_vmuh_hu(src2, alpha2);
788 // Map 0->0xfffe to 0->0xffff
789 dst1 = __lsx_vadd_h(dst1, __lsx_vsrli_h(dst1, 15));
790 dst2 = __lsx_vadd_h(dst2, __lsx_vsrli_h(dst2, 15));
791 // correct alpha value:
792 const __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15};
793 dst1 = __lsx_vshuf_h(blendMask, src1, dst1);
794 dst2 = __lsx_vshuf_h(blendMask, src2, dst2);
795 __lsx_vst(dst1, &buffer[i], 0);
796 __lsx_vst(dst2, &buffer[i + 2], 0);
797 } else {
798 __lsx_vst(src1, &buffer[i], 0);
799 __lsx_vst(src2, &buffer[i + 2], 0);
800 }
801 } else {
802 __lsx_vst(zero, &buffer[i], 0);
803 __lsx_vst(zero, &buffer[i + 2], 0);
804 }
805 }
806
807 SIMD_EPILOGUE(i, count, 3) {
808 const uint s = RGBA ? RGBA2ARGB(src[i]) : src[i];
809 buffer[i] = QRgba64::fromArgb32(s).premultiplied();
810 }
811}
812
813template<bool RGBA, bool RGBx>
814static inline void convertARGBFromARGB32PM_lsx(uint *buffer, const uint *src, int count)
815{
816 int i = 0;
817 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
818 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
819 const __m128i zero = __lsx_vldi(0);
820
821 for (; i < count - 3; i += 4) {
822 __m128i srcVector = __lsx_vld(&src[i], 0);
823 const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask));
824 if (testz[0]!=0) {
825 const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask));
826 if (testc[0]!=0) {
827 __m128i srcVectorAlpha = __lsx_vsrli_w(srcVector, 24);
828 if (RGBA)
829 srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask);
830 const __m128 a = __lsx_vffint_s_w(srcVectorAlpha);
831 const __m128 ia = reciprocal_mul_ps(a, 255.0f);
832 __m128i src1 = __lsx_vilvl_b(zero, srcVector);
833 __m128i src3 = __lsx_vilvh_b(zero, srcVector);
834 __m128i src2 = __lsx_vilvh_h(zero, src1);
835 __m128i src4 = __lsx_vilvh_h(zero, src3);
836 src1 = __lsx_vilvl_h(zero, src1);
837 src3 = __lsx_vilvl_h(zero, src3);
838 __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0);
839 __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1);
840 __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2);
841 __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3);
842 src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1));
843 src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2));
844 src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3));
845 src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4));
846 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
847 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
848 src1 = __lsx_vmaxi_h(src1, 0);
849 src3 = __lsx_vmaxi_h(src3, 0);
850 src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7));
851 // Handle potential alpha == 0 values:
852 __m128i srcVectorAlphaMask = __lsx_vseq_w(srcVectorAlpha, zero);
853 src1 = __lsx_vandn_v(srcVectorAlphaMask, src1);
854 // Fixup alpha values:
855 if (RGBx)
856 srcVector = __lsx_vor_v(src1, alphaMask);
857 else
858 srcVector = __lsx_vbitsel_v(src1, srcVector, __lsx_vslti_b(alphaMask, 0));
859 __lsx_vst(srcVector, &buffer[i], 0);
860 } else {
861 if (RGBA)
862 __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0);
863 else if (buffer != src)
864 __lsx_vst(srcVector, &buffer[i], 0);
865 }
866 } else {
867 if (RGBx)
868 __lsx_vst(alphaMask, &buffer[i], 0);
869 else
870 __lsx_vst(zero, &buffer[i], 0);
871 }
872 }
873
874 SIMD_EPILOGUE(i, count, 3) {
875 uint v = qUnpremultiply_lsx(src[i]);
876 if (RGBx)
877 v = 0xff000000 | v;
878 if (RGBA)
879 v = ARGB2RGBA(v);
880 buffer[i] = v;
881 }
882}
883
884template<bool RGBA>
885static inline void convertARGBFromRGBA64PM_lsx(uint *buffer, const QRgba64 *src, int count)
886{
887 int i = 0;
888 const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48));
889 const __m128i alphaMask32 = __lsx_vreplgr2vr_w(0xff000000);
890 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
891 const __m128i zero = __lsx_vldi(0);
892
893 for (; i < count - 3; i += 4) {
894 __m128i srcVector1 = __lsx_vld(&src[i], 0);
895 __m128i srcVector2 = __lsx_vld(&src[i + 2], 0);
896 const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask));
897 bool transparent1 = testz1[0]==0;
898 const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask));
899 bool opaque1 = testc1[0]==0;
900 const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask));
901 bool transparent2 = testz2[0]==0;
902 const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask));
903 bool opaque2 = testc2[0]==0;
904
905 if (!(transparent1 && transparent2)) {
906 if (!(opaque1 && opaque2)) {
907 __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48);
908 __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48);
909 __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15),
910 __lsx_vsat_wu(srcVector1Alpha, 15));
911 const __m128 a = __lsx_vffint_s_w(srcVectorAlpha);
912 // Convert srcVectorAlpha to final 8-bit alpha channel
913 srcVectorAlpha = __lsx_vadd_w(srcVectorAlpha, __lsx_vreplgr2vr_w(128));
914 srcVectorAlpha = __lsx_vsub_w(srcVectorAlpha, __lsx_vsrli_w(srcVectorAlpha, 8));
915 srcVectorAlpha = __lsx_vsrli_w(srcVectorAlpha, 8);
916 srcVectorAlpha = __lsx_vslli_w(srcVectorAlpha, 24);
917 const __m128 ia = reciprocal_mul_ps(a, 255.0f);
918 __m128i src1 = __lsx_vilvl_h(zero, srcVector1);
919 __m128i src2 = __lsx_vilvh_h(zero, srcVector1);
920 __m128i src3 = __lsx_vilvl_h(zero, srcVector2);
921 __m128i src4 = __lsx_vilvh_h(zero, srcVector2);
922 __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0);
923 __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1);
924 __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2);
925 __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3);
926 src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1));
927 src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2));
928 src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3));
929 src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4));
930 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
931 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
932 // Handle potential alpha == 0 values:
933 __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero);
934 __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero);
935 src1 = __lsx_vandn_v(srcVector1AlphaMask, src1);
936 src3 = __lsx_vandn_v(srcVector2AlphaMask, src3);
937 src1 = __lsx_vmaxi_h(src1, 0);
938 src3 = __lsx_vmaxi_h(src3, 0);
939 src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7));
940 // Fixup alpha values:
941 src1 = __lsx_vbitsel_v(src1, srcVectorAlpha, __lsx_vslti_b(alphaMask32, 0));
942 // Fix RGB order
943 if (!RGBA){
944 src1 = __lsx_vshuf_b(zero, src1, rgbaMask);}
945 __lsx_vst(src1, (__m128i *)&buffer[i], 0);
946 } else {
947 __m128i src1 = __lsx_vilvl_h(zero, srcVector1);
948 __m128i src2 = __lsx_vilvh_h(zero, srcVector1);
949 __m128i src3 = __lsx_vilvl_h(zero, srcVector2);
950 __m128i src4 = __lsx_vilvh_h(zero, srcVector2);
951 src1 = __lsx_vadd_w(src1, __lsx_vreplgr2vr_w(128));
952 src2 = __lsx_vadd_w(src2, __lsx_vreplgr2vr_w(128));
953 src3 = __lsx_vadd_w(src3, __lsx_vreplgr2vr_w(128));
954 src4 = __lsx_vadd_w(src4, __lsx_vreplgr2vr_w(128));
955 src1 = __lsx_vsub_w(src1, __lsx_vsrli_w(src1, 8));
956 src2 = __lsx_vsub_w(src2, __lsx_vsrli_w(src2, 8));
957 src3 = __lsx_vsub_w(src3, __lsx_vsrli_w(src3, 8));
958 src4 = __lsx_vsub_w(src4, __lsx_vsrli_w(src4, 8));
959 src1 = __lsx_vsrli_w(src1, 8);
960 src2 = __lsx_vsrli_w(src2, 8);
961 src3 = __lsx_vsrli_w(src3, 8);
962 src4 = __lsx_vsrli_w(src4, 8);
963 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
964 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
965 src1 = __lsx_vmaxi_h(src1, 0);
966 src3 = __lsx_vmaxi_h(src3, 0);
967 src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 15));
968 if (!RGBA){
969 src1 = __lsx_vshuf_b(zero, src1, rgbaMask);}
970 __lsx_vst(src1, &buffer[i], 0);
971 }
972 } else {
973 __lsx_vst(zero, &buffer[i], 0);
974 }
975 }
976
977 SIMD_EPILOGUE(i, count, 3) {
978 buffer[i] = qConvertRgba64ToRgb32_lsx<RGBA ? PixelOrderRGB : PixelOrderBGR>(src[i]);
979 }
980}
981
982template<bool mask>
983static inline void convertRGBA64FromRGBA64PM_lsx(QRgba64 *buffer, const QRgba64 *src, int count)
984{
985 int i = 0;
986 const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48));
987 const __m128i zero = __lsx_vldi(0);
988
989 for (; i < count - 3; i += 4) {
990 __m128i srcVector1 = __lsx_vld(&src[i + 0], 0);
991 __m128i srcVector2 = __lsx_vld(&src[i + 2], 0);
992 const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask));
993 bool transparent1 = testz1[0]==0;
994 const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask));
995 bool opaque1 = testc1[0]==0;
996 const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask));
997 bool transparent2 = testz2[0]==0;
998 const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask));
999 bool opaque2 = testc2[0]==0;
1000
1001 if (!(transparent1 && transparent2)) {
1002 if (!(opaque1 && opaque2)) {
1003 __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48);
1004 __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48);
1005 __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15),
1006 __lsx_vsat_wu(srcVector1Alpha, 15));
1007 const __m128 a = __lsx_vffint_s_w(srcVectorAlpha);
1008 const __m128 ia = reciprocal_mul_ps(a, 65535.0f);
1009 __m128i src1 = __lsx_vilvl_h(zero, srcVector1);
1010 __m128i src2 = __lsx_vilvh_h(zero, srcVector1);
1011 __m128i src3 = __lsx_vilvl_h(zero, srcVector2);
1012 __m128i src4 = __lsx_vilvh_h(zero, srcVector2);
1013 __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0);
1014 __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1);
1015 __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2);
1016 __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3);
1017 src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1));
1018 src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2));
1019 src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3));
1020 src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4));
1021 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
1022 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
1023 // Handle potential alpha == 0 values:
1024 __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero);
1025 __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero);
1026 src1 = __lsx_vandn_v(srcVector1AlphaMask, src1);
1027 src3 = __lsx_vandn_v(srcVector2AlphaMask, src3);
1028 // Fixup alpha values:
1029 if (mask) {
1030 src1 = __lsx_vor_v(src1, alphaMask);
1031 src3 = __lsx_vor_v(src3, alphaMask);
1032 } else {
1033 src1 = __lsx_vbitsel_v(src1, srcVector1, __lsx_vslti_b(alphaMask, 0));
1034 src3 = __lsx_vbitsel_v(src3, srcVector2, __lsx_vslti_b(alphaMask, 0));
1035 }
1036 __lsx_vst(src1, &buffer[i + 0], 0);
1037 __lsx_vst(src3, &buffer[i + 2], 0);
1038 } else {
1039 if (mask) {
1040 srcVector1 = __lsx_vor_v(srcVector1, alphaMask);
1041 srcVector2 = __lsx_vor_v(srcVector2, alphaMask);
1042 }
1043 if (mask || src != buffer) {
1044 __lsx_vst(srcVector1, &buffer[i + 0], 0);
1045 __lsx_vst(srcVector2, &buffer[i + 2], 0);
1046 }
1047 }
1048 } else {
1049 __lsx_vst(zero, &buffer[i + 0], 0);
1050 __lsx_vst(zero, &buffer[i + 2], 0);
1051 }
1052 }
1053
1054 SIMD_EPILOGUE(i, count, 3) {
1055 QRgba64 v = src[i].unpremultiplied();
1056 if (mask)
1057 v.setAlpha(65535);
1058 buffer[i] = v;
1059 }
1060}
1061
1062void QT_FASTCALL convertARGB32ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *)
1063{
1064 convertARGBToARGB32PM_lsx<false>(buffer, buffer, count);
1065}
1066
1067void QT_FASTCALL convertRGBA8888ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *)
1068{
1069 convertARGBToARGB32PM_lsx<true>(buffer, buffer, count);
1070}
1071
1072const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count,
1073 const QList<QRgb> *, QDitherInfo *)
1074{
1075 convertARGBToRGBA64PM_lsx<false>(buffer, src, count);
1076 return buffer;
1077}
1078
1079const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count,
1080 const QList<QRgb> *, QDitherInfo *)
1081{
1082 convertARGBToRGBA64PM_lsx<true>(buffer, src, count);
1083 return buffer;
1084}
1085
1086const uint *QT_FASTCALL fetchARGB32ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count,
1087 const QList<QRgb> *, QDitherInfo *)
1088{
1089 convertARGBToARGB32PM_lsx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1090 return buffer;
1091}
1092
1093const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count,
1094 const QList<QRgb> *, QDitherInfo *)
1095{
1096 convertARGBToARGB32PM_lsx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1097 return buffer;
1098}
1099
1100const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count,
1101 const QList<QRgb> *, QDitherInfo *)
1102{
1103 convertARGBToRGBA64PM_lsx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1104 return buffer;
1105}
1106
1107const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count,
1108 const QList<QRgb> *, QDitherInfo *)
1109{
1110 convertARGBToRGBA64PM_lsx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1111 return buffer;
1112}
1113
1114void QT_FASTCALL storeRGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1115 const QList<QRgb> *, QDitherInfo *)
1116{
1117 uint *d = reinterpret_cast<uint *>(dest) + index;
1118 convertARGBFromARGB32PM_lsx<false,true>(d, src, count);
1119}
1120
1121void QT_FASTCALL storeARGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1122 const QList<QRgb> *, QDitherInfo *)
1123{
1124 uint *d = reinterpret_cast<uint *>(dest) + index;
1125 convertARGBFromARGB32PM_lsx<false,false>(d, src, count);
1126}
1127
1128void QT_FASTCALL storeRGBA8888FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1129 const QList<QRgb> *, QDitherInfo *)
1130{
1131 uint *d = reinterpret_cast<uint *>(dest) + index;
1132 convertARGBFromARGB32PM_lsx<true,false>(d, src, count);
1133}
1134
1135void QT_FASTCALL storeRGBXFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1136 const QList<QRgb> *, QDitherInfo *)
1137{
1138 uint *d = reinterpret_cast<uint *>(dest) + index;
1139 convertARGBFromARGB32PM_lsx<true,true>(d, src, count);
1140}
1141
1142template<QtPixelOrder PixelOrder>
1143void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1144 const QList<QRgb> *, QDitherInfo *)
1145{
1146 uint *d = reinterpret_cast<uint *>(dest) + index;
1147 for (int i = 0; i < count; ++i)
1148 d[i] = qConvertArgb32ToA2rgb30_lsx<PixelOrder>(src[i]);
1149}
1150
1151#if QT_CONFIG(raster_64bit)
1152void QT_FASTCALL destStore64ARGB32_lsx(QRasterBuffer *rasterBuffer, int x,
1153 int y, const QRgba64 *buffer, int length)
1154{
1155 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
1156 convertARGBFromRGBA64PM_lsx<false>(dest, buffer, length);
1157}
1158
1159void QT_FASTCALL destStore64RGBA8888_lsx(QRasterBuffer *rasterBuffer, int x,
1160 int y, const QRgba64 *buffer, int length)
1161{
1162 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
1163 convertARGBFromRGBA64PM_lsx<true>(dest, buffer, length);
1164}
1165#endif
1166
1167void QT_FASTCALL storeARGB32FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1168 const QList<QRgb> *, QDitherInfo *)
1169{
1170 uint *d = (uint*)dest + index;
1171 convertARGBFromRGBA64PM_lsx<false>(d, src, count);
1172}
1173
1174void QT_FASTCALL storeRGBA8888FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1175 const QList<QRgb> *, QDitherInfo *)
1176{
1177 uint *d = (uint*)dest + index;
1178 convertARGBFromRGBA64PM_lsx<true>(d, src, count);
1179}
1180
1181template
1182void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx<PixelOrderBGR>(uchar *dest, const uint *src, int index, int count,
1183 const QList<QRgb> *, QDitherInfo *);
1184template
1185void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx<PixelOrderRGB>(uchar *dest, const uint *src, int index, int count,
1186 const QList<QRgb> *, QDitherInfo *);
1187
1188void QT_FASTCALL storeRGBA64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1189 const QList<QRgb> *, QDitherInfo *)
1190{
1191 QRgba64 *d = (QRgba64 *)dest + index;
1192 convertRGBA64FromRGBA64PM_lsx<false>(d, src, count);
1193}
1194
1195void QT_FASTCALL storeRGBx64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1196 const QList<QRgb> *, QDitherInfo *)
1197{
1198 QRgba64 *d = (QRgba64 *)dest + index;
1199 convertRGBA64FromRGBA64PM_lsx<true>(d, src, count);
1200}
1201
1202#if QT_CONFIG(raster_fp)
1203const QRgbaFloat32 *QT_FASTCALL fetchRGBA32FToRGBA32F_lsx(QRgbaFloat32 *buffer, const uchar *src,
1204 int index, int count,
1205 const QList<QRgb> *, QDitherInfo *)
1206{
1207 const QRgbaFloat32 *s = reinterpret_cast<const QRgbaFloat32 *>(src) + index;
1208 for (int i = 0; i < count; ++i) {
1209 __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(s + i), 0);
1210 __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3);
1211 vsf = __lsx_vfmul_s(vsf, vsa);
1212 vsf = (__m128)__lsx_vextrins_w(vsf, vsa, 0x30);
1213 __lsx_vst(vsf, reinterpret_cast<float *>(buffer + i), 0);
1214 }
1215 return buffer;
1216}
1217
1218void QT_FASTCALL storeRGBX32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src,
1219 int index, int count,
1220 const QList<QRgb> *, QDitherInfo *)
1221{
1222 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
1223 const __m128 zero = (__m128)(v4f32){0.0f, 0.0f, 0.0f, 1.0f};
1224 for (int i = 0; i < count; ++i) {
1225 __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(src + i), 0);
1226 const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3);
1227 FloatInt a;
1228 a.i = __lsx_vpickve2gr_w(vsa, 0);
1229 if (a.f == 1.0f)
1230 { }
1231 else if (a.f == 0.0f)
1232 vsf = zero;
1233 else {
1234 __m128 vsr = __lsx_vfrecip_s(vsa);
1235 vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr),
1236 __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa)));
1237 vsf = __lsx_vfmul_s(vsf, vsr);
1238 FloatInt b = {.f = 1.0f};
1239 vsf = (__m128)__lsx_vinsgr2vr_w(vsf, b.i, 3);
1240 }
1241 __lsx_vst(vsf, reinterpret_cast<float *>(d + i), 0);
1242 }
1243}
1244
1245void QT_FASTCALL storeRGBA32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src,
1246 int index, int count,
1247 const QList<QRgb> *, QDitherInfo *)
1248{
1249 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
1250 const __m128 zero = (__m128)__lsx_vldi(0);
1251 for (int i = 0; i < count; ++i) {
1252 __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(src + i), 0);
1253 const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3);
1254 FloatInt a;
1255 a.i = __lsx_vpickve2gr_w(vsa, 0);
1256 if (a.f == 1.0f)
1257 { }
1258 else if (a.f == 0.0f)
1259 vsf = zero;
1260 else {
1261 __m128 vsr = __lsx_vfrecip_s(vsa);
1262 vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr),
1263 __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa)));
1264 FloatInt b = {.f = 1.0f};
1265 vsr = (__m128)__lsx_vinsgr2vr_w(vsr, b.i, 3);
1266 vsf = __lsx_vfmul_s(vsf, vsr);
1267 }
1268 __lsx_vst(vsf, reinterpret_cast<float *>(d + i), 0);
1269 }
1270}
1271#endif
1272
1273QT_END_NAMESPACE
1274
1275#endif // QT_COMPILER_SUPPORTS_LSX