Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qdrawhelper_lsx.cpp
Go to the documentation of this file.
1// Copyright (C) 2024 Loongson Technology Corporation Limited.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:significant reason:default
4
5#include <private/qdrawhelper_loongarch64_p.h>
6
7#ifdef QT_COMPILER_SUPPORTS_LSX
8
9#include <private/qdrawingprimitive_lsx_p.h>
10#include <private/qpaintengine_raster_p.h>
11
12QT_BEGIN_NAMESPACE
13
14void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
15 const uchar *srcPixels, int sbpl,
16 int w, int h,
17 int const_alpha)
18{
19 const quint32 *src = (const quint32 *) srcPixels;
20 quint32 *dst = (quint32 *) destPixels;
21 if (const_alpha == 256) {
22 for (int y = 0; y < h; ++y) {
23 BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w);
24 dst = (quint32 *)(((uchar *) dst) + dbpl);
25 src = (const quint32 *)(((const uchar *) src) + sbpl);
26 }
27 } else if (const_alpha != 0) {
28 // dest = (s + d * sia) * ca + d * cia
29 // = s * ca + d * (sia * ca + cia)
30 // = s * ca + d * (1 - sa*ca)
31 const_alpha = (const_alpha * 255) >> 8;
32
33 for (int y = 0; y < h; ++y) {
34 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha);
35 dst = (quint32 *)(((uchar *) dst) + dbpl);
36 src = (const quint32 *)(((const uchar *) src) + sbpl);
37 }
38 }
39}
40
41// qblendfunctions.cpp
42void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
43 const uchar *srcPixels, int sbpl,
44 int w, int h,
45 int const_alpha);
46
47void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
48 const uchar *srcPixels, int sbpl,
49 int w, int h,
50 int const_alpha)
51{
52 const quint32 *src = (const quint32 *) srcPixels;
53 quint32 *dst = (quint32 *) destPixels;
54 if (const_alpha != 256) {
55 if (const_alpha != 0) {
56 const __m128i half = __lsx_vreplgr2vr_h(0x80);
57 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
58
59 const_alpha = (const_alpha * 255) >> 8;
60 int one_minus_const_alpha = 255 - const_alpha;
61 const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
62 const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
63 for (int y = 0; y < h; ++y) {
64 int x = 0;
65
66 // First, align dest to 16 bytes:
67 ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
68 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
69 dst[x], one_minus_const_alpha);
70 }
71
72 for (; x < w-3; x += 4) {
73 __m128i srcVector = __lsx_vld(&src[x], 0);
74 __m128i dstVector = __lsx_vld(&dst[x], 0);
75 INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
76 oneMinusConstAlpha, colorMask, half);
77 __lsx_vst(dstVector, &dst[x], 0);
78 }
79 SIMD_EPILOGUE(x, w, 3)
80 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
81 dst[x], one_minus_const_alpha);
82 dst = (quint32 *)(((uchar *) dst) + dbpl);
83 src = (const quint32 *)(((const uchar *) src) + sbpl);
84 }
85 }
86 } else {
87 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
88 }
89}
90
91void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels,
92 int length, uint const_alpha)
93{
94 Q_ASSERT(const_alpha < 256);
95
96 const quint32 *src = (const quint32 *) srcPixels;
97 quint32 *dst = (quint32 *) destPixels;
98
99 if (const_alpha == 255) {
100 BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length);
101 } else {
102 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha);
103 }
104}
105
106void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha)
107{
108 int x = 0;
109
110 if (const_alpha == 255) {
111 // 1) Prologue: align destination on 16 bytes
112 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
113 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
114
115 // 2) composition with LSX
116 for (; x < length - 3; x += 4) {
117 const __m128i srcVector = __lsx_vld(&src[x], 0);
118 const __m128i dstVector = __lsx_vld(&dst[x], 0);
119
120 const __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
121 __lsx_vst(result, &dst[x], 0);
122 }
123
124 // 3) Epilogue:
125 SIMD_EPILOGUE(x, length, 3)
126 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
127 } else {
128 const int one_minus_const_alpha = 255 - const_alpha;
129 const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
130 const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
131
132 // 1) Prologue: align destination on 16 bytes
133 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
134 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
135 const_alpha,
136 one_minus_const_alpha);
137
138 const __m128i half = __lsx_vreplgr2vr_h(0x80);
139 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
140 // 2) composition with LSX
141 for (; x < length - 3; x += 4) {
142 const __m128i srcVector = __lsx_vld(&src[x], 0);
143 __m128i dstVector = __lsx_vld(&dst[x], 0);
144 __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
145 INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector,
146 oneMinusConstAlpha, colorMask, half);
147 __lsx_vst(dstVector, &dst[x], 0);
148 }
149
150 // 3) Epilogue:
151 SIMD_EPILOGUE(x, length, 3)
152 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
153 const_alpha, one_minus_const_alpha);
154 }
155}
156
157void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha)
158{
159 if (const_alpha == 255) {
160 ::memcpy(dst, src, length * sizeof(uint));
161 } else {
162 const int ialpha = 255 - const_alpha;
163
164 int x = 0;
165
166 // 1) prologue, align on 16 bytes
167 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
168 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
169
170 // 2) interpolate pixels with LSX
171 const __m128i half = __lsx_vreplgr2vr_h(0x80);
172 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
173
174 const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
175 const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(ialpha);
176 for (; x < length - 3; x += 4) {
177 const __m128i srcVector = __lsx_vld(&src[x], 0);
178 __m128i dstVector = __lsx_vld(&dst[x], 0);
179 INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
180 oneMinusConstAlpha, colorMask, half);
181 __lsx_vst(dstVector, &dst[x], 0);
182 }
183
184 // 3) Epilogue
185 SIMD_EPILOGUE(x, length, 3)
186 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
187 }
188}
189
190static Q_NEVER_INLINE
191void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
192{
193 __m128i *dst128 = reinterpret_cast<__m128i *>(dest);
194 __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
195
196 while (dst128 + 4 <= end128) {
197 __lsx_vst(value128, dst128 + 0, 0);
198 __lsx_vst(value128, dst128 + 1, 0);
199 __lsx_vst(value128, dst128 + 2, 0);
200 __lsx_vst(value128, dst128 + 3, 0);
201 dst128 += 4;
202 }
203
204 bytecount %= 4 * sizeof(__m128i);
205 switch (bytecount / sizeof(__m128i)) {
206 case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
207 case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
208 case 1: __lsx_vst(value128, dst128++, 0);
209 }
210}
211
212void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count)
213{
214 quintptr misaligned = quintptr(dest) % sizeof(__m128i);
215 if (misaligned && count) {
216 *dest++ = value;
217 --count;
218 }
219
220 if (count % 2) {
221 dest[count - 1] = value;
222 --count;
223 }
224
225 qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64));
226}
227
228void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count)
229{
230 if (count < 4) {
231 // this simplifies the code below: the first switch can fall through
232 // without checking the value of count
233 switch (count) {
234 case 3: *dest++ = value; Q_FALLTHROUGH();
235 case 2: *dest++ = value; Q_FALLTHROUGH();
236 case 1: *dest = value;
237 }
238 return;
239 }
240
241 const int align = (quintptr)(dest) & 0xf;
242 switch (align) {
243 case 4: *dest++ = value; --count; Q_FALLTHROUGH();
244 case 8: *dest++ = value; --count; Q_FALLTHROUGH();
245 case 12: *dest++ = value; --count;
246 }
247
248 const int rest = count & 0x3;
249 if (rest) {
250 switch (rest) {
251 case 3: dest[count - 3] = value; Q_FALLTHROUGH();
252 case 2: dest[count - 2] = value; Q_FALLTHROUGH();
253 case 1: dest[count - 1] = value;
254 }
255 }
256
257 qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32));
258}
259
260void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length,
261 uint color, uint const_alpha)
262{
263 if (const_alpha == 255) {
264 qt_memfill32(destPixels, color, length);
265 } else {
266 const quint32 ialpha = 255 - const_alpha;
267 color = BYTE_MUL(color, const_alpha);
268 int x = 0;
269
270 quint32 *dst = (quint32 *) destPixels;
271 const __m128i colorVector = __lsx_vreplgr2vr_w(color);
272 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
273 const __m128i half = __lsx_vreplgr2vr_h(0x80);
274 const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha);
275
276 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
277 destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
278
279 for (; x < length-3; x += 4) {
280 __m128i dstVector = __lsx_vld(&dst[x], 0);
281 BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half);
282 dstVector = __lsx_vadd_b(colorVector, dstVector);
283 __lsx_vst(dstVector, &dst[x], 0);
284 }
285 SIMD_EPILOGUE(x, length, 3)
286 destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
287 }
288}
289
290void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length,
291 uint color, uint const_alpha)
292{
293 if ((const_alpha & qAlpha(color)) == 255) {
294 qt_memfill32(destPixels, color, length);
295 } else {
296 if (const_alpha != 255)
297 color = BYTE_MUL(color, const_alpha);
298
299 const quint32 minusAlphaOfColor = qAlpha(~color);
300 int x = 0;
301
302 quint32 *dst = (quint32 *) destPixels;
303 const __m128i colorVector = __lsx_vreplgr2vr_w(color);
304 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
305 const __m128i half = __lsx_vreplgr2vr_h(0x80);
306 const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor);
307
308 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
309 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
310
311 for (; x < length-3; x += 4) {
312 __m128i dstVector = __lsx_vld(&dst[x], 0);
313 BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half);
314 dstVector = __lsx_vadd_b(colorVector, dstVector);
315 __lsx_vst(dstVector, &dst[x], 0);
316 }
317 SIMD_EPILOGUE(x, length, 3)
318 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
319 }
320}
321
322void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y,
323 quint32 color,
324 const uchar *src, int width, int height, int stride)
325{
326 quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
327 const int destStride = rasterBuffer->stride<quint32>();
328
329 const __m128i c128 = __lsx_vreplgr2vr_w(color);
330 const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040,
331 0x20202020, 0x10101010};
332 const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040,
333 0x60606060, 0x70707070};
334
335 if (width > 4) {
336 const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404,
337 0x02020202, 0x01010101};
338 const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c,
339 0x7e7e7e7e, 0x7f7f7f7f};
340 while (height--) {
341 for (int x = 0; x < width; x += 8) {
342 const quint8 s = src[x >> 3];
343 if (!s)
344 continue;
345 __m128i mask1 = __lsx_vreplgr2vr_b(s);
346 __m128i mask2 = mask1;
347
348 mask1 = __lsx_vand_v(mask1, maskmask1);
349 mask1 = __lsx_vadd_b(mask1, maskadd1);
350
351 __m128i destSrc1 = __lsx_vld((char*)(dest + x), 0);
352
353 mask1 = __lsx_vslti_b(mask1,0);
354 destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
355 __lsx_vst(destSrc1, (char*)(dest + x), 0);
356
357 __m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0);
358
359 mask2 = __lsx_vand_v(mask2, maskmask2);
360 mask2 = __lsx_vadd_b(mask2, maskadd2);
361
362 mask2 = __lsx_vslti_b(mask2,0);
363 destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2);
364 __lsx_vst(destSrc2, (char*)(dest + x + 4), 0);
365 }
366 dest += destStride;
367 src += stride;
368 }
369 } else {
370 while (height--) {
371 const quint8 s = *src;
372 if (s) {
373 __m128i mask1 = __lsx_vreplgr2vr_b(s);
374
375 __m128i destSrc1 = __lsx_vld((char*)(dest), 0);
376 mask1 = __lsx_vand_v(mask1, maskmask1);
377 mask1 = __lsx_vadd_b(mask1, maskadd1);
378
379 mask1 = __lsx_vslti_b(mask1, 0);
380 destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
381 __lsx_vst(destSrc1, (char*)(dest), 0);
382 }
383 dest += destStride;
384 src += stride;
385 }
386 }
387}
388
389void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
390 const QRgba64 &color,
391 const uchar *src, int width, int height, int stride)
392{
393 qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride);
394}
395
396void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
397 const QRgba64 &color,
398 const uchar *src, int width, int height, int stride)
399{
400 qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride);
401}
402
403void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
404 const QRgba64 &color,
405 const uchar *src, int width, int height, int stride)
406{
407 const quint16 c = qConvertRgb32To16(color.toArgb32());
408 quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
409 const int destStride = rasterBuffer->stride<quint32>();
410
411 const __m128i c128 = __lsx_vreplgr2vr_h(c);
412 const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010,
413 0x0808, 0x0404, 0x0202, 0x0101};
414
415 const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070,
416 0x7878, 0x7c7c, 0x7e7e, 0x7f7f};
417 while (--height >= 0) {
418 for (int x = 0; x < width; x += 8) {
419 const quint8 s = src[x >> 3];
420 if (!s)
421 continue;
422 __m128i mask = __lsx_vreplgr2vr_b(s);
423 __m128i destSrc = __lsx_vld((char*)(dest + x), 0);
424 mask = __lsx_vand_v(mask, maskmask);
425 mask = __lsx_vadd_b(mask, maskadd);
426 mask = __lsx_vslti_b(mask, 0);
427 destSrc = __lsx_vbitsel_v(destSrc, c128, mask);
428 __lsx_vst(destSrc, (char*)(dest + x), 0);
429 }
430 dest += destStride;
431 src += stride;
432 }
433}
434
435class QSimdLsx
436{
437public:
438 typedef __m128i Int32x4;
439 typedef __m128 Float32x4;
440
441 union Vect_buffer_i { Int32x4 v; int i[4]; };
442 union Vect_buffer_f { Float32x4 v; float f[4]; };
443
444 static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); }
445 static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); }
446 static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); }
447 static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); }
448
449 static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); }
450 static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); }
451
452 static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); }
453 static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); }
454 static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); }
455
456 static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); }
457
458 static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); }
459 static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); }
460
461 static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); }
462
463 static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); }
464
465 static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); }
466
467 static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); }
468};
469
470const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op,
471 const QSpanData *data,
472 int y, int x, int length)
473{
474 return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdLsx>,uint>(buffer, op, data, y, x, length);
475}
476
477void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
478 const uchar *srcPixels, int sbpl, int srch,
479 const QRectF &targetRect,
480 const QRectF &sourceRect,
481 const QRect &clip,
482 int const_alpha)
483{
484 if (const_alpha != 256) {
485 // from qblendfunctions.cpp
486 extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
487 const uchar *srcPixels, int sbpl, int srch,
488 const QRectF &targetRect,
489 const QRectF &sourceRect,
490 const QRect &clip,
491 int const_alpha);
492 return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch,
493 targetRect, sourceRect, clip, const_alpha);
494 }
495
496 qreal sx = sourceRect.width() / (qreal)targetRect.width();
497 qreal sy = sourceRect.height() / (qreal)targetRect.height();
498
499
500 const int ix = 0x00010000 * sx;
501 const int iy = 0x00010000 * sy;
502
503 QRect tr = targetRect.normalized().toRect();
504 tr = tr.intersected(clip);
505 if (tr.isEmpty())
506 return;
507 const int tx1 = tr.left();
508 const int ty1 = tr.top();
509 int h = tr.height();
510 int w = tr.width();
511
512 quint32 basex;
513 quint32 srcy;
514
515 if (sx < 0) {
516 int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1;
517 basex = quint32(sourceRect.right() * 65536) + dstx;
518 } else {
519 int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1;
520 basex = quint32(sourceRect.left() * 65536) + dstx;
521 }
522 if (sy < 0) {
523 int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1;
524 srcy = quint32(sourceRect.bottom() * 65536) + dsty;
525 } else {
526 int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1;
527 srcy = quint32(sourceRect.top() * 65536) + dsty;
528 }
529
530 quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
531
532 const __m128i nullVector = __lsx_vreplgr2vr_w(0);
533 const __m128i half = __lsx_vreplgr2vr_h(0x80);
534 const __m128i one = __lsx_vreplgr2vr_h(0xff);
535 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
536 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
537 const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix);
538
539 // this bounds check here is required as floating point rounding above might in some cases lead to
540 // w/h values that are one pixel too large, falling outside of the valid image area.
541 const int ystart = srcy >> 16;
542 if (ystart >= srch && iy < 0) {
543 srcy += iy;
544 --h;
545 }
546 const int xstart = basex >> 16;
547 if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) {
548 basex += ix;
549 --w;
550 }
551 int yend = (srcy + iy * (h - 1)) >> 16;
552 if (yend < 0 || yend >= srch)
553 --h;
554 int xend = (basex + ix * (w - 1)) >> 16;
555 if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32)))
556 --w;
557
558 while (--h >= 0) {
559 const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
560 int srcx = basex;
561 int x = 0;
562
563 ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
564 uint s = src[srcx >> 16];
565 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
566 srcx += ix;
567 }
568
569 __m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx};
570
571 for (; x < (w - 3); x += 4) {
572 const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1);
573 const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3);
574 const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5);
575 const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7);
576 srcxVector = __lsx_vadd_w(srcxVector, ixVector);
577
578 const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]});
579
580 BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
581 }
582
583 SIMD_EPILOGUE(x, w, 3) {
584 uint s = src[(basex + x*ix) >> 16];
585 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
586 }
587 dst = (quint32 *)(((uchar *) dst) + dbpl);
588 srcy += iy;
589 }
590}
591
592const uint *QT_FASTCALL fetchPixelsBPP24_lsx(uint *buffer, const uchar *src, int index, int count)
593{
594 const quint24 *s = reinterpret_cast<const quint24 *>(src);
595 for (int i = 0; i < count; ++i)
596 buffer[i] = s[index + i];
597 return buffer;
598}
599
600const uint * QT_FASTCALL qt_fetchUntransformed_888_lsx(uint *buffer, const Operator *,
601 const QSpanData *data,
602 int y, int x, int length)
603{
604 const uchar *line = data->texture.scanLine(y) + x * 3;
605 // from image/qimage_lsx.cpp
606 extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len);
607 qt_convert_rgb888_to_rgb32_lsx(buffer, line, length);
608 return buffer;
609}
610
611void qt_memfill24_lsx(quint24 *dest, quint24 color, qsizetype count)
612{
613 // LCM of 12 and 16 bytes is 48 bytes (16 px)
614 quint32 v = color;
615 __m128i m = __lsx_vinsgr2vr_w(__lsx_vldi(0), v, 0);
616 quint24 *end = dest + count;
617
618 constexpr uchar x = 2, y = 1, z = 0;
619 alignas(__m128i) static const uchar
620 shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
621 __m128i indexMask = (__m128i)(v16i8){2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
622
623 __m128i mval1 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast<const __m128i *>(shuffleMask), 0));
624 __m128i mval2 = __lsx_vshuf_b(m, m, __lsx_vld(reinterpret_cast<const __m128i *>(shuffleMask + 1), 0));
625 __m128i mval3 = __lsx_vshuf_b(mval2, mval1, indexMask);
626
627 for ( ; dest + 16 <= end; dest += 16) {
628 __lsx_vst(mval1, reinterpret_cast<__m128i *>(dest) + 0, 0);
629 __lsx_vst(mval2, reinterpret_cast<__m128i *>(dest) + 1, 0);
630 __lsx_vst(mval3, reinterpret_cast<__m128i *>(dest) + 2, 0);
631 }
632
633 if (count < 3) {
634 if (count > 1)
635 end[-2] = v;
636 if (count)
637 end[-1] = v;
638 return;
639 }
640
641 // less than 16px/48B left
642 uchar *ptr = reinterpret_cast<uchar *>(dest);
643 uchar *ptr_end = reinterpret_cast<uchar *>(end);
644 qptrdiff left = ptr_end - ptr;
645 if (left >= 24) {
646 // 8px/24B or more left
647 __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) + 0, 0);
648 __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr) + 1, 0, 0);
649 ptr += 24;
650 left -= 24;
651 }
652
653 // less than 8px/24B left
654
655 if (left >= 16) {
656 // but more than 5px/15B left
657 __lsx_vst(mval1, reinterpret_cast<__m128i *>(ptr) , 0);
658 } else if (left >= 8) {
659 // but more than 2px/6B left
660 __lsx_vstelm_d(mval1, reinterpret_cast<__m128i *>(ptr), 0, 0);
661 }
662
663 if (left) {
664 // 1 or 2px left
665 // store 8 bytes ending with the right values (will overwrite a bit)
666 __lsx_vstelm_d(mval2, reinterpret_cast<__m128i *>(ptr_end - 8), 0, 0);
667 }
668}
669
670void QT_FASTCALL rbSwap_888_lsx(uchar *dst, const uchar *src, int count)
671{
672 int i = 0;
673 const static __m128i shuffleMask1 = (__m128i)(v16i8){2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 15};
674 const static __m128i shuffleMask2 = (__m128i)(v16i8){0, 1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, 14, 15};
675 const static __m128i shuffleMask3 = (__m128i)(v16i8){0, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13};
676
677 for (; i + 15 < count; i += 16) {
678 __m128i s1 = __lsx_vld(src, 0);
679 __m128i s2 = __lsx_vld((src + 16), 0);
680 __m128i s3 = __lsx_vld((src + 32), 0);
681 s1 = __lsx_vshuf_b(s1, s1, shuffleMask1);
682 s2 = __lsx_vshuf_b(s2, s2, shuffleMask2);
683 s3 = __lsx_vshuf_b(s3, s3, shuffleMask3);
684 __lsx_vst(s1, dst, 0);
685 __lsx_vst(s2, (dst + 16), 0);
686 __lsx_vst(s3, (dst + 32), 0);
687
688 // Now fix the last four misplaced values
689 std::swap(dst[15], dst[17]);
690 std::swap(dst[30], dst[32]);
691
692 src += 48;
693 dst += 48;
694 }
695
696 if (src != dst) {
697 SIMD_EPILOGUE(i, count, 15) {
698 dst[0] = src[2];
699 dst[1] = src[1];
700 dst[2] = src[0];
701 dst += 3;
702 src += 3;
703 }
704 } else {
705 SIMD_EPILOGUE(i, count, 15) {
706 std::swap(dst[0], dst[2]);
707 dst += 3;
708 }
709 }
710}
711
712template<bool RGBA>
713static void convertARGBToARGB32PM_lsx(uint *buffer, const uint *src, int count)
714{
715 int i = 0;
716 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
717 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
718 const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
719 const __m128i half = __lsx_vreplgr2vr_h(0x0080);
720 const __m128i zero = __lsx_vldi(0);
721
722 for (; i < count - 3; i += 4) {
723 __m128i srcVector = __lsx_vld(&src[i], 0);
724 const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask));
725 if (testz[0]!=0) {
726 const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask));
727 if (testc[0]!=0) {
728 if (RGBA)
729 srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask);
730 __m128i src1 = __lsx_vilvl_b(zero, srcVector);
731 __m128i src2 = __lsx_vilvh_b(zero, srcVector);
732 __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask);
733 __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask);
734 src1 = __lsx_vmul_h(src1, alpha1);
735 src2 = __lsx_vmul_h(src2, alpha2);
736 src1 = __lsx_vadd_h(src1, __lsx_vsrli_h(src1, 8));
737 src2 = __lsx_vadd_h(src2, __lsx_vsrli_h(src2, 8));
738 src1 = __lsx_vadd_h(src1, half);
739 src2 = __lsx_vadd_h(src2, half);
740 src1 = __lsx_vsrli_h(src1, 8);
741 src2 = __lsx_vsrli_h(src2, 8);
742 __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15};
743 src1 = __lsx_vshuf_h(blendMask, alpha1, src1);
744 src2 = __lsx_vshuf_h(blendMask, alpha2, src2);
745 src1 = __lsx_vmaxi_h(src1, 0);
746 src2 = __lsx_vmaxi_h(src2, 0);
747 srcVector = __lsx_vpickev_b(__lsx_vsat_hu(src2, 7), __lsx_vsat_hu(src1, 7));
748 __lsx_vst(srcVector, &buffer[i], 0);
749 } else {
750 if (RGBA)
751 __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0);
752 else if (buffer != src)
753 __lsx_vst(srcVector, &buffer[i], 0);
754 }
755 } else {
756 __lsx_vst(zero, &buffer[i], 0);
757 }
758 }
759
760 SIMD_EPILOGUE(i, count, 3) {
761 uint v = qPremultiply(src[i]);
762 buffer[i] = RGBA ? RGBA2ARGB(v) : v;
763 }
764}
765
766template<bool RGBA>
767static void convertARGBToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
768{
769 int i = 0;
770 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
771 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
772 const __m128i shuffleMask = (__m128i)(v16i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
773 const __m128i zero = __lsx_vldi(0);
774
775 for (; i < count - 3; i += 4) {
776 __m128i srcVector = __lsx_vld(&src[i], 0);
777 const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask));
778 if (testz[0]!=0) {
779 const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask));
780 if (!RGBA)
781 srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask);
782 const __m128i src1 = __lsx_vilvl_b(srcVector, srcVector);
783 const __m128i src2 = __lsx_vilvh_b(srcVector, srcVector);
784 if (testc[0]!=0) {
785 __m128i alpha1 = __lsx_vshuf_b(zero, src1, shuffleMask);
786 __m128i alpha2 = __lsx_vshuf_b(zero, src2, shuffleMask);
787 __m128i dst1 = __lsx_vmuh_hu(src1, alpha1);
788 __m128i dst2 = __lsx_vmuh_hu(src2, alpha2);
789 // Map 0->0xfffe to 0->0xffff
790 dst1 = __lsx_vadd_h(dst1, __lsx_vsrli_h(dst1, 15));
791 dst2 = __lsx_vadd_h(dst2, __lsx_vsrli_h(dst2, 15));
792 // correct alpha value:
793 const __m128i blendMask = (__m128i)(v8i16){0, 1, 2, 11, 4, 5, 6, 15};
794 dst1 = __lsx_vshuf_h(blendMask, src1, dst1);
795 dst2 = __lsx_vshuf_h(blendMask, src2, dst2);
796 __lsx_vst(dst1, &buffer[i], 0);
797 __lsx_vst(dst2, &buffer[i + 2], 0);
798 } else {
799 __lsx_vst(src1, &buffer[i], 0);
800 __lsx_vst(src2, &buffer[i + 2], 0);
801 }
802 } else {
803 __lsx_vst(zero, &buffer[i], 0);
804 __lsx_vst(zero, &buffer[i + 2], 0);
805 }
806 }
807
808 SIMD_EPILOGUE(i, count, 3) {
809 const uint s = RGBA ? RGBA2ARGB(src[i]) : src[i];
810 buffer[i] = QRgba64::fromArgb32(s).premultiplied();
811 }
812}
813
814template<bool RGBA, bool RGBx>
815static inline void convertARGBFromARGB32PM_lsx(uint *buffer, const uint *src, int count)
816{
817 int i = 0;
818 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
819 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
820 const __m128i zero = __lsx_vldi(0);
821
822 for (; i < count - 3; i += 4) {
823 __m128i srcVector = __lsx_vld(&src[i], 0);
824 const v4i32 testz = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector, alphaMask));
825 if (testz[0]!=0) {
826 const v4i32 testc = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector, alphaMask));
827 if (testc[0]!=0) {
828 __m128i srcVectorAlpha = __lsx_vsrli_w(srcVector, 24);
829 if (RGBA)
830 srcVector = __lsx_vshuf_b(zero, srcVector, rgbaMask);
831 const __m128 a = __lsx_vffint_s_w(srcVectorAlpha);
832 const __m128 ia = reciprocal_mul_ps(a, 255.0f);
833 __m128i src1 = __lsx_vilvl_b(zero, srcVector);
834 __m128i src3 = __lsx_vilvh_b(zero, srcVector);
835 __m128i src2 = __lsx_vilvh_h(zero, src1);
836 __m128i src4 = __lsx_vilvh_h(zero, src3);
837 src1 = __lsx_vilvl_h(zero, src1);
838 src3 = __lsx_vilvl_h(zero, src3);
839 __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0);
840 __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1);
841 __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2);
842 __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3);
843 src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1));
844 src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2));
845 src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3));
846 src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4));
847 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
848 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
849 src1 = __lsx_vmaxi_h(src1, 0);
850 src3 = __lsx_vmaxi_h(src3, 0);
851 src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7));
852 // Handle potential alpha == 0 values:
853 __m128i srcVectorAlphaMask = __lsx_vseq_w(srcVectorAlpha, zero);
854 src1 = __lsx_vandn_v(srcVectorAlphaMask, src1);
855 // Fixup alpha values:
856 if (RGBx)
857 srcVector = __lsx_vor_v(src1, alphaMask);
858 else
859 srcVector = __lsx_vbitsel_v(src1, srcVector, __lsx_vslti_b(alphaMask, 0));
860 __lsx_vst(srcVector, &buffer[i], 0);
861 } else {
862 if (RGBA)
863 __lsx_vst(__lsx_vshuf_b(zero, srcVector, rgbaMask), &buffer[i], 0);
864 else if (buffer != src)
865 __lsx_vst(srcVector, &buffer[i], 0);
866 }
867 } else {
868 if (RGBx)
869 __lsx_vst(alphaMask, &buffer[i], 0);
870 else
871 __lsx_vst(zero, &buffer[i], 0);
872 }
873 }
874
875 SIMD_EPILOGUE(i, count, 3) {
876 uint v = qUnpremultiply_lsx(src[i]);
877 if (RGBx)
878 v = 0xff000000 | v;
879 if (RGBA)
880 v = ARGB2RGBA(v);
881 buffer[i] = v;
882 }
883}
884
885template<bool RGBA>
886static inline void convertARGBFromRGBA64PM_lsx(uint *buffer, const QRgba64 *src, int count)
887{
888 int i = 0;
889 const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48));
890 const __m128i alphaMask32 = __lsx_vreplgr2vr_w(0xff000000);
891 const __m128i rgbaMask = (__m128i)(v16i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
892 const __m128i zero = __lsx_vldi(0);
893
894 for (; i < count - 3; i += 4) {
895 __m128i srcVector1 = __lsx_vld(&src[i], 0);
896 __m128i srcVector2 = __lsx_vld(&src[i + 2], 0);
897 const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask));
898 bool transparent1 = testz1[0]==0;
899 const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask));
900 bool opaque1 = testc1[0]==0;
901 const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask));
902 bool transparent2 = testz2[0]==0;
903 const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask));
904 bool opaque2 = testc2[0]==0;
905
906 if (!(transparent1 && transparent2)) {
907 if (!(opaque1 && opaque2)) {
908 __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48);
909 __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48);
910 __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15),
911 __lsx_vsat_wu(srcVector1Alpha, 15));
912 const __m128 a = __lsx_vffint_s_w(srcVectorAlpha);
913 // Convert srcVectorAlpha to final 8-bit alpha channel
914 srcVectorAlpha = __lsx_vadd_w(srcVectorAlpha, __lsx_vreplgr2vr_w(128));
915 srcVectorAlpha = __lsx_vsub_w(srcVectorAlpha, __lsx_vsrli_w(srcVectorAlpha, 8));
916 srcVectorAlpha = __lsx_vsrli_w(srcVectorAlpha, 8);
917 srcVectorAlpha = __lsx_vslli_w(srcVectorAlpha, 24);
918 const __m128 ia = reciprocal_mul_ps(a, 255.0f);
919 __m128i src1 = __lsx_vilvl_h(zero, srcVector1);
920 __m128i src2 = __lsx_vilvh_h(zero, srcVector1);
921 __m128i src3 = __lsx_vilvl_h(zero, srcVector2);
922 __m128i src4 = __lsx_vilvh_h(zero, srcVector2);
923 __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0);
924 __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1);
925 __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2);
926 __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3);
927 src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1));
928 src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2));
929 src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3));
930 src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4));
931 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
932 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
933 // Handle potential alpha == 0 values:
934 __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero);
935 __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero);
936 src1 = __lsx_vandn_v(srcVector1AlphaMask, src1);
937 src3 = __lsx_vandn_v(srcVector2AlphaMask, src3);
938 src1 = __lsx_vmaxi_h(src1, 0);
939 src3 = __lsx_vmaxi_h(src3, 0);
940 src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 7));
941 // Fixup alpha values:
942 src1 = __lsx_vbitsel_v(src1, srcVectorAlpha, __lsx_vslti_b(alphaMask32, 0));
943 // Fix RGB order
944 if (!RGBA){
945 src1 = __lsx_vshuf_b(zero, src1, rgbaMask);}
946 __lsx_vst(src1, (__m128i *)&buffer[i], 0);
947 } else {
948 __m128i src1 = __lsx_vilvl_h(zero, srcVector1);
949 __m128i src2 = __lsx_vilvh_h(zero, srcVector1);
950 __m128i src3 = __lsx_vilvl_h(zero, srcVector2);
951 __m128i src4 = __lsx_vilvh_h(zero, srcVector2);
952 src1 = __lsx_vadd_w(src1, __lsx_vreplgr2vr_w(128));
953 src2 = __lsx_vadd_w(src2, __lsx_vreplgr2vr_w(128));
954 src3 = __lsx_vadd_w(src3, __lsx_vreplgr2vr_w(128));
955 src4 = __lsx_vadd_w(src4, __lsx_vreplgr2vr_w(128));
956 src1 = __lsx_vsub_w(src1, __lsx_vsrli_w(src1, 8));
957 src2 = __lsx_vsub_w(src2, __lsx_vsrli_w(src2, 8));
958 src3 = __lsx_vsub_w(src3, __lsx_vsrli_w(src3, 8));
959 src4 = __lsx_vsub_w(src4, __lsx_vsrli_w(src4, 8));
960 src1 = __lsx_vsrli_w(src1, 8);
961 src2 = __lsx_vsrli_w(src2, 8);
962 src3 = __lsx_vsrli_w(src3, 8);
963 src4 = __lsx_vsrli_w(src4, 8);
964 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
965 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
966 src1 = __lsx_vmaxi_h(src1, 0);
967 src3 = __lsx_vmaxi_h(src3, 0);
968 src1 = __lsx_vpickev_b(__lsx_vsat_hu(src3, 7), __lsx_vsat_hu(src1, 15));
969 if (!RGBA){
970 src1 = __lsx_vshuf_b(zero, src1, rgbaMask);}
971 __lsx_vst(src1, &buffer[i], 0);
972 }
973 } else {
974 __lsx_vst(zero, &buffer[i], 0);
975 }
976 }
977
978 SIMD_EPILOGUE(i, count, 3) {
979 buffer[i] = qConvertRgba64ToRgb32_lsx<RGBA ? PixelOrderRGB : PixelOrderBGR>(src[i]);
980 }
981}
982
983template<bool mask>
984static inline void convertRGBA64FromRGBA64PM_lsx(QRgba64 *buffer, const QRgba64 *src, int count)
985{
986 int i = 0;
987 const __m128i alphaMask = __lsx_vreplgr2vr_d(qint64(Q_UINT64_C(0xffff) << 48));
988 const __m128i zero = __lsx_vldi(0);
989
990 for (; i < count - 3; i += 4) {
991 __m128i srcVector1 = __lsx_vld(&src[i + 0], 0);
992 __m128i srcVector2 = __lsx_vld(&src[i + 2], 0);
993 const v4i32 testz1 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector1, alphaMask));
994 bool transparent1 = testz1[0]==0;
995 const v4i32 testc1 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector1, alphaMask));
996 bool opaque1 = testc1[0]==0;
997 const v4i32 testz2 = (v4i32)__lsx_vmsknz_b(__lsx_vand_v(srcVector2, alphaMask));
998 bool transparent2 = testz2[0]==0;
999 const v4i32 testc2 = (v4i32)__lsx_vmsknz_b(__lsx_vandn_v(srcVector2, alphaMask));
1000 bool opaque2 = testc2[0]==0;
1001
1002 if (!(transparent1 && transparent2)) {
1003 if (!(opaque1 && opaque2)) {
1004 __m128i srcVector1Alpha = __lsx_vsrli_d(srcVector1, 48);
1005 __m128i srcVector2Alpha = __lsx_vsrli_d(srcVector2, 48);
1006 __m128i srcVectorAlpha = __lsx_vpickev_h(__lsx_vsat_wu(srcVector2Alpha, 15),
1007 __lsx_vsat_wu(srcVector1Alpha, 15));
1008 const __m128 a = __lsx_vffint_s_w(srcVectorAlpha);
1009 const __m128 ia = reciprocal_mul_ps(a, 65535.0f);
1010 __m128i src1 = __lsx_vilvl_h(zero, srcVector1);
1011 __m128i src2 = __lsx_vilvh_h(zero, srcVector1);
1012 __m128i src3 = __lsx_vilvl_h(zero, srcVector2);
1013 __m128i src4 = __lsx_vilvh_h(zero, srcVector2);
1014 __m128 ia1 = (__m128)__lsx_vreplvei_w(ia, 0);
1015 __m128 ia2 = (__m128)__lsx_vreplvei_w(ia, 1);
1016 __m128 ia3 = (__m128)__lsx_vreplvei_w(ia, 2);
1017 __m128 ia4 = (__m128)__lsx_vreplvei_w(ia, 3);
1018 src1 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src1), ia1));
1019 src2 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src2), ia2));
1020 src3 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src3), ia3));
1021 src4 = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(src4), ia4));
1022 src1 = __lsx_vpickev_h(__lsx_vsat_wu(src2, 15), __lsx_vsat_wu(src1, 15));
1023 src3 = __lsx_vpickev_h(__lsx_vsat_wu(src4, 15), __lsx_vsat_wu(src3, 15));
1024 // Handle potential alpha == 0 values:
1025 __m128i srcVector1AlphaMask = __lsx_vseq_d(srcVector1Alpha, zero);
1026 __m128i srcVector2AlphaMask = __lsx_vseq_d(srcVector2Alpha, zero);
1027 src1 = __lsx_vandn_v(srcVector1AlphaMask, src1);
1028 src3 = __lsx_vandn_v(srcVector2AlphaMask, src3);
1029 // Fixup alpha values:
1030 if (mask) {
1031 src1 = __lsx_vor_v(src1, alphaMask);
1032 src3 = __lsx_vor_v(src3, alphaMask);
1033 } else {
1034 src1 = __lsx_vbitsel_v(src1, srcVector1, __lsx_vslti_b(alphaMask, 0));
1035 src3 = __lsx_vbitsel_v(src3, srcVector2, __lsx_vslti_b(alphaMask, 0));
1036 }
1037 __lsx_vst(src1, &buffer[i + 0], 0);
1038 __lsx_vst(src3, &buffer[i + 2], 0);
1039 } else {
1040 if (mask) {
1041 srcVector1 = __lsx_vor_v(srcVector1, alphaMask);
1042 srcVector2 = __lsx_vor_v(srcVector2, alphaMask);
1043 }
1044 if (mask || src != buffer) {
1045 __lsx_vst(srcVector1, &buffer[i + 0], 0);
1046 __lsx_vst(srcVector2, &buffer[i + 2], 0);
1047 }
1048 }
1049 } else {
1050 __lsx_vst(zero, &buffer[i + 0], 0);
1051 __lsx_vst(zero, &buffer[i + 2], 0);
1052 }
1053 }
1054
1055 SIMD_EPILOGUE(i, count, 3) {
1056 QRgba64 v = src[i].unpremultiplied();
1057 if (mask)
1058 v.setAlpha(65535);
1059 buffer[i] = v;
1060 }
1061}
1062
1063void QT_FASTCALL convertARGB32ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *)
1064{
1065 convertARGBToARGB32PM_lsx<false>(buffer, buffer, count);
1066}
1067
1068void QT_FASTCALL convertRGBA8888ToARGB32PM_lsx(uint *buffer, int count, const QList<QRgb> *)
1069{
1070 convertARGBToARGB32PM_lsx<true>(buffer, buffer, count);
1071}
1072
1073const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count,
1074 const QList<QRgb> *, QDitherInfo *)
1075{
1076 convertARGBToRGBA64PM_lsx<false>(buffer, src, count);
1077 return buffer;
1078}
1079
1080const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count,
1081 const QList<QRgb> *, QDitherInfo *)
1082{
1083 convertARGBToRGBA64PM_lsx<true>(buffer, src, count);
1084 return buffer;
1085}
1086
1087const uint *QT_FASTCALL fetchARGB32ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count,
1088 const QList<QRgb> *, QDitherInfo *)
1089{
1090 convertARGBToARGB32PM_lsx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1091 return buffer;
1092}
1093
1094const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_lsx(uint *buffer, const uchar *src, int index, int count,
1095 const QList<QRgb> *, QDitherInfo *)
1096{
1097 convertARGBToARGB32PM_lsx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1098 return buffer;
1099}
1100
1101const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count,
1102 const QList<QRgb> *, QDitherInfo *)
1103{
1104 convertARGBToRGBA64PM_lsx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1105 return buffer;
1106}
1107
1108const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_lsx(QRgba64 *buffer, const uchar *src, int index, int count,
1109 const QList<QRgb> *, QDitherInfo *)
1110{
1111 convertARGBToRGBA64PM_lsx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1112 return buffer;
1113}
1114
1115void QT_FASTCALL storeRGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1116 const QList<QRgb> *, QDitherInfo *)
1117{
1118 uint *d = reinterpret_cast<uint *>(dest) + index;
1119 convertARGBFromARGB32PM_lsx<false,true>(d, src, count);
1120}
1121
1122void QT_FASTCALL storeARGB32FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1123 const QList<QRgb> *, QDitherInfo *)
1124{
1125 uint *d = reinterpret_cast<uint *>(dest) + index;
1126 convertARGBFromARGB32PM_lsx<false,false>(d, src, count);
1127}
1128
1129void QT_FASTCALL storeRGBA8888FromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1130 const QList<QRgb> *, QDitherInfo *)
1131{
1132 uint *d = reinterpret_cast<uint *>(dest) + index;
1133 convertARGBFromARGB32PM_lsx<true,false>(d, src, count);
1134}
1135
1136void QT_FASTCALL storeRGBXFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1137 const QList<QRgb> *, QDitherInfo *)
1138{
1139 uint *d = reinterpret_cast<uint *>(dest) + index;
1140 convertARGBFromARGB32PM_lsx<true,true>(d, src, count);
1141}
1142
1143template<QtPixelOrder PixelOrder>
1144void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx(uchar *dest, const uint *src, int index, int count,
1145 const QList<QRgb> *, QDitherInfo *)
1146{
1147 uint *d = reinterpret_cast<uint *>(dest) + index;
1148 for (int i = 0; i < count; ++i)
1149 d[i] = qConvertArgb32ToA2rgb30_lsx<PixelOrder>(src[i]);
1150}
1151
1152#if QT_CONFIG(raster_64bit)
1153void QT_FASTCALL destStore64ARGB32_lsx(QRasterBuffer *rasterBuffer, int x,
1154 int y, const QRgba64 *buffer, int length)
1155{
1156 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
1157 convertARGBFromRGBA64PM_lsx<false>(dest, buffer, length);
1158}
1159
1160void QT_FASTCALL destStore64RGBA8888_lsx(QRasterBuffer *rasterBuffer, int x,
1161 int y, const QRgba64 *buffer, int length)
1162{
1163 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
1164 convertARGBFromRGBA64PM_lsx<true>(dest, buffer, length);
1165}
1166#endif
1167
1168void QT_FASTCALL storeARGB32FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1169 const QList<QRgb> *, QDitherInfo *)
1170{
1171 uint *d = (uint*)dest + index;
1172 convertARGBFromRGBA64PM_lsx<false>(d, src, count);
1173}
1174
1175void QT_FASTCALL storeRGBA8888FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1176 const QList<QRgb> *, QDitherInfo *)
1177{
1178 uint *d = (uint*)dest + index;
1179 convertARGBFromRGBA64PM_lsx<true>(d, src, count);
1180}
1181
1182template
1183void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx<PixelOrderBGR>(uchar *dest, const uint *src, int index, int count,
1184 const QList<QRgb> *, QDitherInfo *);
1185template
1186void QT_FASTCALL storeA2RGB30PMFromARGB32PM_lsx<PixelOrderRGB>(uchar *dest, const uint *src, int index, int count,
1187 const QList<QRgb> *, QDitherInfo *);
1188
1189void QT_FASTCALL storeRGBA64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1190 const QList<QRgb> *, QDitherInfo *)
1191{
1192 QRgba64 *d = (QRgba64 *)dest + index;
1193 convertRGBA64FromRGBA64PM_lsx<false>(d, src, count);
1194}
1195
1196void QT_FASTCALL storeRGBx64FromRGBA64PM_lsx(uchar *dest, const QRgba64 *src, int index, int count,
1197 const QList<QRgb> *, QDitherInfo *)
1198{
1199 QRgba64 *d = (QRgba64 *)dest + index;
1200 convertRGBA64FromRGBA64PM_lsx<true>(d, src, count);
1201}
1202
1203#if QT_CONFIG(raster_fp)
1204const QRgbaFloat32 *QT_FASTCALL fetchRGBA32FToRGBA32F_lsx(QRgbaFloat32 *buffer, const uchar *src,
1205 int index, int count,
1206 const QList<QRgb> *, QDitherInfo *)
1207{
1208 const QRgbaFloat32 *s = reinterpret_cast<const QRgbaFloat32 *>(src) + index;
1209 for (int i = 0; i < count; ++i) {
1210 __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(s + i), 0);
1211 __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3);
1212 vsf = __lsx_vfmul_s(vsf, vsa);
1213 vsf = (__m128)__lsx_vextrins_w(vsf, vsa, 0x30);
1214 __lsx_vst(vsf, reinterpret_cast<float *>(buffer + i), 0);
1215 }
1216 return buffer;
1217}
1218
1219void QT_FASTCALL storeRGBX32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src,
1220 int index, int count,
1221 const QList<QRgb> *, QDitherInfo *)
1222{
1223 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
1224 const __m128 zero = (__m128)(v4f32){0.0f, 0.0f, 0.0f, 1.0f};
1225 for (int i = 0; i < count; ++i) {
1226 __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(src + i), 0);
1227 const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3);
1228 FloatInt a;
1229 a.i = __lsx_vpickve2gr_w(vsa, 0);
1230 if (a.f == 1.0f)
1231 { }
1232 else if (a.f == 0.0f)
1233 vsf = zero;
1234 else {
1235 __m128 vsr = __lsx_vfrecip_s(vsa);
1236 vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr),
1237 __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa)));
1238 vsf = __lsx_vfmul_s(vsf, vsr);
1239 FloatInt b = {.f = 1.0f};
1240 vsf = (__m128)__lsx_vinsgr2vr_w(vsf, b.i, 3);
1241 }
1242 __lsx_vst(vsf, reinterpret_cast<float *>(d + i), 0);
1243 }
1244}
1245
1246void QT_FASTCALL storeRGBA32FFromRGBA32F_lsx(uchar *dest, const QRgbaFloat32 *src,
1247 int index, int count,
1248 const QList<QRgb> *, QDitherInfo *)
1249{
1250 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
1251 const __m128 zero = (__m128)__lsx_vldi(0);
1252 for (int i = 0; i < count; ++i) {
1253 __m128 vsf = (__m128)__lsx_vld(reinterpret_cast<const float *>(src + i), 0);
1254 const __m128 vsa = (__m128)__lsx_vreplvei_w(vsf, 3);
1255 FloatInt a;
1256 a.i = __lsx_vpickve2gr_w(vsa, 0);
1257 if (a.f == 1.0f)
1258 { }
1259 else if (a.f == 0.0f)
1260 vsf = zero;
1261 else {
1262 __m128 vsr = __lsx_vfrecip_s(vsa);
1263 vsr = __lsx_vfsub_s(__lsx_vfadd_s(vsr, vsr),
1264 __lsx_vfmul_s(vsr, __lsx_vfmul_s(vsr, vsa)));
1265 FloatInt b = {.f = 1.0f};
1266 vsr = (__m128)__lsx_vinsgr2vr_w(vsr, b.i, 3);
1267 vsf = __lsx_vfmul_s(vsf, vsr);
1268 }
1269 __lsx_vst(vsf, reinterpret_cast<float *>(d + i), 0);
1270 }
1271}
1272#endif
1273
1274QT_END_NAMESPACE
1275
1276#endif // QT_COMPILER_SUPPORTS_LSX