Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qdrawhelper_lasx.cpp
Go to the documentation of this file.
1// Copyright (C) 2024 Loongson Technology Corporation Limited.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:significant reason:default
4
8#include "qrgba64_p.h"
9
10#if defined(QT_COMPILER_SUPPORTS_LASX)
11
12QT_BEGIN_NAMESPACE
13
14enum {
15 FixedScale = 1 << 16,
16 HalfPoint = 1 << 15
17};
18
19#ifdef Q_CC_CLANG
20#define VREGS_PREFIX "$vr"
21#define XREGS_PREFIX "$xr"
22#else // GCC
23#define VREGS_PREFIX "$f"
24#define XREGS_PREFIX "$f"
25#endif
26#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
27
28// Convert two __m128i to __m256i
29static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo)
30{
31 __m256i out;
32 __asm__ volatile (
33 ".irp i," __ALL_REGS "\n\t"
34 " .ifc %[hi], " VREGS_PREFIX "\\i \n\t"
35 " .irp j," __ALL_REGS "\n\t"
36 " .ifc %[lo], " VREGS_PREFIX "\\j \n\t"
37 " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
38 " .endif \n\t"
39 " .endr \n\t"
40 " .endif \n\t"
41 ".endr \n\t"
42 ".ifnc %[out], %[hi] \n\t"
43 ".irp i," __ALL_REGS "\n\t"
44 " .ifc %[out], " XREGS_PREFIX "\\i \n\t"
45 " .irp j," __ALL_REGS "\n\t"
46 " .ifc %[hi], " VREGS_PREFIX "\\j \n\t"
47 " xvori.b $xr\\i, $xr\\j, 0 \n\t"
48 " .endif \n\t"
49 " .endr \n\t"
50 " .endif \n\t"
51 ".endr \n\t"
52 ".endif \n\t"
53 : [out] "=f" (out), [hi] "+f" (inhi)
54 : [lo] "f" (inlo)
55 );
56 return out;
57}
58
59// Convert __m256i low part to __m128i
60static inline __m128i lasx_extracti128_lo(__m256i in)
61{
62 __m128i out;
63 __asm__ volatile (
64 ".ifnc %[out], %[in] \n\t"
65 ".irp i," __ALL_REGS "\n\t"
66 " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
67 " .irp j," __ALL_REGS "\n\t"
68 " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
69 " vori.b $vr\\i, $vr\\j, 0 \n\t"
70 " .endif \n\t"
71 " .endr \n\t"
72 " .endif \n\t"
73 ".endr \n\t"
74 ".endif \n\t"
75 : [out] "=f" (out) : [in] "f" (in)
76 );
77 return out;
78}
79
80// Convert __m256i high part to __m128i
81static inline __m128i lasx_extracti128_hi(__m256i in)
82{
83 __m128i out;
84 __asm__ volatile (
85 ".irp i," __ALL_REGS "\n\t"
86 " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
87 " .irp j," __ALL_REGS "\n\t"
88 " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
89 " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t"
90 " .endif \n\t"
91 " .endr \n\t"
92 " .endif \n\t"
93 ".endr \n\t"
94 : [out] "=f" (out) : [in] "f" (in)
95 );
96 return out;
97}
98
99// Vectorized blend functions:
100
101// See BYTE_MUL_LSX for details.
102inline static void Q_DECL_VECTORCALL
103BYTE_MUL_LASX(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
104{
105 __m256i pixelVectorAG = __lasx_xvsrli_h(pixelVector, 8);
106 __m256i pixelVectorRB = __lasx_xvand_v(pixelVector, colorMask);
107
108 pixelVectorAG = __lasx_xvmul_h(pixelVectorAG, alphaChannel);
109 pixelVectorRB = __lasx_xvmul_h(pixelVectorRB, alphaChannel);
110
111 pixelVectorRB = __lasx_xvadd_h(pixelVectorRB, __lasx_xvsrli_h(pixelVectorRB, 8));
112 pixelVectorRB = __lasx_xvadd_h(pixelVectorRB, half);
113 pixelVectorAG = __lasx_xvadd_h(pixelVectorAG, __lasx_xvsrli_h(pixelVectorAG, 8));
114 pixelVectorAG = __lasx_xvadd_h(pixelVectorAG, half);
115
116 pixelVectorRB = __lasx_xvsrli_h(pixelVectorRB, 8);
117 pixelVectorAG = __lasx_xvandn_v(colorMask, pixelVectorAG);
118
119 pixelVector = __lasx_xvor_v(pixelVectorAG, pixelVectorRB);
120}
121
122inline static void Q_DECL_VECTORCALL
123BYTE_MUL_RGB64_LASX(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
124{
125 __m256i pixelVectorAG = __lasx_xvsrli_w(pixelVector, 16);
126 __m256i pixelVectorRB = __lasx_xvand_v(pixelVector, colorMask);
127
128 pixelVectorAG = __lasx_xvmul_w(pixelVectorAG, alphaChannel);
129 pixelVectorRB = __lasx_xvmul_w(pixelVectorRB, alphaChannel);
130
131 pixelVectorRB = __lasx_xvadd_w(pixelVectorRB, __lasx_xvsrli_w(pixelVectorRB, 16));
132 pixelVectorAG = __lasx_xvadd_w(pixelVectorAG, __lasx_xvsrli_w(pixelVectorAG, 16));
133 pixelVectorRB = __lasx_xvadd_w(pixelVectorRB, half);
134 pixelVectorAG = __lasx_xvadd_w(pixelVectorAG, half);
135
136 pixelVectorRB = __lasx_xvsrli_w(pixelVectorRB, 16);
137 pixelVectorAG = __lasx_xvandn_v(colorMask, pixelVectorAG);
138
139 pixelVector = __lasx_xvor_v(pixelVectorAG, pixelVectorRB);
140}
141
142// See INTERPOLATE_PIXEL_255_LSX for details.
143inline static void Q_DECL_VECTORCALL
144INTERPOLATE_PIXEL_255_LASX(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel,
145 __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
146{
147 const __m256i srcVectorAG = __lasx_xvsrli_h(srcVector, 8);
148 const __m256i dstVectorAG = __lasx_xvsrli_h(dstVector, 8);
149 const __m256i srcVectorRB = __lasx_xvand_v(srcVector, colorMask);
150 const __m256i dstVectorRB = __lasx_xvand_v(dstVector, colorMask);
151 const __m256i srcVectorAGalpha = __lasx_xvmul_h(srcVectorAG, alphaChannel);
152 const __m256i srcVectorRBalpha = __lasx_xvmul_h(srcVectorRB, alphaChannel);
153 const __m256i dstVectorAGoneMinusAlpha = __lasx_xvmul_h(dstVectorAG, oneMinusAlphaChannel);
154 const __m256i dstVectorRBoneMinusAlpha = __lasx_xvmul_h(dstVectorRB, oneMinusAlphaChannel);
155 __m256i finalAG = __lasx_xvadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
156 __m256i finalRB = __lasx_xvadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
157 finalAG = __lasx_xvadd_h(finalAG, __lasx_xvsrli_h(finalAG, 8));
158 finalRB = __lasx_xvadd_h(finalRB, __lasx_xvsrli_h(finalRB, 8));
159 finalAG = __lasx_xvadd_h(finalAG, half);
160 finalRB = __lasx_xvadd_h(finalRB, half);
161 finalAG = __lasx_xvandn_v(colorMask, finalAG);
162 finalRB = __lasx_xvsrli_h(finalRB, 8);
163
164 dstVector = __lasx_xvor_v(finalAG, finalRB);
165}
166
167inline static void Q_DECL_VECTORCALL
168INTERPOLATE_PIXEL_RGB64_LASX(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel,
169 __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
170{
171 const __m256i srcVectorAG = __lasx_xvsrli_w(srcVector, 16);
172 const __m256i dstVectorAG = __lasx_xvsrli_w(dstVector, 16);
173 const __m256i srcVectorRB = __lasx_xvand_v(srcVector, colorMask);
174 const __m256i dstVectorRB = __lasx_xvand_v(dstVector, colorMask);
175 const __m256i srcVectorAGalpha = __lasx_xvmul_w(srcVectorAG, alphaChannel);
176 const __m256i srcVectorRBalpha = __lasx_xvmul_w(srcVectorRB, alphaChannel);
177 const __m256i dstVectorAGoneMinusAlpha = __lasx_xvmul_w(dstVectorAG, oneMinusAlphaChannel);
178 const __m256i dstVectorRBoneMinusAlpha = __lasx_xvmul_w(dstVectorRB, oneMinusAlphaChannel);
179 __m256i finalAG = __lasx_xvadd_w(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
180 __m256i finalRB = __lasx_xvadd_w(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
181 finalAG = __lasx_xvadd_w(finalAG, __lasx_xvsrli_w(finalAG, 16));
182 finalRB = __lasx_xvadd_w(finalRB, __lasx_xvsrli_w(finalRB, 16));
183 finalAG = __lasx_xvadd_w(finalAG, half);
184 finalRB = __lasx_xvadd_w(finalRB, half);
185 finalAG = __lasx_xvandn_v(colorMask, finalAG);
186 finalRB = __lasx_xvsrli_w(finalRB, 16);
187
188 dstVector = __lasx_xvor_v(finalAG, finalRB);
189}
190
191// See BLEND_SOURCE_OVER_ARGB32_LSX for details.
192inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_LASX(quint32 *dst, const quint32 *src, const int length)
193{
194 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
195 const __m256i one = __lasx_xvreplgr2vr_h(0xff);
196 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
197 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
198 const __m256i offsetMask = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
199 const __m256i offsetMaskr = (__m256i)(v8i32){7, 6, 5, 4, 3, 2, 1, 0};
200 const __m256i alphaShuffleMask = (__m256i)(v32u8){3, 0xff, 3, 0xff, 7, 0xff, 7, 0xff, 11, 0xff, 11, 0xff, 15, 0xff, 15, 0xff,
201 3, 0xff, 3, 0xff, 7, 0xff, 7, 0xff, 11, 0xff, 11, 0xff, 15, 0xff, 15, 0xff};
202
203 const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> 2) & 0x7;
204
205 int x = 0;
206 // Prologue to handle all pixels until dst is 32-byte aligned in one step.
207 if (minusOffsetToAlignDstOn32Bytes != 0 && x < (length - 7)) {
208 const __m256i prologueMask = __lasx_xvsub_w(__lasx_xvreplgr2vr_w(minusOffsetToAlignDstOn32Bytes - 1), offsetMaskr);
209 const __m256i prologueMask1 = __lasx_xvslti_w(prologueMask, 0);
210 const __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
211 __lasx_xvld((const int *)&src[x], 0),
212 prologueMask1);
213 const __m256i prologueMask2 = __lasx_xvslti_b(prologueMask, 0);
214 const __m256i prologueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0),
215 alphaMask,
216 prologueMask2);
217 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, prologueAlphaMask));
218
219 if (testz1[0]!=0 || testz1[4]!=0) {
220 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector,
221 prologueAlphaMask));
222 __m256i dstVector = __lasx_xvld((int *)&dst[x], 0);
223 if (testc1[0]==0 && testc1[4]==0) {
224 __lasx_xvst(__lasx_xvbitsel_v(dstVector, srcVector, prologueMask1), (int *)&dst[x], 0);
225 } else {
226 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0),
227 srcVector,
228 alphaShuffleMask);
229 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
230 __m256i dstV = dstVector;
231 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
232 dstVector = __lasx_xvadd_b(dstVector, srcVector);
233 __lasx_xvst(__lasx_xvbitsel_v(dstV, dstVector, prologueMask1), (int *)&dst[x], 0);
234 }
235 }
236 x += (8 - minusOffsetToAlignDstOn32Bytes);
237 }
238
239 for (; x < (length - 7); x += 8) {
240 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
241 const v8i32 testz2 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
242 if (testz2[0]!=0 || testz2[4]!=0) {
243 const v8i32 testc2 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
244 if (testc2[0]==0 && testc2[4]==0) {
245 __lasx_xvst(srcVector, (__m256i *)&dst[x], 0);
246 } else {
247 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
248 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
249 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
250 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
251 dstVector = __lasx_xvadd_b(dstVector, srcVector);
252 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
253 }
254 }
255 }
256
257 // Epilogue to handle all remaining pixels in one step.
258 if (x < length) {
259 const __m256i epilogueMask = __lasx_xvadd_w(offsetMask, __lasx_xvreplgr2vr_w(x - length));
260 const __m256i epilogueMask1 = __lasx_xvslti_w(epilogueMask, 0);
261 const __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
262 __lasx_xvld((const int *)&src[x], 0),
263 epilogueMask1);
264 const __m256i epilogueMask2 = __lasx_xvslti_b(epilogueMask,0);
265 const __m256i epilogueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0),
266 alphaMask,
267 epilogueMask2);
268 const v8i32 testz3 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, epilogueAlphaMask));
269
270 if (testz3[0]!=0 || testz3[4]!=0) {
271 const v8i32 testc3 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector,
272 epilogueAlphaMask));
273 if (testc3[0]==0 && testc3[4]==0) {
274 __m256i srcV = __lasx_xvld((int *)&dst[x], 0);
275 __lasx_xvst(__lasx_xvbitsel_v(srcV, srcVector, epilogueMask1), (int *)&dst[x], 0);
276 } else {
277 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
278 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
279 __m256i dstVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
280 __lasx_xvld((int *)&dst[x], 0),
281 epilogueMask1);
282 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
283 dstVector = __lasx_xvadd_b(dstVector, srcVector);
284 __m256i dstV = __lasx_xvld((int *)&dst[x], 0);
285 __lasx_xvst(__lasx_xvbitsel_v(dstV, dstVector, epilogueMask1), (int *)&dst[x], 0);
286 }
287 }
288 }
289}
290
291// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX for details.
292inline static void Q_DECL_VECTORCALL
293BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LASX(quint32 *dst, const quint32 *src, const int length, const int const_alpha)
294{
295 int x = 0;
296
297 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
298 blend_pixel(dst[x], src[x], const_alpha);
299
300 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
301 const __m256i one = __lasx_xvreplgr2vr_h(0xff);
302 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
303 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
304 const __m256i alphaShuffleMask = (__m256i)(v32i8){3,char(0xff),3,char(0xff),7,char(0xff),7,char(0xff),11,char(0xff),11,char(0xff),15,char(0xff),15,char(0xff),
305 3,char(0xff),3,char(0xff),7,char(0xff),7,char(0xff),11,char(0xff),11,char(0xff),15,char(0xff),15,char(0xff)};
306 const __m256i constAlphaVector = __lasx_xvreplgr2vr_h(const_alpha);
307 for (; x < (length - 7); x += 8) {
308 __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
309 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
310 if (testz[0]!=0 || testz[4]!=0) {
311 BYTE_MUL_LASX(srcVector, constAlphaVector, colorMask, half);
312
313 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
314 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
315 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
316 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
317 dstVector = __lasx_xvadd_b(dstVector, srcVector);
318 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
319 }
320 }
321 SIMD_EPILOGUE(x, length, 7)
322 blend_pixel(dst[x], src[x], const_alpha);
323}
324
325void qt_blend_argb32_on_argb32_lasx(uchar *destPixels, int dbpl,
326 const uchar *srcPixels, int sbpl,
327 int w, int h,
328 int const_alpha)
329{
330 if (const_alpha == 256) {
331 for (int y = 0; y < h; ++y) {
332 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
333 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
334 BLEND_SOURCE_OVER_ARGB32_LASX(dst, src, w);
335 destPixels += dbpl;
336 srcPixels += sbpl;
337 }
338 } else if (const_alpha != 0) {
339 const_alpha = (const_alpha * 255) >> 8;
340 for (int y = 0; y < h; ++y) {
341 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
342 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
343 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LASX(dst, src, w, const_alpha);
344 destPixels += dbpl;
345 srcPixels += sbpl;
346 }
347 }
348}
349
350void qt_blend_rgb32_on_rgb32_lasx(uchar *destPixels, int dbpl,
351 const uchar *srcPixels, int sbpl,
352 int w, int h,
353 int const_alpha)
354{
355 if (const_alpha == 256) {
356 for (int y = 0; y < h; ++y) {
357 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
358 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
359 ::memcpy(dst, src, w * sizeof(uint));
360 srcPixels += sbpl;
361 destPixels += dbpl;
362 }
363 return;
364 }
365 if (const_alpha == 0)
366 return;
367
368 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
369 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
370
371 const_alpha = (const_alpha * 255) >> 8;
372 int one_minus_const_alpha = 255 - const_alpha;
373 const __m256i constAlphaVector = __lasx_xvreplgr2vr_h(const_alpha);
374 const __m256i oneMinusConstAlpha = __lasx_xvreplgr2vr_h(one_minus_const_alpha);
375 for (int y = 0; y < h; ++y) {
376 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
377 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
378 int x = 0;
379
380 // First, align dest to 32 bytes:
381 ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
382 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
383
384 // 2) interpolate pixels with LASX
385 for (; x < (w - 7); x += 8) {
386 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
387 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
388 INTERPOLATE_PIXEL_255_LASX(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
389 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
390 }
391
392 // 3) Epilogue
393 SIMD_EPILOGUE(x, w, 7)
394 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
395
396 srcPixels += sbpl;
397 destPixels += dbpl;
398 }
399}
400
401static Q_NEVER_INLINE
402void Q_DECL_VECTORCALL qt_memfillXX_lasx(uchar *dest, __m256i value256, qsizetype bytes)
403{
404 __m128i value128 = *(__m128i*)(&value256);
405
406 // main body
407 __m256i *dst256 = reinterpret_cast<__m256i *>(dest);
408 uchar *end = dest + bytes;
409 while (reinterpret_cast<uchar *>(dst256 + 4) <= end) {
410 __lasx_xvst(value256, dst256 + 0, 0);
411 __lasx_xvst(value256, dst256 + 1, 0);
412 __lasx_xvst(value256, dst256 + 2, 0);
413 __lasx_xvst(value256, dst256 + 3, 0);
414 dst256 += 4;
415 }
416
417 // first epilogue: fewer than 128 bytes / 32 entries
418 bytes = end - reinterpret_cast<uchar *>(dst256);
419 switch (bytes / sizeof(value256)) {
420 case 3: __lasx_xvst(value256, dst256++, 0); Q_FALLTHROUGH();
421 case 2: __lasx_xvst(value256, dst256++, 0); Q_FALLTHROUGH();
422 case 1: __lasx_xvst(value256, dst256++, 0);
423 }
424
425 // second epilogue: fewer than 32 bytes
426 __m128i *dst128 = reinterpret_cast<__m128i *>(dst256);
427 if (bytes & sizeof(value128))
428 __lsx_vst(value128, dst128++, 0);
429
430 // third epilogue: fewer than 16 bytes
431 if (bytes & 8)
432 __lasx_xvstelm_d(value256, reinterpret_cast<__m128i *>(end - 8), 0, 0);
433}
434
435void qt_memfill64_lasx(quint64 *dest, quint64 value, qsizetype count)
436{
437 __m256i value256 = __lasx_xvreplgr2vr_d(value);
438
439 qt_memfillXX_lasx(reinterpret_cast<uchar *>(dest), value256, count * sizeof(quint64));
440}
441
442void qt_memfill32_lasx(quint32 *dest, quint32 value, qsizetype count)
443{
444 if (count % 2) {
445 // odd number of pixels, round to even
446 *dest++ = value;
447 --count;
448 }
449 qt_memfillXX_lasx(reinterpret_cast<uchar *>(dest), __lasx_xvreplgr2vr_w(value), count * sizeof(quint32));
450}
451
452void QT_FASTCALL comp_func_SourceOver_lasx(uint *destPixels, const uint *srcPixels,
453 int length, uint const_alpha)
454{
455 Q_ASSERT(const_alpha < 256);
456
457 const quint32 *src = (const quint32 *) srcPixels;
458 quint32 *dst = (quint32 *) destPixels;
459
460 if (const_alpha == 255)
461 BLEND_SOURCE_OVER_ARGB32_LASX(dst, src, length);
462 else
463 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LASX(dst, src, length, const_alpha);
464}
465
466#if QT_CONFIG(raster_64bit)
467void QT_FASTCALL comp_func_SourceOver_rgb64_lasx(QRgba64 *dst, const QRgba64 *src,
468 int length, uint const_alpha)
469{
470 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
471 const __m256i half = __lasx_xvreplgr2vr_w(0x8000);
472 const __m256i one = __lasx_xvreplgr2vr_w(0xffff);
473 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x0000ffff);
474 __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
475 alphaMask = __lasx_xvilvl_b(alphaMask, alphaMask);
476 const __m256i alphaShuffleMask = (__m256i)(v32i8){6,7,char(0xff),char(0xff),6,7,char(0xff),char(0xff),14,15,char(0xff),char(0xff),14,15,char(0xff),char(0xff),
477 6,7,char(0xff),char(0xff),6,7,char(0xff),char(0xff),14,15,char(0xff),char(0xff),14,15,char(0xff),char(0xff)};
478
479 if (const_alpha == 255) {
480 int x = 0;
481 for (; x < length && (quintptr(dst + x) & 31); ++x)
482 blend_pixel(dst[x], src[x]);
483 for (; x < length - 3; x += 4) {
484 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
485 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
486 if (testz1[0]!=0 || testz1[4]!=0){
487 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
488 if (testc1[0]==0 && testc1[4]==0){
489 __lasx_xvst(srcVector, &dst[x], 0);
490 } else {
491 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
492 alphaChannel = __lasx_xvsub_w(one, alphaChannel);
493 __m256i dstVector = __lasx_xvld(&dst[x], 0);
494 BYTE_MUL_RGB64_LASX(dstVector, alphaChannel, colorMask, half);
495 dstVector = __lasx_xvadd_h(dstVector, srcVector);
496 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
497 }
498 }
499 }
500 SIMD_EPILOGUE(x, length, 3)
501 blend_pixel(dst[x], src[x]);
502 } else {
503 const __m256i constAlphaVector = __lasx_xvreplgr2vr_w(const_alpha | (const_alpha << 8));
504 int x = 0;
505 for (; x < length && (quintptr(dst + x) & 31); ++x)
506 blend_pixel(dst[x], src[x], const_alpha);
507 for (; x < length - 3; x += 4) {
508 __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
509 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
510 if (testz[0]!=0 || testz[4]!=0){
511 // Not all transparent
512 BYTE_MUL_RGB64_LASX(srcVector, constAlphaVector, colorMask, half);
513 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
514 alphaChannel = __lasx_xvsub_w(one, alphaChannel);
515 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
516 BYTE_MUL_RGB64_LASX(dstVector, alphaChannel, colorMask, half);
517 dstVector = __lasx_xvadd_h(dstVector, srcVector);
518 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
519 }
520 }
521 SIMD_EPILOGUE(x, length, 3)
522 blend_pixel(dst[x], src[x], const_alpha);
523 }
524}
525#endif
526
527void QT_FASTCALL comp_func_Source_lasx(uint *dst, const uint *src, int length, uint const_alpha)
528{
529 if (const_alpha == 255) {
530 ::memcpy(dst, src, length * sizeof(uint));
531 } else {
532 const int ialpha = 255 - const_alpha;
533
534 int x = 0;
535
536 // 1) prologue, align on 32 bytes
537 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
538 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
539
540 // 2) interpolate pixels with LASX
541 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
542 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
543 const __m256i constAlphaVector = __lasx_xvreplgr2vr_h(const_alpha);
544 const __m256i oneMinusConstAlpha = __lasx_xvreplgr2vr_h(ialpha);
545 for (; x < length - 7; x += 8) {
546 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
547 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
548 INTERPOLATE_PIXEL_255_LASX(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
549 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
550 }
551
552 // 3) Epilogue
553 SIMD_EPILOGUE(x, length, 7)
554 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
555 }
556}
557
558#if QT_CONFIG(raster_64bit)
559void QT_FASTCALL comp_func_Source_rgb64_lasx(QRgba64 *dst, const QRgba64 *src,
560 int length, uint const_alpha)
561{
562 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
563 if (const_alpha == 255) {
564 ::memcpy(dst, src, length * sizeof(QRgba64));
565 } else {
566 const uint ca = const_alpha | (const_alpha << 8); // adjust to [0-65535]
567 const uint cia = 65535 - ca;
568
569 int x = 0;
570
571 // 1) prologue, align on 32 bytes
572 for (; x < length && (quintptr(dst + x) & 31); ++x)
573 dst[x] = interpolate65535(src[x], ca, dst[x], cia);
574
575 // 2) interpolate pixels with AVX2
576 const __m256i half = __lasx_xvreplgr2vr_w(0x8000);
577 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x0000ffff);
578 const __m256i constAlphaVector = __lasx_xvreplgr2vr_w(ca);
579 const __m256i oneMinusConstAlpha = __lasx_xvreplgr2vr_w(cia);
580 for (; x < length - 3; x += 4) {
581 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
582 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
583 INTERPOLATE_PIXEL_RGB64_LASX(srcVector, dstVector, constAlphaVector,
584 oneMinusConstAlpha, colorMask, half);
585 __lasx_xvst(dstVector, &dst[x], 0);
586 }
587
588 // 3) Epilogue
589 SIMD_EPILOGUE(x, length, 3)
590 dst[x] = interpolate65535(src[x], ca, dst[x], cia);
591 }
592}
593#endif
594
595void QT_FASTCALL comp_func_solid_SourceOver_lasx(uint *destPixels, int length,
596 uint color, uint const_alpha)
597{
598 if ((const_alpha & qAlpha(color)) == 255) {
599 qt_memfill32(destPixels, color, length);
600 } else {
601 if (const_alpha != 255)
602 color = BYTE_MUL(color, const_alpha);
603
604 const quint32 minusAlphaOfColor = qAlpha(~color);
605 int x = 0;
606
607 quint32 *dst = (quint32 *) destPixels;
608 const __m256i colorVector = __lasx_xvreplgr2vr_w(color);
609 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
610 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
611 const __m256i minusAlphaOfColorVector = __lasx_xvreplgr2vr_h(minusAlphaOfColor);
612
613 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
614 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
615
616 for (; x < length - 7; x += 8) {
617 __m256i dstVector = __lasx_xvld(&dst[x], 0);
618 BYTE_MUL_LASX(dstVector, minusAlphaOfColorVector, colorMask, half);
619 dstVector = __lasx_xvadd_b(colorVector, dstVector);
620 __lasx_xvst(dstVector, &dst[x], 0);
621 }
622 SIMD_EPILOGUE(x, length, 7)
623 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
624 }
625}
626
627#if QT_CONFIG(raster_64bit)
628void QT_FASTCALL comp_func_solid_SourceOver_rgb64_lasx(QRgba64 *destPixels, int length,
629 QRgba64 color, uint const_alpha)
630{
631 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
632 if (const_alpha == 255 && color.isOpaque()) {
633 qt_memfill64((quint64*)destPixels, color, length);
634 } else {
635 if (const_alpha != 255)
636 color = multiplyAlpha255(color, const_alpha);
637
638 const uint minusAlphaOfColor = 65535 - color.alpha();
639 int x = 0;
640 quint64 *dst = (quint64 *) destPixels;
641 const __m256i colorVector = __lasx_xvreplgr2vr_d(color);
642 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x0000ffff);
643 const __m256i half = __lasx_xvreplgr2vr_w(0x8000);
644 const __m256i minusAlphaOfColorVector = __lasx_xvreplgr2vr_w(minusAlphaOfColor);
645
646 for (; x < length && (quintptr(dst + x) & 31); ++x)
647 destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
648
649 for (; x < length - 3; x += 4) {
650 __m256i dstVector = __lasx_xvld(&dst[x], 0);
651 BYTE_MUL_RGB64_LASX(dstVector, minusAlphaOfColorVector, colorMask, half);
652 dstVector = __lasx_xvadd_h(colorVector, dstVector);
653 __lasx_xvst(dstVector, &dst[x], 0);
654 }
655 SIMD_EPILOGUE(x, length, 3)
656 destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
657 }
658}
659#endif
660
661static inline void interpolate_4_pixels_16_lasx(const __m256i tlr1, const __m256i tlr2, const __m256i blr1,
662 const __m256i blr2, __m256i distx, __m256i disty, uint *b)
663{
664 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
665 const __m256i v_256 = __lasx_xvreplgr2vr_h(256);
666
667 /* Correct for later unpack */
668 const __m256i vdistx = __lasx_xvpermi_d(distx, 0b11011000);
669 const __m256i vdisty = __lasx_xvpermi_d(disty, 0b11011000);
670
671 __m256i dxdy = __lasx_xvmul_h(vdistx, vdisty);
672 const __m256i distx_ = __lasx_xvslli_h(vdistx, 4);
673 const __m256i disty_ = __lasx_xvslli_h(vdisty, 4);
674 __m256i idxidy = __lasx_xvadd_h(dxdy, __lasx_xvsub_h(v_256, __lasx_xvadd_h(distx_, disty_)));
675 __m256i dxidy = __lasx_xvsub_h(distx_, dxdy);
676 __m256i idxdy = __lasx_xvsub_h(disty_, dxdy);
677
678 __m256i tlr1AG = __lasx_xvsrli_h(tlr1, 8);
679 __m256i tlr1RB = __lasx_xvand_v(tlr1, colorMask);
680 __m256i tlr2AG = __lasx_xvsrli_h(tlr2, 8);
681 __m256i tlr2RB = __lasx_xvand_v(tlr2, colorMask);
682 __m256i blr1AG = __lasx_xvsrli_h(blr1, 8);
683 __m256i blr1RB = __lasx_xvand_v(blr1, colorMask);
684 __m256i blr2AG = __lasx_xvsrli_h(blr2, 8);
685 __m256i blr2RB = __lasx_xvand_v(blr2, colorMask);
686
687 __m256i odxidy1 = __lasx_xvilvl_w(dxidy, idxidy);
688 __m256i odxidy2 = __lasx_xvilvh_w(dxidy, idxidy);
689 tlr1AG = __lasx_xvmul_h(tlr1AG, odxidy1);
690 tlr1RB = __lasx_xvmul_h(tlr1RB, odxidy1);
691 tlr2AG = __lasx_xvmul_h(tlr2AG, odxidy2);
692 tlr2RB = __lasx_xvmul_h(tlr2RB, odxidy2);
693 __m256i odxdy1 = __lasx_xvilvl_w(dxdy, idxdy);
694 __m256i odxdy2 = __lasx_xvilvh_w(dxdy, idxdy);
695 blr1AG = __lasx_xvmul_h(blr1AG, odxdy1);
696 blr1RB = __lasx_xvmul_h(blr1RB, odxdy1);
697 blr2AG = __lasx_xvmul_h(blr2AG, odxdy2);
698 blr2RB = __lasx_xvmul_h(blr2RB, odxdy2);
699
700 /* Add the values, and shift to only keep 8 significant bits per colors */
701 tlr1AG = __lasx_xvadd_w(tlr1AG, __lasx_xvbsrl_v(tlr1AG, 0b100));
702 tlr2AG = __lasx_xvadd_w(tlr2AG, __lasx_xvbsrl_v(tlr2AG, 0b100));
703 __m256i topAG = __lasx_xvpermi_w(tlr2AG, tlr1AG, 0b10001000);
704 tlr1RB = __lasx_xvadd_w(tlr1RB, __lasx_xvbsrl_v(tlr1RB, 0b100));
705 tlr2RB = __lasx_xvadd_w(tlr2RB, __lasx_xvbsrl_v(tlr2RB, 0b100));
706 __m256i topRB = __lasx_xvpermi_w(tlr2RB, tlr1RB, 0b10001000);
707 blr1AG = __lasx_xvadd_w(blr1AG, __lasx_xvbsrl_v(blr1AG, 0b100));
708 blr2AG = __lasx_xvadd_w(blr2AG, __lasx_xvbsrl_v(blr2AG, 0b100));
709 __m256i botAG = __lasx_xvpermi_w(blr2AG, blr1AG, 0b10001000);
710 blr1RB = __lasx_xvadd_w(blr1RB, __lasx_xvbsrl_v(blr1RB, 0b100));
711 blr2RB = __lasx_xvadd_w(blr2RB, __lasx_xvbsrl_v(blr2RB, 0b100));
712 __m256i botRB = __lasx_xvpermi_w(blr2RB, blr1RB, 0b10001000);
713 __m256i rAG = __lasx_xvadd_h(topAG, botAG);
714 __m256i rRB = __lasx_xvadd_h(topRB, botRB);
715 rRB = __lasx_xvsrli_h(rRB, 8);
716 /* Correct for hadd */
717 rAG = __lasx_xvpermi_d(rAG, 0b11011000);
718 rRB = __lasx_xvpermi_d(rRB, 0b11011000);
719 __m256i colorMask1 = __lasx_xvslti_b(colorMask, 0);
720 __lasx_xvst(__lasx_xvbitsel_v(rAG, rRB, colorMask1), b, 0);
721}
722
723inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
724{
725 if (v1 < l1)
726 v2 = v1 = l1;
727 else if (v1 >= l2)
728 v2 = v1 = l2;
729 else
730 v2 = v1 + 1;
731 Q_ASSERT(v1 >= l1 && v1 <= l2);
732 Q_ASSERT(v2 >= l1 && v2 <= l2);
733}
734
735void QT_FASTCALL intermediate_adder_lasx(uint *b, uint *end,
736 const IntermediateBuffer &intermediate,
737 int offset, int &fx, int fdx);
738
739void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper_lasx(uint *b, uint *end, const QTextureData &image,
740 int &fx, int &fy, int fdx, int /*fdy*/)
741{
742 int y1 = (fy >> 16);
743 int y2;
744 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
745 const uint *s1 = (const uint *)image.scanLine(y1);
746 const uint *s2 = (const uint *)image.scanLine(y2);
747
748 const int disty = (fy & 0x0000ffff) >> 8;
749 const int idisty = 256 - disty;
750 const int length = end - b;
751
752 // The intermediate buffer is generated in the positive direction
753 const int adjust = (fdx < 0) ? fdx * length : 0;
754 const int offset = (fx + adjust) >> 16;
755 int x = offset;
756
757 IntermediateBuffer intermediate;
758 // count is the size used in the intermediate_buffer.
759 int count = (qint64(length) * qAbs(fdx) + FixedScale - 1) / FixedScale + 2;
760 // length is supposed to be <= BufferSize either because data->m11 < 1 or
761 // data->m11 < 2, and any larger buffers split
762 Q_ASSERT(count <= BufferSize + 2);
763 int f = 0;
764 int lim = qMin(count, image.x2 - x);
765 if (x < image.x1) {
766 Q_ASSERT(x < image.x2);
767 uint t = s1[image.x1];
768 uint b = s2[image.x1];
769 quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
770 quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
771 do {
772 intermediate.buffer_rb[f] = rb;
773 intermediate.buffer_ag[f] = ag;
774 f++;
775 x++;
776 } while (x < image.x1 && f < lim);
777 }
778
779 const __m256i disty_ = __lasx_xvreplgr2vr_h(disty);
780 const __m256i idisty_ = __lasx_xvreplgr2vr_h(idisty);
781 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
782
783 lim -= 7;
784 for (; f < lim; x += 8, f += 8) {
785 // Load 8 pixels from s1, and split the alpha-green and red-blue component
786 __m256i top = __lasx_xvld((s1+x), 0);
787 __m256i topAG = __lasx_xvsrli_h(top, 8);
788 __m256i topRB = __lasx_xvand_v(top, colorMask);
789 // Multiplies each color component by idisty
790 topAG = __lasx_xvmul_h(topAG, idisty_);
791 topRB = __lasx_xvmul_h(topRB, idisty_);
792
793 // Same for the s2 vector
794 __m256i bottom = __lasx_xvld((s2+x), 0);
795 __m256i bottomAG = __lasx_xvsrli_h(bottom, 8);
796 __m256i bottomRB = __lasx_xvand_v(bottom, colorMask);
797 bottomAG = __lasx_xvmul_h(bottomAG, disty_);
798 bottomRB = __lasx_xvmul_h(bottomRB, disty_);
799
800 // Add the values, and shift to only keep 8 significant bits per colors
801 __m256i rAG = __lasx_xvadd_h(topAG, bottomAG);
802 rAG = __lasx_xvsrli_h(rAG, 8);
803 __lasx_xvst(rAG, (&intermediate.buffer_ag[f]), 0);
804 __m256i rRB = __lasx_xvadd_h(topRB, bottomRB);
805 rRB = __lasx_xvsrli_h(rRB, 8);
806 __lasx_xvst(rRB, (&intermediate.buffer_rb[f]), 0);
807 }
808
809 for (; f < count; f++) { // Same as above but without simd
810 x = qMin(x, image.x2 - 1);
811
812 uint t = s1[x];
813 uint b = s2[x];
814
815 intermediate.buffer_rb[f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
816 intermediate.buffer_ag[f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
817 x++;
818 }
819
820 // Now interpolate the values from the intermediate_buffer to get the final result.
821 intermediate_adder_lasx(b, end, intermediate, offset, fx, fdx);
822}
823
824void QT_FASTCALL intermediate_adder_lasx(uint *b, uint *end,
825 const IntermediateBuffer &intermediate,
826 int offset, int &fx, int fdx)
827{
828 fx -= offset * FixedScale;
829
830 const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx * 4);
831 const __m128i v_blend = __lsx_vreplgr2vr_w(0x00ff00ff);
832 const __m128i vdx_shuffle = (__m128i)(v16i8){1, char(0xff), 1, char(0xff), 5, char(0xff), 5, char(0xff),
833 9, char(0xff), 9, char(0xff), 13, char(0xff), 13, char(0xff)};
834 __m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
835
836 while (b < end - 3) {
837 v4i32 offset = (v4i32)__lsx_vsrli_w(v_fx, 16);
838
839 __m256i vrb = (__m256i)(v4i64){*(const long long *)(intermediate.buffer_rb + offset[0]),
840 *(const long long *)(intermediate.buffer_rb + offset[1]),
841 *(const long long *)(intermediate.buffer_rb + offset[2]),
842 *(const long long *)(intermediate.buffer_rb + offset[3])};
843 __m256i vag = (__m256i)(v4i64){*(const long long *)(intermediate.buffer_ag + offset[0]),
844 *(const long long *)(intermediate.buffer_ag + offset[1]),
845 *(const long long *)(intermediate.buffer_ag + offset[2]),
846 *(const long long *)(intermediate.buffer_ag + offset[3])};
847
848 __m128i vdx = __lsx_vshuf_b(__lsx_vldi(0), v_fx, vdx_shuffle);
849 __m128i vidx = __lsx_vsub_h(__lsx_vreplgr2vr_h(256), vdx);
850 v2i64 vl = __lsx_vilvl_w(vdx, vidx);
851 v2i64 vh = __lsx_vilvh_w(vdx, vidx);
852 __m256i vmulx = lasx_set_q(vh, vl);
853
854 vrb = __lasx_xvmul_h(vrb, vmulx);
855 vag = __lasx_xvmul_h(vag, vmulx);
856 vrb = __lasx_xvadd_w(vrb, __lasx_xvbsrl_v(vrb, 0b100));
857 vag = __lasx_xvadd_w(vag, __lasx_xvbsrl_v(vag, 0b100));
858 __m256i vrbag = __lasx_xvpickev_w(vag, vrb);
859 vrbag = (v4i64)__lasx_xvpermi_d(vrbag, 0b11011000);
860
861 __m128i rb = lasx_extracti128_lo(vrbag);
862 __m128i ag = lasx_extracti128_hi(vrbag);
863
864 rb = __lsx_vsrli_h(rb, 8);
865 __lsx_vst(__lsx_vbitsel_v(ag, rb, v_blend), (__m128i*)b, 0);
866 b += 4;
867 v_fx = __lsx_vadd_w(v_fx, v_fdx);
868 }
869 fx = __lsx_vpickve2gr_w(v_fx, 0);
870 while (b < end) {
871 const int x = (fx >> 16);
872
873 const uint distx = (fx & 0x0000ffff) >> 8;
874 const uint idistx = 256 - distx;
875 const uint rb = (intermediate.buffer_rb[x] * idistx + intermediate.buffer_rb[x + 1] * distx) & 0xff00ff00;
876 const uint ag = (intermediate.buffer_ag[x] * idistx + intermediate.buffer_ag[x + 1] * distx) & 0xff00ff00;
877 *b = (rb >> 8) | ag;
878 b++;
879 fx += fdx;
880 }
881 fx += offset * FixedScale;
882}
883
884void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_lasx(uint *b, uint *end, const QTextureData &image,
885 int &fx, int &fy, int fdx, int /*fdy*/)
886{
887 int y1 = (fy >> 16);
888 int y2;
889 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
890 const uint *s1 = (const uint *)image.scanLine(y1);
891 const uint *s2 = (const uint *)image.scanLine(y2);
892 const int disty8 = (fy & 0x0000ffff) >> 8;
893 const int disty4 = (disty8 + 0x08) >> 4;
894
895 const qint64 min_fx = qint64(image.x1) * FixedScale;
896 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
897 while (b < end) {
898 int x1 = (fx >> 16);
899 int x2;
900 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
901 if (x1 != x2)
902 break;
903 uint top = s1[x1];
904 uint bot = s2[x1];
905 *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8);
906 fx += fdx;
907 ++b;
908 }
909 uint *boundedEnd = end;
910 if (fdx > 0)
911 boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
912 else if (fdx < 0)
913 boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
914
915 // A fast middle part without boundary checks
916 const __m256i vdistShuffle = (__m256i)(v32i8){0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff),
917 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff),
918 0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff),
919 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff)};
920 const __m256i v_disty = __lasx_xvreplgr2vr_h(disty4);
921 const __m256i v_fdx = __lasx_xvreplgr2vr_w(fdx * 8);
922 const __m256i v_fx_r = __lasx_xvreplgr2vr_w(0x08);
923 const __m256i v_index = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
924 __m256i v_fx = __lasx_xvreplgr2vr_w(fx);
925 v_fx = __lasx_xvadd_w(v_fx, __lasx_xvmul_w(__lasx_xvreplgr2vr_w(fdx), v_index));
926
927 while (b < boundedEnd - 7) {
928 const v8i32 offset = (v8i32)__lasx_xvsrli_w(v_fx, 16);
929
930 const __m256i toplo = (__m256i)(v4i64){*(const long long *)(s1 + offset[0]), *(const long long *)(s1 + offset[1]),
931 *(const long long *)(s1 + offset[2]), *(const long long *)(s1 + offset[3])};
932 const __m256i tophi = (__m256i)(v4i64){*(const long long *)(s1 + offset[4]), *(const long long *)(s1 + offset[5]),
933 *(const long long *)(s1 + offset[6]), *(const long long *)(s1 + offset[7])};
934 const __m256i botlo = (__m256i)(v4i64){*(const long long *)(s2 + offset[0]), *(const long long *)(s2 + offset[1]),
935 *(const long long *)(s2 + offset[2]), *(const long long *)(s2 + offset[3])};
936 const __m256i bothi = (__m256i)(v4i64){*(const long long *)(s2 + offset[4]), *(const long long *)(s2 + offset[5]),
937 *(const long long *)(s2 + offset[6]), *(const long long *)(s2 + offset[7])};
938
939 __m256i v_distx = __lasx_xvsrli_h(v_fx, 8);
940 v_distx = __lasx_xvsrli_h(__lasx_xvadd_w(v_distx, v_fx_r), 4);
941 v_distx = __lasx_xvshuf_b(__lasx_xvldi(0), v_distx, vdistShuffle);
942
943 interpolate_4_pixels_16_lasx(toplo, tophi, botlo, bothi, v_distx, v_disty, b);
944 b += 8;
945 v_fx = __lasx_xvadd_w(v_fx, v_fdx);
946 }
947 fx = __lasx_xvpickve2gr_w(v_fx, 0);
948
949 while (b < boundedEnd) {
950 int x = (fx >> 16);
951 int distx8 = (fx & 0x0000ffff) >> 8;
952 *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
953 fx += fdx;
954 ++b;
955 }
956
957 while (b < end) {
958 int x1 = (fx >> 16);
959 int x2;
960 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
961 uint tl = s1[x1];
962 uint tr = s1[x2];
963 uint bl = s2[x1];
964 uint br = s2[x2];
965 int distx8 = (fx & 0x0000ffff) >> 8;
966 *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
967 fx += fdx;
968 ++b;
969 }
970}
971
972void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_lasx(uint *b, uint *end, const QTextureData &image,
973 int &fx, int &fy, int fdx, int fdy)
974{
975 const qint64 min_fx = qint64(image.x1) * FixedScale;
976 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
977 const qint64 min_fy = qint64(image.y1) * FixedScale;
978 const qint64 max_fy = qint64(image.y2 - 1) * FixedScale;
979 // first handle the possibly bounded part in the beginning
980 while (b < end) {
981 int x1 = (fx >> 16);
982 int x2;
983 int y1 = (fy >> 16);
984 int y2;
985 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
986 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
987 if (x1 != x2 && y1 != y2)
988 break;
989 const uint *s1 = (const uint *)image.scanLine(y1);
990 const uint *s2 = (const uint *)image.scanLine(y2);
991 uint tl = s1[x1];
992 uint tr = s1[x2];
993 uint bl = s2[x1];
994 uint br = s2[x2];
995 int distx = (fx & 0x0000ffff) >> 8;
996 int disty = (fy & 0x0000ffff) >> 8;
997 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
998 fx += fdx;
999 fy += fdy;
1000 ++b;
1001 }
1002 uint *boundedEnd = end;
1003 if (fdx > 0)
1004 boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
1005 else if (fdx < 0)
1006 boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
1007 if (fdy > 0)
1008 boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy);
1009 else if (fdy < 0)
1010 boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy);
1011
1012 // until boundedEnd we can now have a fast middle part without boundary checks
1013 const __m256i vdistShuffle = (__m256i)(v32i8){0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff), 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff),
1014 0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff), 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff)};
1015 const __m256i v_fdx = __lasx_xvreplgr2vr_w(fdx * 8);
1016 const __m256i v_fdy = __lasx_xvreplgr2vr_w(fdy * 8);
1017 const __m256i v_fxy_r = __lasx_xvreplgr2vr_w(0x08);
1018 const __m256i v_index = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
1019 __m256i v_fx = __lasx_xvreplgr2vr_w(fx);
1020 __m256i v_fy = __lasx_xvreplgr2vr_w(fy);
1021 v_fx = __lasx_xvadd_w(v_fx, __lasx_xvmul_w(__lasx_xvreplgr2vr_w(fdx), v_index));
1022 v_fy = __lasx_xvadd_w(v_fy, __lasx_xvmul_w(__lasx_xvreplgr2vr_w(fdy), v_index));
1023
1024 const uchar *textureData = image.imageData;
1025 const qsizetype bytesPerLine = image.bytesPerLine;
1026 const __m256i vbpl = __lasx_xvreplgr2vr_h(bytesPerLine/4);
1027
1028 while (b < boundedEnd - 7) {
1029 const __m256i vy = __lasx_xvpickev_h(__lasx_xvldi(0),
1030 __lasx_xvsat_w(__lasx_xvsrli_w(v_fy, 16), 15));
1031 // 8x16bit * 8x16bit -> 8x32bit
1032 __m256i offset = __lasx_xvilvl_h(__lasx_xvmuh_h(vy, vbpl), __lasx_xvmul_h(vy, vbpl));
1033 offset = __lasx_xvadd_w(offset, __lasx_xvsrli_w(v_fx, 16));
1034
1035 const uint *s1 = (const uint *)(textureData);
1036 const uint *s2 = (const uint *)(textureData + bytesPerLine);
1037 const __m256i toplo = (__m256i)(v4i64){*(const long long *)(s1+((v8i32)offset)[0]), *(const long long *)(s1+((v8i32)offset)[1]),
1038 *(const long long *)(s1+((v8i32)offset)[2]), *(const long long *)(s1+((v8i32)offset)[3])};
1039 const __m256i tophi = (__m256i)(v4i64){*(const long long *)(s1+((v8i32)offset)[4]), *(const long long *)(s1+((v8i32)offset)[5]),
1040 *(const long long *)(s1+((v8i32)offset)[6]), *(const long long *)(s1+((v8i32)offset)[7])};
1041 const __m256i botlo = (__m256i)(v4i64){*(const long long *)(s2+((v8i32)offset)[0]), *(const long long *)(s2+((v8i32)offset)[1]),
1042 *(const long long *)(s2+((v8i32)offset)[2]), *(const long long *)(s2+((v8i32)offset)[3])};
1043 const __m256i bothi = (__m256i)(v4i64){*(const long long *)(s2+((v8i32)offset)[4]), *(const long long *)(s2+((v8i32)offset)[5]),
1044 *(const long long *)(s2+((v8i32)offset)[6]), *(const long long *)(s2+((v8i32)offset)[7])};
1045
1046 __m256i v_distx = __lasx_xvsrli_h(v_fx, 8);
1047 __m256i v_disty = __lasx_xvsrli_h(v_fy, 8);
1048 v_distx = __lasx_xvsrli_h(__lasx_xvadd_w(v_distx, v_fxy_r), 4);
1049 v_disty = __lasx_xvsrli_h(__lasx_xvadd_w(v_disty, v_fxy_r), 4);
1050 v_distx = __lasx_xvshuf_b(__lasx_xvldi(0), v_distx, vdistShuffle);
1051 v_disty = __lasx_xvshuf_b(__lasx_xvldi(0), v_disty, vdistShuffle);
1052
1053 interpolate_4_pixels_16_lasx(toplo, tophi, botlo, bothi, v_distx, v_disty, b);
1054 b += 8;
1055 v_fx = __lasx_xvadd_w(v_fx, v_fdx);
1056 v_fy = __lasx_xvadd_w(v_fy, v_fdy);
1057 }
1058 fx = __lasx_xvpickve2gr_w(v_fx, 0);
1059 fy = __lasx_xvpickve2gr_w(v_fy, 0);
1060
1061 while (b < boundedEnd) {
1062 int x = (fx >> 16);
1063 int y = (fy >> 16);
1064
1065 const uint *s1 = (const uint *)image.scanLine(y);
1066 const uint *s2 = (const uint *)image.scanLine(y + 1);
1067
1068 int distx = (fx & 0x0000ffff) >> 8;
1069 int disty = (fy & 0x0000ffff) >> 8;
1070 *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
1071
1072 fx += fdx;
1073 fy += fdy;
1074 ++b;
1075 }
1076
1077 while (b < end) {
1078 int x1 = (fx >> 16);
1079 int x2;
1080 int y1 = (fy >> 16);
1081 int y2;
1082
1083 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
1084 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
1085
1086 const uint *s1 = (const uint *)image.scanLine(y1);
1087 const uint *s2 = (const uint *)image.scanLine(y2);
1088
1089 uint tl = s1[x1];
1090 uint tr = s1[x2];
1091 uint bl = s2[x1];
1092 uint br = s2[x2];
1093
1094 int distx = (fx & 0x0000ffff) >> 8;
1095 int disty = (fy & 0x0000ffff) >> 8;
1096 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
1097
1098 fx += fdx;
1099 fy += fdy;
1100 ++b;
1101 }
1102}
1103
1104static inline __m256i epilogueMaskFromCount(qsizetype count)
1105{
1106 Q_ASSERT(count > 0);
1107 static const __m256i offsetMask = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
1108 return __lasx_xvadd_w(offsetMask, __lasx_xvreplgr2vr_w(-count));
1109}
1110
1111template<bool RGBA>
1112static void convertARGBToARGB32PM_lasx(uint *buffer, const uint *src, qsizetype count)
1113{
1114 qsizetype i = 0;
1115 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
1116 const __m256i rgbaMask = (__m256i)(v32i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
1117 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
1118 const __m256i shuffleMask = (__m256i)(v32i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15,
1119 6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
1120 const __m256i half = __lasx_xvreplgr2vr_h(0x0080);
1121 const __m256i zero = __lasx_xvldi(0);
1122
1123 for (; i < count - 7; i += 8) {
1124 __m256i srcVector = __lasx_xvld(reinterpret_cast<const __m256i *>(src + i), 0);
1125 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
1126 if (testz[0]!=0 || testz[4]!=0){
1127 const v8i32 testc = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
1128 bool cf = testc[0]==0 && testc[4]==0;
1129 if (RGBA)
1130 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1131 if (!cf) {
1132 __m256i src1 = __lasx_xvilvl_b(zero, srcVector);
1133 __m256i src2 = __lasx_xvilvh_b(zero, srcVector);
1134 __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1135 __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1136 __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15, 0, 1, 2, 11, 4, 5, 6, 15};
1137 src1 = __lasx_xvmul_h(src1, alpha1);
1138 src2 = __lasx_xvmul_h(src2, alpha2);
1139 src1 = __lasx_xvadd_h(src1, __lasx_xvsrli_h(src1, 8));
1140 src2 = __lasx_xvadd_h(src2, __lasx_xvsrli_h(src2, 8));
1141 src1 = __lasx_xvadd_h(src1, half);
1142 src2 = __lasx_xvadd_h(src2, half);
1143 src1 = __lasx_xvsrli_h(src1, 8);
1144 src2 = __lasx_xvsrli_h(src2, 8);
1145 src1 = __lasx_xvshuf_h(blendMask, alpha1, src1);
1146 src2 = __lasx_xvshuf_h(blendMask, alpha2, src2);
1147 src1 = __lasx_xvmaxi_h(src1, 0);
1148 src2 = __lasx_xvmaxi_h(src2, 0);
1149 srcVector = __lasx_xvpickev_b(__lasx_xvsat_hu(src2, 7), __lasx_xvsat_hu(src1, 7));
1150 __lasx_xvst(srcVector, reinterpret_cast<__m256i *>(buffer + i), 0);
1151 } else {
1152 if (buffer != src || RGBA)
1153 __lasx_xvst(srcVector, reinterpret_cast<__m256i *>(buffer + i), 0);
1154 }
1155 } else {
1156 __lasx_xvst(zero, reinterpret_cast<__m256i *>(buffer + i), 0);
1157 }
1158 }
1159
1160 if (i < count) {
1161 const __m256i epilogueMask = epilogueMaskFromCount(count - i);
1162 const __m256i epilogueMask1 = __lasx_xvslti_w(epilogueMask, 0);
1163 __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
1164 __lasx_xvld(reinterpret_cast<const int *>(src + i), 0),
1165 epilogueMask1);
1166 const __m256i epilogueMask2 = __lasx_xvslti_b(epilogueMask, 0);
1167 const __m256i epilogueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0), alphaMask, epilogueMask2);
1168
1169 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, epilogueAlphaMask));
1170 if (testz1[0]!=0 || testz1[4]!=0){
1171 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, epilogueAlphaMask));
1172 bool cf = testc1[0]==0 && testc1[4]==0;
1173 if (RGBA)
1174 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1175 if (!cf) {
1176 __m256i src1 = __lasx_xvilvl_b(zero, srcVector);
1177 __m256i src2 = __lasx_xvilvh_b(zero, srcVector);
1178 __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1179 __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1180 __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15, 0, 1, 2, 11, 4, 5, 6, 15};
1181 src1 = __lasx_xvmul_h(src1, alpha1);
1182 src2 = __lasx_xvmul_h(src2, alpha2);
1183 src1 = __lasx_xvadd_h(src1, __lasx_xvsrli_h(src1, 8));
1184 src2 = __lasx_xvadd_h(src2, __lasx_xvsrli_h(src2, 8));
1185 src1 = __lasx_xvadd_h(src1, half);
1186 src2 = __lasx_xvadd_h(src2, half);
1187 src1 = __lasx_xvsrli_h(src1, 8);
1188 src2 = __lasx_xvsrli_h(src2, 8);
1189 src1 = __lasx_xvshuf_h(blendMask, alpha1, src1);
1190 src2 = __lasx_xvshuf_h(blendMask, alpha2, src2);
1191 src1 = __lasx_xvmaxi_h(src1, 0);
1192 src2 = __lasx_xvmaxi_h(src2, 0);
1193 srcVector = __lasx_xvpickev_b(__lasx_xvsat_hu(src2, 7), __lasx_xvsat_hu(src1, 7));
1194 __m256i srcV = __lasx_xvld(reinterpret_cast<int *>(buffer + i), 0);
1195 srcV = __lasx_xvbitsel_v(srcV, srcVector, epilogueMask1);
1196 __lasx_xvst(srcV, reinterpret_cast<int *>(buffer + i), 0);
1197 } else {
1198 if (buffer != src || RGBA) {
1199 __m256i srcV = __lasx_xvld(reinterpret_cast<int *>(buffer + i), 0);
1200 srcV = __lasx_xvbitsel_v(srcV, srcVector, epilogueMask1);
1201 __lasx_xvst(srcV, reinterpret_cast<int *>(buffer + i), 0);
1202 }
1203 }
1204 } else {
1205 __m256i srcV = __lasx_xvld(reinterpret_cast<int *>(buffer + i), 0);
1206 srcV = __lasx_xvbitsel_v(srcV, zero, epilogueMask1);
1207 __lasx_xvst(srcV, reinterpret_cast<int *>(buffer + i), 0);
1208 }
1209 }
1210}
1211
1212void QT_FASTCALL convertARGB32ToARGB32PM_lasx(uint *buffer, int count, const QList<QRgb> *)
1213{
1214 convertARGBToARGB32PM_lasx<false>(buffer, buffer, count);
1215}
1216
1217void QT_FASTCALL convertRGBA8888ToARGB32PM_lasx(uint *buffer, int count, const QList<QRgb> *)
1218{
1219 convertARGBToARGB32PM_lasx<true>(buffer, buffer, count);
1220}
1221
1222const uint *QT_FASTCALL fetchARGB32ToARGB32PM_lasx(uint *buffer, const uchar *src, int index,
1223 int count, const QList<QRgb> *, QDitherInfo *)
1224{
1225 convertARGBToARGB32PM_lasx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1226 return buffer;
1227}
1228
1229const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_lasx(uint *buffer, const uchar *src, int index, int count,
1230 const QList<QRgb> *, QDitherInfo *)
1231{
1232 convertARGBToARGB32PM_lasx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1233 return buffer;
1234}
1235
1236template<bool RGBA>
1237static void convertARGBToRGBA64PM_lasx(QRgba64 *buffer, const uint *src, qsizetype count)
1238{
1239 qsizetype i = 0;
1240 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
1241 const __m256i rgbaMask = (__m256i)(v32i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
1242 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
1243 const __m256i shuffleMask = (__m256i)(v32i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15,
1244 6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
1245 const __m256i zero = __lasx_xvldi(0);
1246
1247 for (; i < count - 7; i += 8) {
1248 __m256i dst1, dst2;
1249 __m256i srcVector = __lasx_xvld(reinterpret_cast<const __m256i *>(src + i), 0);
1250 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
1251 if (testz[0]!=0 || testz[4]!=0){
1252 const v8i32 testc = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
1253 bool cf = testc[0]==0 && testc[4]==0;
1254 if (!RGBA)
1255 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1256
1257 // The two unpack instructions unpack the low and upper halves of
1258 // each 128-bit half of the 256-bit register. Here's the tracking
1259 // of what's where: (p is 32-bit, P is 64-bit)
1260 // as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
1261 // after xvpermi_d [ p1, p2, p5, p6; p3, p4, p7, p8 ]
1262 // after xvilvl/h [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
1263 srcVector = __lasx_xvpermi_d(srcVector, 0b11011000);
1264 const __m256i src1 = __lasx_xvilvl_b(srcVector, srcVector);
1265 const __m256i src2 = __lasx_xvilvh_b(srcVector, srcVector);
1266 if (!cf) {
1267 const __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1268 const __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1269 __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15, 0, 1, 2, 11, 4, 5, 6, 15};
1270 dst1 = __lasx_xvmuh_hu(src1, alpha1);
1271 dst2 = __lasx_xvmuh_hu(src2, alpha2);
1272 dst1 = __lasx_xvadd_h(dst1, __lasx_xvsrli_h(dst1, 15));
1273 dst2 = __lasx_xvadd_h(dst2, __lasx_xvsrli_h(dst2, 15));
1274 dst1 = __lasx_xvshuf_h(blendMask, src1, dst1);
1275 dst2 = __lasx_xvshuf_h(blendMask, src2, dst2);
1276 } else {
1277 dst1 = src1;
1278 dst2 = src2;
1279 }
1280 } else {
1281 dst1 = dst2 = zero;
1282 }
1283 __lasx_xvst(dst1, reinterpret_cast<__m256i *>(buffer + i), 0);
1284 __lasx_xvst(dst2, reinterpret_cast<__m256i *>(buffer + i) + 1, 0);
1285 }
1286
1287 if (i < count) {
1288 __m256i epilogueMask = epilogueMaskFromCount(count - i);
1289 const __m256i epilogueMask1 = __lasx_xvslti_w(epilogueMask,0);
1290 __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
1291 __lasx_xvld(reinterpret_cast<const int *>(src + i), 0),
1292 epilogueMask1);
1293 __m256i dst1, dst2;
1294 const __m256i epilogueMask2 = __lasx_xvslti_b(epilogueMask, 0);
1295 const __m256i epilogueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0),
1296 alphaMask,
1297 epilogueMask2);
1298
1299 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, epilogueAlphaMask));
1300 if (testz1[0]!=0 || testz1[4]!=0){
1301 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, epilogueAlphaMask));
1302 bool cf = testc1[0]==0 && testc1[4]==0;
1303
1304 if (!RGBA)
1305 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1306 srcVector = __lasx_xvpermi_d(srcVector, 0b11011000);
1307 const __m256i src1 = __lasx_xvilvl_b(srcVector, srcVector);
1308 const __m256i src2 = __lasx_xvilvh_b(srcVector, srcVector);
1309 if (!cf) {
1310 const __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1311 const __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1312 const __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15,
1313 0, 1, 2, 11, 4, 5, 6, 15};
1314 dst1 = __lasx_xvmuh_hu(src1, alpha1);
1315 dst2 = __lasx_xvmuh_hu(src2, alpha2);
1316 dst1 = __lasx_xvadd_h(dst1, __lasx_xvsrli_h(dst1, 15));
1317 dst2 = __lasx_xvadd_h(dst2, __lasx_xvsrli_h(dst2, 15));
1318 dst1 = __lasx_xvshuf_h(blendMask, src1, dst1);
1319 dst2 = __lasx_xvshuf_h(blendMask, src2, dst2);
1320 } else {
1321 dst1 = src1;
1322 dst2 = src2;
1323 }
1324 } else {
1325 dst1 = dst2 = zero;
1326 }
1327 epilogueMask = __lasx_xvpermi_d(epilogueMask, 0b11011000);
1328 __m256i epilogueMaskl = __lasx_xvslti_d(__lasx_xvilvl_w(epilogueMask, epilogueMask), 0);
1329 __m256i epilogueMaskh = __lasx_xvslti_d(__lasx_xvilvh_w(epilogueMask, epilogueMask), 0);
1330 __m256i dst1V = __lasx_xvld(reinterpret_cast<qint64 *>(buffer + i), 0);
1331 dst1V = __lasx_xvbitsel_v(dst1V, dst1, epilogueMaskl);
1332 __lasx_xvst(dst1V, reinterpret_cast<qint64 *>(buffer + i), 0);
1333 __m256i dst2V = __lasx_xvld(reinterpret_cast<qint64 *>(buffer + i + 4), 0);
1334 dst2V = __lasx_xvbitsel_v(dst2V, dst2, epilogueMaskh);
1335 __lasx_xvst(dst2V, reinterpret_cast<qint64 *>(buffer + i + 4), 0);
1336 }
1337}
1338
1339const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_lasx(QRgba64 *buffer, const uint *src, int count,
1340 const QList<QRgb> *, QDitherInfo *)
1341{
1342 convertARGBToRGBA64PM_lasx<false>(buffer, src, count);
1343 return buffer;
1344}
1345
1346const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_lasx(QRgba64 *buffer, const uint *src, int count,
1347 const QList<QRgb> *, QDitherInfo *)
1348{
1349 convertARGBToRGBA64PM_lasx<true>(buffer, src, count);
1350 return buffer;
1351}
1352
1353const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_lasx(QRgba64 *buffer, const uchar *src, int index, int count,
1354 const QList<QRgb> *, QDitherInfo *)
1355{
1356 convertARGBToRGBA64PM_lasx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1357 return buffer;
1358}
1359
1360const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_lasx(QRgba64 *buffer, const uchar *src, int index, int count,
1361 const QList<QRgb> *, QDitherInfo *)
1362{
1363 convertARGBToRGBA64PM_lasx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1364 return buffer;
1365}
1366
1367QT_END_NAMESPACE
1368
1369#endif