Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qdrawhelper_lasx.cpp
Go to the documentation of this file.
1// Copyright (C) 2024 Loongson Technology Corporation Limited.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
7#include "qrgba64_p.h"
8
9#if defined(QT_COMPILER_SUPPORTS_LASX)
10
11QT_BEGIN_NAMESPACE
12
13enum {
14 FixedScale = 1 << 16,
15 HalfPoint = 1 << 15
16};
17
18#ifdef Q_CC_CLANG
19#define VREGS_PREFIX "$vr"
20#define XREGS_PREFIX "$xr"
21#else // GCC
22#define VREGS_PREFIX "$f"
23#define XREGS_PREFIX "$f"
24#endif
25#define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
26
27// Convert two __m128i to __m256i
28static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo)
29{
30 __m256i out;
31 __asm__ volatile (
32 ".irp i," __ALL_REGS "\n\t"
33 " .ifc %[hi], " VREGS_PREFIX "\\i \n\t"
34 " .irp j," __ALL_REGS "\n\t"
35 " .ifc %[lo], " VREGS_PREFIX "\\j \n\t"
36 " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
37 " .endif \n\t"
38 " .endr \n\t"
39 " .endif \n\t"
40 ".endr \n\t"
41 ".ifnc %[out], %[hi] \n\t"
42 ".irp i," __ALL_REGS "\n\t"
43 " .ifc %[out], " XREGS_PREFIX "\\i \n\t"
44 " .irp j," __ALL_REGS "\n\t"
45 " .ifc %[hi], " VREGS_PREFIX "\\j \n\t"
46 " xvori.b $xr\\i, $xr\\j, 0 \n\t"
47 " .endif \n\t"
48 " .endr \n\t"
49 " .endif \n\t"
50 ".endr \n\t"
51 ".endif \n\t"
52 : [out] "=f" (out), [hi] "+f" (inhi)
53 : [lo] "f" (inlo)
54 );
55 return out;
56}
57
58// Convert __m256i low part to __m128i
59static inline __m128i lasx_extracti128_lo(__m256i in)
60{
61 __m128i out;
62 __asm__ volatile (
63 ".ifnc %[out], %[in] \n\t"
64 ".irp i," __ALL_REGS "\n\t"
65 " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
66 " .irp j," __ALL_REGS "\n\t"
67 " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
68 " vori.b $vr\\i, $vr\\j, 0 \n\t"
69 " .endif \n\t"
70 " .endr \n\t"
71 " .endif \n\t"
72 ".endr \n\t"
73 ".endif \n\t"
74 : [out] "=f" (out) : [in] "f" (in)
75 );
76 return out;
77}
78
79// Convert __m256i high part to __m128i
80static inline __m128i lasx_extracti128_hi(__m256i in)
81{
82 __m128i out;
83 __asm__ volatile (
84 ".irp i," __ALL_REGS "\n\t"
85 " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
86 " .irp j," __ALL_REGS "\n\t"
87 " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
88 " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t"
89 " .endif \n\t"
90 " .endr \n\t"
91 " .endif \n\t"
92 ".endr \n\t"
93 : [out] "=f" (out) : [in] "f" (in)
94 );
95 return out;
96}
97
98// Vectorized blend functions:
99
100// See BYTE_MUL_LSX for details.
101inline static void Q_DECL_VECTORCALL
102BYTE_MUL_LASX(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
103{
104 __m256i pixelVectorAG = __lasx_xvsrli_h(pixelVector, 8);
105 __m256i pixelVectorRB = __lasx_xvand_v(pixelVector, colorMask);
106
107 pixelVectorAG = __lasx_xvmul_h(pixelVectorAG, alphaChannel);
108 pixelVectorRB = __lasx_xvmul_h(pixelVectorRB, alphaChannel);
109
110 pixelVectorRB = __lasx_xvadd_h(pixelVectorRB, __lasx_xvsrli_h(pixelVectorRB, 8));
111 pixelVectorRB = __lasx_xvadd_h(pixelVectorRB, half);
112 pixelVectorAG = __lasx_xvadd_h(pixelVectorAG, __lasx_xvsrli_h(pixelVectorAG, 8));
113 pixelVectorAG = __lasx_xvadd_h(pixelVectorAG, half);
114
115 pixelVectorRB = __lasx_xvsrli_h(pixelVectorRB, 8);
116 pixelVectorAG = __lasx_xvandn_v(colorMask, pixelVectorAG);
117
118 pixelVector = __lasx_xvor_v(pixelVectorAG, pixelVectorRB);
119}
120
121inline static void Q_DECL_VECTORCALL
122BYTE_MUL_RGB64_LASX(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
123{
124 __m256i pixelVectorAG = __lasx_xvsrli_w(pixelVector, 16);
125 __m256i pixelVectorRB = __lasx_xvand_v(pixelVector, colorMask);
126
127 pixelVectorAG = __lasx_xvmul_w(pixelVectorAG, alphaChannel);
128 pixelVectorRB = __lasx_xvmul_w(pixelVectorRB, alphaChannel);
129
130 pixelVectorRB = __lasx_xvadd_w(pixelVectorRB, __lasx_xvsrli_w(pixelVectorRB, 16));
131 pixelVectorAG = __lasx_xvadd_w(pixelVectorAG, __lasx_xvsrli_w(pixelVectorAG, 16));
132 pixelVectorRB = __lasx_xvadd_w(pixelVectorRB, half);
133 pixelVectorAG = __lasx_xvadd_w(pixelVectorAG, half);
134
135 pixelVectorRB = __lasx_xvsrli_w(pixelVectorRB, 16);
136 pixelVectorAG = __lasx_xvandn_v(colorMask, pixelVectorAG);
137
138 pixelVector = __lasx_xvor_v(pixelVectorAG, pixelVectorRB);
139}
140
141// See INTERPOLATE_PIXEL_255_LSX for details.
142inline static void Q_DECL_VECTORCALL
143INTERPOLATE_PIXEL_255_LASX(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel,
144 __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
145{
146 const __m256i srcVectorAG = __lasx_xvsrli_h(srcVector, 8);
147 const __m256i dstVectorAG = __lasx_xvsrli_h(dstVector, 8);
148 const __m256i srcVectorRB = __lasx_xvand_v(srcVector, colorMask);
149 const __m256i dstVectorRB = __lasx_xvand_v(dstVector, colorMask);
150 const __m256i srcVectorAGalpha = __lasx_xvmul_h(srcVectorAG, alphaChannel);
151 const __m256i srcVectorRBalpha = __lasx_xvmul_h(srcVectorRB, alphaChannel);
152 const __m256i dstVectorAGoneMinusAlpha = __lasx_xvmul_h(dstVectorAG, oneMinusAlphaChannel);
153 const __m256i dstVectorRBoneMinusAlpha = __lasx_xvmul_h(dstVectorRB, oneMinusAlphaChannel);
154 __m256i finalAG = __lasx_xvadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
155 __m256i finalRB = __lasx_xvadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
156 finalAG = __lasx_xvadd_h(finalAG, __lasx_xvsrli_h(finalAG, 8));
157 finalRB = __lasx_xvadd_h(finalRB, __lasx_xvsrli_h(finalRB, 8));
158 finalAG = __lasx_xvadd_h(finalAG, half);
159 finalRB = __lasx_xvadd_h(finalRB, half);
160 finalAG = __lasx_xvandn_v(colorMask, finalAG);
161 finalRB = __lasx_xvsrli_h(finalRB, 8);
162
163 dstVector = __lasx_xvor_v(finalAG, finalRB);
164}
165
166inline static void Q_DECL_VECTORCALL
167INTERPOLATE_PIXEL_RGB64_LASX(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel,
168 __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
169{
170 const __m256i srcVectorAG = __lasx_xvsrli_w(srcVector, 16);
171 const __m256i dstVectorAG = __lasx_xvsrli_w(dstVector, 16);
172 const __m256i srcVectorRB = __lasx_xvand_v(srcVector, colorMask);
173 const __m256i dstVectorRB = __lasx_xvand_v(dstVector, colorMask);
174 const __m256i srcVectorAGalpha = __lasx_xvmul_w(srcVectorAG, alphaChannel);
175 const __m256i srcVectorRBalpha = __lasx_xvmul_w(srcVectorRB, alphaChannel);
176 const __m256i dstVectorAGoneMinusAlpha = __lasx_xvmul_w(dstVectorAG, oneMinusAlphaChannel);
177 const __m256i dstVectorRBoneMinusAlpha = __lasx_xvmul_w(dstVectorRB, oneMinusAlphaChannel);
178 __m256i finalAG = __lasx_xvadd_w(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
179 __m256i finalRB = __lasx_xvadd_w(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
180 finalAG = __lasx_xvadd_w(finalAG, __lasx_xvsrli_w(finalAG, 16));
181 finalRB = __lasx_xvadd_w(finalRB, __lasx_xvsrli_w(finalRB, 16));
182 finalAG = __lasx_xvadd_w(finalAG, half);
183 finalRB = __lasx_xvadd_w(finalRB, half);
184 finalAG = __lasx_xvandn_v(colorMask, finalAG);
185 finalRB = __lasx_xvsrli_w(finalRB, 16);
186
187 dstVector = __lasx_xvor_v(finalAG, finalRB);
188}
189
190// See BLEND_SOURCE_OVER_ARGB32_LSX for details.
191inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_LASX(quint32 *dst, const quint32 *src, const int length)
192{
193 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
194 const __m256i one = __lasx_xvreplgr2vr_h(0xff);
195 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
196 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
197 const __m256i offsetMask = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
198 const __m256i offsetMaskr = (__m256i)(v8i32){7, 6, 5, 4, 3, 2, 1, 0};
199 const __m256i alphaShuffleMask = (__m256i)(v32u8){3, 0xff, 3, 0xff, 7, 0xff, 7, 0xff, 11, 0xff, 11, 0xff, 15, 0xff, 15, 0xff,
200 3, 0xff, 3, 0xff, 7, 0xff, 7, 0xff, 11, 0xff, 11, 0xff, 15, 0xff, 15, 0xff};
201
202 const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> 2) & 0x7;
203
204 int x = 0;
205 // Prologue to handle all pixels until dst is 32-byte aligned in one step.
206 if (minusOffsetToAlignDstOn32Bytes != 0 && x < (length - 7)) {
207 const __m256i prologueMask = __lasx_xvsub_w(__lasx_xvreplgr2vr_w(minusOffsetToAlignDstOn32Bytes - 1), offsetMaskr);
208 const __m256i prologueMask1 = __lasx_xvslti_w(prologueMask, 0);
209 const __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
210 __lasx_xvld((const int *)&src[x], 0),
211 prologueMask1);
212 const __m256i prologueMask2 = __lasx_xvslti_b(prologueMask, 0);
213 const __m256i prologueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0),
214 alphaMask,
215 prologueMask2);
216 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, prologueAlphaMask));
217
218 if (testz1[0]!=0 || testz1[4]!=0) {
219 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector,
220 prologueAlphaMask));
221 __m256i dstVector = __lasx_xvld((int *)&dst[x], 0);
222 if (testc1[0]==0 && testc1[4]==0) {
223 __lasx_xvst(__lasx_xvbitsel_v(dstVector, srcVector, prologueMask1), (int *)&dst[x], 0);
224 } else {
225 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0),
226 srcVector,
227 alphaShuffleMask);
228 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
229 __m256i dstV = dstVector;
230 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
231 dstVector = __lasx_xvadd_b(dstVector, srcVector);
232 __lasx_xvst(__lasx_xvbitsel_v(dstV, dstVector, prologueMask1), (int *)&dst[x], 0);
233 }
234 }
235 x += (8 - minusOffsetToAlignDstOn32Bytes);
236 }
237
238 for (; x < (length - 7); x += 8) {
239 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
240 const v8i32 testz2 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
241 if (testz2[0]!=0 || testz2[4]!=0) {
242 const v8i32 testc2 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
243 if (testc2[0]==0 && testc2[4]==0) {
244 __lasx_xvst(srcVector, (__m256i *)&dst[x], 0);
245 } else {
246 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
247 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
248 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
249 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
250 dstVector = __lasx_xvadd_b(dstVector, srcVector);
251 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
252 }
253 }
254 }
255
256 // Epilogue to handle all remaining pixels in one step.
257 if (x < length) {
258 const __m256i epilogueMask = __lasx_xvadd_w(offsetMask, __lasx_xvreplgr2vr_w(x - length));
259 const __m256i epilogueMask1 = __lasx_xvslti_w(epilogueMask, 0);
260 const __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
261 __lasx_xvld((const int *)&src[x], 0),
262 epilogueMask1);
263 const __m256i epilogueMask2 = __lasx_xvslti_b(epilogueMask,0);
264 const __m256i epilogueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0),
265 alphaMask,
266 epilogueMask2);
267 const v8i32 testz3 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, epilogueAlphaMask));
268
269 if (testz3[0]!=0 || testz3[4]!=0) {
270 const v8i32 testc3 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector,
271 epilogueAlphaMask));
272 if (testc3[0]==0 && testc3[4]==0) {
273 __m256i srcV = __lasx_xvld((int *)&dst[x], 0);
274 __lasx_xvst(__lasx_xvbitsel_v(srcV, srcVector, epilogueMask1), (int *)&dst[x], 0);
275 } else {
276 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
277 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
278 __m256i dstVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
279 __lasx_xvld((int *)&dst[x], 0),
280 epilogueMask1);
281 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
282 dstVector = __lasx_xvadd_b(dstVector, srcVector);
283 __m256i dstV = __lasx_xvld((int *)&dst[x], 0);
284 __lasx_xvst(__lasx_xvbitsel_v(dstV, dstVector, epilogueMask1), (int *)&dst[x], 0);
285 }
286 }
287 }
288}
289
290// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX for details.
291inline static void Q_DECL_VECTORCALL
292BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LASX(quint32 *dst, const quint32 *src, const int length, const int const_alpha)
293{
294 int x = 0;
295
296 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
297 blend_pixel(dst[x], src[x], const_alpha);
298
299 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
300 const __m256i one = __lasx_xvreplgr2vr_h(0xff);
301 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
302 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
303 const __m256i alphaShuffleMask = (__m256i)(v32i8){3,char(0xff),3,char(0xff),7,char(0xff),7,char(0xff),11,char(0xff),11,char(0xff),15,char(0xff),15,char(0xff),
304 3,char(0xff),3,char(0xff),7,char(0xff),7,char(0xff),11,char(0xff),11,char(0xff),15,char(0xff),15,char(0xff)};
305 const __m256i constAlphaVector = __lasx_xvreplgr2vr_h(const_alpha);
306 for (; x < (length - 7); x += 8) {
307 __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
308 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
309 if (testz[0]!=0 || testz[4]!=0) {
310 BYTE_MUL_LASX(srcVector, constAlphaVector, colorMask, half);
311
312 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
313 alphaChannel = __lasx_xvsub_h(one, alphaChannel);
314 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
315 BYTE_MUL_LASX(dstVector, alphaChannel, colorMask, half);
316 dstVector = __lasx_xvadd_b(dstVector, srcVector);
317 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
318 }
319 }
320 SIMD_EPILOGUE(x, length, 7)
321 blend_pixel(dst[x], src[x], const_alpha);
322}
323
324void qt_blend_argb32_on_argb32_lasx(uchar *destPixels, int dbpl,
325 const uchar *srcPixels, int sbpl,
326 int w, int h,
327 int const_alpha)
328{
329 if (const_alpha == 256) {
330 for (int y = 0; y < h; ++y) {
331 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
332 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
333 BLEND_SOURCE_OVER_ARGB32_LASX(dst, src, w);
334 destPixels += dbpl;
335 srcPixels += sbpl;
336 }
337 } else if (const_alpha != 0) {
338 const_alpha = (const_alpha * 255) >> 8;
339 for (int y = 0; y < h; ++y) {
340 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
341 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
342 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LASX(dst, src, w, const_alpha);
343 destPixels += dbpl;
344 srcPixels += sbpl;
345 }
346 }
347}
348
349void qt_blend_rgb32_on_rgb32_lasx(uchar *destPixels, int dbpl,
350 const uchar *srcPixels, int sbpl,
351 int w, int h,
352 int const_alpha)
353{
354 if (const_alpha == 256) {
355 for (int y = 0; y < h; ++y) {
356 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
357 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
358 ::memcpy(dst, src, w * sizeof(uint));
359 srcPixels += sbpl;
360 destPixels += dbpl;
361 }
362 return;
363 }
364 if (const_alpha == 0)
365 return;
366
367 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
368 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
369
370 const_alpha = (const_alpha * 255) >> 8;
371 int one_minus_const_alpha = 255 - const_alpha;
372 const __m256i constAlphaVector = __lasx_xvreplgr2vr_h(const_alpha);
373 const __m256i oneMinusConstAlpha = __lasx_xvreplgr2vr_h(one_minus_const_alpha);
374 for (int y = 0; y < h; ++y) {
375 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
376 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
377 int x = 0;
378
379 // First, align dest to 32 bytes:
380 ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
381 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
382
383 // 2) interpolate pixels with LASX
384 for (; x < (w - 7); x += 8) {
385 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
386 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
387 INTERPOLATE_PIXEL_255_LASX(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
388 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
389 }
390
391 // 3) Epilogue
392 SIMD_EPILOGUE(x, w, 7)
393 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
394
395 srcPixels += sbpl;
396 destPixels += dbpl;
397 }
398}
399
400static Q_NEVER_INLINE
401void Q_DECL_VECTORCALL qt_memfillXX_lasx(uchar *dest, __m256i value256, qsizetype bytes)
402{
403 __m128i value128 = *(__m128i*)(&value256);
404
405 // main body
406 __m256i *dst256 = reinterpret_cast<__m256i *>(dest);
407 uchar *end = dest + bytes;
408 while (reinterpret_cast<uchar *>(dst256 + 4) <= end) {
409 __lasx_xvst(value256, dst256 + 0, 0);
410 __lasx_xvst(value256, dst256 + 1, 0);
411 __lasx_xvst(value256, dst256 + 2, 0);
412 __lasx_xvst(value256, dst256 + 3, 0);
413 dst256 += 4;
414 }
415
416 // first epilogue: fewer than 128 bytes / 32 entries
417 bytes = end - reinterpret_cast<uchar *>(dst256);
418 switch (bytes / sizeof(value256)) {
419 case 3: __lasx_xvst(value256, dst256++, 0); Q_FALLTHROUGH();
420 case 2: __lasx_xvst(value256, dst256++, 0); Q_FALLTHROUGH();
421 case 1: __lasx_xvst(value256, dst256++, 0);
422 }
423
424 // second epilogue: fewer than 32 bytes
425 __m128i *dst128 = reinterpret_cast<__m128i *>(dst256);
426 if (bytes & sizeof(value128))
427 __lsx_vst(value128, dst128++, 0);
428
429 // third epilogue: fewer than 16 bytes
430 if (bytes & 8)
431 __lasx_xvstelm_d(value256, reinterpret_cast<__m128i *>(end - 8), 0, 0);
432}
433
434void qt_memfill64_lasx(quint64 *dest, quint64 value, qsizetype count)
435{
436 __m256i value256 = __lasx_xvreplgr2vr_d(value);
437
438 qt_memfillXX_lasx(reinterpret_cast<uchar *>(dest), value256, count * sizeof(quint64));
439}
440
441void qt_memfill32_lasx(quint32 *dest, quint32 value, qsizetype count)
442{
443 if (count % 2) {
444 // odd number of pixels, round to even
445 *dest++ = value;
446 --count;
447 }
448 qt_memfillXX_lasx(reinterpret_cast<uchar *>(dest), __lasx_xvreplgr2vr_w(value), count * sizeof(quint32));
449}
450
451void QT_FASTCALL comp_func_SourceOver_lasx(uint *destPixels, const uint *srcPixels,
452 int length, uint const_alpha)
453{
454 Q_ASSERT(const_alpha < 256);
455
456 const quint32 *src = (const quint32 *) srcPixels;
457 quint32 *dst = (quint32 *) destPixels;
458
459 if (const_alpha == 255)
460 BLEND_SOURCE_OVER_ARGB32_LASX(dst, src, length);
461 else
462 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LASX(dst, src, length, const_alpha);
463}
464
465#if QT_CONFIG(raster_64bit)
466void QT_FASTCALL comp_func_SourceOver_rgb64_lasx(QRgba64 *dst, const QRgba64 *src,
467 int length, uint const_alpha)
468{
469 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
470 const __m256i half = __lasx_xvreplgr2vr_w(0x8000);
471 const __m256i one = __lasx_xvreplgr2vr_w(0xffff);
472 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x0000ffff);
473 __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
474 alphaMask = __lasx_xvilvl_b(alphaMask, alphaMask);
475 const __m256i alphaShuffleMask = (__m256i)(v32i8){6,7,char(0xff),char(0xff),6,7,char(0xff),char(0xff),14,15,char(0xff),char(0xff),14,15,char(0xff),char(0xff),
476 6,7,char(0xff),char(0xff),6,7,char(0xff),char(0xff),14,15,char(0xff),char(0xff),14,15,char(0xff),char(0xff)};
477
478 if (const_alpha == 255) {
479 int x = 0;
480 for (; x < length && (quintptr(dst + x) & 31); ++x)
481 blend_pixel(dst[x], src[x]);
482 for (; x < length - 3; x += 4) {
483 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
484 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
485 if (testz1[0]!=0 || testz1[4]!=0){
486 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
487 if (testc1[0]==0 && testc1[4]==0){
488 __lasx_xvst(srcVector, &dst[x], 0);
489 } else {
490 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
491 alphaChannel = __lasx_xvsub_w(one, alphaChannel);
492 __m256i dstVector = __lasx_xvld(&dst[x], 0);
493 BYTE_MUL_RGB64_LASX(dstVector, alphaChannel, colorMask, half);
494 dstVector = __lasx_xvadd_h(dstVector, srcVector);
495 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
496 }
497 }
498 }
499 SIMD_EPILOGUE(x, length, 3)
500 blend_pixel(dst[x], src[x]);
501 } else {
502 const __m256i constAlphaVector = __lasx_xvreplgr2vr_w(const_alpha | (const_alpha << 8));
503 int x = 0;
504 for (; x < length && (quintptr(dst + x) & 31); ++x)
505 blend_pixel(dst[x], src[x], const_alpha);
506 for (; x < length - 3; x += 4) {
507 __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
508 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
509 if (testz[0]!=0 || testz[4]!=0){
510 // Not all transparent
511 BYTE_MUL_RGB64_LASX(srcVector, constAlphaVector, colorMask, half);
512 __m256i alphaChannel = __lasx_xvshuf_b(__lasx_xvldi(0), srcVector, alphaShuffleMask);
513 alphaChannel = __lasx_xvsub_w(one, alphaChannel);
514 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
515 BYTE_MUL_RGB64_LASX(dstVector, alphaChannel, colorMask, half);
516 dstVector = __lasx_xvadd_h(dstVector, srcVector);
517 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
518 }
519 }
520 SIMD_EPILOGUE(x, length, 3)
521 blend_pixel(dst[x], src[x], const_alpha);
522 }
523}
524#endif
525
526void QT_FASTCALL comp_func_Source_lasx(uint *dst, const uint *src, int length, uint const_alpha)
527{
528 if (const_alpha == 255) {
529 ::memcpy(dst, src, length * sizeof(uint));
530 } else {
531 const int ialpha = 255 - const_alpha;
532
533 int x = 0;
534
535 // 1) prologue, align on 32 bytes
536 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
537 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
538
539 // 2) interpolate pixels with LASX
540 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
541 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
542 const __m256i constAlphaVector = __lasx_xvreplgr2vr_h(const_alpha);
543 const __m256i oneMinusConstAlpha = __lasx_xvreplgr2vr_h(ialpha);
544 for (; x < length - 7; x += 8) {
545 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
546 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
547 INTERPOLATE_PIXEL_255_LASX(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
548 __lasx_xvst(dstVector, (__m256i *)&dst[x], 0);
549 }
550
551 // 3) Epilogue
552 SIMD_EPILOGUE(x, length, 7)
553 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
554 }
555}
556
557#if QT_CONFIG(raster_64bit)
558void QT_FASTCALL comp_func_Source_rgb64_lasx(QRgba64 *dst, const QRgba64 *src,
559 int length, uint const_alpha)
560{
561 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
562 if (const_alpha == 255) {
563 ::memcpy(dst, src, length * sizeof(QRgba64));
564 } else {
565 const uint ca = const_alpha | (const_alpha << 8); // adjust to [0-65535]
566 const uint cia = 65535 - ca;
567
568 int x = 0;
569
570 // 1) prologue, align on 32 bytes
571 for (; x < length && (quintptr(dst + x) & 31); ++x)
572 dst[x] = interpolate65535(src[x], ca, dst[x], cia);
573
574 // 2) interpolate pixels with AVX2
575 const __m256i half = __lasx_xvreplgr2vr_w(0x8000);
576 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x0000ffff);
577 const __m256i constAlphaVector = __lasx_xvreplgr2vr_w(ca);
578 const __m256i oneMinusConstAlpha = __lasx_xvreplgr2vr_w(cia);
579 for (; x < length - 3; x += 4) {
580 const __m256i srcVector = __lasx_xvld((const __m256i *)&src[x], 0);
581 __m256i dstVector = __lasx_xvld((__m256i *)&dst[x], 0);
582 INTERPOLATE_PIXEL_RGB64_LASX(srcVector, dstVector, constAlphaVector,
583 oneMinusConstAlpha, colorMask, half);
584 __lasx_xvst(dstVector, &dst[x], 0);
585 }
586
587 // 3) Epilogue
588 SIMD_EPILOGUE(x, length, 3)
589 dst[x] = interpolate65535(src[x], ca, dst[x], cia);
590 }
591}
592#endif
593
594void QT_FASTCALL comp_func_solid_SourceOver_lasx(uint *destPixels, int length,
595 uint color, uint const_alpha)
596{
597 if ((const_alpha & qAlpha(color)) == 255) {
598 qt_memfill32(destPixels, color, length);
599 } else {
600 if (const_alpha != 255)
601 color = BYTE_MUL(color, const_alpha);
602
603 const quint32 minusAlphaOfColor = qAlpha(~color);
604 int x = 0;
605
606 quint32 *dst = (quint32 *) destPixels;
607 const __m256i colorVector = __lasx_xvreplgr2vr_w(color);
608 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
609 const __m256i half = __lasx_xvreplgr2vr_h(0x80);
610 const __m256i minusAlphaOfColorVector = __lasx_xvreplgr2vr_h(minusAlphaOfColor);
611
612 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
613 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
614
615 for (; x < length - 7; x += 8) {
616 __m256i dstVector = __lasx_xvld(&dst[x], 0);
617 BYTE_MUL_LASX(dstVector, minusAlphaOfColorVector, colorMask, half);
618 dstVector = __lasx_xvadd_b(colorVector, dstVector);
619 __lasx_xvst(dstVector, &dst[x], 0);
620 }
621 SIMD_EPILOGUE(x, length, 7)
622 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
623 }
624}
625
626#if QT_CONFIG(raster_64bit)
627void QT_FASTCALL comp_func_solid_SourceOver_rgb64_lasx(QRgba64 *destPixels, int length,
628 QRgba64 color, uint const_alpha)
629{
630 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
631 if (const_alpha == 255 && color.isOpaque()) {
632 qt_memfill64((quint64*)destPixels, color, length);
633 } else {
634 if (const_alpha != 255)
635 color = multiplyAlpha255(color, const_alpha);
636
637 const uint minusAlphaOfColor = 65535 - color.alpha();
638 int x = 0;
639 quint64 *dst = (quint64 *) destPixels;
640 const __m256i colorVector = __lasx_xvreplgr2vr_d(color);
641 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x0000ffff);
642 const __m256i half = __lasx_xvreplgr2vr_w(0x8000);
643 const __m256i minusAlphaOfColorVector = __lasx_xvreplgr2vr_w(minusAlphaOfColor);
644
645 for (; x < length && (quintptr(dst + x) & 31); ++x)
646 destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
647
648 for (; x < length - 3; x += 4) {
649 __m256i dstVector = __lasx_xvld(&dst[x], 0);
650 BYTE_MUL_RGB64_LASX(dstVector, minusAlphaOfColorVector, colorMask, half);
651 dstVector = __lasx_xvadd_h(colorVector, dstVector);
652 __lasx_xvst(dstVector, &dst[x], 0);
653 }
654 SIMD_EPILOGUE(x, length, 3)
655 destPixels[x] = color + multiplyAlpha65535(destPixels[x], minusAlphaOfColor);
656 }
657}
658#endif
659
660static inline void interpolate_4_pixels_16_lasx(const __m256i tlr1, const __m256i tlr2, const __m256i blr1,
661 const __m256i blr2, __m256i distx, __m256i disty, uint *b)
662{
663 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
664 const __m256i v_256 = __lasx_xvreplgr2vr_h(256);
665
666 /* Correct for later unpack */
667 const __m256i vdistx = __lasx_xvpermi_d(distx, 0b11011000);
668 const __m256i vdisty = __lasx_xvpermi_d(disty, 0b11011000);
669
670 __m256i dxdy = __lasx_xvmul_h(vdistx, vdisty);
671 const __m256i distx_ = __lasx_xvslli_h(vdistx, 4);
672 const __m256i disty_ = __lasx_xvslli_h(vdisty, 4);
673 __m256i idxidy = __lasx_xvadd_h(dxdy, __lasx_xvsub_h(v_256, __lasx_xvadd_h(distx_, disty_)));
674 __m256i dxidy = __lasx_xvsub_h(distx_, dxdy);
675 __m256i idxdy = __lasx_xvsub_h(disty_, dxdy);
676
677 __m256i tlr1AG = __lasx_xvsrli_h(tlr1, 8);
678 __m256i tlr1RB = __lasx_xvand_v(tlr1, colorMask);
679 __m256i tlr2AG = __lasx_xvsrli_h(tlr2, 8);
680 __m256i tlr2RB = __lasx_xvand_v(tlr2, colorMask);
681 __m256i blr1AG = __lasx_xvsrli_h(blr1, 8);
682 __m256i blr1RB = __lasx_xvand_v(blr1, colorMask);
683 __m256i blr2AG = __lasx_xvsrli_h(blr2, 8);
684 __m256i blr2RB = __lasx_xvand_v(blr2, colorMask);
685
686 __m256i odxidy1 = __lasx_xvilvl_w(dxidy, idxidy);
687 __m256i odxidy2 = __lasx_xvilvh_w(dxidy, idxidy);
688 tlr1AG = __lasx_xvmul_h(tlr1AG, odxidy1);
689 tlr1RB = __lasx_xvmul_h(tlr1RB, odxidy1);
690 tlr2AG = __lasx_xvmul_h(tlr2AG, odxidy2);
691 tlr2RB = __lasx_xvmul_h(tlr2RB, odxidy2);
692 __m256i odxdy1 = __lasx_xvilvl_w(dxdy, idxdy);
693 __m256i odxdy2 = __lasx_xvilvh_w(dxdy, idxdy);
694 blr1AG = __lasx_xvmul_h(blr1AG, odxdy1);
695 blr1RB = __lasx_xvmul_h(blr1RB, odxdy1);
696 blr2AG = __lasx_xvmul_h(blr2AG, odxdy2);
697 blr2RB = __lasx_xvmul_h(blr2RB, odxdy2);
698
699 /* Add the values, and shift to only keep 8 significant bits per colors */
700 tlr1AG = __lasx_xvadd_w(tlr1AG, __lasx_xvbsrl_v(tlr1AG, 0b100));
701 tlr2AG = __lasx_xvadd_w(tlr2AG, __lasx_xvbsrl_v(tlr2AG, 0b100));
702 __m256i topAG = __lasx_xvpermi_w(tlr2AG, tlr1AG, 0b10001000);
703 tlr1RB = __lasx_xvadd_w(tlr1RB, __lasx_xvbsrl_v(tlr1RB, 0b100));
704 tlr2RB = __lasx_xvadd_w(tlr2RB, __lasx_xvbsrl_v(tlr2RB, 0b100));
705 __m256i topRB = __lasx_xvpermi_w(tlr2RB, tlr1RB, 0b10001000);
706 blr1AG = __lasx_xvadd_w(blr1AG, __lasx_xvbsrl_v(blr1AG, 0b100));
707 blr2AG = __lasx_xvadd_w(blr2AG, __lasx_xvbsrl_v(blr2AG, 0b100));
708 __m256i botAG = __lasx_xvpermi_w(blr2AG, blr1AG, 0b10001000);
709 blr1RB = __lasx_xvadd_w(blr1RB, __lasx_xvbsrl_v(blr1RB, 0b100));
710 blr2RB = __lasx_xvadd_w(blr2RB, __lasx_xvbsrl_v(blr2RB, 0b100));
711 __m256i botRB = __lasx_xvpermi_w(blr2RB, blr1RB, 0b10001000);
712 __m256i rAG = __lasx_xvadd_h(topAG, botAG);
713 __m256i rRB = __lasx_xvadd_h(topRB, botRB);
714 rRB = __lasx_xvsrli_h(rRB, 8);
715 /* Correct for hadd */
716 rAG = __lasx_xvpermi_d(rAG, 0b11011000);
717 rRB = __lasx_xvpermi_d(rRB, 0b11011000);
718 __m256i colorMask1 = __lasx_xvslti_b(colorMask, 0);
719 __lasx_xvst(__lasx_xvbitsel_v(rAG, rRB, colorMask1), b, 0);
720}
721
722inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
723{
724 if (v1 < l1)
725 v2 = v1 = l1;
726 else if (v1 >= l2)
727 v2 = v1 = l2;
728 else
729 v2 = v1 + 1;
730 Q_ASSERT(v1 >= l1 && v1 <= l2);
731 Q_ASSERT(v2 >= l1 && v2 <= l2);
732}
733
734void QT_FASTCALL intermediate_adder_lasx(uint *b, uint *end,
735 const IntermediateBuffer &intermediate,
736 int offset, int &fx, int fdx);
737
738void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper_lasx(uint *b, uint *end, const QTextureData &image,
739 int &fx, int &fy, int fdx, int /*fdy*/)
740{
741 int y1 = (fy >> 16);
742 int y2;
743 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
744 const uint *s1 = (const uint *)image.scanLine(y1);
745 const uint *s2 = (const uint *)image.scanLine(y2);
746
747 const int disty = (fy & 0x0000ffff) >> 8;
748 const int idisty = 256 - disty;
749 const int length = end - b;
750
751 // The intermediate buffer is generated in the positive direction
752 const int adjust = (fdx < 0) ? fdx * length : 0;
753 const int offset = (fx + adjust) >> 16;
754 int x = offset;
755
756 IntermediateBuffer intermediate;
757 // count is the size used in the intermediate_buffer.
758 int count = (qint64(length) * qAbs(fdx) + FixedScale - 1) / FixedScale + 2;
759 // length is supposed to be <= BufferSize either because data->m11 < 1 or
760 // data->m11 < 2, and any larger buffers split
761 Q_ASSERT(count <= BufferSize + 2);
762 int f = 0;
763 int lim = qMin(count, image.x2 - x);
764 if (x < image.x1) {
765 Q_ASSERT(x < image.x2);
766 uint t = s1[image.x1];
767 uint b = s2[image.x1];
768 quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
769 quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
770 do {
771 intermediate.buffer_rb[f] = rb;
772 intermediate.buffer_ag[f] = ag;
773 f++;
774 x++;
775 } while (x < image.x1 && f < lim);
776 }
777
778 const __m256i disty_ = __lasx_xvreplgr2vr_h(disty);
779 const __m256i idisty_ = __lasx_xvreplgr2vr_h(idisty);
780 const __m256i colorMask = __lasx_xvreplgr2vr_w(0x00ff00ff);
781
782 lim -= 7;
783 for (; f < lim; x += 8, f += 8) {
784 // Load 8 pixels from s1, and split the alpha-green and red-blue component
785 __m256i top = __lasx_xvld((s1+x), 0);
786 __m256i topAG = __lasx_xvsrli_h(top, 8);
787 __m256i topRB = __lasx_xvand_v(top, colorMask);
788 // Multiplies each color component by idisty
789 topAG = __lasx_xvmul_h(topAG, idisty_);
790 topRB = __lasx_xvmul_h(topRB, idisty_);
791
792 // Same for the s2 vector
793 __m256i bottom = __lasx_xvld((s2+x), 0);
794 __m256i bottomAG = __lasx_xvsrli_h(bottom, 8);
795 __m256i bottomRB = __lasx_xvand_v(bottom, colorMask);
796 bottomAG = __lasx_xvmul_h(bottomAG, disty_);
797 bottomRB = __lasx_xvmul_h(bottomRB, disty_);
798
799 // Add the values, and shift to only keep 8 significant bits per colors
800 __m256i rAG = __lasx_xvadd_h(topAG, bottomAG);
801 rAG = __lasx_xvsrli_h(rAG, 8);
802 __lasx_xvst(rAG, (&intermediate.buffer_ag[f]), 0);
803 __m256i rRB = __lasx_xvadd_h(topRB, bottomRB);
804 rRB = __lasx_xvsrli_h(rRB, 8);
805 __lasx_xvst(rRB, (&intermediate.buffer_rb[f]), 0);
806 }
807
808 for (; f < count; f++) { // Same as above but without simd
809 x = qMin(x, image.x2 - 1);
810
811 uint t = s1[x];
812 uint b = s2[x];
813
814 intermediate.buffer_rb[f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
815 intermediate.buffer_ag[f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
816 x++;
817 }
818
819 // Now interpolate the values from the intermediate_buffer to get the final result.
820 intermediate_adder_lasx(b, end, intermediate, offset, fx, fdx);
821}
822
823void QT_FASTCALL intermediate_adder_lasx(uint *b, uint *end,
824 const IntermediateBuffer &intermediate,
825 int offset, int &fx, int fdx)
826{
827 fx -= offset * FixedScale;
828
829 const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx * 4);
830 const __m128i v_blend = __lsx_vreplgr2vr_w(0x00ff00ff);
831 const __m128i vdx_shuffle = (__m128i)(v16i8){1, char(0xff), 1, char(0xff), 5, char(0xff), 5, char(0xff),
832 9, char(0xff), 9, char(0xff), 13, char(0xff), 13, char(0xff)};
833 __m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
834
835 while (b < end - 3) {
836 v4i32 offset = (v4i32)__lsx_vsrli_w(v_fx, 16);
837
838 __m256i vrb = (__m256i)(v4i64){*(const long long *)(intermediate.buffer_rb + offset[0]),
839 *(const long long *)(intermediate.buffer_rb + offset[1]),
840 *(const long long *)(intermediate.buffer_rb + offset[2]),
841 *(const long long *)(intermediate.buffer_rb + offset[3])};
842 __m256i vag = (__m256i)(v4i64){*(const long long *)(intermediate.buffer_ag + offset[0]),
843 *(const long long *)(intermediate.buffer_ag + offset[1]),
844 *(const long long *)(intermediate.buffer_ag + offset[2]),
845 *(const long long *)(intermediate.buffer_ag + offset[3])};
846
847 __m128i vdx = __lsx_vshuf_b(__lsx_vldi(0), v_fx, vdx_shuffle);
848 __m128i vidx = __lsx_vsub_h(__lsx_vreplgr2vr_h(256), vdx);
849 v2i64 vl = __lsx_vilvl_w(vdx, vidx);
850 v2i64 vh = __lsx_vilvh_w(vdx, vidx);
851 __m256i vmulx = lasx_set_q(vh, vl);
852
853 vrb = __lasx_xvmul_h(vrb, vmulx);
854 vag = __lasx_xvmul_h(vag, vmulx);
855 vrb = __lasx_xvadd_w(vrb, __lasx_xvbsrl_v(vrb, 0b100));
856 vag = __lasx_xvadd_w(vag, __lasx_xvbsrl_v(vag, 0b100));
857 __m256i vrbag = __lasx_xvpickev_w(vag, vrb);
858 vrbag = (v4i64)__lasx_xvpermi_d(vrbag, 0b11011000);
859
860 __m128i rb = lasx_extracti128_lo(vrbag);
861 __m128i ag = lasx_extracti128_hi(vrbag);
862
863 rb = __lsx_vsrli_h(rb, 8);
864 __lsx_vst(__lsx_vbitsel_v(ag, rb, v_blend), (__m128i*)b, 0);
865 b += 4;
866 v_fx = __lsx_vadd_w(v_fx, v_fdx);
867 }
868 fx = __lsx_vpickve2gr_w(v_fx, 0);
869 while (b < end) {
870 const int x = (fx >> 16);
871
872 const uint distx = (fx & 0x0000ffff) >> 8;
873 const uint idistx = 256 - distx;
874 const uint rb = (intermediate.buffer_rb[x] * idistx + intermediate.buffer_rb[x + 1] * distx) & 0xff00ff00;
875 const uint ag = (intermediate.buffer_ag[x] * idistx + intermediate.buffer_ag[x + 1] * distx) & 0xff00ff00;
876 *b = (rb >> 8) | ag;
877 b++;
878 fx += fdx;
879 }
880 fx += offset * FixedScale;
881}
882
883void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_lasx(uint *b, uint *end, const QTextureData &image,
884 int &fx, int &fy, int fdx, int /*fdy*/)
885{
886 int y1 = (fy >> 16);
887 int y2;
888 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
889 const uint *s1 = (const uint *)image.scanLine(y1);
890 const uint *s2 = (const uint *)image.scanLine(y2);
891 const int disty8 = (fy & 0x0000ffff) >> 8;
892 const int disty4 = (disty8 + 0x08) >> 4;
893
894 const qint64 min_fx = qint64(image.x1) * FixedScale;
895 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
896 while (b < end) {
897 int x1 = (fx >> 16);
898 int x2;
899 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
900 if (x1 != x2)
901 break;
902 uint top = s1[x1];
903 uint bot = s2[x1];
904 *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8);
905 fx += fdx;
906 ++b;
907 }
908 uint *boundedEnd = end;
909 if (fdx > 0)
910 boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
911 else if (fdx < 0)
912 boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
913
914 // A fast middle part without boundary checks
915 const __m256i vdistShuffle = (__m256i)(v32i8){0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff),
916 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff),
917 0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff),
918 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff)};
919 const __m256i v_disty = __lasx_xvreplgr2vr_h(disty4);
920 const __m256i v_fdx = __lasx_xvreplgr2vr_w(fdx * 8);
921 const __m256i v_fx_r = __lasx_xvreplgr2vr_w(0x08);
922 const __m256i v_index = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
923 __m256i v_fx = __lasx_xvreplgr2vr_w(fx);
924 v_fx = __lasx_xvadd_w(v_fx, __lasx_xvmul_w(__lasx_xvreplgr2vr_w(fdx), v_index));
925
926 while (b < boundedEnd - 7) {
927 const v8i32 offset = (v8i32)__lasx_xvsrli_w(v_fx, 16);
928
929 const __m256i toplo = (__m256i)(v4i64){*(const long long *)(s1 + offset[0]), *(const long long *)(s1 + offset[1]),
930 *(const long long *)(s1 + offset[2]), *(const long long *)(s1 + offset[3])};
931 const __m256i tophi = (__m256i)(v4i64){*(const long long *)(s1 + offset[4]), *(const long long *)(s1 + offset[5]),
932 *(const long long *)(s1 + offset[6]), *(const long long *)(s1 + offset[7])};
933 const __m256i botlo = (__m256i)(v4i64){*(const long long *)(s2 + offset[0]), *(const long long *)(s2 + offset[1]),
934 *(const long long *)(s2 + offset[2]), *(const long long *)(s2 + offset[3])};
935 const __m256i bothi = (__m256i)(v4i64){*(const long long *)(s2 + offset[4]), *(const long long *)(s2 + offset[5]),
936 *(const long long *)(s2 + offset[6]), *(const long long *)(s2 + offset[7])};
937
938 __m256i v_distx = __lasx_xvsrli_h(v_fx, 8);
939 v_distx = __lasx_xvsrli_h(__lasx_xvadd_w(v_distx, v_fx_r), 4);
940 v_distx = __lasx_xvshuf_b(__lasx_xvldi(0), v_distx, vdistShuffle);
941
942 interpolate_4_pixels_16_lasx(toplo, tophi, botlo, bothi, v_distx, v_disty, b);
943 b += 8;
944 v_fx = __lasx_xvadd_w(v_fx, v_fdx);
945 }
946 fx = __lasx_xvpickve2gr_w(v_fx, 0);
947
948 while (b < boundedEnd) {
949 int x = (fx >> 16);
950 int distx8 = (fx & 0x0000ffff) >> 8;
951 *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
952 fx += fdx;
953 ++b;
954 }
955
956 while (b < end) {
957 int x1 = (fx >> 16);
958 int x2;
959 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
960 uint tl = s1[x1];
961 uint tr = s1[x2];
962 uint bl = s2[x1];
963 uint br = s2[x2];
964 int distx8 = (fx & 0x0000ffff) >> 8;
965 *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
966 fx += fdx;
967 ++b;
968 }
969}
970
971void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_lasx(uint *b, uint *end, const QTextureData &image,
972 int &fx, int &fy, int fdx, int fdy)
973{
974 const qint64 min_fx = qint64(image.x1) * FixedScale;
975 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
976 const qint64 min_fy = qint64(image.y1) * FixedScale;
977 const qint64 max_fy = qint64(image.y2 - 1) * FixedScale;
978 // first handle the possibly bounded part in the beginning
979 while (b < end) {
980 int x1 = (fx >> 16);
981 int x2;
982 int y1 = (fy >> 16);
983 int y2;
984 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
985 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
986 if (x1 != x2 && y1 != y2)
987 break;
988 const uint *s1 = (const uint *)image.scanLine(y1);
989 const uint *s2 = (const uint *)image.scanLine(y2);
990 uint tl = s1[x1];
991 uint tr = s1[x2];
992 uint bl = s2[x1];
993 uint br = s2[x2];
994 int distx = (fx & 0x0000ffff) >> 8;
995 int disty = (fy & 0x0000ffff) >> 8;
996 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
997 fx += fdx;
998 fy += fdy;
999 ++b;
1000 }
1001 uint *boundedEnd = end;
1002 if (fdx > 0)
1003 boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
1004 else if (fdx < 0)
1005 boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
1006 if (fdy > 0)
1007 boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy);
1008 else if (fdy < 0)
1009 boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy);
1010
1011 // until boundedEnd we can now have a fast middle part without boundary checks
1012 const __m256i vdistShuffle = (__m256i)(v32i8){0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff), 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff),
1013 0, char(0xff), 0, char(0xff), 4, char(0xff), 4, char(0xff), 8, char(0xff), 8, char(0xff), 12, char(0xff), 12, char(0xff)};
1014 const __m256i v_fdx = __lasx_xvreplgr2vr_w(fdx * 8);
1015 const __m256i v_fdy = __lasx_xvreplgr2vr_w(fdy * 8);
1016 const __m256i v_fxy_r = __lasx_xvreplgr2vr_w(0x08);
1017 const __m256i v_index = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
1018 __m256i v_fx = __lasx_xvreplgr2vr_w(fx);
1019 __m256i v_fy = __lasx_xvreplgr2vr_w(fy);
1020 v_fx = __lasx_xvadd_w(v_fx, __lasx_xvmul_w(__lasx_xvreplgr2vr_w(fdx), v_index));
1021 v_fy = __lasx_xvadd_w(v_fy, __lasx_xvmul_w(__lasx_xvreplgr2vr_w(fdy), v_index));
1022
1023 const uchar *textureData = image.imageData;
1024 const qsizetype bytesPerLine = image.bytesPerLine;
1025 const __m256i vbpl = __lasx_xvreplgr2vr_h(bytesPerLine/4);
1026
1027 while (b < boundedEnd - 7) {
1028 const __m256i vy = __lasx_xvpickev_h(__lasx_xvldi(0),
1029 __lasx_xvsat_w(__lasx_xvsrli_w(v_fy, 16), 15));
1030 // 8x16bit * 8x16bit -> 8x32bit
1031 __m256i offset = __lasx_xvilvl_h(__lasx_xvmuh_h(vy, vbpl), __lasx_xvmul_h(vy, vbpl));
1032 offset = __lasx_xvadd_w(offset, __lasx_xvsrli_w(v_fx, 16));
1033
1034 const uint *s1 = (const uint *)(textureData);
1035 const uint *s2 = (const uint *)(textureData + bytesPerLine);
1036 const __m256i toplo = (__m256i)(v4i64){*(const long long *)(s1+((v8i32)offset)[0]), *(const long long *)(s1+((v8i32)offset)[1]),
1037 *(const long long *)(s1+((v8i32)offset)[2]), *(const long long *)(s1+((v8i32)offset)[3])};
1038 const __m256i tophi = (__m256i)(v4i64){*(const long long *)(s1+((v8i32)offset)[4]), *(const long long *)(s1+((v8i32)offset)[5]),
1039 *(const long long *)(s1+((v8i32)offset)[6]), *(const long long *)(s1+((v8i32)offset)[7])};
1040 const __m256i botlo = (__m256i)(v4i64){*(const long long *)(s2+((v8i32)offset)[0]), *(const long long *)(s2+((v8i32)offset)[1]),
1041 *(const long long *)(s2+((v8i32)offset)[2]), *(const long long *)(s2+((v8i32)offset)[3])};
1042 const __m256i bothi = (__m256i)(v4i64){*(const long long *)(s2+((v8i32)offset)[4]), *(const long long *)(s2+((v8i32)offset)[5]),
1043 *(const long long *)(s2+((v8i32)offset)[6]), *(const long long *)(s2+((v8i32)offset)[7])};
1044
1045 __m256i v_distx = __lasx_xvsrli_h(v_fx, 8);
1046 __m256i v_disty = __lasx_xvsrli_h(v_fy, 8);
1047 v_distx = __lasx_xvsrli_h(__lasx_xvadd_w(v_distx, v_fxy_r), 4);
1048 v_disty = __lasx_xvsrli_h(__lasx_xvadd_w(v_disty, v_fxy_r), 4);
1049 v_distx = __lasx_xvshuf_b(__lasx_xvldi(0), v_distx, vdistShuffle);
1050 v_disty = __lasx_xvshuf_b(__lasx_xvldi(0), v_disty, vdistShuffle);
1051
1052 interpolate_4_pixels_16_lasx(toplo, tophi, botlo, bothi, v_distx, v_disty, b);
1053 b += 8;
1054 v_fx = __lasx_xvadd_w(v_fx, v_fdx);
1055 v_fy = __lasx_xvadd_w(v_fy, v_fdy);
1056 }
1057 fx = __lasx_xvpickve2gr_w(v_fx, 0);
1058 fy = __lasx_xvpickve2gr_w(v_fy, 0);
1059
1060 while (b < boundedEnd) {
1061 int x = (fx >> 16);
1062 int y = (fy >> 16);
1063
1064 const uint *s1 = (const uint *)image.scanLine(y);
1065 const uint *s2 = (const uint *)image.scanLine(y + 1);
1066
1067 int distx = (fx & 0x0000ffff) >> 8;
1068 int disty = (fy & 0x0000ffff) >> 8;
1069 *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
1070
1071 fx += fdx;
1072 fy += fdy;
1073 ++b;
1074 }
1075
1076 while (b < end) {
1077 int x1 = (fx >> 16);
1078 int x2;
1079 int y1 = (fy >> 16);
1080 int y2;
1081
1082 fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
1083 fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
1084
1085 const uint *s1 = (const uint *)image.scanLine(y1);
1086 const uint *s2 = (const uint *)image.scanLine(y2);
1087
1088 uint tl = s1[x1];
1089 uint tr = s1[x2];
1090 uint bl = s2[x1];
1091 uint br = s2[x2];
1092
1093 int distx = (fx & 0x0000ffff) >> 8;
1094 int disty = (fy & 0x0000ffff) >> 8;
1095 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
1096
1097 fx += fdx;
1098 fy += fdy;
1099 ++b;
1100 }
1101}
1102
1103static inline __m256i epilogueMaskFromCount(qsizetype count)
1104{
1105 Q_ASSERT(count > 0);
1106 static const __m256i offsetMask = (__m256i)(v8i32){0, 1, 2, 3, 4, 5, 6, 7};
1107 return __lasx_xvadd_w(offsetMask, __lasx_xvreplgr2vr_w(-count));
1108}
1109
1110template<bool RGBA>
1111static void convertARGBToARGB32PM_lasx(uint *buffer, const uint *src, qsizetype count)
1112{
1113 qsizetype i = 0;
1114 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
1115 const __m256i rgbaMask = (__m256i)(v32i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
1116 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
1117 const __m256i shuffleMask = (__m256i)(v32i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15,
1118 6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
1119 const __m256i half = __lasx_xvreplgr2vr_h(0x0080);
1120 const __m256i zero = __lasx_xvldi(0);
1121
1122 for (; i < count - 7; i += 8) {
1123 __m256i srcVector = __lasx_xvld(reinterpret_cast<const __m256i *>(src + i), 0);
1124 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
1125 if (testz[0]!=0 || testz[4]!=0){
1126 const v8i32 testc = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
1127 bool cf = testc[0]==0 && testc[4]==0;
1128 if (RGBA)
1129 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1130 if (!cf) {
1131 __m256i src1 = __lasx_xvilvl_b(zero, srcVector);
1132 __m256i src2 = __lasx_xvilvh_b(zero, srcVector);
1133 __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1134 __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1135 __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15, 0, 1, 2, 11, 4, 5, 6, 15};
1136 src1 = __lasx_xvmul_h(src1, alpha1);
1137 src2 = __lasx_xvmul_h(src2, alpha2);
1138 src1 = __lasx_xvadd_h(src1, __lasx_xvsrli_h(src1, 8));
1139 src2 = __lasx_xvadd_h(src2, __lasx_xvsrli_h(src2, 8));
1140 src1 = __lasx_xvadd_h(src1, half);
1141 src2 = __lasx_xvadd_h(src2, half);
1142 src1 = __lasx_xvsrli_h(src1, 8);
1143 src2 = __lasx_xvsrli_h(src2, 8);
1144 src1 = __lasx_xvshuf_h(blendMask, alpha1, src1);
1145 src2 = __lasx_xvshuf_h(blendMask, alpha2, src2);
1146 src1 = __lasx_xvmaxi_h(src1, 0);
1147 src2 = __lasx_xvmaxi_h(src2, 0);
1148 srcVector = __lasx_xvpickev_b(__lasx_xvsat_hu(src2, 7), __lasx_xvsat_hu(src1, 7));
1149 __lasx_xvst(srcVector, reinterpret_cast<__m256i *>(buffer + i), 0);
1150 } else {
1151 if (buffer != src || RGBA)
1152 __lasx_xvst(srcVector, reinterpret_cast<__m256i *>(buffer + i), 0);
1153 }
1154 } else {
1155 __lasx_xvst(zero, reinterpret_cast<__m256i *>(buffer + i), 0);
1156 }
1157 }
1158
1159 if (i < count) {
1160 const __m256i epilogueMask = epilogueMaskFromCount(count - i);
1161 const __m256i epilogueMask1 = __lasx_xvslti_w(epilogueMask, 0);
1162 __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
1163 __lasx_xvld(reinterpret_cast<const int *>(src + i), 0),
1164 epilogueMask1);
1165 const __m256i epilogueMask2 = __lasx_xvslti_b(epilogueMask, 0);
1166 const __m256i epilogueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0), alphaMask, epilogueMask2);
1167
1168 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, epilogueAlphaMask));
1169 if (testz1[0]!=0 || testz1[4]!=0){
1170 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, epilogueAlphaMask));
1171 bool cf = testc1[0]==0 && testc1[4]==0;
1172 if (RGBA)
1173 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1174 if (!cf) {
1175 __m256i src1 = __lasx_xvilvl_b(zero, srcVector);
1176 __m256i src2 = __lasx_xvilvh_b(zero, srcVector);
1177 __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1178 __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1179 __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15, 0, 1, 2, 11, 4, 5, 6, 15};
1180 src1 = __lasx_xvmul_h(src1, alpha1);
1181 src2 = __lasx_xvmul_h(src2, alpha2);
1182 src1 = __lasx_xvadd_h(src1, __lasx_xvsrli_h(src1, 8));
1183 src2 = __lasx_xvadd_h(src2, __lasx_xvsrli_h(src2, 8));
1184 src1 = __lasx_xvadd_h(src1, half);
1185 src2 = __lasx_xvadd_h(src2, half);
1186 src1 = __lasx_xvsrli_h(src1, 8);
1187 src2 = __lasx_xvsrli_h(src2, 8);
1188 src1 = __lasx_xvshuf_h(blendMask, alpha1, src1);
1189 src2 = __lasx_xvshuf_h(blendMask, alpha2, src2);
1190 src1 = __lasx_xvmaxi_h(src1, 0);
1191 src2 = __lasx_xvmaxi_h(src2, 0);
1192 srcVector = __lasx_xvpickev_b(__lasx_xvsat_hu(src2, 7), __lasx_xvsat_hu(src1, 7));
1193 __m256i srcV = __lasx_xvld(reinterpret_cast<int *>(buffer + i), 0);
1194 srcV = __lasx_xvbitsel_v(srcV, srcVector, epilogueMask1);
1195 __lasx_xvst(srcV, reinterpret_cast<int *>(buffer + i), 0);
1196 } else {
1197 if (buffer != src || RGBA) {
1198 __m256i srcV = __lasx_xvld(reinterpret_cast<int *>(buffer + i), 0);
1199 srcV = __lasx_xvbitsel_v(srcV, srcVector, epilogueMask1);
1200 __lasx_xvst(srcV, reinterpret_cast<int *>(buffer + i), 0);
1201 }
1202 }
1203 } else {
1204 __m256i srcV = __lasx_xvld(reinterpret_cast<int *>(buffer + i), 0);
1205 srcV = __lasx_xvbitsel_v(srcV, zero, epilogueMask1);
1206 __lasx_xvst(srcV, reinterpret_cast<int *>(buffer + i), 0);
1207 }
1208 }
1209}
1210
1211void QT_FASTCALL convertARGB32ToARGB32PM_lasx(uint *buffer, int count, const QList<QRgb> *)
1212{
1213 convertARGBToARGB32PM_lasx<false>(buffer, buffer, count);
1214}
1215
1216void QT_FASTCALL convertRGBA8888ToARGB32PM_lasx(uint *buffer, int count, const QList<QRgb> *)
1217{
1218 convertARGBToARGB32PM_lasx<true>(buffer, buffer, count);
1219}
1220
1221const uint *QT_FASTCALL fetchARGB32ToARGB32PM_lasx(uint *buffer, const uchar *src, int index,
1222 int count, const QList<QRgb> *, QDitherInfo *)
1223{
1224 convertARGBToARGB32PM_lasx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1225 return buffer;
1226}
1227
1228const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_lasx(uint *buffer, const uchar *src, int index, int count,
1229 const QList<QRgb> *, QDitherInfo *)
1230{
1231 convertARGBToARGB32PM_lasx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1232 return buffer;
1233}
1234
1235template<bool RGBA>
1236static void convertARGBToRGBA64PM_lasx(QRgba64 *buffer, const uint *src, qsizetype count)
1237{
1238 qsizetype i = 0;
1239 const __m256i alphaMask = __lasx_xvreplgr2vr_w(0xff000000);
1240 const __m256i rgbaMask = (__m256i)(v32i8){2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
1241 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
1242 const __m256i shuffleMask = (__m256i)(v32i8){6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15,
1243 6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15};
1244 const __m256i zero = __lasx_xvldi(0);
1245
1246 for (; i < count - 7; i += 8) {
1247 __m256i dst1, dst2;
1248 __m256i srcVector = __lasx_xvld(reinterpret_cast<const __m256i *>(src + i), 0);
1249 const v8i32 testz = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, alphaMask));
1250 if (testz[0]!=0 || testz[4]!=0){
1251 const v8i32 testc = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, alphaMask));
1252 bool cf = testc[0]==0 && testc[4]==0;
1253 if (!RGBA)
1254 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1255
1256 // The two unpack instructions unpack the low and upper halves of
1257 // each 128-bit half of the 256-bit register. Here's the tracking
1258 // of what's where: (p is 32-bit, P is 64-bit)
1259 // as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
1260 // after xvpermi_d [ p1, p2, p5, p6; p3, p4, p7, p8 ]
1261 // after xvilvl/h [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
1262 srcVector = __lasx_xvpermi_d(srcVector, 0b11011000);
1263 const __m256i src1 = __lasx_xvilvl_b(srcVector, srcVector);
1264 const __m256i src2 = __lasx_xvilvh_b(srcVector, srcVector);
1265 if (!cf) {
1266 const __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1267 const __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1268 __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15, 0, 1, 2, 11, 4, 5, 6, 15};
1269 dst1 = __lasx_xvmuh_hu(src1, alpha1);
1270 dst2 = __lasx_xvmuh_hu(src2, alpha2);
1271 dst1 = __lasx_xvadd_h(dst1, __lasx_xvsrli_h(dst1, 15));
1272 dst2 = __lasx_xvadd_h(dst2, __lasx_xvsrli_h(dst2, 15));
1273 dst1 = __lasx_xvshuf_h(blendMask, src1, dst1);
1274 dst2 = __lasx_xvshuf_h(blendMask, src2, dst2);
1275 } else {
1276 dst1 = src1;
1277 dst2 = src2;
1278 }
1279 } else {
1280 dst1 = dst2 = zero;
1281 }
1282 __lasx_xvst(dst1, reinterpret_cast<__m256i *>(buffer + i), 0);
1283 __lasx_xvst(dst2, reinterpret_cast<__m256i *>(buffer + i) + 1, 0);
1284 }
1285
1286 if (i < count) {
1287 __m256i epilogueMask = epilogueMaskFromCount(count - i);
1288 const __m256i epilogueMask1 = __lasx_xvslti_w(epilogueMask,0);
1289 __m256i srcVector = __lasx_xvbitsel_v(__lasx_xvldi(0),
1290 __lasx_xvld(reinterpret_cast<const int *>(src + i), 0),
1291 epilogueMask1);
1292 __m256i dst1, dst2;
1293 const __m256i epilogueMask2 = __lasx_xvslti_b(epilogueMask, 0);
1294 const __m256i epilogueAlphaMask = __lasx_xvbitsel_v(__lasx_xvldi(0),
1295 alphaMask,
1296 epilogueMask2);
1297
1298 const v8i32 testz1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvand_v(srcVector, epilogueAlphaMask));
1299 if (testz1[0]!=0 || testz1[4]!=0){
1300 const v8i32 testc1 = (v8i32)__lasx_xvmsknz_b(__lasx_xvandn_v(srcVector, epilogueAlphaMask));
1301 bool cf = testc1[0]==0 && testc1[4]==0;
1302
1303 if (!RGBA)
1304 srcVector = __lasx_xvshuf_b(zero, srcVector, rgbaMask);
1305 srcVector = __lasx_xvpermi_d(srcVector, 0b11011000);
1306 const __m256i src1 = __lasx_xvilvl_b(srcVector, srcVector);
1307 const __m256i src2 = __lasx_xvilvh_b(srcVector, srcVector);
1308 if (!cf) {
1309 const __m256i alpha1 = __lasx_xvshuf_b(zero, src1, shuffleMask);
1310 const __m256i alpha2 = __lasx_xvshuf_b(zero, src2, shuffleMask);
1311 const __m256i blendMask = (__m256i)(v16i16){0, 1, 2, 11, 4, 5, 6, 15,
1312 0, 1, 2, 11, 4, 5, 6, 15};
1313 dst1 = __lasx_xvmuh_hu(src1, alpha1);
1314 dst2 = __lasx_xvmuh_hu(src2, alpha2);
1315 dst1 = __lasx_xvadd_h(dst1, __lasx_xvsrli_h(dst1, 15));
1316 dst2 = __lasx_xvadd_h(dst2, __lasx_xvsrli_h(dst2, 15));
1317 dst1 = __lasx_xvshuf_h(blendMask, src1, dst1);
1318 dst2 = __lasx_xvshuf_h(blendMask, src2, dst2);
1319 } else {
1320 dst1 = src1;
1321 dst2 = src2;
1322 }
1323 } else {
1324 dst1 = dst2 = zero;
1325 }
1326 epilogueMask = __lasx_xvpermi_d(epilogueMask, 0b11011000);
1327 __m256i epilogueMaskl = __lasx_xvslti_d(__lasx_xvilvl_w(epilogueMask, epilogueMask), 0);
1328 __m256i epilogueMaskh = __lasx_xvslti_d(__lasx_xvilvh_w(epilogueMask, epilogueMask), 0);
1329 __m256i dst1V = __lasx_xvld(reinterpret_cast<qint64 *>(buffer + i), 0);
1330 dst1V = __lasx_xvbitsel_v(dst1V, dst1, epilogueMaskl);
1331 __lasx_xvst(dst1V, reinterpret_cast<qint64 *>(buffer + i), 0);
1332 __m256i dst2V = __lasx_xvld(reinterpret_cast<qint64 *>(buffer + i + 4), 0);
1333 dst2V = __lasx_xvbitsel_v(dst2V, dst2, epilogueMaskh);
1334 __lasx_xvst(dst2V, reinterpret_cast<qint64 *>(buffer + i + 4), 0);
1335 }
1336}
1337
1338const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_lasx(QRgba64 *buffer, const uint *src, int count,
1339 const QList<QRgb> *, QDitherInfo *)
1340{
1341 convertARGBToRGBA64PM_lasx<false>(buffer, src, count);
1342 return buffer;
1343}
1344
1345const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_lasx(QRgba64 *buffer, const uint *src, int count,
1346 const QList<QRgb> *, QDitherInfo *)
1347{
1348 convertARGBToRGBA64PM_lasx<true>(buffer, src, count);
1349 return buffer;
1350}
1351
1352const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_lasx(QRgba64 *buffer, const uchar *src, int index, int count,
1353 const QList<QRgb> *, QDitherInfo *)
1354{
1355 convertARGBToRGBA64PM_lasx<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1356 return buffer;
1357}
1358
1359const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_lasx(QRgba64 *buffer, const uchar *src, int index, int count,
1360 const QList<QRgb> *, QDitherInfo *)
1361{
1362 convertARGBToRGBA64PM_lasx<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
1363 return buffer;
1364}
1365
1366QT_END_NAMESPACE
1367
1368#endif