Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qdrawingprimitive_lsx_p.h
Go to the documentation of this file.
1// Copyright (C) 2024 Loongson Technology Corporation Limited.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#ifndef QDRAWINGPRIMITIVE_LSX_P_H
5#define QDRAWINGPRIMITIVE_LSX_P_H
6
7#include <QtGui/private/qtguiglobal_p.h>
8#include <private/qsimd_p.h>
10#include "qrgba64_p.h"
11
12#ifdef __loongarch_sx
13
14//
15// W A R N I N G
16// -------------
17//
18// This file is not part of the Qt API. It exists purely as an
19// implementation detail. This header file may change from version to
20// version without notice, or even be removed.
21//
22// We mean it.
23//
24
25QT_BEGIN_NAMESPACE
26
27/*
28 * Multiply the components of pixelVector by alphaChannel
29 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
30 * colorMask must have 0x00ff00ff on each 32 bits component
31 * half must have the value 128 (0x80) for each 32 bits component
32 */
33inline static void Q_DECL_VECTORCALL
34BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half)
35{
36 /* 1. separate the colors in 2 vectors so each color is on 16 bits
37 (in order to be multiplied by the alpha
38 each 32 bit of dstVectorAG are in the form 0x00AA00GG
39 each 32 bit of dstVectorRB are in the form 0x00RR00BB */
40 __m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, 8);
41 __m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask);
42
43 /* 2. multiply the vectors by the alpha channel */
44 pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel);
45 pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel);
46
47 /* 3. divide by 255, that's the tricky part.
48 we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */
49 /** so first (X + X/256 + rounding) */
50 pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, 8));
51 pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half);
52 pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, 8));
53 pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half);
54
55 /** second divide by 256 */
56 pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, 8);
57 /** for AG, we could >> 8 to divide followed by << 8 to put the
58 bytes in the correct position. By masking instead, we execute
59 only one instruction */
60 pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG);
61
62 /* 4. combine the 2 pairs of colors */
63 pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB);
64}
65
66/*
67 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
68 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
69 * colorMask must have 0x00ff00ff on each 32 bits component
70 * half must have the value 128 (0x80) for each 32 bits component
71 */
72inline static void Q_DECL_VECTORCALL
73INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel,
74 __m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half)
75{
76 /* interpolate AG */
77 __m128i srcVectorAG = __lsx_vsrli_h(srcVector, 8);
78 __m128i dstVectorAG = __lsx_vsrli_h(dstVector, 8);
79 __m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel);
80 __m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel);
81 __m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha);
82 finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, 8));
83 finalAG = __lsx_vadd_h(finalAG, half);
84 finalAG = __lsx_vandn_v(colorMask, finalAG);
85
86 /* interpolate RB */
87 __m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask);
88 __m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask);
89 __m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel);
90 __m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel);
91 __m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha);
92 finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, 8));
93 finalRB = __lsx_vadd_h(finalRB, half);
94 finalRB = __lsx_vsrli_h(finalRB, 8);
95
96 /* combine */
97 dstVector = __lsx_vor_v(finalAG, finalRB);
98}
99
100// same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector
101inline static void Q_DECL_VECTORCALL
102BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 *dst, int x, __m128i srcVector,
103 __m128i nullVector, __m128i half, __m128i one,
104 __m128i colorMask, __m128i alphaMask)
105{
106 const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask);
107 __m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask);
108 v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
109 if (vseq_res[0] == (0x0000ffff)) {
110 /* all opaque */
111 __lsx_vst(srcVector, &dst[x], 0);
112 } else {
113 __m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector);
114 v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n);
115 if (vseq_n_res[0] != (0x0000ffff)) {
116 /* not fully transparent */
117 /* extract the alpha channel on 2 x 16 bits */
118 /* so we have room for the multiplication */
119 /* each 32 bits will be in the form 0x00AA00AA */
120 /* with A being the 1 - alpha */
121 __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
122 alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
123 alphaChannel = __lsx_vsub_h(one, alphaChannel);
124
125 __m128i dstVector = __lsx_vld(&dst[x], 0);
126 BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
127
128 /* result = s + d * (1-alpha) */
129 const __m128i result = __lsx_vadd_b(srcVector, dstVector);
130 __lsx_vst(result, &dst[x], 0);
131 }
132 }
133}
134
135// Basically blend src over dst with the const alpha defined as constAlphaVector.
136// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
137//const __m128i nullVector = __lsx_vreplgr2vr_w(0);
138//const __m128i half = __lsx_vreplgr2vr_h(0x80);
139//const __m128i one = __lsx_vreplgr2vr_h(0xff);
140//const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
141//const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
142//
143// The computation being done is:
144// result = s + d * (1-alpha)
145// with shortcuts if fully opaque or fully transparent.
146inline static void Q_DECL_VECTORCALL
147BLEND_SOURCE_OVER_ARGB32_LSX(quint32 *dst, const quint32 *src, int length)
148{
149 int x = 0;
150
151 /* First, get dst aligned. */
152 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
153 blend_pixel(dst[x], src[x]);
154 }
155
156 const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
157 const __m128i nullVector = __lsx_vreplgr2vr_w(0);
158 const __m128i half = __lsx_vreplgr2vr_h(0x80);
159 const __m128i one = __lsx_vreplgr2vr_h(0xff);
160 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
161
162 for (; x < length-3; x += 4) {
163 const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
164 BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
165 }
166 SIMD_EPILOGUE(x, length, 3) {
167 blend_pixel(dst[x], src[x]);
168 }
169}
170
171// Basically blend src over dst with the const alpha defined as constAlphaVector.
172// The computation being done is:
173// dest = (s + d * sia) * ca + d * cia
174// = s * ca + d * (sia * ca + cia)
175// = s * ca + d * (1 - sa*ca)
176inline static void Q_DECL_VECTORCALL
177BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 *dst, const quint32 *src, int length, uint const_alpha)
178{
179 int x = 0;
180
181 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
182 blend_pixel(dst[x], src[x], const_alpha);
183 }
184
185 const __m128i nullVector = __lsx_vreplgr2vr_w(0);
186 const __m128i half = __lsx_vreplgr2vr_h(0x80);
187 const __m128i one = __lsx_vreplgr2vr_h(0xff);
188 const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
189 const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
190
191 for (; x < length-3; x += 4) {
192 __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
193 __m128i vseq = __lsx_vseq_w(srcVector, nullVector);
194 v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
195 if (vseq_res[0] != 0x0000ffff) {
196 BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half);
197
198 __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
199 alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
200 alphaChannel = __lsx_vsub_h(one, alphaChannel);
201
202 __m128i dstVector = __lsx_vld((__m128i *)&dst[x], 0);
203 BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
204
205 const __m128i result = __lsx_vadd_b(srcVector, dstVector);
206 __lsx_vst(result, &dst[x], 0);
207 }
208 }
209 SIMD_EPILOGUE(x, length, 3) {
210 blend_pixel(dst[x], src[x], const_alpha);
211 }
212}
213
214typedef union
215{
216 int i;
217 float f;
218} FloatInt;
219
220/* float type data load instructions */
221static __m128 __lsx_vreplfr2vr_s(float val)
222{
223 FloatInt fi_tmpval = {.f = val};
224 return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
225}
226
227Q_ALWAYS_INLINE __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(const __m128 a, float mul)
228{
229 __m128 ia = __lsx_vfrecip_s(a); // Approximate 1/a
230 // Improve precision of ia using Newton-Raphson
231 ia = __lsx_vfsub_s(__lsx_vfadd_s(ia, ia), __lsx_vfmul_s(ia, __lsx_vfmul_s(ia, a)));
232 ia = __lsx_vfmul_s(ia, __lsx_vreplfr2vr_s(mul));
233 return ia;
234}
235
236inline QRgb qUnpremultiply_lsx(QRgb p)
237{
238 const uint alpha = qAlpha(p);
239 if (alpha == 255)
240 return p;
241 if (alpha == 0)
242 return 0;
243 const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha));
244 __m128 via = reciprocal_mul_ps(va, 255.0f); // Approximate 1/a
245 const __m128i shuffleMask = (__m128i)(v16i8){0,16,16,16,1,16,16,16,2,16,16,16,3,16,16,16};
246 __m128i vl = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(p), shuffleMask);
247 vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via));
248 vl = __lsx_vmaxi_w(vl, 0);
249 vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15));
250 vl = __lsx_vinsgr2vr_h(vl, alpha, 3);
251 vl = __lsx_vpickev_b(__lsx_vsat_hu(vl, 7), __lsx_vsat_hu(vl, 7));
252 return __lsx_vpickve2gr_w(vl, 0);
253}
254
255template<enum QtPixelOrder PixelOrder>
256inline uint qConvertArgb32ToA2rgb30_lsx(QRgb p)
257{
258 const uint alpha = qAlpha(p);
259 if (alpha == 255)
260 return qConvertRgb32ToRgb30<PixelOrder>(p);
261 if (alpha == 0)
262 return 0;
263 Q_CONSTEXPR float mult = 1023.0f / (255 >> 6);
264 const uint newalpha = (alpha >> 6);
265 const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(alpha));
266 __m128 via = reciprocal_mul_ps(va, mult * newalpha);
267 const __m128i shuffleMask = (__m128i)(v16i8){0,16,16,16,1,16,16,16,2,16,16,16,3,16,16,16};
268 __m128i vl = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(p), shuffleMask);
269 vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl), via));
270 vl = __lsx_vmaxi_w(vl, 0);
271 vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15));
272 uint rgb30 = (newalpha << 30);
273 rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 1)) << 10;
274 if (PixelOrder == PixelOrderRGB) {
275 rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 2)) << 20;
276 rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 0));
277 } else {
278 rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 0)) << 20;
279 rgb30 |= ((uint)__lsx_vpickve2gr_h(vl, 2));
280 }
281 return rgb30;
282}
283
284template<enum QtPixelOrder PixelOrder>
285inline uint qConvertRgba64ToRgb32_lsx(QRgba64 p)
286{
287 if (p.isTransparent())
288 return 0;
289 __m128i vl = __lsx_vilvl_d(__lsx_vldi(0), __lsx_vldrepl_d(&p, 0));
290 if (!p.isOpaque()) {
291 const __m128 va = __lsx_vffint_s_w(__lsx_vreplgr2vr_w(p.alpha()));
292 __m128 via = reciprocal_mul_ps(va, 65535.0f);
293 vl = __lsx_vilvl_h(__lsx_vldi(0), vl);
294 vl = __lsx_vftintrne_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(vl) , via));
295 vl = __lsx_vmaxi_w(vl, 0);
296 vl = __lsx_vpickev_h(__lsx_vsat_wu(vl, 15), __lsx_vsat_wu(vl, 15));
297 vl = __lsx_vinsgr2vr_h(vl, p.alpha(), 3);
298 }
299 if (PixelOrder == PixelOrderBGR){
300 const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7};
301 vl = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vl);
302 }
303 vl = __lsx_vilvl_h(__lsx_vldi(0), vl);
304 vl = __lsx_vadd_w(vl, __lsx_vreplgr2vr_w(128));
305 vl = __lsx_vsub_w(vl, __lsx_vsrli_w(vl, 8));
306 vl = __lsx_vsrli_w(vl, 8);
307 vl = __lsx_vpickev_h(__lsx_vsat_w(vl, 15), __lsx_vsat_w(vl, 15));
308 __m128i tmp = __lsx_vmaxi_h(vl, 0);
309 vl = __lsx_vpickev_b(__lsx_vsat_hu(tmp, 7), __lsx_vsat_hu(tmp, 7));
310 return __lsx_vpickve2gr_w(vl, 0);
311}
312
313QT_END_NAMESPACE
314
315#endif // __loongarch_sx
316
317#endif // QDRAWINGPRIMITIVE_LSX_P_H