6#include <private/qdrawhelper_x86_p.h>
7#include <private/qsimd_p.h>
9#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
13using namespace QImageScale;
15inline static __m128i Q_DECL_VECTORCALL
16qt_qimageScaleAARGBA_helper(
const unsigned int *pix,
int xyap,
int Cxy,
int step,
const __m128i vxyap,
const __m128i vCxy)
18 __m128i vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
19 __m128i vx = _mm_mullo_epi32(vpix, vxyap);
21 for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
23 vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
24 vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, vCxy));
27 vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
28 vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, _mm_set1_epi32(i)));
33void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi,
unsigned int *dest,
34 int dw,
int dh,
int dow,
int sow)
36 const unsigned int **ypoints = isi->ypoints;
37 const int *xpoints = isi->xpoints;
38 const int *xapoints = isi->xapoints;
39 const int *yapoints = isi->yapoints;
41 const __m128i v256 = _mm_set1_epi32(256);
44 auto scaleSection = [&] (
int yStart,
int yEnd) {
45 for (
int y = yStart; y < yEnd; ++y) {
46 const int Cy = yapoints[y] >> 16;
47 const int yap = yapoints[y] & 0xffff;
48 const __m128i vCy = _mm_set1_epi32(Cy);
49 const __m128i vyap = _mm_set1_epi32(yap);
51 unsigned int *dptr = dest + (y * dow);
52 for (
int x = 0; x < dw; x++) {
53 const unsigned int *sptr = ypoints[y] + xpoints[x];
54 __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy);
56 const int xap = xapoints[x];
58 const __m128i vxap = _mm_set1_epi32(xap);
59 const __m128i vinvxap = _mm_sub_epi32(v256, vxap);
60 __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy);
62 vx = _mm_mullo_epi32(vx, vinvxap);
63 vr = _mm_mullo_epi32(vr, vxap);
64 vx = _mm_add_epi32(vx, vr);
65 vx = _mm_srli_epi32(vx, 8);
67 vx = _mm_srli_epi32(vx, 14);
68 vx = _mm_packus_epi32(vx, vx);
69 vx = _mm_packus_epi16(vx, vx);
70 *dptr = _mm_cvtsi128_si32(vx);
77 multithread_pixels_function(isi, dh, scaleSection);
81void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi,
unsigned int *dest,
82 int dw,
int dh,
int dow,
int sow)
84 const unsigned int **ypoints = isi->ypoints;
85 int *xpoints = isi->xpoints;
86 int *xapoints = isi->xapoints;
87 int *yapoints = isi->yapoints;
89 const __m128i v256 = _mm_set1_epi32(256);
92 auto scaleSection = [&] (
int yStart,
int yEnd) {
93 for (
int y = yStart; y < yEnd; ++y) {
94 unsigned int *dptr = dest + (y * dow);
95 for (
int x = 0; x < dw; x++) {
96 int Cx = xapoints[x] >> 16;
97 int xap = xapoints[x] & 0xffff;
98 const __m128i vCx = _mm_set1_epi32(Cx);
99 const __m128i vxap = _mm_set1_epi32(xap);
101 const unsigned int *sptr = ypoints[y] + xpoints[x];
102 __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
104 int yap = yapoints[y];
106 const __m128i vyap = _mm_set1_epi32(yap);
107 const __m128i vinvyap = _mm_sub_epi32(v256, vyap);
108 __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx);
110 vx = _mm_mullo_epi32(vx, vinvyap);
111 vr = _mm_mullo_epi32(vr, vyap);
112 vx = _mm_add_epi32(vx, vr);
113 vx = _mm_srli_epi32(vx, 8);
115 vx = _mm_srli_epi32(vx, 14);
116 vx = _mm_packus_epi32(vx, vx);
117 vx = _mm_packus_epi16(vx, vx);
118 *dptr = _mm_cvtsi128_si32(vx);
125 multithread_pixels_function(isi, dh, scaleSection);
129void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi,
unsigned int *dest,
130 int dw,
int dh,
int dow,
int sow)
132 const unsigned int **ypoints = isi->ypoints;
133 int *xpoints = isi->xpoints;
134 int *xapoints = isi->xapoints;
135 int *yapoints = isi->yapoints;
137 auto scaleSection = [&] (
int yStart,
int yEnd) {
138 for (
int y = yStart; y < yEnd; ++y) {
139 int Cy = yapoints[y] >> 16;
140 int yap = yapoints[y] & 0xffff;
141 const __m128i vCy = _mm_set1_epi32(Cy);
142 const __m128i vyap = _mm_set1_epi32(yap);
144 unsigned int *dptr = dest + (y * dow);
145 for (
int x = 0; x < dw; x++) {
146 const int Cx = xapoints[x] >> 16;
147 const int xap = xapoints[x] & 0xffff;
148 const __m128i vCx = _mm_set1_epi32(Cx);
149 const __m128i vxap = _mm_set1_epi32(xap);
151 const unsigned int *sptr = ypoints[y] + xpoints[x];
152 __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
153 __m128i vr = _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vyap);
156 for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
158 vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
159 vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vCy));
162 vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
163 vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), _mm_set1_epi32(j)));
165 vr = _mm_srli_epi32(vr, 24);
166 vr = _mm_packus_epi32(vr, _mm_setzero_si128());
167 vr = _mm_packus_epi16(vr, _mm_setzero_si128());
168 *dptr = _mm_cvtsi128_si32(vr);
175 multithread_pixels_function(isi, dh, scaleSection);
178template void qt_qimageScaleAARGBA_up_x_down_y_sse4<
false>(QImageScaleInfo *isi,
unsigned int *dest,
179 int dw,
int dh,
int dow,
int sow);
181template void qt_qimageScaleAARGBA_up_x_down_y_sse4<
true>(QImageScaleInfo *isi,
unsigned int *dest,
182 int dw,
int dh,
int dow,
int sow);
184template void qt_qimageScaleAARGBA_down_x_up_y_sse4<
false>(QImageScaleInfo *isi,
unsigned int *dest,
185 int dw,
int dh,
int dow,
int sow);
187template void qt_qimageScaleAARGBA_down_x_up_y_sse4<
true>(QImageScaleInfo *isi,
unsigned int *dest,
188 int dw,
int dh,
int dow,
int sow);
190template void qt_qimageScaleAARGBA_down_xy_sse4<
false>(QImageScaleInfo *isi,
unsigned int *dest,
191 int dw,
int dh,
int dow,
int sow);
193template void qt_qimageScaleAARGBA_down_xy_sse4<
true>(QImageScaleInfo *isi,
unsigned int *dest,
194 int dw,
int dh,
int dow,
int sow);