7#include <private/qdrawhelper_x86_p.h>
8#include <private/qsimd_p.h>
10#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
14using namespace QImageScale;
16inline static __m128i Q_DECL_VECTORCALL
17qt_qimageScaleAARGBA_helper(
const unsigned int *pix,
int xyap,
int Cxy,
int step,
const __m128i vxyap,
const __m128i vCxy)
19 __m128i vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
20 __m128i vx = _mm_mullo_epi32(vpix, vxyap);
22 for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
24 vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
25 vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, vCxy));
28 vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
29 vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, _mm_set1_epi32(i)));
34void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi,
unsigned int *dest,
35 int dw,
int dh,
int dow,
int sow)
37 const unsigned int **ypoints = isi->ypoints;
38 const int *xpoints = isi->xpoints;
39 const int *xapoints = isi->xapoints;
40 const int *yapoints = isi->yapoints;
42 const __m128i v256 = _mm_set1_epi32(256);
45 auto scaleSection = [&] (
int yStart,
int yEnd) {
46 for (
int y = yStart; y < yEnd; ++y) {
47 const int Cy = yapoints[y] >> 16;
48 const int yap = yapoints[y] & 0xffff;
49 const __m128i vCy = _mm_set1_epi32(Cy);
50 const __m128i vyap = _mm_set1_epi32(yap);
52 unsigned int *dptr = dest + (y * dow);
53 for (
int x = 0; x < dw; x++) {
54 const unsigned int *sptr = ypoints[y] + xpoints[x];
55 __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy);
57 const int xap = xapoints[x];
59 const __m128i vxap = _mm_set1_epi32(xap);
60 const __m128i vinvxap = _mm_sub_epi32(v256, vxap);
61 __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy);
63 vx = _mm_mullo_epi32(vx, vinvxap);
64 vr = _mm_mullo_epi32(vr, vxap);
65 vx = _mm_add_epi32(vx, vr);
66 vx = _mm_srli_epi32(vx, 8);
68 vx = _mm_srli_epi32(vx, 14);
69 vx = _mm_packus_epi32(vx, vx);
70 vx = _mm_packus_epi16(vx, vx);
71 *dptr = _mm_cvtsi128_si32(vx);
78 multithread_pixels_function(isi, dh, scaleSection);
82void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi,
unsigned int *dest,
83 int dw,
int dh,
int dow,
int sow)
85 const unsigned int **ypoints = isi->ypoints;
86 int *xpoints = isi->xpoints;
87 int *xapoints = isi->xapoints;
88 int *yapoints = isi->yapoints;
90 const __m128i v256 = _mm_set1_epi32(256);
93 auto scaleSection = [&] (
int yStart,
int yEnd) {
94 for (
int y = yStart; y < yEnd; ++y) {
95 unsigned int *dptr = dest + (y * dow);
96 for (
int x = 0; x < dw; x++) {
97 int Cx = xapoints[x] >> 16;
98 int xap = xapoints[x] & 0xffff;
99 const __m128i vCx = _mm_set1_epi32(Cx);
100 const __m128i vxap = _mm_set1_epi32(xap);
102 const unsigned int *sptr = ypoints[y] + xpoints[x];
103 __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
105 int yap = yapoints[y];
107 const __m128i vyap = _mm_set1_epi32(yap);
108 const __m128i vinvyap = _mm_sub_epi32(v256, vyap);
109 __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx);
111 vx = _mm_mullo_epi32(vx, vinvyap);
112 vr = _mm_mullo_epi32(vr, vyap);
113 vx = _mm_add_epi32(vx, vr);
114 vx = _mm_srli_epi32(vx, 8);
116 vx = _mm_srli_epi32(vx, 14);
117 vx = _mm_packus_epi32(vx, vx);
118 vx = _mm_packus_epi16(vx, vx);
119 *dptr = _mm_cvtsi128_si32(vx);
126 multithread_pixels_function(isi, dh, scaleSection);
130void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi,
unsigned int *dest,
131 int dw,
int dh,
int dow,
int sow)
133 const unsigned int **ypoints = isi->ypoints;
134 int *xpoints = isi->xpoints;
135 int *xapoints = isi->xapoints;
136 int *yapoints = isi->yapoints;
138 auto scaleSection = [&] (
int yStart,
int yEnd) {
139 for (
int y = yStart; y < yEnd; ++y) {
140 int Cy = yapoints[y] >> 16;
141 int yap = yapoints[y] & 0xffff;
142 const __m128i vCy = _mm_set1_epi32(Cy);
143 const __m128i vyap = _mm_set1_epi32(yap);
145 unsigned int *dptr = dest + (y * dow);
146 for (
int x = 0; x < dw; x++) {
147 const int Cx = xapoints[x] >> 16;
148 const int xap = xapoints[x] & 0xffff;
149 const __m128i vCx = _mm_set1_epi32(Cx);
150 const __m128i vxap = _mm_set1_epi32(xap);
152 const unsigned int *sptr = ypoints[y] + xpoints[x];
153 __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
154 __m128i vr = _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vyap);
157 for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
159 vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
160 vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vCy));
163 vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
164 vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), _mm_set1_epi32(j)));
166 vr = _mm_srli_epi32(vr, 24);
167 vr = _mm_packus_epi32(vr, _mm_setzero_si128());
168 vr = _mm_packus_epi16(vr, _mm_setzero_si128());
169 *dptr = _mm_cvtsi128_si32(vr);
176 multithread_pixels_function(isi, dh, scaleSection);
179template void qt_qimageScaleAARGBA_up_x_down_y_sse4<
false>(QImageScaleInfo *isi,
unsigned int *dest,
180 int dw,
int dh,
int dow,
int sow);
182template void qt_qimageScaleAARGBA_up_x_down_y_sse4<
true>(QImageScaleInfo *isi,
unsigned int *dest,
183 int dw,
int dh,
int dow,
int sow);
185template void qt_qimageScaleAARGBA_down_x_up_y_sse4<
false>(QImageScaleInfo *isi,
unsigned int *dest,
186 int dw,
int dh,
int dow,
int sow);
188template void qt_qimageScaleAARGBA_down_x_up_y_sse4<
true>(QImageScaleInfo *isi,
unsigned int *dest,
189 int dw,
int dh,
int dow,
int sow);
191template void qt_qimageScaleAARGBA_down_xy_sse4<
false>(QImageScaleInfo *isi,
unsigned int *dest,
192 int dw,
int dh,
int dow,
int sow);
194template void qt_qimageScaleAARGBA_down_xy_sse4<
true>(QImageScaleInfo *isi,
unsigned int *dest,
195 int dw,
int dh,
int dow,
int sow);