7#include "private/qsimd_p.h"
10#include <QtCore/qdatastream.h>
11#include <QtCore/qmetatype.h>
12#include <QtCore/qtextstream.h>
16#if QT_VERSION < QT_VERSION_CHECK(7
, 0
, 0
)
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
61
62
63
64
65
69
70
71
72
73
77
78
79
80
81
85
86
87
88
89
92
93
94
95
96
97
100
101
102
103
104
105
108
109
110
111
112
113
114
117
118
119
120
121
122
125
126
127
128
131
132
133
134
135
136
137
138
141
142
143
144
145
146
147
148
149
153
154
155
156
157
158
161
162
163
164
167 return isInf() ? FP_INFINITE : isNaN() ? FP_NAN
168 : !(b16 & 0x7fff) ? FP_ZERO : isNormal() ? FP_NORMAL : FP_SUBNORMAL;
172
173
174
175
176
179
180
181
182
183
186
187
188
189
190
191
192
193
194
196#if QT_COMPILER_SUPPORTS_HERE(F16C)
197static inline bool hasFastF16()
201 return qCpuHasFeature(F16C);
204#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
205static bool hasFastF16Avx256()
208 return qCpuHasFeature(ArchSkylakeAvx512);
211static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
212void qFloatToFloat16_tail_avx256(quint16 *out,
const float *in, qsizetype len)
noexcept
214 __mmask16 mask = _bzhi_u32(-1, len);
215 __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
216 __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
217 _mm_mask_storeu_epi16(out, mask, f16);
220static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
221void qFloatFromFloat16_tail_avx256(
float *out,
const quint16 *in, qsizetype len)
noexcept
223 __mmask16 mask = _bzhi_u32(-1, len);
224 __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
225 __m256 f32 = _mm256_cvtph_ps(f16);
226 _mm256_mask_storeu_ps(out, mask, f32);
230QT_FUNCTION_TARGET(F16C)
231static void qFloatToFloat16_fast(quint16 *out,
const float *in, qsizetype len)
noexcept
233 constexpr qsizetype Step =
sizeof(__m256i) /
sizeof(
float);
234 constexpr qsizetype HalfStep =
sizeof(__m128i) /
sizeof(
float);
238 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
239 __m256 f32 = _mm256_loadu_ps(in + offset);
240 __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
241 _mm_storeu_si128(
reinterpret_cast<__m128i *>(out + offset), f16);
245 for ( ; i + Step < len; i += Step)
250 return convertOneChunk(len - Step);
253#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
254 if (hasFastF16Avx256())
255 return qFloatToFloat16_tail_avx256(out, in, len);
258 if (len >= HalfStep) {
259 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
260 __m128 f32 = _mm_loadu_ps(in + offset);
261 __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
262 _mm_storel_epi64(
reinterpret_cast<__m128i *>(out + offset), f16);
267 return convertOneChunk(len - HalfStep);
271 for ( ; i < len; ++i)
272 out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
275QT_FUNCTION_TARGET(F16C)
276static void qFloatFromFloat16_fast(
float *out,
const quint16 *in, qsizetype len)
noexcept
278 constexpr qsizetype Step =
sizeof(__m256i) /
sizeof(
float);
279 constexpr qsizetype HalfStep =
sizeof(__m128i) /
sizeof(
float);
283 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
284 __m128i f16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(in + offset));
285 __m256 f32 = _mm256_cvtph_ps(f16);
286 _mm256_storeu_ps(out + offset, f32);
290 for ( ; i + Step < len; i += Step)
295 return convertOneChunk(len - Step);
298#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
299 if (hasFastF16Avx256())
300 return qFloatFromFloat16_tail_avx256(out, in, len);
303 if (len >= HalfStep) {
304 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
305 __m128i f16 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(in + offset));
306 __m128 f32 = _mm_cvtph_ps(f16);
307 _mm_storeu_ps(out + offset, f32);
312 return convertOneChunk(len - HalfStep);
316 for ( ; i < len; ++i)
317 out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
320#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__) && (__ARM_FP & 2
)
321static inline bool hasFastF16()
326static void qFloatToFloat16_fast(quint16 *out,
const float *in, qsizetype len)
noexcept
328 __fp16 *out_f16 =
reinterpret_cast<
__fp16 *>(out);
330 for (; i < len - 3; i += 4)
331 vst1_f16(out_f16 + i, vcvt_f16_f32(vld1q_f32(in + i)));
332 SIMD_EPILOGUE(i, len, 3)
333 out_f16[i] =
__fp16(in[i]);
336static void qFloatFromFloat16_fast(
float *out,
const quint16 *in, qsizetype len)
noexcept
338 const __fp16 *in_f16 =
reinterpret_cast<
const __fp16 *>(in);
340 for (; i < len - 3; i += 4)
341 vst1q_f32(out + i, vcvt_f32_f16(vld1_f16(in_f16 + i)));
342 SIMD_EPILOGUE(i, len, 3)
343 out[i] =
float(in_f16[i]);
362
363
364
365
366
367
368
369
370
374 return qFloatToFloat16_fast(
reinterpret_cast<quint16 *>(out), in, len);
376 for (qsizetype i = 0; i < len; ++i)
381
382
383
384
385
386
387
388
389
393 return qFloatFromFloat16_fast(out,
reinterpret_cast<
const quint16 *>(in), len);
395 for (qsizetype i = 0; i < len; ++i)
396 out[i] =
float(in[i]);
400
401
402
403
404
405
406
407
408
410#ifndef QT_NO_DATASTREAM
412
413
414
415
416
417
418
419
420
421
422QDataStream &operator<<(QDataStream &ds,
qfloat16 f)
428
429
430
431
432
433
434
435
436
437
438
453QTextStream &operator<<(QTextStream &ts,
qfloat16 f)
455 return ts <<
float(f);
\keyword 16-bit Floating Point Support\inmodule QtCore \inheaderfile QFloat16
Q_CORE_EXPORT int fpClassify() const noexcept
QTextStream & operator>>(QTextStream &ts, qfloat16 &f16)
static void qFloatToFloat16_fast(quint16 *, const float *, qsizetype) noexcept
static void qFloatFromFloat16_fast(float *, const quint16 *, qsizetype) noexcept
QDataStream & operator>>(QDataStream &ds, qfloat16 &f)
Q_CORE_EXPORT void qFloatFromFloat16(float *, const qfloat16 *, qsizetype length) noexcept
Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *, const float *, qsizetype length) noexcept