6#include "private/qsimd_p.h"
9#include <QtCore/qdatastream.h>
10#include <QtCore/qmetatype.h>
11#include <QtCore/qtextstream.h>
15#if QT_VERSION < QT_VERSION_CHECK(7
, 0
, 0
)
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
60
61
62
63
64
68
69
70
71
72
76
77
78
79
80
84
85
86
87
88
91
92
93
94
95
96
99
100
101
102
103
104
107
108
109
110
111
112
113
116
117
118
119
120
121
124
125
126
127
128
129
130
131
134
135
136
137
138
139
140
144
145
146
147
148
149
152
153
154
155
158 return isInf() ? FP_INFINITE : isNaN() ? FP_NAN
159 : !(b16 & 0x7fff) ? FP_ZERO : isNormal() ? FP_NORMAL : FP_SUBNORMAL;
163
164
165
166
167
170
171
172
173
174
177
178
179
180
181
182
183
184
185
187#if QT_COMPILER_SUPPORTS_HERE(F16C)
188static inline bool hasFastF16()
192 return qCpuHasFeature(F16C);
195#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
196static bool hasFastF16Avx256()
199 return qCpuHasFeature(ArchSkylakeAvx512);
202static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
203void qFloatToFloat16_tail_avx256(quint16 *out,
const float *in, qsizetype len)
noexcept
205 __mmask16 mask = _bzhi_u32(-1, len);
206 __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
207 __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
208 _mm_mask_storeu_epi16(out, mask, f16);
211static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
212void qFloatFromFloat16_tail_avx256(
float *out,
const quint16 *in, qsizetype len)
noexcept
214 __mmask16 mask = _bzhi_u32(-1, len);
215 __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
216 __m256 f32 = _mm256_cvtph_ps(f16);
217 _mm256_mask_storeu_ps(out, mask, f32);
221QT_FUNCTION_TARGET(F16C)
222static void qFloatToFloat16_fast(quint16 *out,
const float *in, qsizetype len)
noexcept
224 constexpr qsizetype Step =
sizeof(__m256i) /
sizeof(
float);
225 constexpr qsizetype HalfStep =
sizeof(__m128i) /
sizeof(
float);
229 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
230 __m256 f32 = _mm256_loadu_ps(in + offset);
231 __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
232 _mm_storeu_si128(
reinterpret_cast<__m128i *>(out + offset), f16);
236 for ( ; i + Step < len; i += Step)
241 return convertOneChunk(len - Step);
244#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
245 if (hasFastF16Avx256())
246 return qFloatToFloat16_tail_avx256(out, in, len);
249 if (len >= HalfStep) {
250 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
251 __m128 f32 = _mm_loadu_ps(in + offset);
252 __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
253 _mm_storel_epi64(
reinterpret_cast<__m128i *>(out + offset), f16);
258 return convertOneChunk(len - HalfStep);
262 for ( ; i < len; ++i)
263 out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
266QT_FUNCTION_TARGET(F16C)
267static void qFloatFromFloat16_fast(
float *out,
const quint16 *in, qsizetype len)
noexcept
269 constexpr qsizetype Step =
sizeof(__m256i) /
sizeof(
float);
270 constexpr qsizetype HalfStep =
sizeof(__m128i) /
sizeof(
float);
274 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
275 __m128i f16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(in + offset));
276 __m256 f32 = _mm256_cvtph_ps(f16);
277 _mm256_storeu_ps(out + offset, f32);
281 for ( ; i + Step < len; i += Step)
286 return convertOneChunk(len - Step);
289#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
290 if (hasFastF16Avx256())
291 return qFloatFromFloat16_tail_avx256(out, in, len);
294 if (len >= HalfStep) {
295 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
296 __m128i f16 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(in + offset));
297 __m128 f32 = _mm_cvtph_ps(f16);
298 _mm_storeu_ps(out + offset, f32);
303 return convertOneChunk(len - HalfStep);
307 for ( ; i < len; ++i)
308 out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
311#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__) && (__ARM_FP & 2
)
312static inline bool hasFastF16()
317static void qFloatToFloat16_fast(quint16 *out,
const float *in, qsizetype len)
noexcept
319 __fp16 *out_f16 =
reinterpret_cast<
__fp16 *>(out);
321 for (; i < len - 3; i += 4)
322 vst1_f16(out_f16 + i, vcvt_f16_f32(vld1q_f32(in + i)));
323 SIMD_EPILOGUE(i, len, 3)
324 out_f16[i] =
__fp16(in[i]);
327static void qFloatFromFloat16_fast(
float *out,
const quint16 *in, qsizetype len)
noexcept
329 const __fp16 *in_f16 =
reinterpret_cast<
const __fp16 *>(in);
331 for (; i < len - 3; i += 4)
332 vst1q_f32(out + i, vcvt_f32_f16(vld1_f16(in_f16 + i)));
333 SIMD_EPILOGUE(i, len, 3)
334 out[i] =
float(in_f16[i]);
353
354
355
356
357
358
359
360
361
365 return qFloatToFloat16_fast(
reinterpret_cast<quint16 *>(out), in, len);
367 for (qsizetype i = 0; i < len; ++i)
372
373
374
375
376
377
378
379
380
384 return qFloatFromFloat16_fast(out,
reinterpret_cast<
const quint16 *>(in), len);
386 for (qsizetype i = 0; i < len; ++i)
387 out[i] =
float(in[i]);
391
392
393
394
395
396
397
398
399
401#ifndef QT_NO_DATASTREAM
403
404
405
406
407
408
409
410
411
412
413QDataStream &operator<<(QDataStream &ds,
qfloat16 f)
419
420
421
422
423
424
425
426
427
428
429
444QTextStream &operator<<(QTextStream &ts,
qfloat16 f)
446 return ts <<
float(f);
\keyword 16-bit Floating Point Support\inmodule QtCore \inheaderfile QFloat16
Q_CORE_EXPORT int fpClassify() const noexcept
QTextStream & operator>>(QTextStream &ts, qfloat16 &f16)
static void qFloatToFloat16_fast(quint16 *, const float *, qsizetype) noexcept
static void qFloatFromFloat16_fast(float *, const quint16 *, qsizetype) noexcept
QDataStream & operator>>(QDataStream &ds, qfloat16 &f)
Q_CORE_EXPORT void qFloatFromFloat16(float *, const qfloat16 *, qsizetype length) noexcept
Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *, const float *, qsizetype length) noexcept