6#include "private/qsimd_p.h"
9#include <QtCore/qdatastream.h>
10#include <QtCore/qmetatype.h>
11#include <QtCore/qtextstream.h>
15#if QT_VERSION < QT_VERSION_CHECK(7
, 0
, 0
)
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
60
61
62
63
64
68
69
70
71
72
76
77
78
79
80
84
85
86
87
88
91
92
93
94
95
96
99
100
101
102
103
104
107
108
109
110
111
112
113
116
117
118
119
120
121
124
125
126
127
128
129
133
134
135
136
137
138
141
142
143
144
147 return isInf() ? FP_INFINITE : isNaN() ? FP_NAN
148 : !(b16 & 0x7fff) ? FP_ZERO : isNormal() ? FP_NORMAL : FP_SUBNORMAL;
152
153
154
155
156
159
160
161
162
163
166
167
168
169
170
171
172
173
174
176#if QT_COMPILER_SUPPORTS_HERE(F16C)
177static inline bool hasFastF16()
181 return qCpuHasFeature(F16C);
184#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
185static bool hasFastF16Avx256()
188 return qCpuHasFeature(ArchSkylakeAvx512);
191static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
192void qFloatToFloat16_tail_avx256(quint16 *out,
const float *in, qsizetype len)
noexcept
194 __mmask16 mask = _bzhi_u32(-1, len);
195 __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
196 __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
197 _mm_mask_storeu_epi16(out, mask, f16);
200static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
201void qFloatFromFloat16_tail_avx256(
float *out,
const quint16 *in, qsizetype len)
noexcept
203 __mmask16 mask = _bzhi_u32(-1, len);
204 __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
205 __m256 f32 = _mm256_cvtph_ps(f16);
206 _mm256_mask_storeu_ps(out, mask, f32);
210QT_FUNCTION_TARGET(F16C)
211static void qFloatToFloat16_fast(quint16 *out,
const float *in, qsizetype len)
noexcept
213 constexpr qsizetype Step =
sizeof(__m256i) /
sizeof(
float);
214 constexpr qsizetype HalfStep =
sizeof(__m128i) /
sizeof(
float);
218 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
219 __m256 f32 = _mm256_loadu_ps(in + offset);
220 __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
221 _mm_storeu_si128(
reinterpret_cast<__m128i *>(out + offset), f16);
225 for ( ; i + Step < len; i += Step)
230 return convertOneChunk(len - Step);
233#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
234 if (hasFastF16Avx256())
235 return qFloatToFloat16_tail_avx256(out, in, len);
238 if (len >= HalfStep) {
239 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
240 __m128 f32 = _mm_loadu_ps(in + offset);
241 __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
242 _mm_storel_epi64(
reinterpret_cast<__m128i *>(out + offset), f16);
247 return convertOneChunk(len - HalfStep);
251 for ( ; i < len; ++i)
252 out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
255QT_FUNCTION_TARGET(F16C)
256static void qFloatFromFloat16_fast(
float *out,
const quint16 *in, qsizetype len)
noexcept
258 constexpr qsizetype Step =
sizeof(__m256i) /
sizeof(
float);
259 constexpr qsizetype HalfStep =
sizeof(__m128i) /
sizeof(
float);
263 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
264 __m128i f16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(in + offset));
265 __m256 f32 = _mm256_cvtph_ps(f16);
266 _mm256_storeu_ps(out + offset, f32);
270 for ( ; i + Step < len; i += Step)
275 return convertOneChunk(len - Step);
278#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
279 if (hasFastF16Avx256())
280 return qFloatFromFloat16_tail_avx256(out, in, len);
283 if (len >= HalfStep) {
284 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
285 __m128i f16 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(in + offset));
286 __m128 f32 = _mm_cvtph_ps(f16);
287 _mm_storeu_ps(out + offset, f32);
292 return convertOneChunk(len - HalfStep);
296 for ( ; i < len; ++i)
297 out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
300#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__) && (__ARM_FP & 2
)
301static inline bool hasFastF16()
306static void qFloatToFloat16_fast(quint16 *out,
const float *in, qsizetype len)
noexcept
308 __fp16 *out_f16 =
reinterpret_cast<
__fp16 *>(out);
310 for (; i < len - 3; i += 4)
311 vst1_f16(out_f16 + i, vcvt_f16_f32(vld1q_f32(in + i)));
312 SIMD_EPILOGUE(i, len, 3)
313 out_f16[i] =
__fp16(in[i]);
316static void qFloatFromFloat16_fast(
float *out,
const quint16 *in, qsizetype len)
noexcept
318 const __fp16 *in_f16 =
reinterpret_cast<
const __fp16 *>(in);
320 for (; i < len - 3; i += 4)
321 vst1q_f32(out + i, vcvt_f32_f16(vld1_f16(in_f16 + i)));
322 SIMD_EPILOGUE(i, len, 3)
323 out[i] =
float(in_f16[i]);
342
343
344
345
346
347
348
349
350
354 return qFloatToFloat16_fast(
reinterpret_cast<quint16 *>(out), in, len);
356 for (qsizetype i = 0; i < len; ++i)
361
362
363
364
365
366
367
368
369
373 return qFloatFromFloat16_fast(out,
reinterpret_cast<
const quint16 *>(in), len);
375 for (qsizetype i = 0; i < len; ++i)
376 out[i] =
float(in[i]);
380
381
382
383
384
385
386
387
388
390#ifndef QT_NO_DATASTREAM
392
393
394
395
396
397
398
399
400
401
402QDataStream &operator<<(QDataStream &ds,
qfloat16 f)
408
409
410
411
412
413
414
415
416
417
418
433QTextStream &operator<<(QTextStream &ts,
qfloat16 f)
435 return ts <<
float(f);
\keyword 16-bit Floating Point Support\inmodule QtCore \inheaderfile QFloat16
Q_CORE_EXPORT int fpClassify() const noexcept
QTextStream & operator>>(QTextStream &ts, qfloat16 &f16)
static void qFloatToFloat16_fast(quint16 *, const float *, qsizetype) noexcept
static void qFloatFromFloat16_fast(float *, const quint16 *, qsizetype) noexcept
QDataStream & operator>>(QDataStream &ds, qfloat16 &f)
Q_CORE_EXPORT void qFloatFromFloat16(float *, const qfloat16 *, qsizetype length) noexcept
Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *, const float *, qsizetype length) noexcept