Qt
Internal/Contributor docs for the Qt SDK. <b>Note:</b> These are NOT official API docs; those are found <a href='https://doc.qt.io/'>here</a>.
Loading...
Searching...
No Matches
qfloat16.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2016 by Southwest Research Institute (R)
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qfloat16.h"
6#include "private/qsimd_p.h"
7#include <cmath> // for fpclassify()'s return values
8
9#include <QtCore/qdatastream.h>
10#include <QtCore/qmetatype.h>
11#include <QtCore/qtextstream.h>
12
15
17
18
141int qfloat16::fpClassify() const noexcept
142{
143 return isInf() ? FP_INFINITE : isNaN() ? FP_NAN
144 : !(b16 & 0x7fff) ? FP_ZERO : isNormal() ? FP_NORMAL : FP_SUBNORMAL;
145}
146
172#if QT_COMPILER_SUPPORTS_HERE(F16C)
173static inline bool hasFastF16()
174{
175 // qsimd.cpp:detectProcessorFeatures() turns off this feature if AVX
176 // state-saving is not enabled by the OS
177 return qCpuHasFeature(F16C);
178}
179
180#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
181static bool hasFastF16Avx256()
182{
183 // 256-bit AVX512 don't have a performance penalty (see qstring.cpp for more info)
184 return qCpuHasFeature(ArchSkylakeAvx512);
185}
186
187static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
188void qFloatToFloat16_tail_avx256(quint16 *out, const float *in, qsizetype len) noexcept
189{
190 __mmask16 mask = _bzhi_u32(-1, len);
191 __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
192 __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
193 _mm_mask_storeu_epi16(out, mask, f16);
194};
195
196static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
197void qFloatFromFloat16_tail_avx256(float *out, const quint16 *in, qsizetype len) noexcept
198{
199 __mmask16 mask = _bzhi_u32(-1, len);
200 __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
201 __m256 f32 = _mm256_cvtph_ps(f16);
202 _mm256_mask_storeu_ps(out, mask, f32);
203};
204#endif
205
207static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
208{
209 constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
210 constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
211 qsizetype i = 0;
212
213 if (len >= Step) {
214 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
215 __m256 f32 = _mm256_loadu_ps(in + offset);
216 __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
217 _mm_storeu_si128(reinterpret_cast<__m128i *>(out + offset), f16);
218 };
219
220 // main loop: convert Step (8) floats per iteration
221 for ( ; i + Step < len; i += Step)
222 convertOneChunk(i);
223
224 // epilogue: convert the last chunk, possibly overlapping with the last
225 // iteration of the loop
226 return convertOneChunk(len - Step);
227 }
228
229#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
230 if (hasFastF16Avx256())
231 return qFloatToFloat16_tail_avx256(out, in, len);
232#endif
233
234 if (len >= HalfStep) {
235 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
236 __m128 f32 = _mm_loadu_ps(in + offset);
237 __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
238 _mm_storel_epi64(reinterpret_cast<__m128i *>(out + offset), f16);
239 };
240
241 // two conversions, possibly overlapping
242 convertOneChunk(0);
243 return convertOneChunk(len - HalfStep);
244 }
245
246 // Inlining "qfloat16::qfloat16(float f)":
247 for ( ; i < len; ++i)
248 out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
249}
250
252static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept
253{
254 constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
255 constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
256 qsizetype i = 0;
257
258 if (len >= Step) {
259 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
260 __m128i f16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + offset));
261 __m256 f32 = _mm256_cvtph_ps(f16);
262 _mm256_storeu_ps(out + offset, f32);
263 };
264
265 // main loop: convert Step (8) floats per iteration
266 for ( ; i + Step < len; i += Step)
267 convertOneChunk(i);
268
269 // epilogue: convert the last chunk, possibly overlapping with the last
270 // iteration of the loop
271 return convertOneChunk(len - Step);
272 }
273
274#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
275 if (hasFastF16Avx256())
276 return qFloatFromFloat16_tail_avx256(out, in, len);
277#endif
278
279 if (len >= HalfStep) {
280 auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
281 __m128i f16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(in + offset));
282 __m128 f32 = _mm_cvtph_ps(f16);
283 _mm_storeu_ps(out + offset, f32);
284 };
285
286 // two conversions, possibly overlapping
287 convertOneChunk(0);
288 return convertOneChunk(len - HalfStep);
289 }
290
291 // Inlining "qfloat16::operator float()":
292 for ( ; i < len; ++i)
293 out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
294}
295
296#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__) && (__ARM_FP & 2)
297static inline bool hasFastF16()
298{
299 return true;
300}
301
302static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
303{
304 __fp16 *out_f16 = reinterpret_cast<__fp16 *>(out);
305 qsizetype i = 0;
306 for (; i < len - 3; i += 4)
307 vst1_f16(out_f16 + i, vcvt_f16_f32(vld1q_f32(in + i)));
308 SIMD_EPILOGUE(i, len, 3)
309 out_f16[i] = __fp16(in[i]);
310}
311
312static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept
313{
314 const __fp16 *in_f16 = reinterpret_cast<const __fp16 *>(in);
315 qsizetype i = 0;
316 for (; i < len - 3; i += 4)
317 vst1q_f32(out + i, vcvt_f32_f16(vld1_f16(in_f16 + i)));
318 SIMD_EPILOGUE(i, len, 3)
319 out[i] = float(in_f16[i]);
320}
321#else
322static inline bool hasFastF16()
323{
324 return false;
325}
326
327static void qFloatToFloat16_fast(quint16 *, const float *, qsizetype) noexcept
328{
329 Q_UNREACHABLE();
330}
331
332static void qFloatFromFloat16_fast(float *, const quint16 *, qsizetype) noexcept
333{
334 Q_UNREACHABLE();
335}
336#endif
347Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *out, const float *in, qsizetype len) noexcept
348{
349 if (hasFastF16())
350 return qFloatToFloat16_fast(reinterpret_cast<quint16 *>(out), in, len);
351
352 for (qsizetype i = 0; i < len; ++i)
353 out[i] = qfloat16(in[i]);
354}
355
366Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qsizetype len) noexcept
367{
368 if (hasFastF16())
369 return qFloatFromFloat16_fast(out, reinterpret_cast<const quint16 *>(in), len);
370
371 for (qsizetype i = 0; i < len; ++i)
372 out[i] = float(in[i]);
373}
374
388#ifndef QT_NO_DATASTREAM
401{
402 return ds << f.b16;
403}
404
418{
419 return ds >> f.b16;
420}
421#endif
422
424{
425 float f;
426 ts >> f;
427 f16 = qfloat16(f);
428 return ts;
429}
430
432{
433 return ts << float(f);
434}
435
437
438#include "qfloat16tables.cpp"
\inmodule QtCore\reentrant
Definition qdatastream.h:46
\inmodule QtCore
\keyword 16-bit Floating Point Support\inmodule QtCore \inheaderfile QFloat16
Definition qfloat16.h:47
Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qsizetype len) noexcept
Definition qfloat16.cpp:366
Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *out, const float *in, qsizetype len) noexcept
Definition qfloat16.cpp:347
Combined button and popup list for selecting options.
QDataStream & operator<<(QDataStream &ds, qfloat16 f)
Definition qfloat16.cpp:400
static bool hasFastF16()
Definition qfloat16.cpp:322
static void qFloatToFloat16_fast(quint16 *, const float *, qsizetype) noexcept
Definition qfloat16.cpp:327
static void qFloatFromFloat16_fast(float *, const quint16 *, qsizetype) noexcept
Definition qfloat16.cpp:332
QDataStream & operator>>(QDataStream &ds, qfloat16 &f)
Definition qfloat16.cpp:417
#define QT_DECL_METATYPE_EXTERN(TYPE, EXPORT)
Definition qmetatype.h:1388
#define QT_IMPL_METATYPE_EXTERN(TYPE)
Definition qmetatype.h:1390
GLfloat GLfloat f
GLenum GLuint GLintptr offset
GLint GLint GLint GLint GLint GLint GLint GLbitfield mask
GLuint in
GLenum GLsizei len
#define qCpuHasFeature(feature)
Definition qsimd_p.h:387
#define QT_FUNCTION_TARGET(x)
Definition qsimd_p.h:133
#define SIMD_EPILOGUE(i, length, max)
Definition qsimd_p.h:33
unsigned short quint16
Definition qtypes.h:48
ptrdiff_t qsizetype
Definition qtypes.h:165
QTextStream out(stdout)
[7]