Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qsimd_p.h
Go to the documentation of this file.
1// Copyright (C) 2021 The Qt Company Ltd.
2// Copyright (C) 2022 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#ifndef QSIMD_P_H
6#define QSIMD_P_H
7
8//
9// W A R N I N G
10// -------------
11//
12// This file is not part of the Qt API. It exists purely as an
13// implementation detail. This header file may change from version to
14// version without notice, or even be removed.
15//
16// We mean it.
17//
18
19#include <QtCore/private/qglobal_p.h>
20#include <QtCore/qsimd.h>
21
22QT_WARNING_PUSH
23QT_WARNING_DISABLE_CLANG("-Wundef")
24QT_WARNING_DISABLE_GCC("-Wundef")
25QT_WARNING_DISABLE_INTEL(103)
26
27#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length)
28 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
29
30#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length)
31 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
32
33#define SIMD_EPILOGUE(i, length, max)
34 for (int _i = 0; _i < max && i < length; ++i, ++_i)
35
36/*
37 * Code can use the following constructs to determine compiler support & status:
38 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
39 * If this test passes, then the compiler is already generating code for that
40 * given sub-architecture. The intrinsics for that sub-architecture are
41 * #included and can be used without restriction or runtime check.
42 *
43 * - #if QT_COMPILER_SUPPORTS(XXX)
44 * If this test passes, then the compiler is able to generate code for that
45 * given sub-architecture in another translation unit, given the right set of
46 * flags. Use of the intrinsics is not guaranteed. This is useful with
47 * runtime detection (see below).
48 *
49 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
50 * If this test passes, then the compiler is able to generate code for that
51 * given sub-architecture in this translation unit, even if it is not doing
52 * that now (it might be). Individual functions may be tagged with
53 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
54 * sub-arch. Only inside such functions is the use of the intrisics
55 * guaranteed to work. This is useful with runtime detection (see below).
56 *
57 * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
58 * historical: GCC 4.8 needed the distinction.
59 *
60 * Runtime detection of a CPU sub-architecture can be done with the
61 * qCpuHasFeature(XXX) function. There are two strategies for generating
62 * optimized code like that:
63 *
64 * 1) place the optimized code in a different translation unit (C or assembly
65 * sources) and pass the correct flags to the compiler to enable support. Those
66 * sources must not include qglobal.h, which means they cannot include this
67 * file either. The dispatcher function would look like this:
68 *
69 * void foo()
70 * {
71 * #if QT_COMPILER_SUPPORTS(XXX)
72 * if (qCpuHasFeature(XXX)) {
73 * foo_optimized_xxx();
74 * return;
75 * }
76 * #endif
77 * foo_plain();
78 * }
79 *
80 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
81 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
82 * other Qt code. The dispatcher function would look like this:
83 *
84 * void foo()
85 * {
86 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
87 * if (qCpuHasFeature(XXX)) {
88 * foo_optimized_xxx();
89 * return;
90 * }
91 * #endif
92 * foo_plain();
93 * }
94 */
95
96#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
97#include <intrin.h>
98#endif
99
100#define QT_COMPILER_SUPPORTS(x) (defined QT_COMPILER_SUPPORTS_##x && QT_COMPILER_SUPPORTS_##x)
101
102#if defined(Q_PROCESSOR_ARM_64)
103# define QT_COMPILER_SUPPORTS_HERE(x) ((defined __ARM_FEATURE_##x && __ARM_FEATURE_##x) || (defined __##x##__ && __##x##__) || QT_COMPILER_SUPPORTS(x))
104# if defined(Q_CC_GNU) || defined(Q_CC_CLANG)
105 /* GCC requires attributes for a function */
106# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
107# else
108# define QT_FUNCTION_TARGET(x)
109# endif
110#elif defined(Q_PROCESSOR_ARM_32)
111 /* We do not support for runtime CPU feature switching on ARM32 */
112# define QT_COMPILER_SUPPORTS_HERE(x) ((defined __ARM_FEATURE_##x && __ARM_FEATURE_##x) || (defined __##x##__ && __##x##__))
113# define QT_FUNCTION_TARGET(x)
114#elif defined(Q_PROCESSOR_MIPS)
115# define QT_COMPILER_SUPPORTS_HERE(x) (defined __##x##__ && __##x##__)
116# define QT_FUNCTION_TARGET(x)
117# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
118# define __MIPS_DSP__
119# endif
120# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
121# define __MIPS_DSPR2__
122# endif
123#elif defined(Q_PROCESSOR_LOONGARCH)
124# define QT_COMPILER_SUPPORTS_HERE(x) QT_COMPILER_SUPPORTS(x)
125# define QT_FUNCTION_TARGET(x)
126#elif defined(Q_PROCESSOR_X86)
127# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC) && (Q_CC_CLANG < 1900)
128# define QT_COMPILER_SUPPORTS_HERE(x) (defined __##x##__ && __##x##__)
129# else
130# define QT_COMPILER_SUPPORTS_HERE(x) ((defined __##x##__ && __##x##__) || QT_COMPILER_SUPPORTS(x))
131# endif
132# if defined(Q_CC_GNU) || defined(Q_CC_CLANG)
133 /* GCC requires attributes for a function */
134# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
135# else
136# define QT_FUNCTION_TARGET(x)
137# endif
138#else
139# define QT_COMPILER_SUPPORTS_HERE(x) (defined __##x##__ && __##x##__)
140# define QT_FUNCTION_TARGET(x)
141#endif
142
143#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
144// Intrinsic support appears to be missing, so pretend these features don't exist
145# undef __SSE__
146# undef __SSE2__
147# undef __SSE3__
148# undef __SSSE3__
149# undef __SSE4_1__
150# undef __SSE4_2__
151# undef __AES__
152# undef __POPCNT__
153# undef __AVX__
154# undef __F16C__
155# undef __RDRND__
156# undef __AVX2__
157# undef __BMI__
158# undef __BMI2__
159# undef __FMA__
160# undef __MOVBE__
161# undef __RDSEED__
162# undef __AVX512F__
163# undef __AVX512ER__
164# undef __AVX512CD__
165# undef __AVX512PF__
166# undef __AVX512DQ__
167# undef __AVX512BW__
168# undef __AVX512VL__
169# undef __AVX512IFMA__
170# undef __AVX512VBMI__
171# undef __SHA__
172# undef __AVX512VBMI2__
173# undef __AVX512BITALG__
174# undef __AVX512VNNI__
175# undef __AVX512VPOPCNTDQ__
176# undef __GFNI__
177# undef __VAES__
178#endif
179
180#ifdef Q_PROCESSOR_X86
181/* -- x86 intrinsic support -- */
182
183# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
184// The compiler for QNX is missing the intrinsic
185# undef QT_COMPILER_SUPPORTS_RDSEED
186# endif
187# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
188// MSVC doesn't define __SSE2__, so do it ourselves
189# define __SSE__ 1
190# endif
191
192# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
193// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
194// it to emit unaligned loads & stores
195// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
196asm(
197 ".macro vmovapd args:vararg\n"
198 " vmovupd \\args\n"
199 ".endm\n"
200 ".macro vmovaps args:vararg\n"
201 " vmovups \\args\n"
202 ".endm\n"
203 ".macro vmovdqa args:vararg\n"
204 " vmovdqu \\args\n"
205 ".endm\n"
206 ".macro vmovdqa32 args:vararg\n"
207 " vmovdqu32 \\args\n"
208 ".endm\n"
209 ".macro vmovdqa64 args:vararg\n"
210 " vmovdqu64 \\args\n"
211 ".endm\n"
212);
213# endif
214
215# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
216// GCC 4.4 and Clang 2.8 added a few more intrinsics there
217# include <x86intrin.h>
218# endif
219#ifdef Q_OS_WASM
220# include <immintrin.h>
221# endif
222
223# include <QtCore/private/qsimd_x86_p.h>
224
225// x86-64 sub-architecture version 3
226//
227// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
228// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
229// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
230// with the GNU libc, libraries with this feature can be installed on a
231// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
232// sub-architecture too.
233
234# if defined(__AVX2__)
235// List of features present with -march=x86-64-v3 and not architecturally
236// implied by __AVX2__
237# define ARCH_HASWELL_MACROS
238 (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
239# if ARCH_HASWELL_MACROS != 7
240# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
241# endif
242static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
243# define __haswell__ 1
244# undef ARCH_HASWELL_MACROS
245# endif
246
247// x86-64 sub-architecture version 4
248//
249// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
250// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
251// with AVX512 support and it includes all of these too. The GNU libc subdir for
252// this is "glibc-hwcaps/x86-64-v4".
253//
254# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
255# if ARCH_SKX_MACROS != 0
256# if ARCH_SKX_MACROS != 5
257# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
258# endif
259static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
260# define __skylake_avx512__ 1
261# endif
262# undef ARCH_SKX_MACROS
263#endif /* Q_PROCESSOR_X86 */
264
265// NEON intrinsics
266// note: as of GCC 4.9, does not support function targets for ARM
267#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
268#if defined(Q_CC_CLANG)
269#define QT_FUNCTION_TARGET_STRING_NEON "neon"
270#else
271#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
272#endif
273#ifndef __ARM_NEON__
274// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
275#define __ARM_NEON__
276#endif
277
278#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
279static inline uint16_t vaddvq_u16(uint16x8_t v8)
280{
281 const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
282 const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
283 return vget_lane_u16(vreinterpret_u16_u64(v1), 0);
284}
285
286static inline uint8_t vaddv_u8(uint8x8_t v8)
287{
288 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
289 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
290}
291#endif
292
293// Missing NEON intrinsics, needed due different type definitions:
294static inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4,
295 uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8)
296{
297#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
298 using u64 = uint64_t;
299 const uint16x8_t vmask = {
300 v1 | (v2 << 16) | (u64(v3) << 32) | (u64(v4) << 48),
301 v5 | (v6 << 16) | (u64(v7) << 32) | (u64(v8) << 48)
302 };
303#else
304 const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
305#endif
306 return vmask;
307}
308static inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
309 uint8_t v6, uint8_t v7, uint8_t v8)
310{
311#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
312 using u64 = uint64_t;
313 const uint8x8_t vmask = {
314 v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
315 (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56)
316 };
317#else
318 const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
319#endif
320 return vmask;
321}
322static inline uint8x16_t qvsetq_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
323 uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
324 uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
325 uint8_t v15, uint8_t v16)
326{
327#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
328 using u64 = uint64_t;
329 const uint8x16_t vmask = {
330 v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
331 (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56),
332 v9 | (v10 << 8) | (v11 << 16) | (v12 << 24) |
333 (u64(v13) << 32) | (u64(v14) << 40) | (u64(v15) << 48) | (u64(v16) << 56)
334 };
335#else
336 const uint8x16_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8,
337 v9, v10, v11, v12, v13, v14, v15, v16};
338#endif
339 return vmask;
340}
341static inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
342{
343#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
344 return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
345#else
346 return uint32x4_t{ a, b, c, d };
347#endif
348}
349#endif
350
351#if defined(_M_ARM64) && __ARM_ARCH >= 800
352#define __ARM_FEATURE_CRYPTO 1
353#define __ARM_FEATURE_CRC32 1
354#endif
355
356#if defined(Q_PROCESSOR_ARM_64)
357#if defined(Q_CC_CLANG)
358#define QT_FUNCTION_TARGET_STRING_AES "aes"
359#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
360#define QT_FUNCTION_TARGET_STRING_SVE "sve"
361#elif defined(Q_CC_GNU)
362#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
363#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
364#define QT_FUNCTION_TARGET_STRING_SVE "+sve"
365#elif defined(Q_CC_MSVC)
366#define QT_FUNCTION_TARGET_STRING_AES
367#define QT_FUNCTION_TARGET_STRING_CRC32
368#define QT_FUNCTION_TARGET_STRING_SVE
369#endif
370#elif defined(Q_PROCESSOR_ARM_32)
371#if defined(Q_CC_CLANG)
372#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
373#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
374#elif defined(Q_CC_GNU)
375#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
376#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
377#endif
378#endif
379
380#ifndef Q_PROCESSOR_X86
381enum CPUFeatures {
382#if defined(Q_PROCESSOR_ARM)
383 CpuFeatureNEON = 2,
384 CpuFeatureARM_NEON = CpuFeatureNEON,
385 CpuFeatureCRC32 = 4,
386 CpuFeatureAES = 8,
387 CpuFeatureARM_CRYPTO = CpuFeatureAES,
388 CpuFeatureSVE = 16,
389#elif defined(Q_PROCESSOR_MIPS)
390 CpuFeatureDSP = 2,
391 CpuFeatureDSPR2 = 4,
392#elif defined(Q_PROCESSOR_LOONGARCH)
393 CpuFeatureLSX = 2,
394 CpuFeatureLASX = 4,
395#endif
396};
397
399#if defined __ARM_NEON__
400 | CpuFeatureNEON
401#endif
402#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
403 // Yocto Project recipes enable Crypto extension for all ARMv8 configs,
404 // even for targets without the Crypto extension. That's wrong, but as
405 // the compiler never generates the code for them on their own, most
406 // code never notices the problem. But we would. By not setting the
407 // bits here, we force a runtime detection.
408#if defined __ARM_FEATURE_CRC32
409 | CpuFeatureCRC32
410#endif
411#if defined (__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)
412 | CpuFeatureAES
413#endif
414#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
415#if defined(__ARM_FEATURE_SVE) && defined(Q_PROCESSOR_ARM_64)
416 | CpuFeatureSVE
417#endif
418#if defined __mips_dsp
419 | CpuFeatureDSP
420#endif
421#if defined __mips_dspr2
422 | CpuFeatureDSPR2
423#endif
424#if defined __loongarch_sx
425 | CpuFeatureLSX
426#endif
427#if defined __loongarch_asx
428 | CpuFeatureLASX
429#endif
430 ;
431#endif
432
433#ifdef __cplusplus
434# include <atomic>
435# define Q_ATOMIC(T) std::atomic<T>
436QT_BEGIN_NAMESPACE
437using std::atomic_load_explicit;
438static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
439extern "C" {
440#else
441# include <stdatomic.h>
442# define Q_ATOMIC(T) _Atomic(T)
443#endif
444
445#ifdef Q_PROCESSOR_X86
446typedef uint64_t QCpuFeatureType;
447static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures;
448static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
449static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
450#else
451typedef unsigned QCpuFeatureType;
452#endif
453extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1];
454Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
455
456static inline uint64_t qCpuFeatures()
457{
458#ifdef QT_BOOTSTRAPPED
459 return qCompilerCpuFeatures; // no detection
460#else
461 quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), memory_order_relaxed);
462 if (!QT_SUPPORTS_INIT_PRIORITY) {
463 if (Q_UNLIKELY(features == 0))
464 features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
465 }
466 return features;
467#endif
468}
469
470#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature)
471 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
472
473#ifdef __cplusplus
474} // extern "C"
475
476# if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
477Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
478
479static inline bool qHasHwrng()
480{
481 return qCpuHasFeature(RDRND);
482}
483# else
484static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
485{
486 return 0;
487}
488static inline bool qHasHwrng()
489{
490 return false;
491}
492# endif
493
494QT_END_NAMESPACE
495
496#endif // __cplusplus
497
498QT_WARNING_POP
499
500#endif // QSIMD_P_H
#define assert
void qDumpCPUFeatures()
Definition qsimd.cpp:688
#define QT_FUNCTION_TARGET_BASELINE
Definition qsimd.cpp:24
QT_FUNCTION_TARGET_BASELINE uint64_t QT_MANGLE_NAMESPACE qDetectCpuFeatures()
Definition qsimd.cpp:636
static constexpr auto SimdInitialized
Definition qsimd.cpp:632
static const int features_indices[]
Definition qsimd.cpp:118
static uint detectProcessorFeatures()
Definition qsimd.cpp:623
static const char features_string[]
Definition qsimd.cpp:117
static const quint64 minFeature
Definition qsimd.cpp:630
#define QT_COMPILER_SUPPORTS_HERE(x)
Definition qsimd_p.h:139
static const uint64_t qCompilerCpuFeatures
Definition qsimd_p.h:398
static uint64_t qCpuFeatures()
Definition qsimd_p.h:456
unsigned QCpuFeatureType
Definition qsimd_p.h:451
#define Q_ATOMIC(T)
Definition qsimd_p.h:442