Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qsimd_p.h
Go to the documentation of this file.
1// Copyright (C) 2021 The Qt Company Ltd.
2// Copyright (C) 2022 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4// Qt-Security score:significant reason:default
5
6#ifndef QSIMD_P_H
7#define QSIMD_P_H
8
9//
10// W A R N I N G
11// -------------
12//
13// This file is not part of the Qt API. It exists purely as an
14// implementation detail. This header file may change from version to
15// version without notice, or even be removed.
16//
17// We mean it.
18//
19
20#include <QtCore/private/qglobal_p.h>
21#include <QtCore/qsimd.h>
22
23QT_WARNING_PUSH
24QT_WARNING_DISABLE_CLANG("-Wundef")
25QT_WARNING_DISABLE_GCC("-Wundef")
26QT_WARNING_DISABLE_INTEL(103)
27
28#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length)
29 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
30
31#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length)
32 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
33
34#define SIMD_EPILOGUE(i, length, max)
35 for (int _i = 0; _i < max && i < length; ++i, ++_i)
36
37/*
38 * Code can use the following constructs to determine compiler support & status:
39 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
40 * If this test passes, then the compiler is already generating code for that
41 * given sub-architecture. The intrinsics for that sub-architecture are
42 * #included and can be used without restriction or runtime check.
43 *
44 * - #if QT_COMPILER_SUPPORTS(XXX)
45 * If this test passes, then the compiler is able to generate code for that
46 * given sub-architecture in another translation unit, given the right set of
47 * flags. Use of the intrinsics is not guaranteed. This is useful with
48 * runtime detection (see below).
49 *
50 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
51 * If this test passes, then the compiler is able to generate code for that
52 * given sub-architecture in this translation unit, even if it is not doing
53 * that now (it might be). Individual functions may be tagged with
54 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
55 * sub-arch. Only inside such functions is the use of the intrisics
56 * guaranteed to work. This is useful with runtime detection (see below).
57 *
58 * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
59 * historical: GCC 4.8 needed the distinction.
60 *
61 * Runtime detection of a CPU sub-architecture can be done with the
62 * qCpuHasFeature(XXX) function. There are two strategies for generating
63 * optimized code like that:
64 *
65 * 1) place the optimized code in a different translation unit (C or assembly
66 * sources) and pass the correct flags to the compiler to enable support. Those
67 * sources must not include qglobal.h, which means they cannot include this
68 * file either. The dispatcher function would look like this:
69 *
70 * void foo()
71 * {
72 * #if QT_COMPILER_SUPPORTS(XXX)
73 * if (qCpuHasFeature(XXX)) {
74 * foo_optimized_xxx();
75 * return;
76 * }
77 * #endif
78 * foo_plain();
79 * }
80 *
81 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
82 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
83 * other Qt code. The dispatcher function would look like this:
84 *
85 * void foo()
86 * {
87 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
88 * if (qCpuHasFeature(XXX)) {
89 * foo_optimized_xxx();
90 * return;
91 * }
92 * #endif
93 * foo_plain();
94 * }
95 */
96
97#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
98#include <intrin.h>
99#endif
100
101#define QT_COMPILER_SUPPORTS(x) (defined QT_COMPILER_SUPPORTS_##x && QT_COMPILER_SUPPORTS_##x)
102
103#if defined(Q_PROCESSOR_ARM_64)
104# define QT_COMPILER_SUPPORTS_HERE(x) ((defined __ARM_FEATURE_##x && __ARM_FEATURE_##x) || (defined __##x##__ && __##x##__) || QT_COMPILER_SUPPORTS(x))
105# if defined(Q_CC_GNU) || defined(Q_CC_CLANG)
106 /* GCC requires attributes for a function */
107# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
108# else
109# define QT_FUNCTION_TARGET(x)
110# endif
111#elif defined(Q_PROCESSOR_ARM_32)
112 /* We do not support for runtime CPU feature switching on ARM32 */
113# define QT_COMPILER_SUPPORTS_HERE(x) ((defined __ARM_FEATURE_##x && __ARM_FEATURE_##x) || (defined __##x##__ && __##x##__))
114# define QT_FUNCTION_TARGET(x)
115#elif defined(Q_PROCESSOR_MIPS)
116# define QT_COMPILER_SUPPORTS_HERE(x) (defined __##x##__ && __##x##__)
117# define QT_FUNCTION_TARGET(x)
118# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
119# define __MIPS_DSP__
120# endif
121# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
122# define __MIPS_DSPR2__
123# endif
124#elif defined(Q_PROCESSOR_LOONGARCH)
125# define QT_COMPILER_SUPPORTS_HERE(x) QT_COMPILER_SUPPORTS(x)
126# define QT_FUNCTION_TARGET(x)
127#elif defined(Q_PROCESSOR_X86)
128# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC) && (Q_CC_CLANG < 1900)
129# define QT_COMPILER_SUPPORTS_HERE(x) (defined __##x##__ && __##x##__)
130# else
131# define QT_COMPILER_SUPPORTS_HERE(x) ((defined __##x##__ && __##x##__) || QT_COMPILER_SUPPORTS(x))
132# endif
133# if defined(Q_CC_GNU) || defined(Q_CC_CLANG)
134 /* GCC requires attributes for a function */
135# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
136# else
137# define QT_FUNCTION_TARGET(x)
138# endif
139#else
140# define QT_COMPILER_SUPPORTS_HERE(x) (defined __##x##__ && __##x##__)
141# define QT_FUNCTION_TARGET(x)
142#endif
143
144#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
145// Intrinsic support appears to be missing, so pretend these features don't exist
146# undef __SSE__
147# undef __SSE2__
148# undef __SSE3__
149# undef __SSSE3__
150# undef __SSE4_1__
151# undef __SSE4_2__
152# undef __AES__
153# undef __POPCNT__
154# undef __AVX__
155# undef __F16C__
156# undef __RDRND__
157# undef __AVX2__
158# undef __BMI__
159# undef __BMI2__
160# undef __FMA__
161# undef __MOVBE__
162# undef __RDSEED__
163# undef __AVX512F__
164# undef __AVX512ER__
165# undef __AVX512CD__
166# undef __AVX512PF__
167# undef __AVX512DQ__
168# undef __AVX512BW__
169# undef __AVX512VL__
170# undef __AVX512IFMA__
171# undef __AVX512VBMI__
172# undef __SHA__
173# undef __AVX512VBMI2__
174# undef __AVX512BITALG__
175# undef __AVX512VNNI__
176# undef __AVX512VPOPCNTDQ__
177# undef __GFNI__
178# undef __VAES__
179#endif
180
181#ifdef Q_PROCESSOR_X86
182/* -- x86 intrinsic support -- */
183
184# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
185// The compiler for QNX is missing the intrinsic
186# undef QT_COMPILER_SUPPORTS_RDSEED
187# endif
188# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
189// MSVC doesn't define __SSE2__, so do it ourselves
190# define __SSE__ 1
191# endif
192
193# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
194// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
195// it to emit unaligned loads & stores
196// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
197asm(
198 ".macro vmovapd args:vararg\n"
199 " vmovupd \\args\n"
200 ".endm\n"
201 ".macro vmovaps args:vararg\n"
202 " vmovups \\args\n"
203 ".endm\n"
204 ".macro vmovdqa args:vararg\n"
205 " vmovdqu \\args\n"
206 ".endm\n"
207 ".macro vmovdqa32 args:vararg\n"
208 " vmovdqu32 \\args\n"
209 ".endm\n"
210 ".macro vmovdqa64 args:vararg\n"
211 " vmovdqu64 \\args\n"
212 ".endm\n"
213);
214# endif
215
216# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
217// GCC 4.4 and Clang 2.8 added a few more intrinsics there
218# include <x86intrin.h>
219# endif
220#ifdef Q_OS_WASM
221# include <immintrin.h>
222# endif
223
224# include <QtCore/private/qsimd_x86_p.h>
225
226// x86-64 sub-architecture version 3
227//
228// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
229// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
230// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
231// with the GNU libc, libraries with this feature can be installed on a
232// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
233// sub-architecture too.
234
235# if defined(__AVX2__)
236// List of features present with -march=x86-64-v3 and not architecturally
237// implied by __AVX2__
238# define ARCH_HASWELL_MACROS
239 (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
240# if ARCH_HASWELL_MACROS != 7
241# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
242# endif
243static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
244# define __haswell__ 1
245# undef ARCH_HASWELL_MACROS
246# endif
247
248// x86-64 sub-architecture version 4
249//
250// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
251// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
252// with AVX512 support and it includes all of these too. The GNU libc subdir for
253// this is "glibc-hwcaps/x86-64-v4".
254//
255# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
256# if ARCH_SKX_MACROS != 0
257# if ARCH_SKX_MACROS != 5
258# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
259# endif
260static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
261# define __skylake_avx512__ 1
262# endif
263# undef ARCH_SKX_MACROS
264#endif /* Q_PROCESSOR_X86 */
265
266// NEON intrinsics
267// note: as of GCC 4.9, does not support function targets for ARM
268#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
269#if defined(Q_CC_CLANG)
270#define QT_FUNCTION_TARGET_STRING_NEON "neon"
271#else
272#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
273#endif
274#ifndef __ARM_NEON__
275// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
276#define __ARM_NEON__
277#endif
278
279#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
280static inline uint16_t vaddvq_u16(uint16x8_t v8)
281{
282 const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
283 const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
284 return vget_lane_u16(vreinterpret_u16_u64(v1), 0);
285}
286
287static inline uint8_t vaddv_u8(uint8x8_t v8)
288{
289 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
290 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
291}
292#endif
293
294// Missing NEON intrinsics, needed due different type definitions:
295static inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4,
296 uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8)
297{
298#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
299 using u64 = uint64_t;
300 const uint16x8_t vmask = {
301 v1 | (v2 << 16) | (u64(v3) << 32) | (u64(v4) << 48),
302 v5 | (v6 << 16) | (u64(v7) << 32) | (u64(v8) << 48)
303 };
304#else
305 const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
306#endif
307 return vmask;
308}
309static inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
310 uint8_t v6, uint8_t v7, uint8_t v8)
311{
312#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
313 using u64 = uint64_t;
314 const uint8x8_t vmask = {
315 v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
316 (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56)
317 };
318#else
319 const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
320#endif
321 return vmask;
322}
323static inline uint8x16_t qvsetq_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
324 uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
325 uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
326 uint8_t v15, uint8_t v16)
327{
328#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
329 using u64 = uint64_t;
330 const uint8x16_t vmask = {
331 v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
332 (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56),
333 v9 | (v10 << 8) | (v11 << 16) | (v12 << 24) |
334 (u64(v13) << 32) | (u64(v14) << 40) | (u64(v15) << 48) | (u64(v16) << 56)
335 };
336#else
337 const uint8x16_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8,
338 v9, v10, v11, v12, v13, v14, v15, v16};
339#endif
340 return vmask;
341}
342static inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
343{
344#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
345 return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
346#else
347 return uint32x4_t{ a, b, c, d };
348#endif
349}
350#endif
351
352#if defined(_M_ARM64) && __ARM_ARCH >= 800
353#define __ARM_FEATURE_CRYPTO 1
354#define __ARM_FEATURE_CRC32 1
355#endif
356
357#if defined(Q_PROCESSOR_ARM_64)
358#if defined(Q_CC_CLANG)
359#define QT_FUNCTION_TARGET_STRING_AES "aes"
360#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
361#define QT_FUNCTION_TARGET_STRING_SVE "sve"
362#elif defined(Q_CC_GNU)
363#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
364#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
365#define QT_FUNCTION_TARGET_STRING_SVE "+sve"
366#elif defined(Q_CC_MSVC)
367#define QT_FUNCTION_TARGET_STRING_AES
368#define QT_FUNCTION_TARGET_STRING_CRC32
369#define QT_FUNCTION_TARGET_STRING_SVE
370#endif
371#elif defined(Q_PROCESSOR_ARM_32)
372#if defined(Q_CC_CLANG)
373#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
374#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
375#elif defined(Q_CC_GNU)
376#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
377#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
378#endif
379#endif
380
381#ifndef Q_PROCESSOR_X86
382enum CPUFeatures {
383#if defined(Q_PROCESSOR_ARM)
384 CpuFeatureNEON = 2,
385 CpuFeatureARM_NEON = CpuFeatureNEON,
386 CpuFeatureCRC32 = 4,
387 CpuFeatureAES = 8,
388 CpuFeatureARM_CRYPTO = CpuFeatureAES,
389 CpuFeatureSVE = 16,
390#elif defined(Q_PROCESSOR_MIPS)
391 CpuFeatureDSP = 2,
392 CpuFeatureDSPR2 = 4,
393#elif defined(Q_PROCESSOR_LOONGARCH)
394 CpuFeatureLSX = 2,
395 CpuFeatureLASX = 4,
396#endif
397};
398
400#if defined __ARM_NEON__
401 | CpuFeatureNEON
402#endif
403#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
404 // Yocto Project recipes enable Crypto extension for all ARMv8 configs,
405 // even for targets without the Crypto extension. That's wrong, but as
406 // the compiler never generates the code for them on their own, most
407 // code never notices the problem. But we would. By not setting the
408 // bits here, we force a runtime detection.
409#if defined __ARM_FEATURE_CRC32
410 | CpuFeatureCRC32
411#endif
412#if defined (__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)
413 | CpuFeatureAES
414#endif
415#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
416#if defined(__ARM_FEATURE_SVE) && defined(Q_PROCESSOR_ARM_64)
417 | CpuFeatureSVE
418#endif
419#if defined __mips_dsp
420 | CpuFeatureDSP
421#endif
422#if defined __mips_dspr2
423 | CpuFeatureDSPR2
424#endif
425#if defined __loongarch_sx
426 | CpuFeatureLSX
427#endif
428#if defined __loongarch_asx
429 | CpuFeatureLASX
430#endif
431 ;
432#endif
433
434#ifdef __cplusplus
435# include <atomic>
436# define Q_ATOMIC(T) std::atomic<T>
437QT_BEGIN_NAMESPACE
438using std::atomic_load_explicit;
439static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
440extern "C" {
441#else
442# include <stdatomic.h>
443# define Q_ATOMIC(T) _Atomic(T)
444#endif
445
446#ifdef Q_PROCESSOR_X86
447typedef uint64_t QCpuFeatureType;
448static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures;
449static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
450static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
451#else
452typedef unsigned QCpuFeatureType;
453#endif
454extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1];
455Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
456
457static inline uint64_t qCpuFeatures()
458{
459#ifdef QT_BOOTSTRAPPED
460 return qCompilerCpuFeatures; // no detection
461#else
462 quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), memory_order_relaxed);
463 if (!QT_SUPPORTS_INIT_PRIORITY) {
464 if (Q_UNLIKELY(features == 0))
465 features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
466 }
467 return features;
468#endif
469}
470
471#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature)
472 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
473
474#ifdef __cplusplus
475} // extern "C"
476
477QT_END_NAMESPACE
478
479#endif // __cplusplus
480
481QT_WARNING_POP
482
483#endif // QSIMD_P_H
#define assert
void qDumpCPUFeatures()
Definition qsimd.cpp:682
#define QT_FUNCTION_TARGET_BASELINE
Definition qsimd.cpp:25
QT_FUNCTION_TARGET_BASELINE uint64_t QT_MANGLE_NAMESPACE qDetectCpuFeatures()
Definition qsimd.cpp:630
static constexpr auto SimdInitialized
Definition qsimd.cpp:626
static const int features_indices[]
Definition qsimd.cpp:119
static uint detectProcessorFeatures()
Definition qsimd.cpp:617
static const char features_string[]
Definition qsimd.cpp:118
static const quint64 minFeature
Definition qsimd.cpp:624
static const uint64_t qCompilerCpuFeatures
Definition qsimd_p.h:399
static uint64_t qCpuFeatures()
Definition qsimd_p.h:457
unsigned QCpuFeatureType
Definition qsimd_p.h:452
#define Q_ATOMIC(T)
Definition qsimd_p.h:443