Qt
Internal/Contributor docs for the Qt SDK. <b>Note:</b> These are NOT official API docs; those are found <a href='https://doc.qt.io/'>here</a>.
Loading...
Searching...
No Matches
qsimd_p.h
Go to the documentation of this file.
1// Copyright (C) 2021 The Qt Company Ltd.
2// Copyright (C) 2022 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#ifndef QSIMD_P_H
6#define QSIMD_P_H
7
8//
9// W A R N I N G
10// -------------
11//
12// This file is not part of the Qt API. It exists purely as an
13// implementation detail. This header file may change from version to
14// version without notice, or even be removed.
15//
16// We mean it.
17//
18
19#include <QtCore/private/qglobal_p.h>
20#include <QtCore/qsimd.h>
21
26
27#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
28 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
29
30#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
31 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
32
33#define SIMD_EPILOGUE(i, length, max) \
34 for (int _i = 0; _i < max && i < length; ++i, ++_i)
35
36/*
37 * Code can use the following constructs to determine compiler support & status:
38 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
39 * If this test passes, then the compiler is already generating code for that
40 * given sub-architecture. The intrinsics for that sub-architecture are
41 * #included and can be used without restriction or runtime check.
42 *
43 * - #if QT_COMPILER_SUPPORTS(XXX)
44 * If this test passes, then the compiler is able to generate code for that
45 * given sub-architecture in another translation unit, given the right set of
46 * flags. Use of the intrinsics is not guaranteed. This is useful with
47 * runtime detection (see below).
48 *
49 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
50 * If this test passes, then the compiler is able to generate code for that
51 * given sub-architecture in this translation unit, even if it is not doing
52 * that now (it might be). Individual functions may be tagged with
53 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
54 * sub-arch. Only inside such functions is the use of the intrisics
55 * guaranteed to work. This is useful with runtime detection (see below).
56 *
57 * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
58 * historical: GCC 4.8 needed the distinction.
59 *
60 * Runtime detection of a CPU sub-architecture can be done with the
61 * qCpuHasFeature(XXX) function. There are two strategies for generating
62 * optimized code like that:
63 *
64 * 1) place the optimized code in a different translation unit (C or assembly
65 * sources) and pass the correct flags to the compiler to enable support. Those
66 * sources must not include qglobal.h, which means they cannot include this
67 * file either. The dispatcher function would look like this:
68 *
69 * void foo()
70 * {
71 * #if QT_COMPILER_SUPPORTS(XXX)
72 * if (qCpuHasFeature(XXX)) {
73 * foo_optimized_xxx();
74 * return;
75 * }
76 * #endif
77 * foo_plain();
78 * }
79 *
80 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
81 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
82 * other Qt code. The dispatcher function would look like this:
83 *
84 * void foo()
85 * {
86 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
87 * if (qCpuHasFeature(XXX)) {
88 * foo_optimized_xxx();
89 * return;
90 * }
91 * #endif
92 * foo_plain();
93 * }
94 */
95
96#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
97#include <intrin.h>
98#endif
99
100#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
101
102#if defined(Q_PROCESSOR_ARM)
103# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
104# if defined(Q_CC_GNU)
105 /* GCC requires attributes for a function */
106# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
107# else
108# define QT_FUNCTION_TARGET(x)
109# endif
110#elif defined(Q_PROCESSOR_MIPS)
111# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
112# define QT_FUNCTION_TARGET(x)
113# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
114# define __MIPS_DSP__
115# endif
116# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
117# define __MIPS_DSPR2__
118# endif
119#elif defined(Q_PROCESSOR_X86)
120# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC)
121# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
122# else
123# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
124# endif
125# if defined(Q_CC_GNU)
126 /* GCC requires attributes for a function */
127# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
128# else
129# define QT_FUNCTION_TARGET(x)
130# endif
131#else
132# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
133# define QT_FUNCTION_TARGET(x)
134#endif
135
136#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
137// Intrinsic support appears to be missing, so pretend these features don't exist
138# undef __SSE__
139# undef __SSE2__
140# undef __SSE3__
141# undef __SSSE3__
142# undef __SSE4_1__
143# undef __SSE4_2__
144# undef __AES__
145# undef __POPCNT__
146# undef __AVX__
147# undef __F16C__
148# undef __RDRND__
149# undef __AVX2__
150# undef __BMI__
151# undef __BMI2__
152# undef __FMA__
153# undef __MOVBE__
154# undef __RDSEED__
155# undef __AVX512F__
156# undef __AVX512ER__
157# undef __AVX512CD__
158# undef __AVX512PF__
159# undef __AVX512DQ__
160# undef __AVX512BW__
161# undef __AVX512VL__
162# undef __AVX512IFMA__
163# undef __AVX512VBMI__
164# undef __SHA__
165# undef __AVX512VBMI2__
166# undef __AVX512BITALG__
167# undef __AVX512VNNI__
168# undef __AVX512VPOPCNTDQ__
169# undef __GFNI__
170# undef __VAES__
171#endif
172
173#ifdef Q_PROCESSOR_X86
174/* -- x86 intrinsic support -- */
175
176# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
177// The compiler for QNX is missing the intrinsic
178# undef QT_COMPILER_SUPPORTS_RDSEED
179# endif
180# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
181// MSVC doesn't define __SSE2__, so do it ourselves
182# define __SSE__ 1
183# endif
184
185# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
186// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
187// it to emit unaligned loads & stores
188// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
189asm(
190 ".macro vmovapd args:vararg\n"
191 " vmovupd \\args\n"
192 ".endm\n"
193 ".macro vmovaps args:vararg\n"
194 " vmovups \\args\n"
195 ".endm\n"
196 ".macro vmovdqa args:vararg\n"
197 " vmovdqu \\args\n"
198 ".endm\n"
199 ".macro vmovdqa32 args:vararg\n"
200 " vmovdqu32 \\args\n"
201 ".endm\n"
202 ".macro vmovdqa64 args:vararg\n"
203 " vmovdqu64 \\args\n"
204 ".endm\n"
205);
206# endif
207
208# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
209// GCC 4.4 and Clang 2.8 added a few more intrinsics there
210# include <x86intrin.h>
211# endif
212#ifdef Q_OS_WASM
213# include <immintrin.h>
214# endif
215
216# include <QtCore/private/qsimd_x86_p.h>
217
218// x86-64 sub-architecture version 3
219//
220// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
221// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
222// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
223// with the GNU libc, libraries with this feature can be installed on a
224// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
225// sub-architecture too.
226
227# if defined(__AVX2__)
228// List of features present with -march=x86-64-v3 and not architecturally
229// implied by __AVX2__
230# define ARCH_HASWELL_MACROS \
231 (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
232# if ARCH_HASWELL_MACROS != 7
233# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
234# endif
235static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
236# define __haswell__ 1
237# undef ARCH_HASWELL_MACROS
238# endif
239
240// x86-64 sub-architecture version 4
241//
242// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
243// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
244// with AVX512 support and it includes all of these too. The GNU libc subdir for
245// this is "glibc-hwcaps/x86-64-v4".
246//
247# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
248# if ARCH_SKX_MACROS != 0
249# if ARCH_SKX_MACROS != 5
250# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
251# endif
252static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
253# define __skylake_avx512__ 1
254# endif
255# undef ARCH_SKX_MACROS
256#endif /* Q_PROCESSOR_X86 */
257
258// NEON intrinsics
259// note: as of GCC 4.9, does not support function targets for ARM
260#if defined(__ARM_NEON) || defined(__ARM_NEON__)
261#if defined(Q_CC_CLANG)
262#define QT_FUNCTION_TARGET_STRING_NEON "neon"
263#else
264#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
265#endif
266#ifndef __ARM_NEON__
267// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
268#define __ARM_NEON__
269#endif
270
271#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
272inline uint16_t vaddvq_u16(uint16x8_t v8)
273{
274 const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
275 const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
276 return vget_lane_u16(vreinterpret_u16_u64(v1), 0);
277}
278
279inline uint8_t vaddv_u8(uint8x8_t v8)
280{
281 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
282 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
283}
284#endif
285
286#endif
287
288#if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32)
289# include <arm_acle.h>
290#endif
291
292#if defined(Q_PROCESSOR_ARM_64)
293#if defined(Q_CC_CLANG)
294#define QT_FUNCTION_TARGET_STRING_AES "crypto"
295#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
296#elif defined(Q_CC_GNU)
297#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
298#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
299#endif
300#elif defined(Q_PROCESSOR_ARM_32)
301#if defined(Q_CC_CLANG)
302#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
303#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
304#elif defined(Q_CC_GNU)
305#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
306#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
307#endif
308#endif
309
310#ifndef Q_PROCESSOR_X86
312#if defined(Q_PROCESSOR_ARM)
313 CpuFeatureNEON = 2,
314 CpuFeatureARM_NEON = CpuFeatureNEON,
315 CpuFeatureCRC32 = 4,
316 CpuFeatureAES = 8,
317 CpuFeatureARM_CRYPTO = CpuFeatureAES,
318#elif defined(Q_PROCESSOR_MIPS)
319 CpuFeatureDSP = 2,
320 CpuFeatureDSPR2 = 4,
321#endif
322};
323
324static const uint64_t qCompilerCpuFeatures = 0
325#if defined __ARM_NEON__
326 | CpuFeatureNEON
327#endif
328#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
329 // Yocto Project recipes enable Crypto extension for all ARMv8 configs,
330 // even for targets without the Crypto extension. That's wrong, but as
331 // the compiler never generates the code for them on their own, most
332 // code never notices the problem. But we would. By not setting the
333 // bits here, we force a runtime detection.
334#if defined __ARM_FEATURE_CRC32
335 | CpuFeatureCRC32
336#endif
337#if defined __ARM_FEATURE_CRYPTO
338 | CpuFeatureAES
339#endif
340#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
341#if defined __mips_dsp
342 | CpuFeatureDSP
343#endif
344#if defined __mips_dspr2
345 | CpuFeatureDSPR2
346#endif
347 ;
348#endif
349
350#ifdef __cplusplus
351# include <atomic>
352# define Q_ATOMIC(T) std::atomic<T>
354using std::atomic_load_explicit;
355static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
356extern "C" {
357#else
358# include <stdatomic.h>
359# define Q_ATOMIC(T) _Atomic(T)
360#endif
361
362#ifdef Q_PROCESSOR_X86
363typedef uint64_t QCpuFeatureType;
365static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
366static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
367#else
368typedef unsigned QCpuFeatureType;
369#endif
370extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1];
371Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
372
373static inline uint64_t qCpuFeatures()
374{
375#ifdef QT_BOOTSTRAPPED
376 return qCompilerCpuFeatures; // no detection
377#else
378 quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), memory_order_relaxed);
380 if (Q_UNLIKELY(features == 0))
382 }
383 return features;
384#endif
385}
386
387#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
388 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
389
390#ifdef __cplusplus
391} // extern "C"
392
393# if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
394Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
395
396static inline bool qHasHwrng()
397{
398 return qCpuHasFeature(RDRND);
399}
400# else
401static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
402{
403 return 0;
404}
405static inline bool qHasHwrng()
406{
407 return false;
408}
409# endif
410
412
413#endif // __cplusplus
414
416
417#endif // QSIMD_P_H
Combined button and popup list for selecting options.
#define Q_UNLIKELY(x)
#define QT_WARNING_DISABLE_INTEL(number)
#define QT_WARNING_POP
#define QT_WARNING_DISABLE_GCC(text)
#define QT_WARNING_PUSH
#define QT_WARNING_DISABLE_CLANG(text)
#define QT_SUPPORTS_INIT_PRIORITY
Definition qglobal_p.h:51
GLint GLfloat GLfloat GLfloat v2
GLint GLfloat GLfloat v1
#define qCpuHasFeature(feature)
Definition qsimd_p.h:387
CPUFeatures
Definition qsimd_p.h:311
Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE qDetectCpuFeatures()
Definition qsimd.cpp:561
static const uint64_t qCompilerCpuFeatures
Definition qsimd_p.h:324
static uint64_t qCpuFeatures()
Definition qsimd_p.h:373
unsigned QCpuFeatureType
Definition qsimd_p.h:368
#define Q_ATOMIC(T)
Definition qsimd_p.h:359
#define cpu_haswell
static const uint64_t _compilerCpuFeatures
#define cpu_skylake_avx512
#define QT_MANGLE_NAMESPACE(name)
unsigned long long quint64
Definition qtypes.h:61
ptrdiff_t qsizetype
Definition qtypes.h:165