Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qstringconverter.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4// Qt-Security score:critical reason:data-parser
5
6#include <qstringconverter.h>
7#include <private/qstringconverter_p.h>
8#include "qendian.h"
9
10#include "private/qsimd_p.h"
11#include "private/qstringiterator_p.h"
12#include "private/qtools_p.h"
15#include <QtCore/qbytearraylist.h>
16
17#if QT_CONFIG(icu)
18
19#include <unicode/ucnv.h>
20#include <unicode/ucnv_cb.h>
21#include <unicode/ucnv_err.h>
22#include <unicode/ustring.h>
23#define QT_USE_ICU_CODECS
24#define QT_COM_THREAD_INIT
25
26#elif QT_CONFIG(winsdkicu)
27
28#include <icu.h>
29#include <private/qfunctions_win_p.h>
30#define QT_USE_ICU_CODECS
31#define QT_COM_THREAD_INIT qt_win_ensureComInitializedOnThisThread();
32
33#endif // QT_CONFIG(icu) || QT_CONFIG(winsdkicu)
34
35#ifdef Q_OS_WIN
36#include <qt_windows.h>
37#ifndef QT_BOOTSTRAPPED
38#include <QtCore/qvarlengtharray.h>
39#include <QtCore/private/wcharhelpers_win_p.h>
40
41#include <QtCore/q20iterator.h>
42#endif // !QT_BOOTSTRAPPED
43#endif // Q_OS_WIN
44
45#include <array>
46#if __has_include(<bit>) && __cplusplus > 201703L
47#include <bit>
48#endif
49#include <string>
50#include <QtCore/q20utility.h>
51#ifndef QT_BOOTSTRAPPED
52#include <QtCore/q26numeric.h>
53#endif // !QT_BOOTSTRAPPED
54
55QT_BEGIN_NAMESPACE
56
57using namespace QtMiscUtils;
58
59static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
60static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
61static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
62static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
63
64enum { Endian = 0, Data = 1 };
65
66static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
67
68#if defined(__SSE2__) || defined(__ARM_NEON__)
69Q_ALWAYS_INLINE static uint qBitScanReverse(unsigned v) noexcept
70{
71#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
72 return std::bit_width(v) - 1;
73#else
74 uint result = qCountLeadingZeroBits(v);
75 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
76 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
77 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
78 result ^= sizeof(unsigned) * 8 - 1;
79 return result;
80#endif
81}
82#endif
83
84#if defined(__SSE2__)
85template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE static bool
86simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
87{
88 size_t sizeBytes = reinterpret_cast<const char *>(end) - reinterpret_cast<const char *>(src);
89
90 // do sixteen characters at a time
91 auto process16Chars = [](uchar *dst, const char16_t *src) {
92 __m128i data1 = _mm_loadu_si128((const __m128i*)src);
93 __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
94
95 // check if everything is ASCII
96 // the highest ASCII value is U+007F
97 // Do the packing directly:
98 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
99 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
100 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
101 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
102 // "non-ASCII", but it's an acceptable compromise.
103 __m128i packed = _mm_packus_epi16(data1, data2);
104 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
105
106 // store, even if there are non-ASCII characters here
107 _mm_storeu_si128((__m128i*)dst, packed);
108
109 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
110 ushort n = ~_mm_movemask_epi8(nonAscii);
111 return n;
112 };
113 auto maybeFoundNonAscii = [&](auto n, qptrdiff offset = 0) {
114 if (n) {
115 // find the next probable ASCII character
116 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
117 // characters still coming
118 src += offset;
119 dst += offset;
120 nextAscii = src + qBitScanReverse(n) + 1;
121
122 n = qCountTrailingZeroBits(n);
123 dst += n;
124 src += n;
125 return false;
126 }
127 return src == end;
128 };
129 auto adjustToEnd = [&] {
130 dst += sizeBytes / sizeof(char16_t);
131 src = end;
132 };
133
134 if constexpr (Cpu & CpuFeatureAVX2) {
135 // The 256-bit VPACKUSWB[1] instruction interleaves the two input
136 // operands, so we need an extra permutation to get them back in-order.
137 // VPERMW takes 2 cyles to run while VPERMQ takes only 1.
138 // [1] https://www.felixcloutier.com/x86/PACKUSWB.html
139 constexpr size_t Step = 32;
140 auto process32Chars = [](const char16_t *src, uchar *dst) {
141 __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
142 __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src) + 1);
143 __m256i packed = _mm256_packus_epi16(data1, data2); // will be [A, B, A, B]
144 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
145 __m256i nonAscii = _mm256_cmpgt_epi8(permuted, _mm256_setzero_si256());
146
147 // store, even if there are non-ASCII characters here
148 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), permuted);
149
150 return ~_mm256_movemask_epi8(nonAscii);
151 };
152
153 if constexpr (Cpu & CpuFeatureAVX512VL) {
154 // with AVX512/AXV10, we always process everything
155 if (sizeBytes <= Step * sizeof(char16_t)) {
156 uint mask = _bzhi_u32(-1, uint(sizeBytes / 2));
157 __m256i data1 = _mm256_maskz_loadu_epi16(mask, src);
158 __m256i data2 = _mm256_maskz_loadu_epi16(mask >> 16, src + Step / 2);
159 __m256i packed = _mm256_packus_epi16(data1, data2);
160 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
161 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, permuted, _mm256_setzero_si256());
162
163 // store, even if there are non-ASCII characters here
164 _mm256_mask_storeu_epi8(dst, mask, permuted);
165 if (nonAscii)
166 return maybeFoundNonAscii(nonAscii);
167 adjustToEnd();
168 return true;
169 }
170 }
171
172 if (sizeBytes >= Step * sizeof(char16_t)) {
173 // do 32 characters at a time
174 qptrdiff offset = 0;
175 for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) {
176 if (uint n = process32Chars(src + offset, dst + offset))
177 return maybeFoundNonAscii(n, offset);
178 }
179
180 // do 32 characters again, possibly overlapping with the loop above
181 adjustToEnd();
182 uint n = process32Chars(src - Step, dst - Step);
183 return maybeFoundNonAscii(n, -int(Step));
184 }
185 }
186
187 constexpr size_t Step = 16;
188 if (sizeBytes >= Step * sizeof(char16_t)) {
189
190 qptrdiff offset = 0;
191 for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) {
192 ushort n = process16Chars(dst + offset, src + offset);
193 if (n)
194 return maybeFoundNonAscii(n, offset);
195 if (Cpu & CpuFeatureAVX2)
196 break; // we can only ever loop once because of the code above
197 }
198
199 // do sixteen characters again, possibly overlapping with the loop above
200 adjustToEnd();
201 ushort n = process16Chars(dst - Step, src - Step);
202 return maybeFoundNonAscii(n, -int(Step));
203 }
204
205# if !defined(__OPTIMIZE_SIZE__)
206 if (sizeBytes >= 8 * sizeof(char16_t)) {
207 // do eight characters at a time
208 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
209 __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(end - 8));
210 __m128i packed = _mm_packus_epi16(data, data);
211 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
212
213 // store even non-ASCII
214 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
215
216 uchar n = ~_mm_movemask_epi8(nonAscii);
217 if (n)
218 return maybeFoundNonAscii(n);
219
220 adjustToEnd();
221 packed = _mm_packus_epi16(data2, data2);
222 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
223 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst - 8), packed);
224 n = ~_mm_movemask_epi8(nonAscii);
225 return maybeFoundNonAscii(n, -8);
226 } else if (sizeBytes >= 4 * sizeof(char16_t)) {
227 // do four characters at a time
228 __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
229 __m128i data2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(end - 4));
230 __m128i packed = _mm_packus_epi16(data1, data1);
231 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
232
233 // store even non-ASCII
234 qToUnaligned(_mm_cvtsi128_si32(packed), dst);
235
236 uchar n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
237 if (n)
238 return maybeFoundNonAscii(n);
239
240 adjustToEnd();
241 packed = _mm_packus_epi16(data2, data2);
242 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
243 qToUnaligned(_mm_cvtsi128_si32(packed), dst - 4);
244 n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
245 return maybeFoundNonAscii(n, -4);
246 }
247#endif
248
249 return src == end;
250}
251
252template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE static bool
253simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
254{
255 // do sixteen characters at a time
256 auto process16Chars = [](char16_t *dst, const uchar *src) {
257 __m128i data = _mm_loadu_si128((const __m128i*)src);
258
259 // check if everything is ASCII
260 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
261 uint n = _mm_movemask_epi8(data);
262
263 // store everything, even mojibake
264 _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
265 _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
266 return ushort(n);
267 };
268 auto maybeFoundNonAscii = [&](uint n, qptrdiff offset = 0) {
269 // find the next probable ASCII character
270 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
271 // characters still coming
272 if (n) {
273 uint c = qCountTrailingZeroBits(n);
274 src += offset;
275 dst += offset;
276 n = qBitScanReverse(n);
277 nextAscii = src + n + 1;
278 src += c;
279 dst += c;
280 }
281 return src == end;
282 };
283 auto adjustToEnd = [&] {
284 dst += end - src;
285 src = end;
286 };
287
288 if constexpr (Cpu & CpuFeatureAVX2) {
289 constexpr qsizetype Step = 32;
290 auto process32Chars = [](char16_t *dst, const uchar *src) {
291 __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
292 __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 1);
293
294 // the processor can execute this VPOR (dispatches 3/cycle) faster
295 // than waiting for the VPMOVMSKB (1/cycle) of both data to check
296 // their masks
297 __m128i ored = _mm_or_si128(data1, data2);
298 bool any = _mm_movemask_epi8(ored);
299
300 // store everything, even mojibake
301 __m256i extended1 = _mm256_cvtepu8_epi16(data1);
302 __m256i extended2 = _mm256_cvtepu8_epi16(data2);
303 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), extended1);
304 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, extended2);
305
306 uint n1 = _mm_movemask_epi8(data1);
307 uint n2 = _mm_movemask_epi8(data2);
308 struct R {
309 uint n1, n2;
310 bool any;
311 operator bool() const { return any; }
312 operator uint() const { return n1|(n2 << 16); }
313 };
314 return R{ n1, n2, any };
315 };
316
317 if constexpr (Cpu & CpuFeatureAVX512VL) {
318 // with AVX512/AXV10, we always process everything
319 if (end - src <= Step) {
320 __mmask32 mask = _bzhi_u32(-1, uint(end - src));
321 __m256i data = _mm256_maskz_loadu_epi8(mask, src);
322 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, data, _mm256_setzero_si256());
323
324 // store everything, even mojibake
325 __m256i extended1 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data));
326 __m256i extended2 = _mm256_cvtepu8_epi16(_mm256_extracti64x2_epi64(data, 1));
327 _mm256_mask_storeu_epi16(dst, mask, extended1);
328 _mm256_mask_storeu_epi16(dst + Step/2, mask >> 16, extended2);
329 if (nonAscii)
330 return maybeFoundNonAscii(nonAscii);
331 adjustToEnd();
332 return true;
333 }
334 }
335
336 if (end - src >= Step) {
337 // do 32 characters at a time
338 qptrdiff offset = 0;
339 for ( ; offset + Step < end - src; offset += Step) {
340 auto r = process32Chars(dst + offset, src + offset);
341 if (r)
342 return maybeFoundNonAscii(r, offset);
343 }
344
345 // do 32 characters again, possibly overlapping with the loop above
346 adjustToEnd();
347 auto r = process32Chars(dst - Step, src - Step);
348 return maybeFoundNonAscii(r, -Step);
349 }
350 }
351
352 constexpr qsizetype Step = 16;
353 if (end - src >= Step) {
354 qptrdiff offset = 0;
355 for ( ; offset + Step < end - src; offset += Step) {
356 ushort n = process16Chars(dst + offset, src + offset);
357 if (n)
358 return maybeFoundNonAscii(n, offset);
359 if (Cpu & CpuFeatureAVX2)
360 break; // we can only ever loop once because of the code above
361 }
362
363 // do one chunk again, possibly overlapping with the loop above
364 adjustToEnd();
365 return maybeFoundNonAscii(process16Chars(dst - Step, src - Step), -Step);
366 }
367
368# if !defined(__OPTIMIZE_SIZE__)
369 if (end - src >= 8) {
370 __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
371 __m128i data2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(end - 8));
372 uint n = _mm_movemask_epi8(data) & 0xff;
373 // store everything, even mojibake
374 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
375 if (n)
376 return maybeFoundNonAscii(n);
377
378 // do one chunk again, possibly overlapping the above
379 adjustToEnd();
380 n = _mm_movemask_epi8(data2) & 0xff;
381 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
382 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst - 8), data2);
383 return maybeFoundNonAscii(n, -8);
384 }
385 if (end - src >= 4) {
386 __m128i data = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src));
387 __m128i data2 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(end - 4));
388 uchar n = uchar(_mm_movemask_epi8(data) & 0xf);
389 // store everything, even mojibake
390 data = _mm_unpacklo_epi8(data, _mm_setzero_si128());
391 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), data);
392 if (n)
393 return maybeFoundNonAscii(n);
394
395 // do one chunk again, possibly overlapping the above
396 adjustToEnd();
397 n = uchar(_mm_movemask_epi8(data2) & 0xf);
398 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
399 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst - 4), data2);
400 return maybeFoundNonAscii(n, -4);
401 }
402#endif
403
404 return src == end;
405}
406
407static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
408{
409#ifdef __AVX2__
410 // do 32 characters at a time
411 // (this is similar to simdTestMask in qstring.cpp)
412 const __m256i mask = _mm256_set1_epi8(char(0x80));
413 for ( ; end - src >= 32; src += 32) {
414 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
415 if (_mm256_testz_si256(mask, data))
416 continue;
417
418 uint n = _mm256_movemask_epi8(data);
419 Q_ASSERT(n);
420
421 // find the next probable ASCII character
422 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
423 // characters still coming
424 nextAscii = src + qBitScanReverse(n) + 1;
425
426 // return the non-ASCII character
427 return src + qCountTrailingZeroBits(n);
428 }
429#endif
430
431 // do sixteen characters at a time
432 for ( ; end - src >= 16; src += 16) {
433 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
434
435 // check if everything is ASCII
436 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
437 uint n = _mm_movemask_epi8(data);
438 if (!n)
439 continue;
440
441 // find the next probable ASCII character
442 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
443 // characters still coming
444 nextAscii = src + qBitScanReverse(n) + 1;
445
446 // return the non-ASCII character
447 return src + qCountTrailingZeroBits(n);
448 }
449
450 // do four characters at a time
451 for ( ; end - src >= 4; src += 4) {
452 quint32 data = qFromUnaligned<quint32>(src);
453 data &= 0x80808080U;
454 if (!data)
455 continue;
456
457 // We don't try to guess which of the three bytes is ASCII and which
458 // one isn't. The chance that at least two of them are non-ASCII is
459 // better than 75%.
460 nextAscii = src;
461 return src;
462 }
463 nextAscii = end;
464 return src;
465}
466
467// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
468// and advance src8 and src16 to the first character that could not be compared
469static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
470{
471 int bitSpacing = 1;
472 qptrdiff len = qMin(end8 - src8, end16 - src16);
473 qptrdiff offset = 0;
474 uint mask = 0;
475
476 // do sixteen characters at a time
477 for ( ; offset + 16 < len; offset += 16) {
478 __m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
479#ifdef __AVX2__
480 // AVX2 version, use 256-bit registers and VPMOVXZBW
481 __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
482
483 // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
484 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
485 mask = _mm256_movemask_epi8(datax8);
486 if (mask)
487 break;
488
489 // compare Latin1 to UTF-16
490 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
491 mask = ~_mm256_movemask_epi8(latin1cmp);
492 if (mask)
493 break;
494#else
495 // non-AVX2 code
496 __m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
497 __m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + 1);
498
499 // expand US-ASCII as if it were Latin1, we'll confirm later
500 __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
501 __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
502
503 // compare Latin1 to UTF-16
504 __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
505 __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
506 mask = _mm_movemask_epi8(latin1cmphi) << 16;
507 mask |= ushort(_mm_movemask_epi8(latin1cmplo));
508 mask = ~mask;
509 if (mask)
510 break;
511
512 // confirm it was US-ASCII
513 mask = _mm_movemask_epi8(data8);
514 if (mask) {
515 bitSpacing = 0;
516 break;
517 }
518#endif
519 }
520
521 // helper for comparing 4 or 8 characters
522 auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
523 // n = 4 -> sizemask = 0xff
524 // n = 8 -> sizemask = 0xffff
525 unsigned sizemask = (1U << (2 * n)) - 1;
526
527 // expand as if Latin1
528 data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
529
530 // compare and confirm it's US-ASCII
531 __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
532 mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
533 mask |= _mm_movemask_epi8(data8);
534 if (mask == 0)
535 offset += n;
536 };
537
538 // do eight characters at a time
539 if (mask == 0 && offset + 8 < len) {
540 __m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
541 __m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
542 cmp_lt_16(8, data8, data16);
543 }
544
545 // do four characters
546 if (mask == 0 && offset + 4 < len) {
547 __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
548 __m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
549 cmp_lt_16(4, data8, data16);
550 }
551
552 // correct the source pointers to point to the first character we couldn't deal with
553 if (mask)
554 offset += qCountTrailingZeroBits(mask) >> bitSpacing;
555 src8 += offset;
556 src16 += offset;
557}
558#elif defined(__ARM_NEON__)
559static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
560{
561 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
562 uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
563 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
564
565 // do sixteen characters at a time
566 for ( ; end - src >= 16; src += 16, dst += 16) {
567 // load 2 lanes (or: "load interleaved")
568 uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
569
570 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
571 // add those together into a scalar, and merge the scalars.
572 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
573 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
574
575 // merge the two lanes by shifting the values of the second by 8 and inserting them
576 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
577
578 // store, even if there are non-ASCII characters here
579 vst1q_u8(dst, vreinterpretq_u8_u16(out));
580
581 if (nonAscii) {
582 // find the next probable ASCII character
583 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
584 // characters still coming
585 nextAscii = src + qBitScanReverse(nonAscii) + 1;
586
587 nonAscii = qCountTrailingZeroBits(nonAscii);
588 dst += nonAscii;
589 src += nonAscii;
590 return false;
591 }
592 }
593 return src == end;
594}
595
596static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
597{
598 // do eight characters at a time
599 uint8x8_t msb_mask = vdup_n_u8(0x80);
600 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
601 for ( ; end - src >= 8; src += 8, dst += 8) {
602 uint8x8_t c = vld1_u8(src);
603 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
604 if (!n) {
605 // store
606 vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
607 continue;
608 }
609
610 // copy the front part that is still ASCII
611 while (!(n & 1)) {
612 *dst++ = *src++;
613 n >>= 1;
614 }
615
616 // find the next probable ASCII character
617 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
618 // characters still coming
619 n = qBitScanReverse(n);
620 nextAscii = src + n + 1;
621 return false;
622
623 }
624 return src == end;
625}
626
627static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
628{
629 // The SIMD code below is untested, so just force an early return until
630 // we've had the time to verify it works.
631 nextAscii = end;
632 return src;
633
634 // do eight characters at a time
635 uint8x8_t msb_mask = vdup_n_u8(0x80);
636 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
637 for ( ; end - src >= 8; src += 8) {
638 uint8x8_t c = vld1_u8(src);
639 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
640 if (!n)
641 continue;
642
643 // find the next probable ASCII character
644 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
645 // characters still coming
646 nextAscii = src + qBitScanReverse(n) + 1;
647
648 // return the non-ASCII character
649 return src + qCountTrailingZeroBits(n);
650 }
651 nextAscii = end;
652 return src;
653}
654
655static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
656{
657}
658#else
659static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
660{
661 return false;
662}
663
664static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
665{
666 return false;
667}
668
669static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
670{
671 nextAscii = end;
672 return src;
673}
674
675static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
676{
677}
678#endif
679
680enum { HeaderDone = 1 };
681
682template <typename OnErrorLambda> Q_ALWAYS_INLINE
683char *QUtf8::convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept
684{
685 qsizetype len = in.size();
686
687 uchar *dst = reinterpret_cast<uchar *>(out);
688 const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
689 const char16_t *const end = src + len;
690
691 while (src != end) {
692 const char16_t *nextAscii = end;
693 if (simdEncodeAscii(dst, nextAscii, src, end))
694 break;
695
696 do {
697 char16_t u = *src++;
698 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
699 if (Q_UNLIKELY(res < 0))
700 onError(dst, u, res);
701 } while (src < nextAscii);
702 }
703
704 return reinterpret_cast<char *>(dst);
705}
706
707char *QUtf8::convertFromUnicode(char *dst, QStringView in) noexcept
708{
709 return convertFromUnicode(dst, in, [](auto *dst, ...) {
710 // encoding error - append '?'
711 *dst++ = '?';
712 });
713}
714
715QByteArray QUtf8::convertFromUnicode(QStringView in)
716{
717 qsizetype len = in.size();
718
719 // create a QByteArray with the worst case scenario size
720 QByteArray result(len * 3, Qt::Uninitialized);
721 char *dst = const_cast<char *>(result.constData());
722 dst = convertFromUnicode(dst, in);
723 result.truncate(dst - result.constData());
724 return result;
725}
726
727QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverter::State *state)
728{
729 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
730 char *end = convertFromUnicode(ba.data(), in, state);
731 ba.truncate(end - ba.data());
732 return ba;
733}
734
735char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
736{
737 Q_ASSERT(state);
738 qsizetype len = in.size();
739 if (!len)
740 return out;
741
742 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
743 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
744 *cursor++ = 0;
745 } else {
746 // QChar::replacement encoded in utf8
747 *cursor++ = 0xef;
748 *cursor++ = 0xbf;
749 *cursor++ = 0xbd;
750 }
751 return cursor;
752 };
753
754 uchar *cursor = reinterpret_cast<uchar *>(out);
755 const char16_t *src = in.utf16();
756 const char16_t *const end = src + len;
757
758 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
759 if (state->remainingChars) {
760 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
761 if (res < 0)
762 cursor = appendReplacementChar(cursor);
763 state->state_data[0] = 0;
764 state->remainingChars = 0;
765 } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
766 // append UTF-8 BOM
767 *cursor++ = utf8bom[0];
768 *cursor++ = utf8bom[1];
769 *cursor++ = utf8bom[2];
770 state->internalState |= HeaderDone;
771 }
772 }
773
774 out = reinterpret_cast<char *>(cursor);
775 return convertFromUnicode(out, { src, end }, [&](uchar *&cursor, char16_t uc, int res) {
776 if (res == QUtf8BaseTraits::Error) {
777 // encoding error
778 ++state->invalidChars;
779 cursor = appendReplacementChar(cursor);
780 } else if (res == QUtf8BaseTraits::EndOfString) {
781 if (state->flags & QStringConverter::Flag::Stateless) {
782 ++state->invalidChars;
783 cursor = appendReplacementChar(cursor);
784 } else {
785 state->remainingChars = 1;
786 state->state_data[0] = uc;
787 }
788 }
789 });
790}
791
792char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
793{
794 // ### SIMD-optimize:
795 for (uchar ch : in) {
796 if (ch < 128) {
797 *out++ = ch;
798 } else {
799 // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
800 *out++ = 0b110'0'0000u | (ch >> 6);
801 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
802 }
803 }
804 return out;
805}
806
807QString QUtf8::convertToUnicode(QByteArrayView in)
808{
809 // UTF-8 to UTF-16 always needs the exact same number of words or less:
810 // UTF-8 UTF-16
811 // 1 byte 1 word
812 // 2 bytes 1 word
813 // 3 bytes 1 word
814 // 4 bytes 2 words (one surrogate pair)
815 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
816 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
817 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
818 //
819 // The table holds for invalid sequences too: we'll insert one replacement char
820 // per invalid byte.
821 QString result(in.size(), Qt::Uninitialized);
822 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
823 const QChar *end = convertToUnicode(data, in);
824 result.truncate(end - data);
825 return result;
826}
827
828/*! \internal
829 \since 6.6
830 \overload
831
832 Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
833 QChar starting at \a dst in the destination buffer. The buffer is expected
834 to be large enough to hold the result. An upper bound for the size of the
835 buffer is \c in.size() QChars.
836
837 If, during decoding, an error occurs, a QChar::ReplacementCharacter is
838 written.
839
840 Returns a pointer to one past the last QChar written.
841
842 This function never throws.
843
844 For QChar buffers, instead of casting manually, you can use the static
845 QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
846*/
847char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
848{
849 // check if have to skip a BOM
850 auto bom = QByteArrayView::fromArray(utf8bom);
851 if (in.size() >= bom.size() && in.first(bom.size()) == bom)
852 in.slice(sizeof(utf8bom));
853
854 return convertToUnicode(dst, in, [](char16_t *&dst, ...) {
855 // decoding error
856 *dst++ = QChar::ReplacementCharacter;
857 return true; // continue decoding
858 });
859}
860
861template <typename OnErrorLambda> Q_ALWAYS_INLINE char16_t *
862QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept
863{
864 const uchar *const start = reinterpret_cast<const uchar *>(in.data());
865 const uchar *src = start;
866 const uchar *end = src + in.size();
867
868 // attempt to do a full decoding in SIMD
869 const uchar *nextAscii = end;
870 while (src < end) {
871 nextAscii = end;
872 if (simdDecodeAscii(dst, nextAscii, src, end))
873 break;
874
875 do {
876 uchar b = *src++;
877 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
878 if (Q_LIKELY(res >= 0))
879 continue;
880 // decoding error
881 if (!onError(dst, src, res))
882 return dst;
883 } while (src < nextAscii);
884 }
885
886 return dst;
887}
888
889QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
890{
891 // See above for buffer requirements for stateless decoding. However, that
892 // fails if the state is not empty. The following situations can add to the
893 // requirements:
894 // state contains chars starts with requirement
895 // 1 of 2 bytes valid continuation 0
896 // 2 of 3 bytes same 0
897 // 3 bytes of 4 same +1 (need to insert surrogate pair)
898 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
899 // 2 of 3 bytes same +1 (same)
900 // 3 of 4 bytes same +1 (same)
901 QString result(in.size() + 1, Qt::Uninitialized);
902 QChar *end = convertToUnicode(result.data(), in, state);
903 result.truncate(end - result.constData());
904 return result;
905}
906
907char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state)
908{
909 qsizetype len = in.size();
910
911 Q_ASSERT(state);
912 if (!len)
913 return dst;
914
915
916 char16_t replacement = QChar::ReplacementCharacter;
917 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
918 replacement = QChar::Null;
919
920 qsizetype res;
921
922 const uchar *src = reinterpret_cast<const uchar *>(in.data());
923 const uchar *end = src + len;
924
925 if (!(state->flags & QStringConverter::Flag::Stateless)) {
926 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
927 if (state->remainingChars || !headerdone) {
928 // handle incoming state first
929 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
930 qsizetype remainingCharsCount = state->remainingChars;
931 qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
932
933 memset(remainingCharsData, 0, sizeof(remainingCharsData));
934 memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
935 memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
936
937 const uchar *begin = &remainingCharsData[1];
938 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
939 static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
940 if (res == QUtf8BaseTraits::Error) {
941 ++state->invalidChars;
942 *dst++ = replacement;
943 ++src;
944 } else if (res == QUtf8BaseTraits::EndOfString) {
945 // if we got EndOfString again, then there were too few bytes in src;
946 // copy to our state and return
947 state->remainingChars = remainingCharsCount + newCharsToCopy;
948 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
949 return dst;
950 } else if (!headerdone) {
951 // eat the UTF-8 BOM
952 if (dst[-1] == 0xfeff)
953 --dst;
954 }
955 state->internalState |= HeaderDone;
956
957 // adjust src now that we have maybe consumed a few chars
958 if (res >= 0) {
959 Q_ASSERT(res > remainingCharsCount);
960 src += res - remainingCharsCount;
961 }
962 }
963 } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
964 // stateless, remove initial BOM
965 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
966 // skip BOM
967 src += 3;
968 }
969
970 // main body, stateless decoding
971 res = 0;
972 dst = convertToUnicode(dst, { src, end }, [&](char16_t *&dst, const uchar *src_, int res_) {
973 res = res_;
974 src = src_;
975 if (res == QUtf8BaseTraits::Error) {
976 res = 0;
977 ++state->invalidChars;
978 *dst++ = replacement;
979 }
980 return res == 0; // continue if plain decoding error
981 });
982
983 if (res == QUtf8BaseTraits::EndOfString) {
984 // unterminated UTF sequence
985 if (state->flags & QStringConverter::Flag::Stateless) {
986 *dst++ = QChar::ReplacementCharacter;
987 ++state->invalidChars;
988 while (src++ < end) {
989 *dst++ = QChar::ReplacementCharacter;
990 ++state->invalidChars;
991 }
992 state->remainingChars = 0;
993 } else {
994 --src; // unread the byte in ch
995 state->remainingChars = end - src;
996 memcpy(&state->state_data[0], src, end - src);
997 }
998 } else {
999 state->remainingChars = 0;
1000 }
1001
1002 return dst;
1003}
1004
1006{
1007 struct NoOutput {};
1008 static void appendUtf16(const NoOutput &, char16_t) {}
1009 static void appendUcs4(const NoOutput &, char32_t) {}
1010};
1011
1012QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
1013{
1014 const uchar *src = reinterpret_cast<const uchar *>(in.data());
1015 const uchar *end = src + in.size();
1016 const uchar *nextAscii = src;
1017 bool isValidAscii = true;
1018
1019 while (src < end) {
1020 if (src >= nextAscii)
1021 src = simdFindNonAscii(src, end, nextAscii);
1022 if (src == end)
1023 break;
1024
1025 do {
1026 uchar b = *src++;
1027 if ((b & 0x80) == 0)
1028 continue;
1029
1030 isValidAscii = false;
1031 QUtf8NoOutputTraits::NoOutput output;
1032 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
1033 if (res < 0) {
1034 // decoding error
1035 return { false, false };
1036 }
1037 } while (src < nextAscii);
1038 }
1039
1040 return { true, isValidAscii };
1041}
1042
1043int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
1044{
1045 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
1046 auto end1 = src1 + utf8.size();
1047 auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
1048 auto end2 = src2 + utf16.size();
1049
1050 do {
1051 simdCompareAscii(src1, end1, src2, end2);
1052
1053 if (src1 < end1 && src2 < end2) {
1054 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1055 char32_t uc2 = *src2++;
1056
1057 if (uc1 >= 0x80) {
1058 // Only decode the UTF-16 surrogate pair if the UTF-8 code point
1059 // wasn't US-ASCII (a surrogate cannot match US-ASCII).
1060 if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
1061 uc2 = QChar::surrogateToUcs4(uc2, *src2++);
1062 }
1063 if (cs == Qt::CaseInsensitive) {
1064 uc1 = QChar::toCaseFolded(uc1);
1065 uc2 = QChar::toCaseFolded(uc2);
1066 }
1067 if (uc1 != uc2)
1068 return int(uc1) - int(uc2);
1069 }
1070 } while (src1 < end1 && src2 < end2);
1071
1072 // the shorter string sorts first
1073 return (end1 > src1) - int(end2 > src2);
1074}
1075
1076int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
1077{
1078 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
1079 auto end1 = src1 + utf8.size();
1080 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
1081 auto end2 = src2 + s.size();
1082
1083 while (src1 < end1 && src2 < end2) {
1084 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1085 char32_t uc2 = *src2++;
1086 if (cs == Qt::CaseInsensitive) {
1087 uc1 = QChar::toCaseFolded(uc1);
1088 uc2 = QChar::toCaseFolded(uc2);
1089 }
1090 if (uc1 != uc2)
1091 return int(uc1) - int(uc2);
1092 }
1093
1094 // the shorter string sorts first
1095 return (end1 > src1) - (end2 > src2);
1096}
1097
1098int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
1099{
1100 if (lhs.isEmpty())
1101 return qt_lencmp(0, rhs.size());
1102
1103 if (rhs.isEmpty())
1104 return qt_lencmp(lhs.size(), 0);
1105
1106 if (cs == Qt::CaseSensitive) {
1107 const auto l = std::min(lhs.size(), rhs.size());
1108 int r = memcmp(lhs.data(), rhs.data(), l);
1109 return r ? r : qt_lencmp(lhs.size(), rhs.size());
1110 }
1111
1112 auto src1 = reinterpret_cast<const qchar8_t *>(lhs.data());
1113 auto end1 = src1 + lhs.size();
1114 auto src2 = reinterpret_cast<const qchar8_t *>(rhs.data());
1115 auto end2 = src2 + rhs.size();
1116
1117 while (src1 < end1 && src2 < end2) {
1118 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1119 char32_t uc2 = QUtf8Functions::nextUcs4FromUtf8(src2, end2);
1120
1121 uc1 = QChar::toCaseFolded(uc1);
1122 uc2 = QChar::toCaseFolded(uc2);
1123 if (uc1 != uc2)
1124 return int(uc1) - int(uc2);
1125 }
1126
1127 // the shorter string sorts first
1128 return (end1 > src1) - (end2 > src2);
1129}
1130
1131#ifndef QT_BOOTSTRAPPED
1132QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1133{
1134 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1135 qsizetype length = 2 * in.size();
1136 if (writeBom)
1137 length += 2;
1138
1139 QByteArray d(length, Qt::Uninitialized);
1140 char *end = convertFromUnicode(d.data(), in, state, endian);
1141 Q_ASSERT(end - d.constData() == d.size());
1142 Q_UNUSED(end);
1143 return d;
1144}
1145
1146char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1147{
1148 Q_ASSERT(state);
1149 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1150
1151 if (endian == DetectEndianness)
1152 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1153
1154 if (writeBom) {
1155 // set them up the BOM
1156 QChar bom(QChar::ByteOrderMark);
1157 if (endian == BigEndianness)
1158 qToBigEndian(bom.unicode(), out);
1159 else
1160 qToLittleEndian(bom.unicode(), out);
1161 out += 2;
1162 }
1163 if (endian == BigEndianness)
1164 qToBigEndian<char16_t>(in.data(), in.size(), out);
1165 else
1166 qToLittleEndian<char16_t>(in.data(), in.size(), out);
1167
1168 state->remainingChars = 0;
1169 state->internalState |= HeaderDone;
1170 return out + 2*in.size();
1171}
1172
1173QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1174{
1175 QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
1176 QChar *qch = convertToUnicode(result.data(), in, state, endian);
1177 result.truncate(qch - result.constData());
1178 return result;
1179}
1180
1181QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1182{
1183 qsizetype len = in.size();
1184 const char *chars = in.data();
1185
1186 Q_ASSERT(state);
1187
1188 if (endian == DetectEndianness)
1189 endian = (DataEndianness)state->state_data[Endian];
1190
1191 const char *end = chars + len;
1192
1193 // make sure we can decode at least one char
1194 if (state->remainingChars + len < 2) {
1195 if (len) {
1196 Q_ASSERT(state->remainingChars == 0 && len == 1);
1197 state->remainingChars = 1;
1198 state->state_data[Data] = *chars;
1199 }
1200 return out;
1201 }
1202
1203 bool headerdone = state && state->internalState & HeaderDone;
1204 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1205 headerdone = true;
1206
1207 if (!headerdone || state->remainingChars) {
1208 uchar buf;
1209 if (state->remainingChars)
1210 buf = state->state_data[Data];
1211 else
1212 buf = *chars++;
1213
1214 // detect BOM, set endianness
1215 state->internalState |= HeaderDone;
1216 QChar ch(buf, *chars++);
1217 if (endian == DetectEndianness) {
1218 // someone set us up the BOM
1219 if (ch == QChar::ByteOrderSwapped) {
1220 endian = BigEndianness;
1221 } else if (ch == QChar::ByteOrderMark) {
1222 endian = LittleEndianness;
1223 } else {
1224 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1225 endian = BigEndianness;
1226 } else {
1227 endian = LittleEndianness;
1228 }
1229 }
1230 }
1231 if (endian == BigEndianness)
1232 ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1233 if (headerdone || ch != QChar::ByteOrderMark)
1234 *out++ = ch;
1235 } else if (endian == DetectEndianness) {
1236 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1237 }
1238
1239 qsizetype nPairs = (end - chars) >> 1;
1240 if (endian == BigEndianness)
1241 qFromBigEndian<char16_t>(chars, nPairs, out);
1242 else
1243 qFromLittleEndian<char16_t>(chars, nPairs, out);
1244 out += nPairs;
1245
1246 state->state_data[Endian] = endian;
1247 state->remainingChars = 0;
1248 if ((end - chars) & 1) {
1249 if (state->flags & QStringConverter::Flag::Stateless) {
1250 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1251 } else {
1252 state->remainingChars = 1;
1253 state->state_data[Data] = *(end - 1);
1254 }
1255 } else {
1256 state->state_data[Data] = 0;
1257 }
1258
1259 return out;
1260}
1261
1262QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1263{
1264 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1265 qsizetype length = 4*in.size();
1266 if (writeBom)
1267 length += 4;
1268 QByteArray ba(length, Qt::Uninitialized);
1269 char *end = convertFromUnicode(ba.data(), in, state, endian);
1270 ba.truncate(end - ba.constData());
1271 return ba;
1272}
1273
1274char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1275{
1276 Q_ASSERT(state);
1277
1278 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1279 if (endian == DetectEndianness)
1280 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1281
1282 if (writeBom) {
1283 // set them up the BOM
1284 if (endian == BigEndianness) {
1285 out[0] = 0;
1286 out[1] = 0;
1287 out[2] = (char)0xfe;
1288 out[3] = (char)0xff;
1289 } else {
1290 out[0] = (char)0xff;
1291 out[1] = (char)0xfe;
1292 out[2] = 0;
1293 out[3] = 0;
1294 }
1295 out += 4;
1296 state->internalState |= HeaderDone;
1297 }
1298
1299 const QChar *uc = in.data();
1300 const QChar *end = in.data() + in.size();
1301 QChar ch;
1302 char32_t ucs4;
1303 if (state->remainingChars == 1) {
1304 auto character = state->state_data[Data];
1305 Q_ASSERT(character <= 0xFFFF);
1306 ch = QChar(character);
1307 // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1308 state->remainingChars = 0;
1309 goto decode_surrogate;
1310 }
1311
1312 while (uc < end) {
1313 ch = *uc++;
1314 if (Q_LIKELY(!ch.isSurrogate())) {
1315 ucs4 = ch.unicode();
1316 } else if (Q_LIKELY(ch.isHighSurrogate())) {
1317decode_surrogate:
1318 if (uc == end) {
1319 if (state->flags & QStringConverter::Flag::Stateless) {
1320 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1321 } else {
1322 state->remainingChars = 1;
1323 state->state_data[Data] = ch.unicode();
1324 return out;
1325 }
1326 } else if (uc->isLowSurrogate()) {
1327 ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1328 } else {
1329 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1330 }
1331 } else {
1332 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1333 }
1334 if (endian == BigEndianness)
1335 qToBigEndian(ucs4, out);
1336 else
1337 qToLittleEndian(ucs4, out);
1338 out += 4;
1339 }
1340
1341 return out;
1342}
1343
1344QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1345{
1346 QString result;
1347 result.resize((in.size() + 7) >> 1); // worst case
1348 QChar *end = convertToUnicode(result.data(), in, state, endian);
1349 result.truncate(end - result.constData());
1350 return result;
1351}
1352
1353QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1354{
1355 qsizetype len = in.size();
1356 const char *chars = in.data();
1357
1358 Q_ASSERT(state);
1359 if (endian == DetectEndianness)
1360 endian = (DataEndianness)state->state_data[Endian];
1361
1362 const char *end = chars + len;
1363
1364 uchar tuple[4];
1365 memcpy(tuple, &state->state_data[Data], 4);
1366
1367 // make sure we can decode at least one char
1368 if (state->remainingChars + len < 4) {
1369 if (len) {
1370 while (chars < end) {
1371 tuple[state->remainingChars] = *chars;
1372 ++state->remainingChars;
1373 ++chars;
1374 }
1375 Q_ASSERT(state->remainingChars < 4);
1376 memcpy(&state->state_data[Data], tuple, 4);
1377 }
1378 return out;
1379 }
1380
1381 bool headerdone = state->internalState & HeaderDone;
1382 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1383 headerdone = true;
1384
1385 qsizetype num = state->remainingChars;
1386 state->remainingChars = 0;
1387
1388 if (!headerdone || endian == DetectEndianness || num) {
1389 while (num < 4)
1390 tuple[num++] = *chars++;
1391 if (endian == DetectEndianness) {
1392 // someone set us up the BOM?
1393 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1394 endian = LittleEndianness;
1395 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1396 endian = BigEndianness;
1397 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1398 endian = BigEndianness;
1399 } else {
1400 endian = LittleEndianness;
1401 }
1402 }
1403 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1404 if (headerdone || code != QChar::ByteOrderMark) {
1405 if (QChar::requiresSurrogates(code)) {
1406 *out++ = QChar(QChar::highSurrogate(code));
1407 *out++ = QChar(QChar::lowSurrogate(code));
1408 } else {
1409 *out++ = QChar(code);
1410 }
1411 }
1412 num = 0;
1413 } else if (endian == DetectEndianness) {
1414 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1415 }
1416 state->state_data[Endian] = endian;
1417 state->internalState |= HeaderDone;
1418
1419 while (chars < end) {
1420 tuple[num++] = *chars++;
1421 if (num == 4) {
1422 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1423 for (char16_t c : QChar::fromUcs4(code))
1424 *out++ = c;
1425 num = 0;
1426 }
1427 }
1428
1429 if (num) {
1430 if (state->flags & QStringDecoder::Flag::Stateless) {
1431 *out++ = QChar::ReplacementCharacter;
1432 } else {
1433 state->state_data[Endian] = endian;
1434 state->remainingChars = num;
1435 memcpy(&state->state_data[Data], tuple, 4);
1436 }
1437 }
1438
1439 return out;
1440}
1441#endif // !QT_BOOTSTRAPPED
1442
1443#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1444int QLocal8Bit::checkUtf8()
1445{
1446 return GetACP() == CP_UTF8 ? 1 : -1;
1447}
1448
1449QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1450{
1451 return convertToUnicode_sys(in, CP_ACP, state);
1452}
1453
1454QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1455 QStringConverter::State *state)
1456{
1457 const char *mb = in.data();
1458 qsizetype mblen = in.size();
1459
1460 Q_ASSERT(state);
1461 qsizetype &invalidChars = state->invalidChars;
1462 using Flag = QStringConverter::Flag;
1463 const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1464 const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1465 : QChar::ReplacementCharacter;
1466 if (state->flags & Flag::Stateless) {
1467 Q_ASSERT(state->remainingChars == 0);
1468 state = nullptr;
1469 }
1470
1471 if (!mb || !mblen)
1472 return QString();
1473
1474 // Use a local stack-buffer at first to allow us a decently large container
1475 // to avoid a lot of resizing, without also returning an overallocated
1476 // QString to the user for small strings.
1477 // Then we can be fast for small strings and take the hit of extra resizes
1478 // and measuring how much storage is needed for large strings.
1479 std::array<wchar_t, 4096> buf;
1480 wchar_t *out = buf.data();
1481 qsizetype outlen = buf.size();
1482
1483 QString sp;
1484
1485 // Return a pointer to storage where we have enough space for `size`
1486 const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
1487 if (outlen >= size)
1488 return {out, outlen};
1489 const bool wasStackBuffer = sp.isEmpty();
1490 const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
1491 const qsizetype offset = qsizetype(std::distance(begin, out));
1492 qsizetype newSize = 0;
1493 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1494 Q_CHECK_PTR(false);
1495 return {nullptr, 0};
1496 }
1497 sp.resize(newSize);
1498 auto it = reinterpret_cast<wchar_t *>(sp.data());
1499 if (wasStackBuffer)
1500 it = std::copy_n(buf.data(), offset, it);
1501 else
1502 it += offset;
1503 return {it, size};
1504 };
1505
1506 // Convert the pending characters (if available)
1507 while (state && state->remainingChars && mblen) {
1508 QStringConverter::State localState;
1509 localState.flags = state->flags;
1510 // Use at most 6 characters as a guess for the longest encoded character
1511 // in any multibyte encoding.
1512 // Even with a total of 2 bytes of overhead that would leave around
1513 // 2^(4 * 8) possible characters
1514 std::array<char, 6> prev = {0};
1515 Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1516 qsizetype index = 0;
1517 for (; index < state->remainingChars; ++index)
1518 prev[index] = state->state_data[index];
1519 const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1520 for (qsizetype i = 0; i < toCopy; ++i, ++index)
1521 prev[index] = mb[i];
1522 mb += toCopy;
1523 mblen -= toCopy;
1524
1525 // Recursing:
1526 // Since we are using a clean local state it will try to decode what was
1527 // stored in our state + some extra octets from input (`prev`). If some
1528 // part fails we will have those characters stored in the local state's
1529 // storage, and we can extract those. It may also output some
1530 // replacement characters, which we'll count in the invalidChars.
1531 // In the best case we only do this once, but we will loop until we have
1532 // resolved all the remaining characters or we have run out of new input
1533 // in which case we may still have remaining characters.
1534 const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1535 &localState);
1536 std::tie(out, outlen) = growOut(tmp.size());
1537 if (!out)
1538 return {};
1539 out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
1540 outlen -= tmp.size();
1541 const qsizetype tail = toCopy - localState.remainingChars;
1542 if (tail >= 0) {
1543 // Everything left to process comes from `in`, so we can stop
1544 // looping. Adjust the window for `in` and unset remainingChars to
1545 // signal that we're done.
1546 mb -= localState.remainingChars;
1547 mblen += localState.remainingChars;
1548 localState.remainingChars = 0;
1549 }
1550 state->remainingChars = localState.remainingChars;
1551 state->invalidChars += localState.invalidChars;
1552 std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1553 }
1554
1555 Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
1556
1557 // Need it in this scope, since we try to decrease our window size if we
1558 // encounter an error
1559 int nextIn = q26::saturate_cast<int>(mblen);
1560 while (mblen > 0) {
1561 std::tie(out, outlen) = growOut(1); // Need space for at least one character
1562 if (!out)
1563 return {};
1564 const int nextOut = q26::saturate_cast<int>(outlen);
1565 int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1566 if (len) {
1567 mb += nextIn;
1568 mblen -= nextIn;
1569 out += len;
1570 outlen -= len;
1571 } else {
1572 int r = GetLastError();
1573 if (r == ERROR_INSUFFICIENT_BUFFER) {
1574 const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
1575 std::tie(out, outlen) = growOut(wclen);
1576 if (!out)
1577 return {};
1578 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1579 // Can't decode the current window, so either store the state,
1580 // reduce window size or output a replacement character.
1581
1582 // Check if we can store all remaining characters in the state
1583 // to be used next time we're called:
1584 if (state && mblen <= q20::ssize(state->state_data)) {
1585 state->remainingChars = mblen;
1586 std::copy_n(mb, mblen, state->state_data);
1587 mb += mblen;
1588 mblen = 0;
1589 break;
1590 }
1591
1592 // .. if not, try to find the last valid character in the window
1593 // and try again with a shrunken window:
1594 if (nextIn > 1) {
1595 // There may be some incomplete data at the end of our current
1596 // window, so decrease the window size and try again.
1597 // In the worst case scenario there is gigs of undecodable
1598 // garbage, but what are we supposed to do about that?
1599 const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
1600 if (it != mb)
1601 nextIn = int(it - mb);
1602 else
1603 --nextIn;
1604 continue;
1605 }
1606
1607 // Finally, we are forced to output a replacement character for
1608 // the first byte in the window:
1609 std::tie(out, outlen) = growOut(1);
1610 if (!out)
1611 return {};
1612 *out = replacementCharacter;
1613 ++invalidChars;
1614 ++out;
1615 --outlen;
1616 ++mb;
1617 --mblen;
1618 } else {
1619 // Fail.
1620 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1621 break;
1622 }
1623 }
1624 nextIn = q26::saturate_cast<int>(mblen);
1625 }
1626
1627 if (sp.isEmpty()) {
1628 // We must have only used the stack buffer
1629 if (out != buf.data()) // else: we return null-string
1630 sp = QStringView(buf.data(), out).toString();
1631 } else{
1632 const auto begin = reinterpret_cast<wchar_t *>(sp.data());
1633 sp.truncate(std::distance(begin, out));
1634 }
1635
1636 if (sp.size() && sp.back().isNull())
1637 sp.chop(1);
1638
1639 if (!state && mblen > 0) {
1640 // We have trailing character(s) that could not be converted, and
1641 // nowhere to cache them
1642 sp.resize(sp.size() + mblen, replacementCharacter);
1643 invalidChars += mblen;
1644 }
1645 return sp;
1646}
1647
1648QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1649{
1650 return convertFromUnicode_sys(in, CP_ACP, state);
1651}
1652
1653QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1654 QStringConverter::State *state)
1655{
1656 const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
1657 qsizetype uclen = in.size();
1658
1659 Q_ASSERT(state);
1660 // The Windows API has a *boolean* out-parameter that says if a replacement
1661 // character was used, but it gives us no way to know _how many_ were used.
1662 // Since we cannot simply scan the string for replacement characters
1663 // (which is potentially a question mark, and thus a valid character),
1664 // we simply do not track the number of invalid characters here.
1665 // auto &invalidChars = state->invalidChars;
1666
1667 using Flag = QStringConverter::Flag;
1668 if (state->flags & Flag::Stateless) { // temporary
1669 Q_ASSERT(state->remainingChars == 0);
1670 state = nullptr;
1671 }
1672
1673 if (!ch)
1674 return QByteArray();
1675 if (uclen == 0)
1676 return QByteArray("");
1677
1678 // Use a local stack-buffer at first to allow us a decently large container
1679 // to avoid a lot of resizing, without also returning an overallocated
1680 // QByteArray to the user for small strings.
1681 // Then we can be fast for small strings and take the hit of extra resizes
1682 // and measuring how much storage is needed for large strings.
1683 std::array<char, 4096> buf;
1684 char *out = buf.data();
1685 qsizetype outlen = buf.size();
1686 QByteArray mb;
1687
1688 if (state && state->remainingChars > 0) {
1689 Q_ASSERT(state->remainingChars == 1);
1690 // Let's try to decode the pending character
1691 wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
1692 // Check if the second character is a valid low surrogate,
1693 // otherwise we'll just decode the first character, for which windows
1694 // will output a replacement character.
1695 const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
1696 int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
1697 nullptr);
1698 if (!len)
1699 return {}; // Cannot recover, and I refuse to believe it was a size limitation
1700 out += len;
1701 outlen -= len;
1702 if (validCodePoint) {
1703 ++ch;
1704 --uclen;
1705 }
1706 state->remainingChars = 0;
1707 state->state_data[0] = 0;
1708 if (uclen == 0)
1709 return QByteArrayView(buf.data(), len).toByteArray();
1710 }
1711
1712 if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
1713 // We can handle a missing low surrogate at the end of the string,
1714 // so if there is one, exclude it now and store it in the state.
1715 state->remainingChars = 1;
1716 state->state_data[0] = ch[uclen - 1];
1717 --uclen;
1718 if (uclen == 0)
1719 return QByteArray();
1720 }
1721
1722 Q_ASSERT(uclen > 0);
1723
1724 // Return a pointer to storage where we have enough space for `size`
1725 const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
1726 if (outlen >= size)
1727 return {out, outlen};
1728 const bool wasStackBuffer = mb.isEmpty();
1729 const auto begin = wasStackBuffer ? buf.data() : mb.data();
1730 const qsizetype offset = qsizetype(std::distance(begin, out));
1731 qsizetype newSize = 0;
1732 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1733 Q_CHECK_PTR(false);
1734 return {nullptr, 0};
1735 }
1736 mb.resize(newSize);
1737 auto it = mb.data();
1738 if (wasStackBuffer)
1739 it = std::copy_n(buf.data(), offset, it);
1740 else
1741 it += offset;
1742 return {it, size};
1743 };
1744
1745 const auto getNextWindowSize = [&]() {
1746 int nextIn = q26::saturate_cast<int>(uclen);
1747 // The Windows API has some issues if the current window ends in the
1748 // middle of a surrogate pair, so we avoid that:
1749 if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
1750 --nextIn;
1751 return nextIn;
1752 };
1753
1754 int len = 0;
1755 while (uclen > 0) {
1756 const int nextIn = getNextWindowSize();
1757 std::tie(out, outlen) = growOut(1); // We need at least one byte
1758 if (!out)
1759 return {};
1760 const int nextOut = q26::saturate_cast<int>(outlen);
1761 len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
1762 if (len > 0) {
1763 ch += nextIn;
1764 uclen -= nextIn;
1765 out += len;
1766 outlen -= len;
1767 } else {
1768 int r = GetLastError();
1769 if (r == ERROR_INSUFFICIENT_BUFFER) {
1770 int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
1771 nullptr, nullptr);
1772 if (neededLength <= 0) {
1773 // Fail. Observed with UTF8 where the input window was max int and ended in an
1774 // incomplete sequence, probably a Windows bug. We try to avoid that from
1775 // happening by reducing the window size in that case. But let's keep this
1776 // branch just in case of other bugs.
1777#ifndef QT_NO_DEBUG
1778 r = GetLastError();
1779 fprintf(stderr,
1780 "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1781#endif // !QT_NO_DEBUG
1782 break;
1783 }
1784 std::tie(out, outlen) = growOut(neededLength);
1785 if (!out)
1786 return {};
1787 // and try again...
1788 } else {
1789 // Fail. Probably can't happen in fact (dwFlags is 0).
1790#ifndef QT_NO_DEBUG
1791 // Can't use qWarning(), as it'll recurse to handle %ls
1792 fprintf(stderr,
1793 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1794 r, qt_castToWchar(QStringView(ch, uclen).left(100).toString()));
1795#endif
1796 break;
1797 }
1798 }
1799 }
1800 if (mb.isEmpty()) {
1801 // We must have only used the stack buffer
1802 if (out != buf.data()) // else: we return null-array
1803 mb = QByteArrayView(buf.data(), out).toByteArray();
1804 } else {
1805 mb.truncate(std::distance(mb.data(), out));
1806 }
1807 return mb;
1808}
1809#endif
1810
1811void QStringConverter::State::clear() noexcept
1812{
1813 if (clearFn)
1814 clearFn(this);
1815 else
1816 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1817 remainingChars = 0;
1818 invalidChars = 0;
1819 internalState = 0;
1820}
1821
1822void QStringConverter::State::reset() noexcept
1823{
1824 if (flags & Flag::UsesIcu) {
1825#if defined(QT_USE_ICU_CODECS)
1826 QT_COM_THREAD_INIT
1827 UConverter *converter = static_cast<UConverter *>(d[0]);
1828 if (converter)
1829 ucnv_reset(converter);
1830#else
1831 Q_UNREACHABLE();
1832#endif
1833 } else {
1834 clear();
1835 }
1836}
1837
1838#ifndef QT_BOOTSTRAPPED
1839static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1840{
1841 return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
1842}
1843
1844static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1845{
1846 return QUtf16::convertFromUnicode(out, in, state, DetectEndianness);
1847}
1848
1849static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1850{
1851 return QUtf16::convertToUnicode(out, in, state, BigEndianness);
1852}
1853
1854static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1855{
1856 return QUtf16::convertFromUnicode(out, in, state, BigEndianness);
1857}
1858
1859static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1860{
1861 return QUtf16::convertToUnicode(out, in, state, LittleEndianness);
1862}
1863
1864static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1865{
1866 return QUtf16::convertFromUnicode(out, in, state, LittleEndianness);
1867}
1868
1869static QChar *fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1870{
1871 return QUtf32::convertToUnicode(out, in, state, DetectEndianness);
1872}
1873
1874static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1875{
1876 return QUtf32::convertFromUnicode(out, in, state, DetectEndianness);
1877}
1878
1879static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1880{
1881 return QUtf32::convertToUnicode(out, in, state, BigEndianness);
1882}
1883
1884static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1885{
1886 return QUtf32::convertFromUnicode(out, in, state, BigEndianness);
1887}
1888
1889static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1890{
1891 return QUtf32::convertToUnicode(out, in, state, LittleEndianness);
1892}
1893
1894static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1895{
1896 return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
1897}
1898#endif // !QT_BOOTSTRAPPED
1899
1900char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
1901{
1902 Q_ASSERT(state);
1903 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1904 state = nullptr;
1905
1906 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1907 qsizetype invalid = 0;
1908 for (qsizetype i = 0; i < in.size(); ++i) {
1909 if (in[i] > QChar(0xff)) {
1910 *out = replacement;
1911 ++invalid;
1912 } else {
1913 *out = (char)in[i].cell();
1914 }
1915 ++out;
1916 }
1917 if (state)
1918 state->invalidChars += invalid;
1919 return out;
1920}
1921
1922static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1923{
1924 QString s = QLocal8Bit::convertToUnicode(in, state);
1925 memcpy(out, s.constData(), s.size()*sizeof(QChar));
1926 return out + s.size();
1927}
1928
1929static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1930{
1931 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1932 memcpy(out, s.constData(), s.size());
1933 return out + s.size();
1934}
1935
1936
1937static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1938static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1939
1940#ifndef QT_BOOTSTRAPPED
1941static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1942static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1943
1944static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1945static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1946#endif
1947
1948static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1949static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1950
1951
1952
1953/*!
1954 \class QStringConverter
1955 \inmodule QtCore
1956 \brief The QStringConverter class provides a base class for encoding and decoding text.
1957 \reentrant
1958 \ingroup i18n
1959 \ingroup string-processing
1960
1961 Qt uses UTF-16 to store, draw and manipulate strings. In many
1962 situations you may wish to deal with data that uses a different
1963 encoding. Most text data transferred over files and network connections is encoded
1964 in UTF-8.
1965
1966 The QStringConverter class is a base class for the \l {QStringEncoder} and
1967 \l {QStringDecoder} classes that help with converting between different
1968 text encodings. QStringDecoder can decode a string from an encoded representation
1969 into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1970 operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1971 the requested encoding.
1972
1973 The following encodings are always supported:
1974
1975 \list
1976 \li UTF-8
1977 \li UTF-16
1978 \li UTF-16BE
1979 \li UTF-16LE
1980 \li UTF-32
1981 \li UTF-32BE
1982 \li UTF-32LE
1983 \li ISO-8859-1 (Latin-1)
1984 \li The system encoding
1985 \endlist
1986
1987 QStringConverter may support more encodings depending on how Qt was
1988 compiled. If more codecs are supported, they can be listed using
1989 availableCodecs().
1990
1991 \l {QStringConverter}s can be used as follows to convert some encoded
1992 string to and from UTF-16.
1993
1994 Suppose you have some string encoded in UTF-8, and
1995 want to convert it to a QString. The simple way
1996 to do it is to use a \l {QStringDecoder} like this:
1997
1998 \snippet code/src_corelib_text_qstringconverter.cpp 0
1999
2000 After this, \c string holds the text in decoded form.
2001 Converting a string from Unicode to the local encoding is just as
2002 easy using the \l {QStringEncoder} class:
2003
2004 \snippet code/src_corelib_text_qstringconverter.cpp 1
2005
2006 To read or write text files in various encodings, use QTextStream and
2007 its \l{QTextStream::setEncoding()}{setEncoding()} function.
2008
2009 Some care must be taken when trying to convert the data in chunks,
2010 for example, when receiving it over a network. In such cases it is
2011 possible that a multi-byte character will be split over two
2012 chunks. At best this might result in the loss of a character and
2013 at worst cause the entire conversion to fail.
2014
2015 Both QStringEncoder and QStringDecoder make this easy, by tracking
2016 this in an internal state. So simply calling the encoder or decoder
2017 again with the next chunk of data will automatically continue encoding
2018 or decoding the data correctly:
2019
2020 \snippet code/src_corelib_text_qstringconverter.cpp 2
2021
2022 The QStringDecoder object maintains state between chunks and therefore
2023 works correctly even if a multi-byte character is split between
2024 chunks.
2025
2026 QStringConverter objects can't be copied because of their internal state, but
2027 can be moved.
2028
2029 \sa QTextStream, QStringDecoder, QStringEncoder
2030*/
2031
2032/*!
2033 \enum QStringConverter::Flag
2034
2035 \value Default Default conversion rules apply.
2036 \value ConvertInvalidToNull If this flag is set, each invalid input
2037 character is output as a null character. If it is not set,
2038 invalid input characters are represented as QChar::ReplacementCharacter
2039 if the output encoding can represent that character, otherwise as a question mark.
2040 \value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
2041 character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
2042 encodings.
2043 \value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
2044 leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
2045 skipped, but converted to utf-16 and inserted at the start of the created QString.
2046 \value Stateless Ignore possible converter states between different function calls
2047 to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
2048 sequence of data is encountered.
2049 \omitvalue UsesIcu
2050*/
2051
2052/*!
2053 \enum QStringConverter::Encoding
2054 \value Utf8 Create a converter to or from UTF-8
2055 \value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
2056 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
2057 be assumed.
2058 \value Utf16BE Create a converter to or from big-endian UTF-16.
2059 \value Utf16LE Create a converter to or from little-endian UTF-16.
2060 \value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
2061 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
2062 be assumed.
2063 \value Utf32BE Create a converter to or from big-endian UTF-32.
2064 \value Utf32LE Create a converter to or from little-endian UTF-32.
2065 \value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
2066 \value System Create a converter to or from the underlying encoding of the
2067 operating systems locale. This is always assumed to be UTF-8 for Unix based
2068 systems. On Windows, this converts to and from the locale code page.
2069 \omitvalue LastEncoding
2070*/
2071
2072/*!
2073 \struct QStringConverter::Interface
2074 \internal
2075*/
2076
2077const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
2078{
2079 { "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
2080#ifndef QT_BOOTSTRAPPED
2081 { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
2082 { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
2083 { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
2084 { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
2085 { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
2086 { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
2087#endif
2088 { "ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len },
2089 { "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
2090};
2091
2092// match names case insensitive and skipping '-' and '_'
2093template <typename Char>
2094static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
2095{
2096 do {
2097 while (*a == '-' || *a == '_')
2098 ++a;
2099 while (b != b_end && (*b == Char{'-'} || *b == Char{'_'}))
2100 ++b;
2101 if (!*a && b == b_end) // end of both strings
2102 return true;
2103 if (char16_t(*b) > 127)
2104 return false; // non-US-ASCII cannot match US-ASCII (prevents narrowing below)
2105 } while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(char(*b++)));
2106
2107 return false;
2108}
2109
2110static bool nameMatch_impl(const char *a, QLatin1StringView b)
2111{
2112 return nameMatch_impl_impl(a, b.begin(), b.end());
2113}
2114
2115static bool nameMatch_impl(const char *a, QUtf8StringView b)
2116{
2117 return nameMatch_impl(a, QLatin1StringView{QByteArrayView{b}});
2118}
2119
2120static bool nameMatch_impl(const char *a, QStringView b)
2121{
2122 return nameMatch_impl_impl(a, b.utf16(), b.utf16() + b.size()); // uses char16_t*, not QChar*
2123}
2124
2125static bool nameMatch(const char *a, QAnyStringView b)
2126{
2127 return b.visit([a](auto b) { return nameMatch_impl(a, b); });
2128}
2129
2130
2131/*!
2132 \fn constexpr QStringConverter::QStringConverter()
2133 \internal
2134*/
2135
2136/*!
2137 \fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
2138 \internal
2139*/
2140
2141
2142#if defined(QT_USE_ICU_CODECS)
2143// only derives from QStringConverter to get access to protected types
2144struct QStringConverterICU : QStringConverter
2145{
2146 static void clear_function(QStringConverter::State *state) noexcept
2147 {
2148 QT_COM_THREAD_INIT
2149 ucnv_close(static_cast<UConverter *>(state->d[0]));
2150 state->d[0] = nullptr;
2151 }
2152
2153 static void ensureConverter(QStringConverter::State *state)
2154 {
2155 // old code might reset the state via clear instead of reset
2156 // in that case, the converter has been closed, and we have to reopen it
2157 if (state->d[0] == nullptr)
2158 state->d[0] = createConverterForName(static_cast<const char *>(state->d[1]), state);
2159 }
2160
2161 static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
2162 {
2163 QT_COM_THREAD_INIT
2164 ensureConverter(state);
2165
2166 auto icu_conv = static_cast<UConverter *>(state->d[0]);
2167 UErrorCode err = U_ZERO_ERROR;
2168 auto source = in.data();
2169 auto sourceLimit = in.data() + in.size();
2170
2171 qsizetype length = toLen(in.size());
2172
2173 UChar *target = reinterpret_cast<UChar *>(out);
2174 auto targetLimit = target + length;
2175 // We explicitly clean up anyway, so no need to set flush to true,
2176 // which would just reset the converter.
2177 UBool flush = false;
2178
2179 // If the QStringConverter was moved, the state that we used as a context is stale now.
2180 UConverterToUCallback action;
2181 const void *context;
2182 ucnv_getToUCallBack(icu_conv, &action, &context);
2183 if (context != state)
2184 ucnv_setToUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
2185
2186 ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
2187 // We did reserve enough space:
2188 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2189 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2190 if (auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
2191 ucnv_reset(icu_conv);
2192 state->invalidChars += leftOver;
2193 }
2194 }
2195 return reinterpret_cast<QChar *>(target);
2196 }
2197
2198 static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
2199 {
2200 QT_COM_THREAD_INIT
2201 ensureConverter(state);
2202 auto icu_conv = static_cast<UConverter *>(state->d[0]);
2203 UErrorCode err = U_ZERO_ERROR;
2204 auto source = reinterpret_cast<const UChar *>(in.data());
2205 auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
2206
2207 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2208
2209 char *target = out;
2210 char *targetLimit = out + length;
2211 UBool flush = false;
2212
2213 // If the QStringConverter was moved, the state that we used as a context is stale now.
2214 UConverterFromUCallback action;
2215 const void *context;
2216 ucnv_getFromUCallBack(icu_conv, &action, &context);
2217 if (context != state)
2218 ucnv_setFromUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
2219
2220 ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
2221 // We did reserve enough space:
2222 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2223 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2224 if (auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
2225 ucnv_reset(icu_conv);
2226 state->invalidChars += leftOver;
2227 }
2228 }
2229 return target;
2230 }
2231
2232 Q_DISABLE_COPY_MOVE(QStringConverterICU)
2233
2234 template<qsizetype X>
2235 static qsizetype fromLen(qsizetype inLength)
2236 {
2237 return X * inLength * sizeof(UChar);
2238 }
2239
2240 static qsizetype toLen(qsizetype inLength)
2241 {
2242
2243 /* Assumption: each input char might map to a different codepoint
2244 Each codepoint can take up to 4 bytes == 2 QChar
2245 We can ignore reserving space for a BOM, as only UTF encodings use one
2246 and those are not handled by the ICU converter.
2247 */
2248 return 2 * inLength;
2249 }
2250
2251 static constexpr QStringConverter::Interface forLength[] = {
2252 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
2253 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
2254 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
2255 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
2256 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
2257 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
2258 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
2259 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
2260 };
2261
2262 static UConverter *createConverterForName(const char *name, const State *state)
2263 {
2264 Q_ASSERT(name);
2265 Q_ASSERT(state);
2266 QT_COM_THREAD_INIT
2267 UErrorCode status = U_ZERO_ERROR;
2268 UConverter *conv = ucnv_open(name, &status);
2269 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2270 ucnv_close(conv);
2271 return nullptr;
2272 }
2273
2274 if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
2275 UErrorCode error = U_ZERO_ERROR;
2276
2277 auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2278 const char *, int32_t length,
2279 UConverterCallbackReason reason, UErrorCode *err) {
2280 if (reason <= UCNV_IRREGULAR) {
2281 *err = U_ZERO_ERROR;
2282 UChar c = '\0';
2283 ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
2284 // Recover outer scope's state (which isn't const) from context:
2285 auto state = const_cast<State *>(static_cast<const State *>(context));
2286 state->invalidChars += length;
2287 }
2288 };
2289 ucnv_setToUCallBack(conv, nullToSubstituter, state, nullptr, nullptr, &error);
2290
2291 auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2292 const UChar *, int32_t length,
2293 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2294 if (reason <= UCNV_IRREGULAR) {
2295 *err = U_ZERO_ERROR;
2296 const UChar replacement[] = { 0 };
2297 const UChar *stringBegin = std::begin(replacement);
2298 ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
2299 // Recover outer scope's state (which isn't const) from context:
2300 auto state = const_cast<State *>(static_cast<const State *>(context));
2301 state->invalidChars += length;
2302 }
2303 };
2304 ucnv_setFromUCallBack(conv, nullFromSubstituter, state, nullptr, nullptr, &error);
2305 } else {
2306 UErrorCode error = U_ZERO_ERROR;
2307
2308 auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2309 const char *codeUnits,int32_t length,
2310 UConverterCallbackReason reason, UErrorCode *err) {
2311 if (reason <= UCNV_IRREGULAR) {
2312 // Recover outer scope's state (which isn't const) from context:
2313 auto state = const_cast<State *>(static_cast<const State *>(context));
2314 state->invalidChars += length;
2315 }
2316 // use existing ICU callback for logic
2317 UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, toUArgs, codeUnits, length, reason, err);
2318
2319 };
2320 ucnv_setToUCallBack(conv, qmarkToSubstituter, state, nullptr, nullptr, &error);
2321
2322 auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2323 const UChar *codeUnits, int32_t length,
2324 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2325 if (reason <= UCNV_IRREGULAR) {
2326 // Recover outer scope's state (which isn't const) from context:
2327 auto state = const_cast<State *>(static_cast<const State *>(context));
2328 state->invalidChars += length;
2329 }
2330 // use existing ICU callback for logic
2331 UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length,
2332 codePoint, reason, err);
2333 };
2334 ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state, nullptr, nullptr, &error);
2335 }
2336 return conv;
2337 }
2338
2339 static std::string nul_terminate_impl(QLatin1StringView name)
2340 { return name.isNull() ? std::string() : std::string{name.data(), size_t(name.size())}; }
2341
2342 static std::string nul_terminate_impl(QUtf8StringView name)
2343 { return nul_terminate_impl(QLatin1StringView{QByteArrayView{name}}); }
2344
2345 static std::string nul_terminate_impl(QStringView name)
2346 {
2347 std::string result;
2348 const auto convert = [&](char *p, size_t n) {
2349 const auto sz = QLatin1::convertFromUnicode(p, name) - p;
2350 Q_ASSERT(q20::cmp_less_equal(sz, n));
2351 return sz;
2352 };
2353#ifdef __cpp_lib_string_resize_and_overwrite
2354 result.resize_and_overwrite(size_t(name.size()), convert);
2355#else
2356 result.resize(size_t(name.size()));
2357 result.resize(convert(result.data(), result.size()));
2358#endif // __cpp_lib_string_resize_and_overwrite
2359 return result;
2360 }
2361
2362 static std::string nul_terminate(QAnyStringView name)
2363 { return name.visit([](auto name) { return nul_terminate_impl(name); }); }
2364
2365 static const QStringConverter::Interface *
2366 make_icu_converter(QStringConverter::State *state, QAnyStringView name)
2367 { return make_icu_converter(state, nul_terminate(name).data()); }
2368
2369 static const QStringConverter::Interface *make_icu_converter(
2370 QStringConverter::State *state,
2371 const char *name)
2372 {
2373 QT_COM_THREAD_INIT
2374 UErrorCode status = U_ZERO_ERROR;
2375 UConverter *conv = createConverterForName(name, state);
2376 if (!conv)
2377 return nullptr;
2378
2379 const char *icuName = ucnv_getName(conv, &status);
2380 // ucnv_getStandardName returns a name which is owned by the library
2381 // we can thus store it in the state without worrying aobut its lifetime
2382 const char *persistentName = ucnv_getStandardName(icuName, "MIME", &status);
2383 if (U_FAILURE(status) || !persistentName) {
2384 status = U_ZERO_ERROR;
2385 persistentName = ucnv_getStandardName(icuName, "IANA", &status);
2386 }
2387 state->d[1] = const_cast<char *>(persistentName);
2388 state->d[0] = conv;
2389 state->flags |= QStringConverter::Flag::UsesIcu;
2390 qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
2391 state->clearFn = QStringConverterICU::clear_function;
2392 if (maxCharSize > 8 || maxCharSize < 1) {
2393 qWarning("Encountered unexpected codec \"%s\" which requires >8x space", name);
2394 return nullptr;
2395 } else {
2396 return &forLength[maxCharSize - 1];
2397 }
2398
2399 }
2400
2401};
2402#endif
2403
2404/*!
2405 \internal
2406*/
2407QStringConverter::QStringConverter(QAnyStringView name, Flags f)
2408 : iface(nullptr), state(f)
2409{
2410 auto e = encodingForName(name);
2411 if (e)
2412 iface = encodingInterfaces + int(*e);
2413#if defined(QT_USE_ICU_CODECS)
2414 else
2415 iface = QStringConverterICU::make_icu_converter(&state, name);
2416#endif
2417}
2418
2419
2420const char *QStringConverter::name() const noexcept
2421{
2422 if (!iface)
2423 return nullptr;
2424 if (state.flags & QStringConverter::Flag::UsesIcu) {
2425#if defined(QT_USE_ICU_CODECS)
2426 return static_cast<const char*>(state.d[1]);
2427#else
2428 return nullptr;
2429#endif
2430 } else {
2431 return iface->name;
2432 }
2433}
2434
2435/*!
2436 \fn bool QStringConverter::isValid() const
2437
2438 Returns true if this is a valid string converter that can be used for encoding or
2439 decoding text.
2440
2441 Default constructed string converters or converters constructed with an unsupported
2442 name are not valid.
2443*/
2444
2445/*!
2446 \fn void QStringConverter::resetState()
2447
2448 Resets the internal state of the converter, clearing potential errors or partial
2449 conversions.
2450*/
2451
2452/*!
2453 \fn bool QStringConverter::hasError() const
2454
2455 Returns true if a conversion could not correctly convert a character. This could for example
2456 get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
2457 limitations in the target encoding.
2458*/
2459
2460/*!
2461 \fn const char *QStringConverter::name() const
2462
2463 Returns the canonical name of the encoding this QStringConverter can encode or decode.
2464 Returns a nullptr if the converter is not valid.
2465 The returned name is UTF-8 encoded.
2466
2467 \sa isValid()
2468*/
2469
2470/*!
2471 Convert \a name to the corresponding \l Encoding member, if there is one.
2472
2473 If the \a name is not the name of a codec listed in the Encoding enumeration,
2474 \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
2475 the QStringConverter constructor when Qt is built with ICU, if ICU provides a
2476 converter with the given name.
2477
2478 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2479 which was expected to be UTF-8-encoded.
2480*/
2481std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAnyStringView name) noexcept
2482{
2483 if (name.isEmpty())
2484 return std::nullopt;
2485 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2486 if (nameMatch(encodingInterfaces[i].name, name))
2487 return QStringConverter::Encoding(i);
2488 }
2489 if (nameMatch("latin1", name))
2490 return QStringConverter::Latin1;
2491 return std::nullopt;
2492}
2493
2494#ifndef QT_BOOTSTRAPPED
2495namespace QtPrivate {
2496// Note: Check isValid() on the QStringConverter before calling this with its
2497// state!
2498static int partiallyParsedDataCount(QStringConverter::State *state)
2499{
2500#if QT_CONFIG(icu)
2501 if (state->flags & QStringConverter::Flag::UsesIcu) {
2502 UConverter *converter = static_cast<UConverter *>(state->d[0]);
2503 if (!converter)
2504 return 0;
2505 UErrorCode err = U_ZERO_ERROR;
2506 auto leftOver = ucnv_fromUCountPending(converter, &err);
2507 // If there is an error, leftOver is -1, so no need for an additional
2508 // check.
2509 return std::max(leftOver, 0);
2510 }
2511#endif
2512 return q26::saturate_cast<int>(state->remainingChars);
2513}
2514} // namespace QtPrivate
2515
2516/*!
2517 Returns the encoding for the content of \a data if it can be determined.
2518 \a expectedFirstCharacter can be passed as an additional hint to help determine
2519 the encoding.
2520
2521 The returned optional is empty, if the encoding is unclear.
2522 */
2523std::optional<QStringConverter::Encoding>
2524QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
2525{
2526 // someone set us up the BOM?
2527 qsizetype arraySize = data.size();
2528 if (arraySize > 3) {
2529 char32_t uc = qFromUnaligned<char32_t>(data.data());
2530 if (uc == qToBigEndian(char32_t(QChar::ByteOrderMark)))
2531 return QStringConverter::Utf32BE;
2532 if (uc == qToLittleEndian(char32_t(QChar::ByteOrderMark)))
2533 return QStringConverter::Utf32LE;
2534 if (expectedFirstCharacter) {
2535 // catch also anything starting with the expected character
2536 if (qToLittleEndian(uc) == expectedFirstCharacter)
2537 return QStringConverter::Utf32LE;
2538 else if (qToBigEndian(uc) == expectedFirstCharacter)
2539 return QStringConverter::Utf32BE;
2540 }
2541 }
2542
2543 if (arraySize > 2) {
2544 if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == 0)
2545 return QStringConverter::Utf8;
2546 }
2547
2548 if (arraySize > 1) {
2549 char16_t uc = qFromUnaligned<char16_t>(data.data());
2550 if (uc == qToBigEndian(char16_t(QChar::ByteOrderMark)))
2551 return QStringConverter::Utf16BE;
2552 if (uc == qToLittleEndian(char16_t(QChar::ByteOrderMark)))
2553 return QStringConverter::Utf16LE;
2554 if (expectedFirstCharacter) {
2555 // catch also anything starting with the expected character
2556 if (qToLittleEndian(uc) == expectedFirstCharacter)
2557 return QStringConverter::Utf16LE;
2558 else if (qToBigEndian(uc) == expectedFirstCharacter)
2559 return QStringConverter::Utf16BE;
2560 }
2561 }
2562 return std::nullopt;
2563}
2564
2565static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
2566{
2567 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
2568 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
2569
2570 QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
2571 qsizetype pos = metaSearcher.indexIn(header);
2572 if (pos != -1) {
2573 pos = charsetSearcher.indexIn(header, pos);
2574 if (pos != -1) {
2575 pos += qstrlen("charset=");
2576 if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
2577 ++pos;
2578
2579 qsizetype pos2 = pos;
2580 // The attribute can be closed with either """, "'", ">" or "/",
2581 // none of which are valid charset characters.
2582 while (++pos2 < header.size()) {
2583 char ch = header.at(pos2);
2584 if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
2585 QByteArray name = header.mid(pos, pos2 - pos);
2586 qsizetype colon = name.indexOf(':');
2587 if (colon > 0)
2588 name = name.left(colon);
2589 name = name.simplified();
2590 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
2591 name = QByteArrayLiteral("UTF-8");
2592 if (!name.isEmpty())
2593 return name;
2594 }
2595 }
2596 }
2597 }
2598 return QByteArray();
2599}
2600
2601/*!
2602 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2603 order marks or a charset specifier in the HTML meta tag. If the optional is empty,
2604 the encoding specified is not supported by QStringConverter. If no encoding is
2605 detected, the method returns Utf8.
2606
2607 \sa QStringDecoder::decoderForHtml()
2608*/
2609std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2610{
2611 // determine charset
2612 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2613 if (encoding)
2614 // trust the initial BOM
2615 return encoding;
2616
2617 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2618 if (!encodingTag.isEmpty())
2619 return encodingForName(encodingTag);
2620
2621 return Utf8;
2622}
2623
2625{
2626#if !defined(QT_USE_ICU_CODECS)
2627 return QStringConverter::Encoding::LastEncoding;
2628#else
2629 QT_COM_THREAD_INIT
2630 /* icu contains also the names of what Qt provides
2631 except for the special Locale one (so add one for it)
2632 */
2633 return 1 + ucnv_countAvailable();
2634#endif
2635}
2636
2637/*!
2638 Returns a list of names of supported codecs. The names returned
2639 by this function can be passed to QStringEncoder's and
2640 QStringDecoder's constructor to create a en- or decoder for
2641 the given codec.
2642
2643 This function may be used to obtain a listing of additional codecs beyond
2644 the standard ones. Support for additional codecs requires Qt be compiled
2645 with support for the ICU library.
2646
2647 \note The order of codecs is an internal implementation detail
2648 and not guaranteed to be stable.
2649 */
2650QStringList QStringConverter::availableCodecs()
2651{
2652 auto availableCodec = [](qsizetype index) -> QString
2653 {
2654 #if !defined(QT_USE_ICU_CODECS)
2655 return QString::fromLatin1(encodingInterfaces[index].name);
2656 #else
2657 if (index == 0) // "Locale", not provided by icu
2658 return QString::fromLatin1(
2659 encodingInterfaces[QStringConverter::Encoding::System].name);
2660 QT_COM_THREAD_INIT
2661 // this mirrors the setup we do to set a converters name
2662 UErrorCode status = U_ZERO_ERROR;
2663 auto icuName = ucnv_getAvailableName(int32_t(index - 1));
2664 const char *standardName = ucnv_getStandardName(icuName, "MIME", &status);
2665 if (U_FAILURE(status) || !standardName) {
2666 status = U_ZERO_ERROR;
2667 standardName = ucnv_getStandardName(icuName, "IANA", &status);
2668 }
2669 if (!standardName)
2670 standardName = icuName;
2671 return QString::fromLatin1(standardName);
2672 #endif
2673 };
2674
2675 qsizetype codecCount = availableCodecCount();
2676 QStringList result;
2677 result.reserve(codecCount);
2678 for (qsizetype i = 0; i < codecCount; ++i)
2679 result.push_back(availableCodec(i));
2680 return result;
2681}
2682
2683/*!
2684 \class QStringConverter::FinalizeResultBase
2685 \internal
2686*/
2687/*!
2688 \class QStringConverter::FinalizeResultChar
2689 \inmodule QtCore
2690 \since 6.11
2691 \reentrant
2692 \brief Holds the result of calling finalize() on QStringDecoder or
2693 QStringEncoder.
2694
2695 This class is used to relay the result of the finalize() call or the reason
2696 why the call did not succeed.
2697*/
2698/*!
2699 \enum QStringConverter::FinalizeResultBase::Error
2700 \value NoError No error.
2701 \value InvalidCharacters The encoder successfully finalized, but encountered
2702 invalid characters either during finalization or some time earlier.
2703 \value NotEnoughSpace finalize() did \e{not} succeed, you must grow the
2704 buffer and call finalize() again.
2705*/
2706
2707/*!
2708 \variable QStringConverter::FinalizeResultChar::error
2709 Relays errors discovered during finalization.
2710*/
2711/*!
2712 \variable QStringConverter::FinalizeResultChar::next
2713 Points to the character position \e{following} the last-written character.
2714*/
2715/*!
2716 \variable QStringConverter::FinalizeResultChar::invalidChars
2717 The number of invalid characters that were previously counted in the state
2718 as well as any that were encountered during the call to finalize().
2719*/
2720
2721/*!
2722 \typedef QStringDecoder::FinalizeResult
2723
2724 This is an alias for QStringConverter::FinalizeResultChar<char16_t>.
2725*/
2726
2727/*!
2728 \typedef QStringDecoder::FinalizeResultQChar
2729
2730 This is an alias for QStringConverter::FinalizeResultChar<QChar>.
2731*/
2732
2733/*!
2734 \fn QStringDecoder::FinalizeResultQChar QStringDecoder::finalize(QChar *out, qsizetype maxlen)
2735 \fn QStringDecoder::FinalizeResult QStringDecoder::finalize(char16_t *out, qsizetype maxlen)
2736 \fn QStringDecoder::FinalizeResult QStringDecoder::finalize()
2737
2738 Signals to the decoder that no further data will arrive.
2739
2740 May also provide data from residual content that was pending decoding.
2741 When there is no residual data to account for, the return's \c error
2742 field will be set to \l {QCharConverter::FinalizeResult::Error::}
2743 {NoError}.
2744
2745 If \a out is supplied and non-null, it must have space in which up to
2746 \a maxlen characters may be written. Up to this many characters of
2747 residual output are written to this space, with the end indicated by
2748 the return-value's \c next field. Typically this residual data shall
2749 consist of one replacement character per remaining unconverted input
2750 character.
2751
2752 If all residual content has been delivered via \a out, if \a out is
2753 \nullptr, or if there is no residual data, the decoder is reset on
2754 return from finalize(). Otherwise, the remaining data can be retrieved
2755 or discarded by a further call to finalize().
2756
2757 \since 6.11
2758 \sa hasError(), appendToBuffer()
2759 */
2760auto QStringDecoder::finalize(char16_t *out, qsizetype maxlen) -> FinalizeResult
2761{
2762 int count = 0;
2763 if (isValid())
2764 count = QtPrivate::partiallyParsedDataCount(&state);
2765 using Error = FinalizeResult::Error;
2766 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2767 if (count == 0 || !out) {
2768 resetState();
2769 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2770 }
2771 if (maxlen < count)
2772 return { {}, out, invalidChars, Error::NotEnoughSpace };
2773
2774 const char16_t replacement = (state.flags & QStringConverter::Flag::ConvertInvalidToNull)
2775 ? QChar::Null
2776 : QChar::ReplacementCharacter;
2777 out = std::fill_n(out, count, replacement);
2778 resetState();
2779 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2780}
2781
2782/*!
2783 \typedef QStringEncoder::FinalizeResult
2784
2785 This is an alias for QStringConverter::FinalizeResultChar<char>.
2786*/
2787
2788/*!
2789 \fn QStringEncoder::FinalizeResult QStringEncoder::finalize(char *out, qsizetype maxlen)
2790 \fn QStringEncoder::FinalizeResult QStringEncoder::finalize()
2791
2792 Signals to the decoder that no further data will arrive.
2793
2794 May also provide data from residual content that was pending decoding.
2795 When there is no residual data to account for, the return's \c error
2796 field will be set to \l {QCharConverter::FinalizeResult::Error::}
2797 {NoError}.
2798
2799 If \a out is supplied and non-null, it must have space in which up to
2800 \a maxlen characters may be written. Up to this many characters of
2801 residual output are written to this space, with the end indicated by
2802 the return-value's \c next field. Typically this residual data shall
2803 consist of one replacement character per remaining unconverted input
2804 character. When using a stateful encoding, such as ISO-2022-JP, this may
2805 also write bytes to restore, or end, the current state in the character
2806 stream.
2807
2808 If all residual content has been delivered via \a out, if \a out is
2809 \nullptr, or if there is no residual data, the decoder is reset on
2810 return from finalize(). Otherwise, the remaining data can be retrieved
2811 or discarded by a further call to finalize().
2812
2813 \since 6.11
2814 \sa hasError(), appendToBuffer()
2815 */
2816auto QStringEncoder::finalize(char *out, qsizetype maxlen) -> QStringEncoder::FinalizeResult
2817{
2818 qsizetype count = 0;
2819 if (isValid())
2820 count = QtPrivate::partiallyParsedDataCount(&state);
2821 // For ICU we may be using a stateful codec that need to restore or finalize
2822 // some state, otherwise we have nothing to do with count == 0
2823 using Error = FinalizeResult::Error;
2824 const bool usesIcu = !!(state.flags & QStringConverter::Flag::UsesIcu) && !!state.d[0];
2825 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2826 if (!isValid() || (!count && !usesIcu) || !out) {
2827 resetState();
2828 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2829 }
2830
2831 if ((false)) {
2832#if defined(QT_USE_ICU_CODECS)
2833 } else if (usesIcu) {
2834 Q_ASSERT(out);
2835 auto *icu_conv = static_cast<UConverter *>(state.d[0]);
2836 Q_ASSERT(icu_conv); // bool usesIcu checks that the pointer is non-null
2837 UErrorCode err = U_ZERO_ERROR;
2838
2839 UBool flush = true;
2840
2841 // If the QStringConverter was moved, the state that we used as a context is stale now.
2842 UConverterFromUCallback action;
2843 const void *context;
2844 ucnv_getFromUCallBack(icu_conv, &action, &context);
2845 if (context != &state)
2846 ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
2847 const UChar *dummyInput = u"";
2848 const char *outEnd = out + maxlen;
2849 ucnv_fromUnicode(icu_conv, &out, outEnd, &dummyInput, dummyInput, nullptr, flush, &err);
2850 if (err == U_BUFFER_OVERFLOW_ERROR)
2851 return { {}, out, invalidChars, Error::NotEnoughSpace };
2852 resetState();
2853#endif
2854 } else if (!(state.flags & QStringConverter::Flag::ConvertInvalidToNull)) {
2855 /*
2856 We don't really know (in general) how the replacement character
2857 looks like in the target encoding. So we just encode 0xfffd, which
2858 is the Unicode replacement character.
2859 Use 4 as a best-guess for the upper-bound of how many characters
2860 would potentially be produced by the leftover UTF-16 characters in
2861 the state
2862 */
2863 constexpr QChar replacementCharacter = QChar::ReplacementCharacter;
2864 constexpr char16_t repl = replacementCharacter.unicode();
2865 constexpr std::array<char16_t, 4> replacement{ repl, repl, repl, repl };
2866 const qsizetype charactersToEncode = std::min(count, qsizetype(replacement.size()));
2867 if (maxlen < requiredSpace(charactersToEncode))
2868 return { {}, out, invalidChars, Error::NotEnoughSpace };
2869 // we don't want the incomplete data in the internal buffer; we're
2870 // flushing the buffer after all
2871 resetState();
2872 out = appendToBuffer(out, QStringView(replacement.data(), charactersToEncode));
2873 } else /* outputting Null characters for each remaining unconverted input character */ {
2874 if (maxlen < count)
2875 return { {}, out, invalidChars, Error::NotEnoughSpace };
2876 out = std::fill_n(out, count, '\0');
2877 resetState();
2878 }
2879 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2880}
2881
2882/*!
2883 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2884 order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
2885 matching the encoding. If the returned decoder is not valid,
2886 the encoding specified is not supported by QStringConverter. If no encoding is
2887 detected, the method returns a decoder for Utf8.
2888
2889 \sa isValid()
2890*/
2891QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
2892{
2893 // determine charset
2894 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2895 if (encoding)
2896 // trust the initial BOM
2897 return QStringDecoder(encoding.value());
2898
2899 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2900 if (!encodingTag.isEmpty())
2901 return QStringDecoder(encodingTag);
2902
2903 return QStringDecoder(Utf8);
2904}
2905#endif // !QT_BOOTSTRAPPED
2906
2907/*!
2908 Returns the canonical name for encoding \a e or \nullptr if \a e is an
2909 invalid value.
2910
2911 \note In Qt versions prior to 6.10, 6.9.1, 6.8.4 or 6.5.9, calling this
2912 function with an invalid argument resulted in undefined behavior. Since the
2913 above-mentioned Qt versions, it returns nullptr instead.
2914*/
2915const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) noexcept
2916{
2917 auto i = size_t(e);
2918 if (Q_UNLIKELY(i >= std::size(encodingInterfaces)))
2919 return nullptr;
2920 return encodingInterfaces[i].name;
2921}
2922
2923/*!
2924 \class QStringEncoder
2925 \inmodule QtCore
2926 \brief The QStringEncoder class provides a state-based encoder for text.
2927 \reentrant
2928 \ingroup i18n
2929 \ingroup string-processing
2930
2931 A text encoder converts text from Qt's internal representation into an encoded
2932 text format using a specific encoding.
2933
2934 Converting a string from Unicode to the local encoding can be achieved
2935 using the following code:
2936
2937 \snippet code/src_corelib_text_qstringconverter.cpp 1
2938
2939 The encoder remembers any state that is required between calls, so converting
2940 data received in chunks, for example, when receiving it over a network, is just as
2941 easy, by calling the encoder whenever new data is available:
2942
2943 \snippet code/src_corelib_text_qstringconverter.cpp 3
2944
2945 The QStringEncoder object maintains state between chunks and therefore
2946 works correctly even if a UTF-16 surrogate character is split between
2947 chunks.
2948
2949 QStringEncoder objects can't be copied because of their internal state, but
2950 can be moved.
2951
2952 \sa QStringConverter, QStringDecoder
2953*/
2954
2955/*!
2956 \fn constexpr QStringEncoder::QStringEncoder(const Interface *i)
2957 \internal
2958*/
2959
2960/*!
2961 \fn constexpr QStringEncoder::QStringEncoder()
2962
2963 Default constructs an encoder. The default encoder is not valid,
2964 and can't be used for converting text.
2965*/
2966
2967/*!
2968 \fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
2969
2970 Creates an encoder object using \a encoding and \a flags.
2971*/
2972
2973/*!
2974 \fn QStringEncoder::QStringEncoder(QAnyStringView name, Flags flags = Flag::Default)
2975
2976 Creates an encoder object using \a name and \a flags.
2977 If \a name is not the name of a known encoding an invalid converter will get created.
2978
2979 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2980 which was expected to be UTF-8-encoded.
2981
2982 \sa isValid()
2983*/
2984
2985/*!
2986 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
2987 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
2988 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
2989 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
2990
2991 Converts \a in and returns a struct that is implicitly convertible to QByteArray.
2992
2993 \snippet code/src_corelib_text_qstringconverter.cpp 5
2994*/
2995
2996/*!
2997 \fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
2998
2999 Returns the maximum amount of characters required to be able to process
3000 \a inputLength decoded data.
3001
3002 \sa appendToBuffer()
3003*/
3004
3005/*!
3006 \fn char *QStringEncoder::appendToBuffer(char *out, QStringView in)
3007
3008 Encodes \a in and writes the encoded result into the buffer
3009 starting at \a out. Returns a pointer to the end of the data written.
3010
3011 \note \a out must be large enough to be able to hold all the decoded data. Use
3012 requiredSpace() to determine the maximum size requirement to be able to encode
3013 \a in. This function may write to any bytes between \a out and \c{out +
3014 requiredSpace()}, including those past the returned end pointer.
3015
3016 \sa requiredSpace()
3017*/
3018
3019/*!
3020 \class QStringDecoder
3021 \inmodule QtCore
3022 \brief The QStringDecoder class provides a state-based decoder for text.
3023 \reentrant
3024 \ingroup i18n
3025 \ingroup string-processing
3026
3027 A text decoder converts text an encoded text format that uses a specific encoding
3028 into Qt's internal representation.
3029
3030 Converting encoded data into a QString can be achieved
3031 using the following code:
3032
3033 \snippet code/src_corelib_text_qstringconverter.cpp 0
3034
3035 The decoder remembers any state that is required between calls, so converting
3036 data received in chunks, for example, when receiving it over a network, is just as
3037 easy, by calling the decoder whenever new data is available:
3038
3039 \snippet code/src_corelib_text_qstringconverter.cpp 2
3040
3041 The QStringDecoder object maintains state between chunks and therefore
3042 works correctly even if chunks are split in the middle of a multi-byte character
3043 sequence.
3044
3045 QStringDecoder objects can't be copied because of their internal state, but
3046 can be moved.
3047
3048 \sa QStringConverter, QStringEncoder
3049*/
3050
3051/*!
3052 \fn constexpr QStringDecoder::QStringDecoder(const Interface *i)
3053 \internal
3054*/
3055
3056/*!
3057 \fn constexpr QStringDecoder::QStringDecoder()
3058
3059 Default constructs an decoder. The default decoder is not valid,
3060 and can't be used for converting text.
3061*/
3062
3063/*!
3064 \fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
3065
3066 Creates an decoder object using \a encoding and \a flags.
3067*/
3068
3069/*!
3070 \fn QStringDecoder::QStringDecoder(QAnyStringView name, Flags flags = Flag::Default)
3071
3072 Creates an decoder object using \a name and \a flags.
3073 If \a name is not the name of a known encoding an invalid converter will get created.
3074
3075 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
3076 which was expected to be UTF-8-encoded.
3077
3078 \sa isValid()
3079*/
3080
3081/*!
3082 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
3083 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
3084 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
3085 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
3086
3087 Converts \a ba and returns a struct that is implicitly convertible to QString.
3088
3089
3090 \snippet code/src_corelib_text_qstringconverter.cpp 4
3091*/
3092
3093/*!
3094 \fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
3095
3096 Returns the maximum amount of UTF-16 code units required to be able to process
3097 \a inputLength encoded data.
3098
3099 \sa appendToBuffer
3100*/
3101
3102/*!
3103 \fn QChar *QStringDecoder::appendToBuffer(QChar *out, QByteArrayView in)
3104
3105 Decodes the sequence of bytes viewed by \a in and writes the decoded result into
3106 the buffer starting at \a out. Returns a pointer to the end of data written.
3107
3108 \a out needs to be large enough to be able to hold all the decoded data. Use
3109 \l{requiredSpace} to determine the maximum size requirements to decode an encoded
3110 data buffer of \c in.size() bytes. This function may write to any bytes
3111 between \a out and \c{out + requiredSpace()}, including those past the
3112 returned end pointer.
3113
3114 \sa requiredSpace
3115*/
3116
3117/*!
3118 \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
3119 \since 6.6
3120 \overload
3121*/
3122
3123QT_END_NAMESPACE
\inmodule QtCore
static int partiallyParsedDataCount(QStringConverter::State *state)
#define __has_include(x)
static bool nameMatch(const char *a, QAnyStringView b)
static const uchar utf8bom[]
static QChar * fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
@ HeaderDone
static QChar * fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
static QChar * fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf8Len(qsizetype l)
static QChar * fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QChar * fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toLatin1Len(qsizetype l)
static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
static bool nameMatch_impl(const char *a, QLatin1StringView b)
static QChar * fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
static char * toUtf32(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf8Len(qsizetype l)
static char * toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
static qsizetype toUtf16Len(qsizetype l)
static qsizetype fromLatin1Len(qsizetype l)
static char * toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf32Len(qsizetype l)
static qsizetype availableCodecCount()
static QChar * fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf32Len(qsizetype l)
static char * toUtf16(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf16Len(qsizetype l)
static char * toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
static void appendUtf16(const NoOutput &, char16_t)
static void appendUcs4(const NoOutput &, char32_t)