Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qstringconverter.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// Copyright (C) 2020 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4// Qt-Security score:critical reason:data-parser
5
6#include <qstringconverter.h>
7#include <private/qstringconverter_p.h>
8#include "qendian.h"
9
10#include "private/qsimd_p.h"
11#include "private/qstringiterator_p.h"
12#include "private/qtools_p.h"
15#include <QtCore/qbytearraylist.h>
16
17#if QT_CONFIG(icu)
18
19#include <unicode/ucnv.h>
20#include <unicode/ucnv_cb.h>
21#include <unicode/ucnv_err.h>
22#include <unicode/ustring.h>
23#define QT_USE_ICU_CODECS
24#define QT_COM_THREAD_INIT
25
26#elif QT_CONFIG(winsdkicu)
27
28#include <icu.h>
29#include <private/qfunctions_win_p.h>
30#define QT_USE_ICU_CODECS
31#define QT_COM_THREAD_INIT qt_win_ensureComInitializedOnThisThread();
32
33#endif // QT_CONFIG(icu) || QT_CONFIG(winsdkicu)
34
35#ifdef Q_OS_WIN
36#include <qt_windows.h>
37#ifndef QT_BOOTSTRAPPED
38#include <QtCore/qvarlengtharray.h>
39#include <QtCore/private/wcharhelpers_win_p.h>
40
41#include <QtCore/q20iterator.h>
42#endif // !QT_BOOTSTRAPPED
43#endif // Q_OS_WIN
44
45#include <array>
46#if __has_include(<bit>) && __cplusplus > 201703L
47#include <bit>
48#endif
49#include <string>
50#include <QtCore/q20utility.h>
51#ifndef QT_BOOTSTRAPPED
52#include <QtCore/q26numeric.h>
53#endif // !QT_BOOTSTRAPPED
54
55QT_BEGIN_NAMESPACE
56
57using namespace QtMiscUtils;
58
59static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
60static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
61static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
62static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
63
64enum { Endian = 0, Data = 1 };
65
66static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf };
67
68#if defined(__SSE2__) || defined(__ARM_NEON__)
69Q_ALWAYS_INLINE static uint qBitScanReverse(unsigned v) noexcept
70{
71#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
72 return std::bit_width(v) - 1;
73#else
74 uint result = qCountLeadingZeroBits(v);
75 // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31
76 // and the lsb index is 0. The result for _bit_scan_reverse is expected to be the index when
77 // counting up: msb index is 0 (because it starts there), and the lsb index is 31.
78 result ^= sizeof(unsigned) * 8 - 1;
79 return result;
80#endif
81}
82#endif
83
84#if defined(__SSE2__)
85template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE static bool
86simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
87{
88 size_t sizeBytes = reinterpret_cast<const char *>(end) - reinterpret_cast<const char *>(src);
89
90 // do sixteen characters at a time
91 auto process16Chars = [](uchar *dst, const char16_t *src) {
92 __m128i data1 = _mm_loadu_si128((const __m128i*)src);
93 __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src);
94
95 // check if everything is ASCII
96 // the highest ASCII value is U+007F
97 // Do the packing directly:
98 // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
99 // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
100 // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
101 // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
102 // "non-ASCII", but it's an acceptable compromise.
103 __m128i packed = _mm_packus_epi16(data1, data2);
104 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
105
106 // store, even if there are non-ASCII characters here
107 _mm_storeu_si128((__m128i*)dst, packed);
108
109 // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
110 ushort n = ~_mm_movemask_epi8(nonAscii);
111 return n;
112 };
113 auto maybeFoundNonAscii = [&](auto n, qptrdiff offset = 0) {
114 if (n) {
115 // find the next probable ASCII character
116 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
117 // characters still coming
118 src += offset;
119 dst += offset;
120 nextAscii = src + qBitScanReverse(n) + 1;
121
122 n = qCountTrailingZeroBits(n);
123 dst += n;
124 src += n;
125 return false;
126 }
127 return src == end;
128 };
129 auto adjustToEnd = [&] {
130 dst += sizeBytes / sizeof(char16_t);
131 src = end;
132 };
133
134 if constexpr (Cpu & CpuFeatureAVX2) {
135 // The 256-bit VPACKUSWB[1] instruction interleaves the two input
136 // operands, so we need an extra permutation to get them back in-order.
137 // VPERMW takes 2 cyles to run while VPERMQ takes only 1.
138 // [1] https://www.felixcloutier.com/x86/PACKUSWB.html
139 constexpr size_t Step = 32;
140 auto process32Chars = [](const char16_t *src, uchar *dst) {
141 __m256i data1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
142 __m256i data2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src) + 1);
143 __m256i packed = _mm256_packus_epi16(data1, data2); // will be [A, B, A, B]
144 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
145 __m256i nonAscii = _mm256_cmpgt_epi8(permuted, _mm256_setzero_si256());
146
147 // store, even if there are non-ASCII characters here
148 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), permuted);
149
150 return ~_mm256_movemask_epi8(nonAscii);
151 };
152
153 if constexpr (Cpu & CpuFeatureAVX512VL) {
154 // with AVX512/AXV10, we always process everything
155 if (sizeBytes <= Step * sizeof(char16_t)) {
156 uint mask = _bzhi_u32(-1, uint(sizeBytes / 2));
157 __m256i data1 = _mm256_maskz_loadu_epi16(mask, src);
158 __m256i data2 = _mm256_maskz_loadu_epi16(mask >> 16, src + Step / 2);
159 __m256i packed = _mm256_packus_epi16(data1, data2);
160 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
161 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, permuted, _mm256_setzero_si256());
162
163 // store, even if there are non-ASCII characters here
164 _mm256_mask_storeu_epi8(dst, mask, permuted);
165 if (nonAscii)
166 return maybeFoundNonAscii(nonAscii);
167 adjustToEnd();
168 return true;
169 }
170 }
171
172 if (sizeBytes >= Step * sizeof(char16_t)) {
173 // do 32 characters at a time
174 qptrdiff offset = 0;
175 for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) {
176 if (uint n = process32Chars(src + offset, dst + offset))
177 return maybeFoundNonAscii(n, offset);
178 }
179
180 // do 32 characters again, possibly overlapping with the loop above
181 adjustToEnd();
182 uint n = process32Chars(src - Step, dst - Step);
183 return maybeFoundNonAscii(n, -int(Step));
184 }
185 }
186
187 constexpr size_t Step = 16;
188 if (sizeBytes >= Step * sizeof(char16_t)) {
189
190 qptrdiff offset = 0;
191 for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) {
192 ushort n = process16Chars(dst + offset, src + offset);
193 if (n)
194 return maybeFoundNonAscii(n, offset);
195 if (Cpu & CpuFeatureAVX2)
196 break; // we can only ever loop once because of the code above
197 }
198
199 // do sixteen characters again, possibly overlapping with the loop above
200 adjustToEnd();
201 ushort n = process16Chars(dst - Step, src - Step);
202 return maybeFoundNonAscii(n, -int(Step));
203 }
204
205# if !defined(__OPTIMIZE_SIZE__)
206 if (sizeBytes >= 8 * sizeof(char16_t)) {
207 // do eight characters at a time
208 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
209 __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(end - 8));
210 __m128i packed = _mm_packus_epi16(data, data);
211 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
212
213 // store even non-ASCII
214 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
215
216 uchar n = ~_mm_movemask_epi8(nonAscii);
217 if (n)
218 return maybeFoundNonAscii(n);
219
220 adjustToEnd();
221 packed = _mm_packus_epi16(data2, data2);
222 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
223 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst - 8), packed);
224 n = ~_mm_movemask_epi8(nonAscii);
225 return maybeFoundNonAscii(n, -8);
226 } else if (sizeBytes >= 4 * sizeof(char16_t)) {
227 // do four characters at a time
228 __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
229 __m128i data2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(end - 4));
230 __m128i packed = _mm_packus_epi16(data1, data1);
231 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
232
233 // store even non-ASCII
234 qToUnaligned(_mm_cvtsi128_si32(packed), dst);
235
236 uchar n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
237 if (n)
238 return maybeFoundNonAscii(n);
239
240 adjustToEnd();
241 packed = _mm_packus_epi16(data2, data2);
242 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
243 qToUnaligned(_mm_cvtsi128_si32(packed), dst - 4);
244 n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
245 return maybeFoundNonAscii(n, -4);
246 }
247#endif
248
249 return src == end;
250}
251
252template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE static bool
253simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
254{
255 // do sixteen characters at a time
256 auto process16Chars = [](char16_t *dst, const uchar *src) {
257 __m128i data = _mm_loadu_si128((const __m128i*)src);
258
259 // check if everything is ASCII
260 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
261 uint n = _mm_movemask_epi8(data);
262
263 // store everything, even mojibake
264 _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
265 _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
266 return ushort(n);
267 };
268 auto maybeFoundNonAscii = [&](uint n, qptrdiff offset = 0) {
269 // find the next probable ASCII character
270 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
271 // characters still coming
272 if (n) {
273 uint c = qCountTrailingZeroBits(n);
274 src += offset;
275 dst += offset;
276 n = qBitScanReverse(n);
277 nextAscii = src + n + 1;
278 src += c;
279 dst += c;
280 }
281 return src == end;
282 };
283 auto adjustToEnd = [&] {
284 dst += end - src;
285 src = end;
286 };
287
288 if constexpr (Cpu & CpuFeatureAVX2) {
289 constexpr qsizetype Step = 32;
290 auto process32Chars = [](char16_t *dst, const uchar *src) {
291 __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
292 __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src) + 1);
293
294 // the processor can execute this VPOR (dispatches 3/cycle) faster
295 // than waiting for the VPMOVMSKB (1/cycle) of both data to check
296 // their masks
297 __m128i ored = _mm_or_si128(data1, data2);
298 bool any = _mm_movemask_epi8(ored);
299
300 // store everything, even mojibake
301 __m256i extended1 = _mm256_cvtepu8_epi16(data1);
302 __m256i extended2 = _mm256_cvtepu8_epi16(data2);
303 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), extended1);
304 _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, extended2);
305
306 uint n1 = _mm_movemask_epi8(data1);
307 uint n2 = _mm_movemask_epi8(data2);
308 struct R {
309 uint n1, n2;
310 bool any;
311 operator bool() const { return any; }
312 operator uint() const { return n1|(n2 << 16); }
313 };
314 return R{ n1, n2, any };
315 };
316
317 if constexpr (Cpu & CpuFeatureAVX512VL) {
318 // with AVX512/AXV10, we always process everything
319 if (end - src <= Step) {
320 __mmask32 mask = _bzhi_u32(-1, uint(end - src));
321 __m256i data = _mm256_maskz_loadu_epi8(mask, src);
322 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, data, _mm256_setzero_si256());
323
324 // store everything, even mojibake
325 __m256i extended1 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data));
326 __m256i extended2 = _mm256_cvtepu8_epi16(_mm256_extracti64x2_epi64(data, 1));
327 _mm256_mask_storeu_epi16(dst, mask, extended1);
328 _mm256_mask_storeu_epi16(dst + Step/2, mask >> 16, extended2);
329 if (nonAscii)
330 return maybeFoundNonAscii(nonAscii);
331 adjustToEnd();
332 return true;
333 }
334 }
335
336 if (end - src >= Step) {
337 // do 32 characters at a time
338 qptrdiff offset = 0;
339 for ( ; offset + Step < end - src; offset += Step) {
340 auto r = process32Chars(dst + offset, src + offset);
341 if (r)
342 return maybeFoundNonAscii(r, offset);
343 }
344
345 // do 32 characters again, possibly overlapping with the loop above
346 adjustToEnd();
347 auto r = process32Chars(dst - Step, src - Step);
348 return maybeFoundNonAscii(r, -Step);
349 }
350 }
351
352 constexpr qsizetype Step = 16;
353 if (end - src >= Step) {
354 qptrdiff offset = 0;
355 for ( ; offset + Step < end - src; offset += Step) {
356 ushort n = process16Chars(dst + offset, src + offset);
357 if (n)
358 return maybeFoundNonAscii(n, offset);
359 if (Cpu & CpuFeatureAVX2)
360 break; // we can only ever loop once because of the code above
361 }
362
363 // do one chunk again, possibly overlapping with the loop above
364 adjustToEnd();
365 return maybeFoundNonAscii(process16Chars(dst - Step, src - Step), -Step);
366 }
367
368# if !defined(__OPTIMIZE_SIZE__)
369 if (end - src >= 8) {
370 __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
371 __m128i data2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(end - 8));
372 uint n = _mm_movemask_epi8(data) & 0xff;
373 // store everything, even mojibake
374 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
375 if (n)
376 return maybeFoundNonAscii(n);
377
378 // do one chunk again, possibly overlapping the above
379 adjustToEnd();
380 n = _mm_movemask_epi8(data2) & 0xff;
381 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
382 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst - 8), data2);
383 return maybeFoundNonAscii(n, -8);
384 }
385 if (end - src >= 4) {
386 __m128i data = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src));
387 __m128i data2 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(end - 4));
388 uchar n = uchar(_mm_movemask_epi8(data) & 0xf);
389 // store everything, even mojibake
390 data = _mm_unpacklo_epi8(data, _mm_setzero_si128());
391 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), data);
392 if (n)
393 return maybeFoundNonAscii(n);
394
395 // do one chunk again, possibly overlapping the above
396 adjustToEnd();
397 n = uchar(_mm_movemask_epi8(data2) & 0xf);
398 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
399 _mm_storel_epi64(reinterpret_cast<__m128i *>(dst - 4), data2);
400 return maybeFoundNonAscii(n, -4);
401 }
402#endif
403
404 return src == end;
405}
406
407static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
408{
409#ifdef __AVX2__
410 // do 32 characters at a time
411 // (this is similar to simdTestMask in qstring.cpp)
412 const __m256i mask = _mm256_set1_epi8(char(0x80));
413 for ( ; end - src >= 32; src += 32) {
414 __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
415 if (_mm256_testz_si256(mask, data))
416 continue;
417
418 uint n = _mm256_movemask_epi8(data);
419 Q_ASSERT(n);
420
421 // find the next probable ASCII character
422 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
423 // characters still coming
424 nextAscii = src + qBitScanReverse(n) + 1;
425
426 // return the non-ASCII character
427 return src + qCountTrailingZeroBits(n);
428 }
429#endif
430
431 // do sixteen characters at a time
432 for ( ; end - src >= 16; src += 16) {
433 __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
434
435 // check if everything is ASCII
436 // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
437 uint n = _mm_movemask_epi8(data);
438 if (!n)
439 continue;
440
441 // find the next probable ASCII character
442 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
443 // characters still coming
444 nextAscii = src + qBitScanReverse(n) + 1;
445
446 // return the non-ASCII character
447 return src + qCountTrailingZeroBits(n);
448 }
449
450 // do four characters at a time
451 for ( ; end - src >= 4; src += 4) {
452 quint32 data = qFromUnaligned<quint32>(src);
453 data &= 0x80808080U;
454 if (!data)
455 continue;
456
457 // We don't try to guess which of the three bytes is ASCII and which
458 // one isn't. The chance that at least two of them are non-ASCII is
459 // better than 75%.
460 nextAscii = src;
461 return src;
462 }
463 nextAscii = end;
464 return src;
465}
466
467// Compare only the US-ASCII beginning of [src8, end8) and [src16, end16)
468// and advance src8 and src16 to the first character that could not be compared
469static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const char16_t *&src16, const char16_t *end16)
470{
471 int bitSpacing = 1;
472 qptrdiff len = qMin(end8 - src8, end16 - src16);
473 qptrdiff offset = 0;
474 uint mask = 0;
475
476 // do sixteen characters at a time
477 for ( ; offset + 16 < len; offset += 16) {
478 __m128i data8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src8 + offset));
479#ifdef __AVX2__
480 // AVX2 version, use 256-bit registers and VPMOVXZBW
481 __m256i data16 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src16 + offset));
482
483 // expand US-ASCII as if it were Latin1 and confirm it's US-ASCII
484 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
485 mask = _mm256_movemask_epi8(datax8);
486 if (mask)
487 break;
488
489 // compare Latin1 to UTF-16
490 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
491 mask = ~_mm256_movemask_epi8(latin1cmp);
492 if (mask)
493 break;
494#else
495 // non-AVX2 code
496 __m128i datalo16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
497 __m128i datahi16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset) + 1);
498
499 // expand US-ASCII as if it were Latin1, we'll confirm later
500 __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
501 __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
502
503 // compare Latin1 to UTF-16
504 __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
505 __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
506 mask = _mm_movemask_epi8(latin1cmphi) << 16;
507 mask |= ushort(_mm_movemask_epi8(latin1cmplo));
508 mask = ~mask;
509 if (mask)
510 break;
511
512 // confirm it was US-ASCII
513 mask = _mm_movemask_epi8(data8);
514 if (mask) {
515 bitSpacing = 0;
516 break;
517 }
518#endif
519 }
520
521 // helper for comparing 4 or 8 characters
522 auto cmp_lt_16 = [&mask, &offset](int n, __m128i data8, __m128i data16) {
523 // n = 4 -> sizemask = 0xff
524 // n = 8 -> sizemask = 0xffff
525 unsigned sizemask = (1U << (2 * n)) - 1;
526
527 // expand as if Latin1
528 data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
529
530 // compare and confirm it's US-ASCII
531 __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
532 mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
533 mask |= _mm_movemask_epi8(data8);
534 if (mask == 0)
535 offset += n;
536 };
537
538 // do eight characters at a time
539 if (mask == 0 && offset + 8 < len) {
540 __m128i data8 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src8 + offset));
541 __m128i data16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src16 + offset));
542 cmp_lt_16(8, data8, data16);
543 }
544
545 // do four characters
546 if (mask == 0 && offset + 4 < len) {
547 __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
548 __m128i data16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src16 + offset));
549 cmp_lt_16(4, data8, data16);
550 }
551
552 // correct the source pointers to point to the first character we couldn't deal with
553 if (mask)
554 offset += qCountTrailingZeroBits(mask) >> bitSpacing;
555 src8 += offset;
556 src16 += offset;
557}
558#elif defined(__ARM_NEON__)
559static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
560{
561 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
562 uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
563 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
564
565 // do sixteen characters at a time
566 for ( ; end - src >= 16; src += 16, dst += 16) {
567 // load 2 lanes (or: "load interleaved")
568 uint16x8x2_t in = vld2q_u16(reinterpret_cast<const uint16_t *>(src));
569
570 // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc),
571 // add those together into a scalar, and merge the scalars.
572 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
573 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
574
575 // merge the two lanes by shifting the values of the second by 8 and inserting them
576 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
577
578 // store, even if there are non-ASCII characters here
579 vst1q_u8(dst, vreinterpretq_u8_u16(out));
580
581 if (nonAscii) {
582 // find the next probable ASCII character
583 // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
584 // characters still coming
585 nextAscii = src + qBitScanReverse(nonAscii) + 1;
586
587 nonAscii = qCountTrailingZeroBits(nonAscii);
588 dst += nonAscii;
589 src += nonAscii;
590 return false;
591 }
592 }
593 return src == end;
594}
595
596static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
597{
598 // do eight characters at a time
599 uint8x8_t msb_mask = vdup_n_u8(0x80);
600 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
601 for ( ; end - src >= 8; src += 8, dst += 8) {
602 uint8x8_t c = vld1_u8(src);
603 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
604 if (!n) {
605 // store
606 vst1q_u16(reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
607 continue;
608 }
609
610 // copy the front part that is still ASCII
611 while (!(n & 1)) {
612 *dst++ = *src++;
613 n >>= 1;
614 }
615
616 // find the next probable ASCII character
617 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
618 // characters still coming
619 n = qBitScanReverse(n);
620 nextAscii = src + n + 1;
621 return false;
622
623 }
624 return src == end;
625}
626
627static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
628{
629 // The SIMD code below is untested, so just force an early return until
630 // we've had the time to verify it works.
631 nextAscii = end;
632 return src;
633
634 // do eight characters at a time
635 uint8x8_t msb_mask = vdup_n_u8(0x80);
636 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
637 for ( ; end - src >= 8; src += 8) {
638 uint8x8_t c = vld1_u8(src);
639 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
640 if (!n)
641 continue;
642
643 // find the next probable ASCII character
644 // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
645 // characters still coming
646 nextAscii = src + qBitScanReverse(n) + 1;
647
648 // return the non-ASCII character
649 return src + qCountTrailingZeroBits(n);
650 }
651 nextAscii = end;
652 return src;
653}
654
655static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
656{
657}
658#else
659static inline bool simdEncodeAscii(uchar *, const char16_t *, const char16_t *, const char16_t *)
660{
661 return false;
662}
663
664static inline bool simdDecodeAscii(char16_t *, const uchar *, const uchar *, const uchar *)
665{
666 return false;
667}
668
669static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, const uchar *&nextAscii)
670{
671 nextAscii = end;
672 return src;
673}
674
675static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t *&, const char16_t *)
676{
677}
678#endif
679
680enum { HeaderDone = 1 };
681
682template <typename OnErrorLambda> Q_ALWAYS_INLINE
683char *QUtf8::convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept
684{
685 qsizetype len = in.size();
686
687 uchar *dst = reinterpret_cast<uchar *>(out);
688 const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
689 const char16_t *const end = src + len;
690
691 while (src != end) {
692 const char16_t *nextAscii = end;
693 if (simdEncodeAscii(dst, nextAscii, src, end))
694 break;
695
696 do {
697 char16_t u = *src++;
698 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
699 if (Q_UNLIKELY(res < 0))
700 onError(dst, u, res);
701 } while (src < nextAscii);
702 }
703
704 return reinterpret_cast<char *>(dst);
705}
706
707char *QUtf8::convertFromUnicode(char *dst, QStringView in) noexcept
708{
709 return convertFromUnicode(dst, in, [](auto *dst, ...) {
710 // encoding error - append '?'
711 *dst++ = '?';
712 });
713}
714
715QByteArray QUtf8::convertFromUnicode(QStringView in)
716{
717 qsizetype len = in.size();
718
719 // create a QByteArray with the worst case scenario size
720 QByteArray result(len * 3, Qt::Uninitialized);
721 char *dst = const_cast<char *>(result.constData());
722 dst = convertFromUnicode(dst, in);
723 result.truncate(dst - result.constData());
724 return result;
725}
726
727QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverter::State *state)
728{
729 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
730 char *end = convertFromUnicode(ba.data(), in, state);
731 ba.truncate(end - ba.data());
732 return ba;
733}
734
735char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state)
736{
737 Q_ASSERT(state);
738 qsizetype len = in.size();
739 if (!len)
740 return out;
741
742 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
743 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
744 *cursor++ = 0;
745 } else {
746 // QChar::replacement encoded in utf8
747 *cursor++ = 0xef;
748 *cursor++ = 0xbf;
749 *cursor++ = 0xbd;
750 }
751 return cursor;
752 };
753
754 uchar *cursor = reinterpret_cast<uchar *>(out);
755 const char16_t *src = in.utf16();
756 const char16_t *const end = src + len;
757
758 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
759 if (state->remainingChars) {
760 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
761 if (res < 0)
762 cursor = appendReplacementChar(cursor);
763 state->state_data[0] = 0;
764 state->remainingChars = 0;
765 } else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
766 // append UTF-8 BOM
767 *cursor++ = utf8bom[0];
768 *cursor++ = utf8bom[1];
769 *cursor++ = utf8bom[2];
770 state->internalState |= HeaderDone;
771 }
772 }
773
774 out = reinterpret_cast<char *>(cursor);
775 return convertFromUnicode(out, { src, end }, [&](uchar *&cursor, char16_t uc, int res) {
776 if (res == QUtf8BaseTraits::Error) {
777 // encoding error
778 ++state->invalidChars;
779 cursor = appendReplacementChar(cursor);
780 } else if (res == QUtf8BaseTraits::EndOfString) {
781 if (state->flags & QStringConverter::Flag::Stateless) {
782 ++state->invalidChars;
783 cursor = appendReplacementChar(cursor);
784 } else {
785 state->remainingChars = 1;
786 state->state_data[0] = uc;
787 }
788 }
789 });
790}
791
792char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
793{
794 // ### SIMD-optimize:
795 for (uchar ch : in) {
796 if (ch < 128) {
797 *out++ = ch;
798 } else {
799 // as per https://en.wikipedia.org/wiki/UTF-8#Encoding, 2nd row
800 *out++ = 0b110'0'0000u | (ch >> 6);
801 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
802 }
803 }
804 return out;
805}
806
807QString QUtf8::convertToUnicode(QByteArrayView in)
808{
809 // UTF-8 to UTF-16 always needs the exact same number of words or less:
810 // UTF-8 UTF-16
811 // 1 byte 1 word
812 // 2 bytes 1 word
813 // 3 bytes 1 word
814 // 4 bytes 2 words (one surrogate pair)
815 // That is, we'll use the full buffer if the input is US-ASCII (1-byte UTF-8),
816 // half the buffer for U+0080-U+07FF text (e.g., Greek, Cyrillic, Arabic) or
817 // non-BMP text, and one third of the buffer for U+0800-U+FFFF text (e.g, CJK).
818 //
819 // The table holds for invalid sequences too: we'll insert one replacement char
820 // per invalid byte.
821 QString result(in.size(), Qt::Uninitialized);
822 QChar *data = const_cast<QChar*>(result.constData()); // we know we're not shared
823 const QChar *end = convertToUnicode(data, in);
824 result.truncate(end - data);
825 return result;
826}
827
828/*! \internal
829 \since 6.6
830 \overload
831
832 Converts the UTF-8 sequence of bytes viewed by \a in to a sequence of
833 QChar starting at \a dst in the destination buffer. The buffer is expected
834 to be large enough to hold the result. An upper bound for the size of the
835 buffer is \c in.size() QChars.
836
837 If, during decoding, an error occurs, a QChar::ReplacementCharacter is
838 written.
839
840 Returns a pointer to one past the last QChar written.
841
842 This function never throws.
843
844 For QChar buffers, instead of casting manually, you can use the static
845 QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
846*/
847char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
848{
849 // check if have to skip a BOM
850 auto bom = QByteArrayView::fromArray(utf8bom);
851 if (in.size() >= bom.size() && in.first(bom.size()) == bom)
852 in.slice(sizeof(utf8bom));
853
854 return convertToUnicode(dst, in, [](char16_t *&dst, ...) {
855 // decoding error
856 *dst++ = QChar::ReplacementCharacter;
857 return true; // continue decoding
858 });
859}
860
861template <typename OnErrorLambda> Q_ALWAYS_INLINE char16_t *
862QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept
863{
864 const uchar *const start = reinterpret_cast<const uchar *>(in.data());
865 const uchar *src = start;
866 const uchar *end = src + in.size();
867
868 // attempt to do a full decoding in SIMD
869 const uchar *nextAscii = end;
870 while (src < end) {
871 nextAscii = end;
872 if (simdDecodeAscii(dst, nextAscii, src, end))
873 break;
874
875 do {
876 uchar b = *src++;
877 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
878 if (Q_LIKELY(res >= 0))
879 continue;
880 // decoding error
881 if (!onError(dst, src, res))
882 return dst;
883 } while (src < nextAscii);
884 }
885
886 return dst;
887}
888
889QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
890{
891 // See above for buffer requirements for stateless decoding. However, that
892 // fails if the state is not empty. The following situations can add to the
893 // requirements:
894 // state contains chars starts with requirement
895 // 1 of 2 bytes valid continuation 0
896 // 2 of 3 bytes same 0
897 // 3 bytes of 4 same +1 (need to insert surrogate pair)
898 // 1 of 2 bytes invalid continuation +1 (need to insert replacement and restart)
899 // 2 of 3 bytes same +1 (same)
900 // 3 of 4 bytes same +1 (same)
901 QString result(in.size() + 1, Qt::Uninitialized);
902 QChar *end = convertToUnicode(result.data(), in, state);
903 result.truncate(end - result.constData());
904 return result;
905}
906
907char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConverter::State *state)
908{
909 qsizetype len = in.size();
910
911 Q_ASSERT(state);
912 if (!len)
913 return dst;
914
915
916 char16_t replacement = QChar::ReplacementCharacter;
917 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
918 replacement = QChar::Null;
919
920 qsizetype res;
921
922 const uchar *src = reinterpret_cast<const uchar *>(in.data());
923 const uchar *end = src + len;
924
925 if (!(state->flags & QStringConverter::Flag::Stateless)) {
926 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
927 if (state->remainingChars || !headerdone) {
928 // handle incoming state first
929 uchar remainingCharsData[4]; // longest UTF-8 sequence possible
930 qsizetype remainingCharsCount = state->remainingChars;
931 qsizetype newCharsToCopy = qMin<qsizetype>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
932
933 memset(remainingCharsData, 0, sizeof(remainingCharsData));
934 memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
935 memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
936
937 const uchar *begin = &remainingCharsData[1];
938 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
939 static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
940 if (res == QUtf8BaseTraits::Error) {
941 ++state->invalidChars;
942 *dst++ = replacement;
943 ++src;
944 } else if (res == QUtf8BaseTraits::EndOfString) {
945 // if we got EndOfString again, then there were too few bytes in src;
946 // copy to our state and return
947 state->remainingChars = remainingCharsCount + newCharsToCopy;
948 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
949 return dst;
950 } else if (!headerdone) {
951 // eat the UTF-8 BOM
952 if (dst[-1] == 0xfeff)
953 --dst;
954 }
955 state->internalState |= HeaderDone;
956
957 // adjust src now that we have maybe consumed a few chars
958 if (res >= 0) {
959 Q_ASSERT(res > remainingCharsCount);
960 src += res - remainingCharsCount;
961 }
962 }
963 } else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
964 // stateless, remove initial BOM
965 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
966 // skip BOM
967 src += 3;
968 }
969
970 // main body, stateless decoding
971 res = 0;
972 dst = convertToUnicode(dst, { src, end }, [&](char16_t *&dst, const uchar *src_, int res_) {
973 res = res_;
974 src = src_;
975 if (res == QUtf8BaseTraits::Error) {
976 res = 0;
977 ++state->invalidChars;
978 *dst++ = replacement;
979 }
980 return res == 0; // continue if plain decoding error
981 });
982
983 if (res == QUtf8BaseTraits::EndOfString) {
984 // unterminated UTF sequence
985 if (state->flags & QStringConverter::Flag::Stateless) {
986 *dst++ = QChar::ReplacementCharacter;
987 ++state->invalidChars;
988 while (src++ < end) {
989 *dst++ = QChar::ReplacementCharacter;
990 ++state->invalidChars;
991 }
992 state->remainingChars = 0;
993 } else {
994 --src; // unread the byte in ch
995 state->remainingChars = end - src;
996 memcpy(&state->state_data[0], src, end - src);
997 }
998 } else {
999 state->remainingChars = 0;
1000 }
1001
1002 return dst;
1003}
1004
1006{
1007 struct NoOutput {};
1008 static void appendUtf16(const NoOutput &, char16_t) {}
1009 static void appendUcs4(const NoOutput &, char32_t) {}
1010};
1011
1012QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
1013{
1014 const uchar *src = reinterpret_cast<const uchar *>(in.data());
1015 const uchar *end = src + in.size();
1016 const uchar *nextAscii = src;
1017 bool isValidAscii = true;
1018
1019 while (src < end) {
1020 if (src >= nextAscii)
1021 src = simdFindNonAscii(src, end, nextAscii);
1022 if (src == end)
1023 break;
1024
1025 do {
1026 uchar b = *src++;
1027 if ((b & 0x80) == 0)
1028 continue;
1029
1030 isValidAscii = false;
1031 QUtf8NoOutputTraits::NoOutput output;
1032 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
1033 if (res < 0) {
1034 // decoding error
1035 return { false, false };
1036 }
1037 } while (src < nextAscii);
1038 }
1039
1040 return { true, isValidAscii };
1041}
1042
1043int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs) noexcept
1044{
1045 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
1046 auto end1 = src1 + utf8.size();
1047 auto src2 = reinterpret_cast<const char16_t *>(utf16.data());
1048 auto end2 = src2 + utf16.size();
1049
1050 do {
1051 simdCompareAscii(src1, end1, src2, end2);
1052
1053 if (src1 < end1 && src2 < end2) {
1054 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1055 char32_t uc2 = *src2++;
1056
1057 if (uc1 >= 0x80) {
1058 // Only decode the UTF-16 surrogate pair if the UTF-8 code point
1059 // wasn't US-ASCII (a surrogate cannot match US-ASCII).
1060 if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
1061 uc2 = QChar::surrogateToUcs4(uc2, *src2++);
1062 }
1063 if (cs == Qt::CaseInsensitive) {
1064 uc1 = QChar::toCaseFolded(uc1);
1065 uc2 = QChar::toCaseFolded(uc2);
1066 }
1067 if (uc1 != uc2)
1068 return int(uc1) - int(uc2);
1069 }
1070 } while (src1 < end1 && src2 < end2);
1071
1072 // the shorter string sorts first
1073 return (end1 > src1) - int(end2 > src2);
1074}
1075
1076int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
1077{
1078 auto src1 = reinterpret_cast<const qchar8_t *>(utf8.data());
1079 auto end1 = src1 + utf8.size();
1080 auto src2 = reinterpret_cast<const uchar *>(s.latin1());
1081 auto end2 = src2 + s.size();
1082
1083 while (src1 < end1 && src2 < end2) {
1084 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1085 char32_t uc2 = *src2++;
1086 if (cs == Qt::CaseInsensitive) {
1087 uc1 = QChar::toCaseFolded(uc1);
1088 uc2 = QChar::toCaseFolded(uc2);
1089 }
1090 if (uc1 != uc2)
1091 return int(uc1) - int(uc2);
1092 }
1093
1094 // the shorter string sorts first
1095 return (end1 > src1) - (end2 > src2);
1096}
1097
1098int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs) noexcept
1099{
1100 if (lhs.isEmpty())
1101 return qt_lencmp(0, rhs.size());
1102
1103 if (rhs.isEmpty())
1104 return qt_lencmp(lhs.size(), 0);
1105
1106 if (cs == Qt::CaseSensitive) {
1107 const auto l = std::min(lhs.size(), rhs.size());
1108 int r = memcmp(lhs.data(), rhs.data(), l);
1109 return r ? r : qt_lencmp(lhs.size(), rhs.size());
1110 }
1111
1112 auto src1 = reinterpret_cast<const qchar8_t *>(lhs.data());
1113 auto end1 = src1 + lhs.size();
1114 auto src2 = reinterpret_cast<const qchar8_t *>(rhs.data());
1115 auto end2 = src2 + rhs.size();
1116
1117 while (src1 < end1 && src2 < end2) {
1118 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1119 char32_t uc2 = QUtf8Functions::nextUcs4FromUtf8(src2, end2);
1120
1121 uc1 = QChar::toCaseFolded(uc1);
1122 uc2 = QChar::toCaseFolded(uc2);
1123 if (uc1 != uc2)
1124 return int(uc1) - int(uc2);
1125 }
1126
1127 // the shorter string sorts first
1128 return (end1 > src1) - (end2 > src2);
1129}
1130
1131#ifndef QT_BOOTSTRAPPED
1132QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1133{
1134 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1135 qsizetype length = 2 * in.size();
1136 if (writeBom)
1137 length += 2;
1138
1139 QByteArray d(length, Qt::Uninitialized);
1140 char *end = convertFromUnicode(d.data(), in, state, endian);
1141 Q_ASSERT(end - d.constData() == d.size());
1142 Q_UNUSED(end);
1143 return d;
1144}
1145
1146char *QUtf16::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1147{
1148 Q_ASSERT(state);
1149 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1150
1151 if (endian == DetectEndianness)
1152 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1153
1154 if (writeBom) {
1155 // set them up the BOM
1156 QChar bom(QChar::ByteOrderMark);
1157 if (endian == BigEndianness)
1158 qToBigEndian(bom.unicode(), out);
1159 else
1160 qToLittleEndian(bom.unicode(), out);
1161 out += 2;
1162 }
1163 if (endian == BigEndianness)
1164 qToBigEndian<char16_t>(in.data(), in.size(), out);
1165 else
1166 qToLittleEndian<char16_t>(in.data(), in.size(), out);
1167
1168 state->remainingChars = 0;
1169 state->internalState |= HeaderDone;
1170 return out + 2*in.size();
1171}
1172
1173QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1174{
1175 QString result((in.size() + 1) >> 1, Qt::Uninitialized); // worst case
1176 QChar *qch = convertToUnicode(result.data(), in, state, endian);
1177 result.truncate(qch - result.constData());
1178 return result;
1179}
1180
1181QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1182{
1183 qsizetype len = in.size();
1184 const char *chars = in.data();
1185
1186 Q_ASSERT(state);
1187
1188 if (endian == DetectEndianness)
1189 endian = (DataEndianness)state->state_data[Endian];
1190
1191 const char *end = chars + len;
1192
1193 // make sure we can decode at least one char
1194 if (state->remainingChars + len < 2) {
1195 if (len) {
1196 Q_ASSERT(state->remainingChars == 0 && len == 1);
1197 state->remainingChars = 1;
1198 state->state_data[Data] = *chars;
1199 }
1200 return out;
1201 }
1202
1203 bool headerdone = state && state->internalState & HeaderDone;
1204 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1205 headerdone = true;
1206
1207 if (!headerdone || state->remainingChars) {
1208 uchar buf;
1209 if (state->remainingChars)
1210 buf = state->state_data[Data];
1211 else
1212 buf = *chars++;
1213
1214 // detect BOM, set endianness
1215 state->internalState |= HeaderDone;
1216 QChar ch(buf, *chars++);
1217 if (endian == DetectEndianness) {
1218 // someone set us up the BOM
1219 if (ch == QChar::ByteOrderSwapped) {
1220 endian = BigEndianness;
1221 } else if (ch == QChar::ByteOrderMark) {
1222 endian = LittleEndianness;
1223 } else {
1224 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1225 endian = BigEndianness;
1226 } else {
1227 endian = LittleEndianness;
1228 }
1229 }
1230 }
1231 if (endian == BigEndianness)
1232 ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1233 if (headerdone || ch != QChar::ByteOrderMark)
1234 *out++ = ch;
1235 } else if (endian == DetectEndianness) {
1236 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1237 }
1238
1239 qsizetype nPairs = (end - chars) >> 1;
1240 if (endian == BigEndianness)
1241 qFromBigEndian<char16_t>(chars, nPairs, out);
1242 else
1243 qFromLittleEndian<char16_t>(chars, nPairs, out);
1244 out += nPairs;
1245
1246 state->state_data[Endian] = endian;
1247 state->remainingChars = 0;
1248 if ((end - chars) & 1) {
1249 if (state->flags & QStringConverter::Flag::Stateless) {
1250 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1251 } else {
1252 state->remainingChars = 1;
1253 state->state_data[Data] = *(end - 1);
1254 }
1255 } else {
1256 state->state_data[Data] = 0;
1257 }
1258
1259 return out;
1260}
1261
1262QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1263{
1264 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1265 qsizetype length = 4*in.size();
1266 if (writeBom)
1267 length += 4;
1268 QByteArray ba(length, Qt::Uninitialized);
1269 char *end = convertFromUnicode(ba.data(), in, state, endian);
1270 ba.truncate(end - ba.constData());
1271 return ba;
1272}
1273
1274char *QUtf32::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1275{
1276 Q_ASSERT(state);
1277
1278 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1279 if (endian == DetectEndianness)
1280 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1281
1282 if (writeBom) {
1283 // set them up the BOM
1284 if (endian == BigEndianness) {
1285 out[0] = 0;
1286 out[1] = 0;
1287 out[2] = (char)0xfe;
1288 out[3] = (char)0xff;
1289 } else {
1290 out[0] = (char)0xff;
1291 out[1] = (char)0xfe;
1292 out[2] = 0;
1293 out[3] = 0;
1294 }
1295 out += 4;
1296 state->internalState |= HeaderDone;
1297 }
1298
1299 const QChar *uc = in.data();
1300 const QChar *end = in.data() + in.size();
1301 QChar ch;
1302 char32_t ucs4;
1303 if (state->remainingChars == 1) {
1304 auto character = state->state_data[Data];
1305 Q_ASSERT(character <= 0xFFFF);
1306 ch = QChar(character);
1307 // this is ugly, but shortcuts a whole lot of logic that would otherwise be required
1308 state->remainingChars = 0;
1309 goto decode_surrogate;
1310 }
1311
1312 while (uc < end) {
1313 ch = *uc++;
1314 if (Q_LIKELY(!ch.isSurrogate())) {
1315 ucs4 = ch.unicode();
1316 } else if (Q_LIKELY(ch.isHighSurrogate())) {
1317decode_surrogate:
1318 if (uc == end) {
1319 if (state->flags & QStringConverter::Flag::Stateless) {
1320 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1321 } else {
1322 state->remainingChars = 1;
1323 state->state_data[Data] = ch.unicode();
1324 return out;
1325 }
1326 } else if (uc->isLowSurrogate()) {
1327 ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1328 } else {
1329 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1330 }
1331 } else {
1332 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1333 }
1334 if (endian == BigEndianness)
1335 qToBigEndian(ucs4, out);
1336 else
1337 qToLittleEndian(ucs4, out);
1338 out += 4;
1339 }
1340
1341 return out;
1342}
1343
1344QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1345{
1346 QString result;
1347 result.resize((in.size() + 7) >> 1); // worst case
1348 QChar *end = convertToUnicode(result.data(), in, state, endian);
1349 result.truncate(end - result.constData());
1350 return result;
1351}
1352
1353QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1354{
1355 qsizetype len = in.size();
1356 const char *chars = in.data();
1357
1358 Q_ASSERT(state);
1359 if (endian == DetectEndianness)
1360 endian = (DataEndianness)state->state_data[Endian];
1361
1362 const char *end = chars + len;
1363
1364 uchar tuple[4];
1365 memcpy(tuple, &state->state_data[Data], 4);
1366
1367 // make sure we can decode at least one char
1368 if (state->remainingChars + len < 4) {
1369 if (len) {
1370 while (chars < end) {
1371 tuple[state->remainingChars] = *chars;
1372 ++state->remainingChars;
1373 ++chars;
1374 }
1375 Q_ASSERT(state->remainingChars < 4);
1376 memcpy(&state->state_data[Data], tuple, 4);
1377 }
1378 return out;
1379 }
1380
1381 bool headerdone = state->internalState & HeaderDone;
1382 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1383 headerdone = true;
1384
1385 qsizetype num = state->remainingChars;
1386 state->remainingChars = 0;
1387
1388 const auto writeCodeToOutput = [&](char32_t code) {
1389 if (Q_UNLIKELY(code > QChar::LastValidCodePoint)) {
1390 if (state->flags & QStringDecoder::Flag::ConvertInvalidToNull)
1391 *out++ = QChar::Null;
1392 else
1393 *out++ = QChar::ReplacementCharacter;
1394 } else {
1395 for (char16_t c : QChar::fromUcs4(code))
1396 *out++ = c;
1397 }
1398 };
1399
1400 if (!headerdone || endian == DetectEndianness || num) {
1401 while (num < 4)
1402 tuple[num++] = *chars++;
1403 if (endian == DetectEndianness) {
1404 // someone set us up the BOM?
1405 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1406 endian = LittleEndianness;
1407 } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1408 endian = BigEndianness;
1409 } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1410 endian = BigEndianness;
1411 } else {
1412 endian = LittleEndianness;
1413 }
1414 }
1415 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1416 if (headerdone || code != QChar::ByteOrderMark) {
1417 writeCodeToOutput(code);
1418 }
1419 num = 0;
1420 } else if (endian == DetectEndianness) {
1421 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1422 }
1423 state->state_data[Endian] = endian;
1424 state->internalState |= HeaderDone;
1425
1426 while (chars < end) {
1427 tuple[num++] = *chars++;
1428 if (num == 4) {
1429 char32_t code = (endian == BigEndianness) ? qFromBigEndian<char32_t>(tuple) : qFromLittleEndian<char32_t>(tuple);
1430 writeCodeToOutput(code);
1431 num = 0;
1432 }
1433 }
1434
1435 if (num) {
1436 if (state->flags & QStringDecoder::Flag::Stateless) {
1437 *out++ = QChar::ReplacementCharacter;
1438 } else {
1439 state->state_data[Endian] = endian;
1440 state->remainingChars = num;
1441 memcpy(&state->state_data[Data], tuple, 4);
1442 }
1443 }
1444
1445 return out;
1446}
1447#endif // !QT_BOOTSTRAPPED
1448
1449#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1450int QLocal8Bit::checkUtf8()
1451{
1452 return GetACP() == CP_UTF8 ? 1 : -1;
1453}
1454
1455QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1456{
1457 return convertToUnicode_sys(in, CP_ACP, state);
1458}
1459
1460QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1461 QStringConverter::State *state)
1462{
1463 const char *mb = in.data();
1464 qsizetype mblen = in.size();
1465
1466 Q_ASSERT(state);
1467 qsizetype &invalidChars = state->invalidChars;
1468 using Flag = QStringConverter::Flag;
1469 const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1470 const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1471 : QChar::ReplacementCharacter;
1472 if (state->flags & Flag::Stateless) {
1473 Q_ASSERT(state->remainingChars == 0);
1474 state = nullptr;
1475 }
1476
1477 if (!mb || !mblen)
1478 return QString();
1479
1480 // Use a local stack-buffer at first to allow us a decently large container
1481 // to avoid a lot of resizing, without also returning an overallocated
1482 // QString to the user for small strings.
1483 // Then we can be fast for small strings and take the hit of extra resizes
1484 // and measuring how much storage is needed for large strings.
1485 std::array<wchar_t, 4096> buf;
1486 wchar_t *out = buf.data();
1487 qsizetype outlen = buf.size();
1488
1489 QString sp;
1490
1491 // Return a pointer to storage where we have enough space for `size`
1492 const auto growOut = [&](qsizetype size) -> std::tuple<wchar_t *, qsizetype> {
1493 if (outlen >= size)
1494 return {out, outlen};
1495 const bool wasStackBuffer = sp.isEmpty();
1496 const auto begin = wasStackBuffer ? buf.data() : reinterpret_cast<wchar_t *>(sp.data());
1497 const qsizetype offset = qsizetype(std::distance(begin, out));
1498 qsizetype newSize = 0;
1499 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1500 Q_CHECK_PTR(false);
1501 return {nullptr, 0};
1502 }
1503 sp.resize(newSize);
1504 auto it = reinterpret_cast<wchar_t *>(sp.data());
1505 if (wasStackBuffer)
1506 it = std::copy_n(buf.data(), offset, it);
1507 else
1508 it += offset;
1509 return {it, size};
1510 };
1511
1512 // Convert the pending characters (if available)
1513 while (state && state->remainingChars && mblen) {
1514 QStringConverter::State localState;
1515 localState.flags = state->flags;
1516 // Use at most 6 characters as a guess for the longest encoded character
1517 // in any multibyte encoding.
1518 // Even with a total of 2 bytes of overhead that would leave around
1519 // 2^(4 * 8) possible characters
1520 std::array<char, 6> prev = {0};
1521 Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1522 qsizetype index = 0;
1523 for (; index < state->remainingChars; ++index)
1524 prev[index] = state->state_data[index];
1525 const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1526 for (qsizetype i = 0; i < toCopy; ++i, ++index)
1527 prev[index] = mb[i];
1528 mb += toCopy;
1529 mblen -= toCopy;
1530
1531 // Recursing:
1532 // Since we are using a clean local state it will try to decode what was
1533 // stored in our state + some extra octets from input (`prev`). If some
1534 // part fails we will have those characters stored in the local state's
1535 // storage, and we can extract those. It may also output some
1536 // replacement characters, which we'll count in the invalidChars.
1537 // In the best case we only do this once, but we will loop until we have
1538 // resolved all the remaining characters or we have run out of new input
1539 // in which case we may still have remaining characters.
1540 const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1541 &localState);
1542 std::tie(out, outlen) = growOut(tmp.size());
1543 if (!out)
1544 return {};
1545 out = std::copy_n(reinterpret_cast<const wchar_t *>(tmp.constData()), tmp.size(), out);
1546 outlen -= tmp.size();
1547 const qsizetype tail = toCopy - localState.remainingChars;
1548 if (tail >= 0) {
1549 // Everything left to process comes from `in`, so we can stop
1550 // looping. Adjust the window for `in` and unset remainingChars to
1551 // signal that we're done.
1552 mb -= localState.remainingChars;
1553 mblen += localState.remainingChars;
1554 localState.remainingChars = 0;
1555 }
1556 state->remainingChars = localState.remainingChars;
1557 state->invalidChars += localState.invalidChars;
1558 std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1559 }
1560
1561 Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
1562
1563 // Need it in this scope, since we try to decrease our window size if we
1564 // encounter an error
1565 int nextIn = q26::saturate_cast<int>(mblen);
1566 while (mblen > 0) {
1567 std::tie(out, outlen) = growOut(1); // Need space for at least one character
1568 if (!out)
1569 return {};
1570 const int nextOut = q26::saturate_cast<int>(outlen);
1571 int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1572 if (len) {
1573 mb += nextIn;
1574 mblen -= nextIn;
1575 out += len;
1576 outlen -= len;
1577 } else {
1578 int r = GetLastError();
1579 if (r == ERROR_INSUFFICIENT_BUFFER) {
1580 const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
1581 std::tie(out, outlen) = growOut(wclen);
1582 if (!out)
1583 return {};
1584 } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1585 // Can't decode the current window, so either store the state,
1586 // reduce window size or output a replacement character.
1587
1588 // Check if we can store all remaining characters in the state
1589 // to be used next time we're called:
1590 if (state && mblen <= q20::ssize(state->state_data)) {
1591 state->remainingChars = mblen;
1592 std::copy_n(mb, mblen, state->state_data);
1593 mb += mblen;
1594 mblen = 0;
1595 break;
1596 }
1597
1598 // .. if not, try to find the last valid character in the window
1599 // and try again with a shrunken window:
1600 if (nextIn > 1) {
1601 // There may be some incomplete data at the end of our current
1602 // window, so decrease the window size and try again.
1603 // In the worst case scenario there is gigs of undecodable
1604 // garbage, but what are we supposed to do about that?
1605 const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
1606 if (it != mb)
1607 nextIn = int(it - mb);
1608 else
1609 --nextIn;
1610 continue;
1611 }
1612
1613 // Finally, we are forced to output a replacement character for
1614 // the first byte in the window:
1615 std::tie(out, outlen) = growOut(1);
1616 if (!out)
1617 return {};
1618 *out = replacementCharacter;
1619 ++invalidChars;
1620 ++out;
1621 --outlen;
1622 ++mb;
1623 --mblen;
1624 } else {
1625 // Fail.
1626 qWarning("MultiByteToWideChar: Cannot convert multibyte text");
1627 break;
1628 }
1629 }
1630 nextIn = q26::saturate_cast<int>(mblen);
1631 }
1632
1633 if (sp.isEmpty()) {
1634 // We must have only used the stack buffer
1635 if (out != buf.data()) // else: we return null-string
1636 sp = QStringView(buf.data(), out).toString();
1637 } else{
1638 const auto begin = reinterpret_cast<wchar_t *>(sp.data());
1639 sp.truncate(std::distance(begin, out));
1640 }
1641
1642 if (sp.size() && sp.back().isNull())
1643 sp.chop(1);
1644
1645 if (!state && mblen > 0) {
1646 // We have trailing character(s) that could not be converted, and
1647 // nowhere to cache them
1648 sp.resize(sp.size() + mblen, replacementCharacter);
1649 invalidChars += mblen;
1650 }
1651 return sp;
1652}
1653
1654QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1655{
1656 return convertFromUnicode_sys(in, CP_ACP, state);
1657}
1658
1659QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1660 QStringConverter::State *state)
1661{
1662 const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
1663 qsizetype uclen = in.size();
1664
1665 Q_ASSERT(state);
1666 // The Windows API has a *boolean* out-parameter that says if a replacement
1667 // character was used, but it gives us no way to know _how many_ were used.
1668 // Since we cannot simply scan the string for replacement characters
1669 // (which is potentially a question mark, and thus a valid character),
1670 // we simply do not track the number of invalid characters here.
1671 // auto &invalidChars = state->invalidChars;
1672
1673 using Flag = QStringConverter::Flag;
1674 if (state->flags & Flag::Stateless) { // temporary
1675 Q_ASSERT(state->remainingChars == 0);
1676 state = nullptr;
1677 }
1678
1679 if (!ch)
1680 return QByteArray();
1681 if (uclen == 0)
1682 return QByteArray("");
1683
1684 // Use a local stack-buffer at first to allow us a decently large container
1685 // to avoid a lot of resizing, without also returning an overallocated
1686 // QByteArray to the user for small strings.
1687 // Then we can be fast for small strings and take the hit of extra resizes
1688 // and measuring how much storage is needed for large strings.
1689 std::array<char, 4096> buf;
1690 char *out = buf.data();
1691 qsizetype outlen = buf.size();
1692 QByteArray mb;
1693
1694 if (state && state->remainingChars > 0) {
1695 Q_ASSERT(state->remainingChars == 1);
1696 // Let's try to decode the pending character
1697 wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] };
1698 // Check if the second character is a valid low surrogate,
1699 // otherwise we'll just decode the first character, for which windows
1700 // will output a replacement character.
1701 const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
1702 int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen, nullptr,
1703 nullptr);
1704 if (!len)
1705 return {}; // Cannot recover, and I refuse to believe it was a size limitation
1706 out += len;
1707 outlen -= len;
1708 if (validCodePoint) {
1709 ++ch;
1710 --uclen;
1711 }
1712 state->remainingChars = 0;
1713 state->state_data[0] = 0;
1714 if (uclen == 0)
1715 return QByteArrayView(buf.data(), len).toByteArray();
1716 }
1717
1718 if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
1719 // We can handle a missing low surrogate at the end of the string,
1720 // so if there is one, exclude it now and store it in the state.
1721 state->remainingChars = 1;
1722 state->state_data[0] = ch[uclen - 1];
1723 --uclen;
1724 if (uclen == 0)
1725 return QByteArray();
1726 }
1727
1728 Q_ASSERT(uclen > 0);
1729
1730 // Return a pointer to storage where we have enough space for `size`
1731 const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
1732 if (outlen >= size)
1733 return {out, outlen};
1734 const bool wasStackBuffer = mb.isEmpty();
1735 const auto begin = wasStackBuffer ? buf.data() : mb.data();
1736 const qsizetype offset = qsizetype(std::distance(begin, out));
1737 qsizetype newSize = 0;
1738 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1739 Q_CHECK_PTR(false);
1740 return {nullptr, 0};
1741 }
1742 mb.resize(newSize);
1743 auto it = mb.data();
1744 if (wasStackBuffer)
1745 it = std::copy_n(buf.data(), offset, it);
1746 else
1747 it += offset;
1748 return {it, size};
1749 };
1750
1751 const auto getNextWindowSize = [&]() {
1752 int nextIn = q26::saturate_cast<int>(uclen);
1753 // The Windows API has some issues if the current window ends in the
1754 // middle of a surrogate pair, so we avoid that:
1755 if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
1756 --nextIn;
1757 return nextIn;
1758 };
1759
1760 int len = 0;
1761 while (uclen > 0) {
1762 const int nextIn = getNextWindowSize();
1763 std::tie(out, outlen) = growOut(1); // We need at least one byte
1764 if (!out)
1765 return {};
1766 const int nextOut = q26::saturate_cast<int>(outlen);
1767 len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
1768 if (len > 0) {
1769 ch += nextIn;
1770 uclen -= nextIn;
1771 out += len;
1772 outlen -= len;
1773 } else {
1774 int r = GetLastError();
1775 if (r == ERROR_INSUFFICIENT_BUFFER) {
1776 int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
1777 nullptr, nullptr);
1778 if (neededLength <= 0) {
1779 // Fail. Observed with UTF8 where the input window was max int and ended in an
1780 // incomplete sequence, probably a Windows bug. We try to avoid that from
1781 // happening by reducing the window size in that case. But let's keep this
1782 // branch just in case of other bugs.
1783#ifndef QT_NO_DEBUG
1784 r = GetLastError();
1785 fprintf(stderr,
1786 "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1787#endif // !QT_NO_DEBUG
1788 break;
1789 }
1790 std::tie(out, outlen) = growOut(neededLength);
1791 if (!out)
1792 return {};
1793 // and try again...
1794 } else {
1795 // Fail. Probably can't happen in fact (dwFlags is 0).
1796#ifndef QT_NO_DEBUG
1797 // Can't use qWarning(), as it'll recurse to handle %ls
1798 fprintf(stderr,
1799 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1800 r, qt_castToWchar(QStringView(ch, uclen).left(100).toString()));
1801#endif
1802 break;
1803 }
1804 }
1805 }
1806 if (mb.isEmpty()) {
1807 // We must have only used the stack buffer
1808 if (out != buf.data()) // else: we return null-array
1809 mb = QByteArrayView(buf.data(), out).toByteArray();
1810 } else {
1811 mb.truncate(std::distance(mb.data(), out));
1812 }
1813 return mb;
1814}
1815#endif
1816
1817void QStringConverter::State::clear() noexcept
1818{
1819 if (clearFn)
1820 clearFn(this);
1821 else
1822 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1823 remainingChars = 0;
1824 invalidChars = 0;
1825 internalState = 0;
1826}
1827
1828void QStringConverter::State::reset() noexcept
1829{
1830 if (flags & Flag::UsesIcu) {
1831#if defined(QT_USE_ICU_CODECS)
1832 QT_COM_THREAD_INIT
1833 UConverter *converter = static_cast<UConverter *>(d[0]);
1834 if (converter)
1835 ucnv_reset(converter);
1836#else
1837 Q_UNREACHABLE();
1838#endif
1839 } else {
1840 clear();
1841 }
1842}
1843
1844#ifndef QT_BOOTSTRAPPED
1845static QChar *fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1846{
1847 return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
1848}
1849
1850static char *toUtf16(char *out, QStringView in, QStringConverter::State *state)
1851{
1852 return QUtf16::convertFromUnicode(out, in, state, DetectEndianness);
1853}
1854
1855static QChar *fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1856{
1857 return QUtf16::convertToUnicode(out, in, state, BigEndianness);
1858}
1859
1860static char *toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
1861{
1862 return QUtf16::convertFromUnicode(out, in, state, BigEndianness);
1863}
1864
1865static QChar *fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1866{
1867 return QUtf16::convertToUnicode(out, in, state, LittleEndianness);
1868}
1869
1870static char *toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
1871{
1872 return QUtf16::convertFromUnicode(out, in, state, LittleEndianness);
1873}
1874
1875static QChar *fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1876{
1877 return QUtf32::convertToUnicode(out, in, state, DetectEndianness);
1878}
1879
1880static char *toUtf32(char *out, QStringView in, QStringConverter::State *state)
1881{
1882 return QUtf32::convertFromUnicode(out, in, state, DetectEndianness);
1883}
1884
1885static QChar *fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1886{
1887 return QUtf32::convertToUnicode(out, in, state, BigEndianness);
1888}
1889
1890static char *toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
1891{
1892 return QUtf32::convertFromUnicode(out, in, state, BigEndianness);
1893}
1894
1895static QChar *fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
1896{
1897 return QUtf32::convertToUnicode(out, in, state, LittleEndianness);
1898}
1899
1900static char *toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
1901{
1902 return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
1903}
1904#endif // !QT_BOOTSTRAPPED
1905
1906char *QLatin1::convertFromUnicode(char *out, QStringView in, QStringConverter::State *state) noexcept
1907{
1908 Q_ASSERT(state);
1909 if (state->flags & QStringConverter::Flag::Stateless) // temporary
1910 state = nullptr;
1911
1912 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 : '?';
1913 qsizetype invalid = 0;
1914 for (qsizetype i = 0; i < in.size(); ++i) {
1915 if (in[i] > QChar(0xff)) {
1916 *out = replacement;
1917 ++invalid;
1918 } else {
1919 *out = (char)in[i].cell();
1920 }
1921 ++out;
1922 }
1923 if (state)
1924 state->invalidChars += invalid;
1925 return out;
1926}
1927
1928static QChar *fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
1929{
1930 QString s = QLocal8Bit::convertToUnicode(in, state);
1931 memcpy(out, s.constData(), s.size()*sizeof(QChar));
1932 return out + s.size();
1933}
1934
1935static char *toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
1936{
1937 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1938 memcpy(out, s.constData(), s.size());
1939 return out + s.size();
1940}
1941
1942
1943static qsizetype fromUtf8Len(qsizetype l) { return l + 1; }
1944static qsizetype toUtf8Len(qsizetype l) { return 3*(l + 1); }
1945
1946#ifndef QT_BOOTSTRAPPED
1947static qsizetype fromUtf16Len(qsizetype l) { return l/2 + 2; }
1948static qsizetype toUtf16Len(qsizetype l) { return 2*(l + 1); }
1949
1950static qsizetype fromUtf32Len(qsizetype l) { return l/2 + 2; }
1951static qsizetype toUtf32Len(qsizetype l) { return 4*(l + 1); }
1952#endif
1953
1954static qsizetype fromLatin1Len(qsizetype l) { return l + 1; }
1955static qsizetype toLatin1Len(qsizetype l) { return l + 1; }
1956
1957
1958
1959/*!
1960 \class QStringConverter
1961 \inmodule QtCore
1962 \brief The QStringConverter class provides a base class for encoding and decoding text.
1963 \reentrant
1964 \ingroup i18n
1965 \ingroup string-processing
1966
1967 Qt uses UTF-16 to store, draw and manipulate strings. In many
1968 situations you may wish to deal with data that uses a different
1969 encoding. Most text data transferred over files and network connections is encoded
1970 in UTF-8.
1971
1972 The QStringConverter class is a base class for the \l {QStringEncoder} and
1973 \l {QStringDecoder} classes that help with converting between different
1974 text encodings. QStringDecoder can decode a string from an encoded representation
1975 into UTF-16, the format Qt uses internally. QStringEncoder does the opposite
1976 operation, encoding UTF-16 encoded data (usually in the form of a QString) to
1977 the requested encoding.
1978
1979 The following encodings are always supported:
1980
1981 \list
1982 \li UTF-8
1983 \li UTF-16
1984 \li UTF-16BE
1985 \li UTF-16LE
1986 \li UTF-32
1987 \li UTF-32BE
1988 \li UTF-32LE
1989 \li ISO-8859-1 (Latin-1)
1990 \li The system encoding
1991 \endlist
1992
1993 QStringConverter may support more encodings depending on how Qt was
1994 compiled. If more codecs are supported, they can be listed using
1995 availableCodecs().
1996
1997 \l {QStringConverter}s can be used as follows to convert some encoded
1998 string to and from UTF-16.
1999
2000 Suppose you have some string encoded in UTF-8, and
2001 want to convert it to a QString. The simple way
2002 to do it is to use a \l {QStringDecoder} like this:
2003
2004 \snippet code/src_corelib_text_qstringconverter.cpp 0
2005
2006 After this, \c string holds the text in decoded form.
2007 Converting a string from Unicode to the local encoding is just as
2008 easy using the \l {QStringEncoder} class:
2009
2010 \snippet code/src_corelib_text_qstringconverter.cpp 1
2011
2012 To read or write text files in various encodings, use QTextStream and
2013 its \l{QTextStream::setEncoding()}{setEncoding()} function.
2014
2015 Some care must be taken when trying to convert the data in chunks,
2016 for example, when receiving it over a network. In such cases it is
2017 possible that a multi-byte character will be split over two
2018 chunks. At best this might result in the loss of a character and
2019 at worst cause the entire conversion to fail.
2020
2021 Both QStringEncoder and QStringDecoder make this easy, by tracking
2022 this in an internal state. So simply calling the encoder or decoder
2023 again with the next chunk of data will automatically continue encoding
2024 or decoding the data correctly:
2025
2026 \snippet code/src_corelib_text_qstringconverter.cpp 2
2027
2028 The QStringDecoder object maintains state between chunks and therefore
2029 works correctly even if a multi-byte character is split between
2030 chunks.
2031
2032 QStringConverter objects can't be copied because of their internal state, but
2033 can be moved.
2034
2035 \sa QTextStream, QStringDecoder, QStringEncoder
2036*/
2037
2038/*!
2039 \enum QStringConverter::Flag
2040
2041 \value Default Default conversion rules apply.
2042 \value ConvertInvalidToNull If this flag is set, each invalid input
2043 character is output as a null character. If it is not set,
2044 invalid input characters are represented as QChar::ReplacementCharacter
2045 if the output encoding can represent that character, otherwise as a question mark.
2046 \value WriteBom When converting from a QString to an output encoding, write a QChar::ByteOrderMark as the first
2047 character if the output encoding supports this. This is the case for UTF-8, UTF-16 and UTF-32
2048 encodings.
2049 \value ConvertInitialBom When converting from an input encoding to a QString the QStringDecoder usually skips an
2050 leading QChar::ByteOrderMark. When this flag is set, the byte order mark will not be
2051 skipped, but converted to utf-16 and inserted at the start of the created QString.
2052 \value Stateless Ignore possible converter states between different function calls
2053 to encode or decode strings. This will also cause the QStringConverter to raise an error if an incomplete
2054 sequence of data is encountered.
2055 \omitvalue UsesIcu
2056*/
2057
2058/*!
2059 \enum QStringConverter::Encoding
2060 \value Utf8 Create a converter to or from UTF-8
2061 \value Utf16 Create a converter to or from UTF-16. When decoding, the byte order will get automatically
2062 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
2063 be assumed.
2064 \value Utf16BE Create a converter to or from big-endian UTF-16.
2065 \value Utf16LE Create a converter to or from little-endian UTF-16.
2066 \value Utf32 Create a converter to or from UTF-32. When decoding, the byte order will get automatically
2067 detected by a leading byte order mark. If none exists or when encoding, the system byte order will
2068 be assumed.
2069 \value Utf32BE Create a converter to or from big-endian UTF-32.
2070 \value Utf32LE Create a converter to or from little-endian UTF-32.
2071 \value Latin1 Create a converter to or from ISO-8859-1 (Latin1).
2072 \value System Create a converter to or from the underlying encoding of the
2073 operating systems locale. This is always assumed to be UTF-8 for Unix based
2074 systems. On Windows, this converts to and from the locale code page.
2075 \omitvalue LastEncoding
2076*/
2077
2078/*!
2079 \struct QStringConverter::Interface
2080 \internal
2081*/
2082
2083const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
2084{
2085 { "UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
2086#ifndef QT_BOOTSTRAPPED
2087 { "UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
2088 { "UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
2089 { "UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
2090 { "UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
2091 { "UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
2092 { "UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
2093#endif
2094 { "ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len },
2095 { "Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
2096};
2097
2098// match names case insensitive and skipping '-' and '_'
2099template <typename Char>
2100static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
2101{
2102 do {
2103 while (*a == '-' || *a == '_')
2104 ++a;
2105 while (b != b_end && (*b == Char{'-'} || *b == Char{'_'}))
2106 ++b;
2107 if (!*a && b == b_end) // end of both strings
2108 return true;
2109 if (char16_t(*b) > 127)
2110 return false; // non-US-ASCII cannot match US-ASCII (prevents narrowing below)
2111 } while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(char(*b++)));
2112
2113 return false;
2114}
2115
2116static bool nameMatch_impl(const char *a, QLatin1StringView b)
2117{
2118 return nameMatch_impl_impl(a, b.begin(), b.end());
2119}
2120
2121static bool nameMatch_impl(const char *a, QUtf8StringView b)
2122{
2123 return nameMatch_impl(a, QLatin1StringView{QByteArrayView{b}});
2124}
2125
2126static bool nameMatch_impl(const char *a, QStringView b)
2127{
2128 return nameMatch_impl_impl(a, b.utf16(), b.utf16() + b.size()); // uses char16_t*, not QChar*
2129}
2130
2131static bool nameMatch(const char *a, QAnyStringView b)
2132{
2133 return b.visit([a](auto b) { return nameMatch_impl(a, b); });
2134}
2135
2136
2137/*!
2138 \fn constexpr QStringConverter::QStringConverter()
2139 \internal
2140*/
2141
2142/*!
2143 \fn constexpr QStringConverter::QStringConverter(Encoding, Flags)
2144 \internal
2145*/
2146
2147
2148#if defined(QT_USE_ICU_CODECS)
2149// only derives from QStringConverter to get access to protected types
2150struct QStringConverterICU : QStringConverter
2151{
2152 static void clear_function(QStringConverter::State *state) noexcept
2153 {
2154 QT_COM_THREAD_INIT
2155 ucnv_close(static_cast<UConverter *>(state->d[0]));
2156 state->d[0] = nullptr;
2157 }
2158
2159 static void ensureConverter(QStringConverter::State *state)
2160 {
2161 // old code might reset the state via clear instead of reset
2162 // in that case, the converter has been closed, and we have to reopen it
2163 if (state->d[0] == nullptr)
2164 state->d[0] = createConverterForName(static_cast<const char *>(state->d[1]), state);
2165 }
2166
2167 static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
2168 {
2169 QT_COM_THREAD_INIT
2170 ensureConverter(state);
2171
2172 auto icu_conv = static_cast<UConverter *>(state->d[0]);
2173 UErrorCode err = U_ZERO_ERROR;
2174 auto source = in.data();
2175 auto sourceLimit = in.data() + in.size();
2176
2177 qsizetype length = toLen(in.size());
2178
2179 UChar *target = reinterpret_cast<UChar *>(out);
2180 auto targetLimit = target + length;
2181 // We explicitly clean up anyway, so no need to set flush to true,
2182 // which would just reset the converter.
2183 UBool flush = false;
2184
2185 // If the QStringConverter was moved, the state that we used as a context is stale now.
2186 UConverterToUCallback action;
2187 const void *context;
2188 ucnv_getToUCallBack(icu_conv, &action, &context);
2189 if (context != state)
2190 ucnv_setToUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
2191
2192 ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
2193 // We did reserve enough space:
2194 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2195 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2196 if (auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
2197 ucnv_reset(icu_conv);
2198 state->invalidChars += leftOver;
2199 }
2200 }
2201 return reinterpret_cast<QChar *>(target);
2202 }
2203
2204 static char *fromUtf16(char *out, QStringView in, QStringConverter::State *state)
2205 {
2206 QT_COM_THREAD_INIT
2207 ensureConverter(state);
2208 auto icu_conv = static_cast<UConverter *>(state->d[0]);
2209 UErrorCode err = U_ZERO_ERROR;
2210 auto source = reinterpret_cast<const UChar *>(in.data());
2211 auto sourceLimit = reinterpret_cast<const UChar *>(in.data() + in.size());
2212
2213 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2214
2215 char *target = out;
2216 char *targetLimit = out + length;
2217 UBool flush = false;
2218
2219 // If the QStringConverter was moved, the state that we used as a context is stale now.
2220 UConverterFromUCallback action;
2221 const void *context;
2222 ucnv_getFromUCallBack(icu_conv, &action, &context);
2223 if (context != state)
2224 ucnv_setFromUCallBack(icu_conv, action, state, nullptr, nullptr, &err);
2225
2226 ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit, nullptr, flush, &err);
2227 // We did reserve enough space:
2228 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2229 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2230 if (auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
2231 ucnv_reset(icu_conv);
2232 state->invalidChars += leftOver;
2233 }
2234 }
2235 return target;
2236 }
2237
2238 Q_DISABLE_COPY_MOVE(QStringConverterICU)
2239
2240 template<qsizetype X>
2241 static qsizetype fromLen(qsizetype inLength)
2242 {
2243 return X * inLength * sizeof(UChar);
2244 }
2245
2246 static qsizetype toLen(qsizetype inLength)
2247 {
2248
2249 /* Assumption: each input char might map to a different codepoint
2250 Each codepoint can take up to 4 bytes == 2 QChar
2251 We can ignore reserving space for a BOM, as only UTF encodings use one
2252 and those are not handled by the ICU converter.
2253 */
2254 return 2 * inLength;
2255 }
2256
2257 static constexpr QStringConverter::Interface forLength[] = {
2258 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
2259 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
2260 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
2261 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
2262 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
2263 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
2264 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
2265 {"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
2266 };
2267
2268 static UConverter *createConverterForName(const char *name, const State *state)
2269 {
2270 Q_ASSERT(name);
2271 Q_ASSERT(state);
2272 QT_COM_THREAD_INIT
2273 UErrorCode status = U_ZERO_ERROR;
2274 UConverter *conv = ucnv_open(name, &status);
2275 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2276 ucnv_close(conv);
2277 return nullptr;
2278 }
2279
2280 if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
2281 UErrorCode error = U_ZERO_ERROR;
2282
2283 auto nullToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2284 const char *, int32_t length,
2285 UConverterCallbackReason reason, UErrorCode *err) {
2286 if (reason <= UCNV_IRREGULAR) {
2287 *err = U_ZERO_ERROR;
2288 UChar c = '\0';
2289 ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
2290 // Recover outer scope's state (which isn't const) from context:
2291 auto state = const_cast<State *>(static_cast<const State *>(context));
2292 state->invalidChars += length;
2293 }
2294 };
2295 ucnv_setToUCallBack(conv, nullToSubstituter, state, nullptr, nullptr, &error);
2296
2297 auto nullFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2298 const UChar *, int32_t length,
2299 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2300 if (reason <= UCNV_IRREGULAR) {
2301 *err = U_ZERO_ERROR;
2302 const UChar replacement[] = { 0 };
2303 const UChar *stringBegin = std::begin(replacement);
2304 ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
2305 // Recover outer scope's state (which isn't const) from context:
2306 auto state = const_cast<State *>(static_cast<const State *>(context));
2307 state->invalidChars += length;
2308 }
2309 };
2310 ucnv_setFromUCallBack(conv, nullFromSubstituter, state, nullptr, nullptr, &error);
2311 } else {
2312 UErrorCode error = U_ZERO_ERROR;
2313
2314 auto qmarkToSubstituter = [](const void *context, UConverterToUnicodeArgs *toUArgs,
2315 const char *codeUnits,int32_t length,
2316 UConverterCallbackReason reason, UErrorCode *err) {
2317 if (reason <= UCNV_IRREGULAR) {
2318 // Recover outer scope's state (which isn't const) from context:
2319 auto state = const_cast<State *>(static_cast<const State *>(context));
2320 state->invalidChars += length;
2321 }
2322 // use existing ICU callback for logic
2323 UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, toUArgs, codeUnits, length, reason, err);
2324
2325 };
2326 ucnv_setToUCallBack(conv, qmarkToSubstituter, state, nullptr, nullptr, &error);
2327
2328 auto qmarkFromSubstituter = [](const void *context, UConverterFromUnicodeArgs *fromUArgs,
2329 const UChar *codeUnits, int32_t length,
2330 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2331 if (reason <= UCNV_IRREGULAR) {
2332 // Recover outer scope's state (which isn't const) from context:
2333 auto state = const_cast<State *>(static_cast<const State *>(context));
2334 state->invalidChars += length;
2335 }
2336 // use existing ICU callback for logic
2337 UCNV_FROM_U_CALLBACK_SUBSTITUTE(nullptr, fromUArgs, codeUnits, length,
2338 codePoint, reason, err);
2339 };
2340 ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state, nullptr, nullptr, &error);
2341 }
2342 return conv;
2343 }
2344
2345 static std::string nul_terminate_impl(QLatin1StringView name)
2346 { return name.isNull() ? std::string() : std::string{name.data(), size_t(name.size())}; }
2347
2348 static std::string nul_terminate_impl(QUtf8StringView name)
2349 { return nul_terminate_impl(QLatin1StringView{QByteArrayView{name}}); }
2350
2351 static std::string nul_terminate_impl(QStringView name)
2352 {
2353 std::string result;
2354 const auto convert = [&](char *p, size_t n) {
2355 const auto sz = QLatin1::convertFromUnicode(p, name) - p;
2356 Q_ASSERT(q20::cmp_less_equal(sz, n));
2357 return sz;
2358 };
2359#ifdef __cpp_lib_string_resize_and_overwrite
2360 result.resize_and_overwrite(size_t(name.size()), convert);
2361#else
2362 result.resize(size_t(name.size()));
2363 result.resize(convert(result.data(), result.size()));
2364#endif // __cpp_lib_string_resize_and_overwrite
2365 return result;
2366 }
2367
2368 static std::string nul_terminate(QAnyStringView name)
2369 { return name.visit([](auto name) { return nul_terminate_impl(name); }); }
2370
2371 static const QStringConverter::Interface *
2372 make_icu_converter(QStringConverter::State *state, QAnyStringView name)
2373 { return make_icu_converter(state, nul_terminate(name).data()); }
2374
2375 static const QStringConverter::Interface *make_icu_converter(
2376 QStringConverter::State *state,
2377 const char *name)
2378 {
2379 QT_COM_THREAD_INIT
2380 UErrorCode status = U_ZERO_ERROR;
2381 UConverter *conv = createConverterForName(name, state);
2382 if (!conv)
2383 return nullptr;
2384
2385 const char *icuName = ucnv_getName(conv, &status);
2386 // ucnv_getStandardName returns a name which is owned by the library
2387 // we can thus store it in the state without worrying aobut its lifetime
2388 const char *persistentName = ucnv_getStandardName(icuName, "MIME", &status);
2389 if (U_FAILURE(status) || !persistentName) {
2390 status = U_ZERO_ERROR;
2391 persistentName = ucnv_getStandardName(icuName, "IANA", &status);
2392 }
2393 state->d[1] = const_cast<char *>(persistentName);
2394 state->d[0] = conv;
2395 state->flags |= QStringConverter::Flag::UsesIcu;
2396 qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
2397 state->clearFn = QStringConverterICU::clear_function;
2398 if (maxCharSize > 8 || maxCharSize < 1) {
2399 qWarning("Encountered unexpected codec \"%s\" which requires >8x space", name);
2400 return nullptr;
2401 } else {
2402 return &forLength[maxCharSize - 1];
2403 }
2404
2405 }
2406
2407};
2408#endif
2409
2410/*!
2411 \internal
2412*/
2413QStringConverter::QStringConverter(QAnyStringView name, Flags f)
2414 : iface(nullptr), state(f)
2415{
2416 auto e = encodingForName(name);
2417 if (e)
2418 iface = encodingInterfaces + int(*e);
2419#if defined(QT_USE_ICU_CODECS)
2420 else
2421 iface = QStringConverterICU::make_icu_converter(&state, name);
2422#endif
2423}
2424
2425
2426const char *QStringConverter::name() const noexcept
2427{
2428 if (!iface)
2429 return nullptr;
2430 if (state.flags & QStringConverter::Flag::UsesIcu) {
2431#if defined(QT_USE_ICU_CODECS)
2432 return static_cast<const char*>(state.d[1]);
2433#else
2434 return nullptr;
2435#endif
2436 } else {
2437 return iface->name;
2438 }
2439}
2440
2441/*!
2442 \fn bool QStringConverter::isValid() const
2443
2444 Returns true if this is a valid string converter that can be used for encoding or
2445 decoding text.
2446
2447 Default constructed string converters or converters constructed with an unsupported
2448 name are not valid.
2449*/
2450
2451/*!
2452 \fn void QStringConverter::resetState()
2453
2454 Resets the internal state of the converter, clearing potential errors or partial
2455 conversions.
2456*/
2457
2458/*!
2459 \fn bool QStringConverter::hasError() const
2460
2461 Returns true if a conversion could not correctly convert a character. This could for example
2462 get triggered by an invalid UTF-8 sequence or when a character can't get converted due to
2463 limitations in the target encoding.
2464*/
2465
2466/*!
2467 \fn const char *QStringConverter::name() const
2468
2469 Returns the canonical name of the encoding this QStringConverter can encode or decode.
2470 Returns a nullptr if the converter is not valid.
2471 The returned name is UTF-8 encoded.
2472
2473 \sa isValid()
2474*/
2475
2476/*!
2477 Convert \a name to the corresponding \l Encoding member, if there is one.
2478
2479 If the \a name is not the name of a codec listed in the Encoding enumeration,
2480 \c{std::nullopt} is returned. Such a name may, none the less, be accepted by
2481 the QStringConverter constructor when Qt is built with ICU, if ICU provides a
2482 converter with the given name.
2483
2484 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2485 which was expected to be UTF-8-encoded.
2486*/
2487std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAnyStringView name) noexcept
2488{
2489 if (name.isEmpty())
2490 return std::nullopt;
2491 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2492 if (nameMatch(encodingInterfaces[i].name, name))
2493 return QStringConverter::Encoding(i);
2494 }
2495 if (nameMatch("latin1", name))
2496 return QStringConverter::Latin1;
2497 return std::nullopt;
2498}
2499
2500#ifndef QT_BOOTSTRAPPED
2501namespace QtPrivate {
2502// Note: Check isValid() on the QStringConverter before calling this with its
2503// state!
2504static int partiallyParsedDataCount(QStringConverter::State *state)
2505{
2506#if QT_CONFIG(icu)
2507 if (state->flags & QStringConverter::Flag::UsesIcu) {
2508 UConverter *converter = static_cast<UConverter *>(state->d[0]);
2509 if (!converter)
2510 return 0;
2511 UErrorCode err = U_ZERO_ERROR;
2512 auto leftOver = ucnv_fromUCountPending(converter, &err);
2513 // If there is an error, leftOver is -1, so no need for an additional
2514 // check.
2515 return std::max(leftOver, 0);
2516 }
2517#endif
2518 return q26::saturate_cast<int>(state->remainingChars);
2519}
2520} // namespace QtPrivate
2521
2522/*!
2523 Returns the encoding for the content of \a data if it can be determined.
2524 \a expectedFirstCharacter can be passed as an additional hint to help determine
2525 the encoding.
2526
2527 The returned optional is empty, if the encoding is unclear.
2528 */
2529std::optional<QStringConverter::Encoding>
2530QStringConverter::encodingForData(QByteArrayView data, char16_t expectedFirstCharacter) noexcept
2531{
2532 // someone set us up the BOM?
2533 qsizetype arraySize = data.size();
2534 if (arraySize > 3) {
2535 char32_t uc = qFromUnaligned<char32_t>(data.data());
2536 if (uc == qToBigEndian(char32_t(QChar::ByteOrderMark)))
2537 return QStringConverter::Utf32BE;
2538 if (uc == qToLittleEndian(char32_t(QChar::ByteOrderMark)))
2539 return QStringConverter::Utf32LE;
2540 if (expectedFirstCharacter) {
2541 // catch also anything starting with the expected character
2542 if (qToLittleEndian(uc) == expectedFirstCharacter)
2543 return QStringConverter::Utf32LE;
2544 else if (qToBigEndian(uc) == expectedFirstCharacter)
2545 return QStringConverter::Utf32BE;
2546 }
2547 }
2548
2549 if (arraySize > 2) {
2550 if (memcmp(data.data(), utf8bom, sizeof(utf8bom)) == 0)
2551 return QStringConverter::Utf8;
2552 }
2553
2554 if (arraySize > 1) {
2555 char16_t uc = qFromUnaligned<char16_t>(data.data());
2556 if (uc == qToBigEndian(char16_t(QChar::ByteOrderMark)))
2557 return QStringConverter::Utf16BE;
2558 if (uc == qToLittleEndian(char16_t(QChar::ByteOrderMark)))
2559 return QStringConverter::Utf16LE;
2560 if (expectedFirstCharacter) {
2561 // catch also anything starting with the expected character
2562 if (qToLittleEndian(uc) == expectedFirstCharacter)
2563 return QStringConverter::Utf16LE;
2564 else if (qToBigEndian(uc) == expectedFirstCharacter)
2565 return QStringConverter::Utf16BE;
2566 }
2567 }
2568 return std::nullopt;
2569}
2570
2571static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
2572{
2573 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher("meta ");
2574 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher("charset=");
2575
2576 QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
2577 qsizetype pos = metaSearcher.indexIn(header);
2578 if (pos != -1) {
2579 pos = charsetSearcher.indexIn(header, pos);
2580 if (pos != -1) {
2581 pos += qstrlen("charset=");
2582 if (pos < header.size() && (header.at(pos) == '\"' || header.at(pos) == '\''))
2583 ++pos;
2584
2585 qsizetype pos2 = pos;
2586 // The attribute can be closed with either """, "'", ">" or "/",
2587 // none of which are valid charset characters.
2588 while (++pos2 < header.size()) {
2589 char ch = header.at(pos2);
2590 if (ch == '\"' || ch == '\'' || ch == '>' || ch == '/') {
2591 QByteArray name = header.mid(pos, pos2 - pos);
2592 qsizetype colon = name.indexOf(':');
2593 if (colon > 0)
2594 name = name.left(colon);
2595 name = name.simplified();
2596 if (name == "unicode") // QTBUG-41998, ICU will return UTF-16.
2597 name = QByteArrayLiteral("UTF-8");
2598 if (!name.isEmpty())
2599 return name;
2600 }
2601 }
2602 }
2603 }
2604 return QByteArray();
2605}
2606
2607/*!
2608 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2609 order marks or a charset specifier in the HTML meta tag. If the optional is empty,
2610 the encoding specified is not supported by QStringConverter. If no encoding is
2611 detected, the method returns Utf8.
2612
2613 \sa QStringDecoder::decoderForHtml()
2614*/
2615std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2616{
2617 // determine charset
2618 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2619 if (encoding)
2620 // trust the initial BOM
2621 return encoding;
2622
2623 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2624 if (!encodingTag.isEmpty())
2625 return encodingForName(encodingTag);
2626
2627 return Utf8;
2628}
2629
2631{
2632#if !defined(QT_USE_ICU_CODECS)
2633 return QStringConverter::Encoding::LastEncoding;
2634#else
2635 QT_COM_THREAD_INIT
2636 /* icu contains also the names of what Qt provides
2637 except for the special Locale one (so add one for it)
2638 */
2639 return 1 + ucnv_countAvailable();
2640#endif
2641}
2642
2643/*!
2644 Returns a list of names of supported codecs. The names returned
2645 by this function can be passed to QStringEncoder's and
2646 QStringDecoder's constructor to create a en- or decoder for
2647 the given codec.
2648
2649 This function may be used to obtain a listing of additional codecs beyond
2650 the standard ones. Support for additional codecs requires Qt be compiled
2651 with support for the ICU library.
2652
2653 \note The order of codecs is an internal implementation detail
2654 and not guaranteed to be stable.
2655 */
2656QStringList QStringConverter::availableCodecs()
2657{
2658 auto availableCodec = [](qsizetype index) -> QString
2659 {
2660 #if !defined(QT_USE_ICU_CODECS)
2661 return QString::fromLatin1(encodingInterfaces[index].name);
2662 #else
2663 if (index == 0) // "Locale", not provided by icu
2664 return QString::fromLatin1(
2665 encodingInterfaces[QStringConverter::Encoding::System].name);
2666 QT_COM_THREAD_INIT
2667 // this mirrors the setup we do to set a converters name
2668 UErrorCode status = U_ZERO_ERROR;
2669 auto icuName = ucnv_getAvailableName(int32_t(index - 1));
2670 const char *standardName = ucnv_getStandardName(icuName, "MIME", &status);
2671 if (U_FAILURE(status) || !standardName) {
2672 status = U_ZERO_ERROR;
2673 standardName = ucnv_getStandardName(icuName, "IANA", &status);
2674 }
2675 if (!standardName)
2676 standardName = icuName;
2677 return QString::fromLatin1(standardName);
2678 #endif
2679 };
2680
2681 qsizetype codecCount = availableCodecCount();
2682 QStringList result;
2683 result.reserve(codecCount);
2684 for (qsizetype i = 0; i < codecCount; ++i)
2685 result.push_back(availableCodec(i));
2686 return result;
2687}
2688
2689/*!
2690 \class QStringConverter::FinalizeResultBase
2691 \internal
2692*/
2693/*!
2694 \class QStringConverter::FinalizeResultChar
2695 \inmodule QtCore
2696 \since 6.11
2697 \reentrant
2698 \brief Holds the result of calling finalize() on QStringDecoder or
2699 QStringEncoder.
2700
2701 FinalizeResultChar<Char> is a template class where \a Char specifies
2702 the character type (typically \c char or \c char16_t).
2703
2704 This class is used to relay the result of the finalize() call or the reason
2705 why the call did not succeed.
2706*/
2707/*!
2708 \enum QStringConverter::FinalizeResultBase::Error
2709 \value NoError No error.
2710 \value InvalidCharacters The encoder successfully finalized, but encountered
2711 invalid characters either during finalization or some time earlier.
2712 \value NotEnoughSpace finalize() did \e{not} succeed, you must grow the
2713 buffer and call finalize() again.
2714*/
2715
2716/*!
2717 \variable QStringConverter::FinalizeResultChar::error
2718 Relays errors discovered during finalization.
2719*/
2720/*!
2721 \variable QStringConverter::FinalizeResultChar::next
2722 Points to the character position \e{following} the last-written character.
2723*/
2724/*!
2725 \variable QStringConverter::FinalizeResultChar::invalidChars
2726 The number of invalid characters that were previously counted in the state
2727 as well as any that were encountered during the call to finalize().
2728*/
2729
2730/*!
2731 \typedef QStringDecoder::FinalizeResult
2732
2733 This is an alias for QStringConverter::FinalizeResultChar<char16_t>.
2734*/
2735
2736/*!
2737 \typedef QStringDecoder::FinalizeResultQChar
2738
2739 This is an alias for QStringConverter::FinalizeResultChar<QChar>.
2740*/
2741
2742/*!
2743 \fn QStringDecoder::FinalizeResultQChar QStringDecoder::finalize(QChar *out, qsizetype maxlen)
2744 \fn QStringDecoder::FinalizeResult QStringDecoder::finalize(char16_t *out, qsizetype maxlen)
2745 \fn QStringDecoder::FinalizeResult QStringDecoder::finalize()
2746
2747 Signals to the decoder that no further data will arrive.
2748
2749 May also provide data from residual content that was pending decoding.
2750 When there is no residual data to account for, the return's \c error
2751 field will be set to \l {QStringConverter::FinalizeResultChar::error}
2752 {NoError}.
2753
2754 If \a out is supplied and non-null, it must have space in which up to
2755 \a maxlen characters may be written. Up to this many characters of
2756 residual output are written to this space, with the end indicated by
2757 the return-value's \c next field. Typically this residual data shall
2758 consist of one replacement character per remaining unconverted input
2759 character.
2760
2761 If all residual content has been delivered via \a out, if \a out is
2762 \nullptr, or if there is no residual data, the decoder is reset on
2763 return from finalize(). Otherwise, the remaining data can be retrieved
2764 or discarded by a further call to finalize().
2765
2766 \since 6.11
2767 \sa hasError(), appendToBuffer()
2768 */
2769auto QStringDecoder::finalize(char16_t *out, qsizetype maxlen) -> FinalizeResult
2770{
2771 int count = 0;
2772 if (isValid())
2773 count = QtPrivate::partiallyParsedDataCount(&state);
2774 using Error = FinalizeResult::Error;
2775 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2776 if (count == 0 || !out) {
2777 resetState();
2778 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2779 }
2780 if (maxlen < count)
2781 return { {}, out, invalidChars, Error::NotEnoughSpace };
2782
2783 const char16_t replacement = (state.flags & QStringConverter::Flag::ConvertInvalidToNull)
2784 ? QChar::Null
2785 : QChar::ReplacementCharacter;
2786 out = std::fill_n(out, count, replacement);
2787 resetState();
2788 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2789}
2790
2791/*!
2792 \typedef QStringEncoder::FinalizeResult
2793
2794 This is an alias for QStringConverter::FinalizeResultChar<char>.
2795*/
2796
2797/*!
2798 \fn QStringEncoder::FinalizeResult QStringEncoder::finalize(char *out, qsizetype maxlen)
2799 \fn QStringEncoder::FinalizeResult QStringEncoder::finalize()
2800
2801 Signals to the decoder that no further data will arrive.
2802
2803 May also provide data from residual content that was pending decoding.
2804 When there is no residual data to account for, the return's \c error
2805 field will be set to \l {QStringConverter::FinalizeResultChar::error}
2806 {NoError}.
2807
2808 If \a out is supplied and non-null, it must have space in which up to
2809 \a maxlen characters may be written. Up to this many characters of
2810 residual output are written to this space, with the end indicated by
2811 the return-value's \c next field. Typically this residual data shall
2812 consist of one replacement character per remaining unconverted input
2813 character. When using a stateful encoding, such as ISO-2022-JP, this may
2814 also write bytes to restore, or end, the current state in the character
2815 stream.
2816
2817 If all residual content has been delivered via \a out, if \a out is
2818 \nullptr, or if there is no residual data, the decoder is reset on
2819 return from finalize(). Otherwise, the remaining data can be retrieved
2820 or discarded by a further call to finalize().
2821
2822 \since 6.11
2823 \sa hasError(), appendToBuffer()
2824 */
2825auto QStringEncoder::finalize(char *out, qsizetype maxlen) -> QStringEncoder::FinalizeResult
2826{
2827 qsizetype count = 0;
2828 if (isValid())
2829 count = QtPrivate::partiallyParsedDataCount(&state);
2830 // For ICU we may be using a stateful codec that need to restore or finalize
2831 // some state, otherwise we have nothing to do with count == 0
2832 using Error = FinalizeResult::Error;
2833 const bool usesIcu = !!(state.flags & QStringConverter::Flag::UsesIcu) && !!state.d[0];
2834 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2835 if (!isValid() || (!count && !usesIcu) || !out) {
2836 resetState();
2837 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2838 }
2839
2840 if ((false)) {
2841#if defined(QT_USE_ICU_CODECS)
2842 } else if (usesIcu) {
2843 Q_ASSERT(out);
2844 auto *icu_conv = static_cast<UConverter *>(state.d[0]);
2845 Q_ASSERT(icu_conv); // bool usesIcu checks that the pointer is non-null
2846 UErrorCode err = U_ZERO_ERROR;
2847
2848 UBool flush = true;
2849
2850 // If the QStringConverter was moved, the state that we used as a context is stale now.
2851 UConverterFromUCallback action;
2852 const void *context;
2853 ucnv_getFromUCallBack(icu_conv, &action, &context);
2854 if (context != &state)
2855 ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
2856 const UChar *dummyInput = u"";
2857 const char *outEnd = out + maxlen;
2858 ucnv_fromUnicode(icu_conv, &out, outEnd, &dummyInput, dummyInput, nullptr, flush, &err);
2859 if (err == U_BUFFER_OVERFLOW_ERROR)
2860 return { {}, out, invalidChars, Error::NotEnoughSpace };
2861 resetState();
2862#endif
2863 } else if (!(state.flags & QStringConverter::Flag::ConvertInvalidToNull)) {
2864 /*
2865 We don't really know (in general) how the replacement character
2866 looks like in the target encoding. So we just encode 0xfffd, which
2867 is the Unicode replacement character.
2868 Use 4 as a best-guess for the upper-bound of how many characters
2869 would potentially be produced by the leftover UTF-16 characters in
2870 the state
2871 */
2872 constexpr QChar replacementCharacter = QChar::ReplacementCharacter;
2873 constexpr char16_t repl = replacementCharacter.unicode();
2874 constexpr std::array<char16_t, 4> replacement{ repl, repl, repl, repl };
2875 const qsizetype charactersToEncode = std::min(count, qsizetype(replacement.size()));
2876 if (maxlen < requiredSpace(charactersToEncode))
2877 return { {}, out, invalidChars, Error::NotEnoughSpace };
2878 // we don't want the incomplete data in the internal buffer; we're
2879 // flushing the buffer after all
2880 resetState();
2881 out = appendToBuffer(out, QStringView(replacement.data(), charactersToEncode));
2882 } else /* outputting Null characters for each remaining unconverted input character */ {
2883 if (maxlen < count)
2884 return { {}, out, invalidChars, Error::NotEnoughSpace };
2885 out = std::fill_n(out, count, '\0');
2886 resetState();
2887 }
2888 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2889}
2890
2891/*!
2892 Tries to determine the encoding of the HTML in \a data by looking at leading byte
2893 order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder
2894 matching the encoding. If the returned decoder is not valid,
2895 the encoding specified is not supported by QStringConverter. If no encoding is
2896 detected, the method returns a decoder for Utf8.
2897
2898 \sa isValid()
2899*/
2900QStringDecoder QStringDecoder::decoderForHtml(QByteArrayView data)
2901{
2902 // determine charset
2903 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2904 if (encoding)
2905 // trust the initial BOM
2906 return QStringDecoder(encoding.value());
2907
2908 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2909 if (!encodingTag.isEmpty())
2910 return QStringDecoder(encodingTag);
2911
2912 return QStringDecoder(Utf8);
2913}
2914#endif // !QT_BOOTSTRAPPED
2915
2916/*!
2917 Returns the canonical name for encoding \a e or \nullptr if \a e is an
2918 invalid value.
2919
2920 \note In Qt versions prior to 6.10, 6.9.1, 6.8.4 or 6.5.9, calling this
2921 function with an invalid argument resulted in undefined behavior. Since the
2922 above-mentioned Qt versions, it returns nullptr instead.
2923*/
2924const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e) noexcept
2925{
2926 auto i = size_t(e);
2927 if (Q_UNLIKELY(i >= std::size(encodingInterfaces)))
2928 return nullptr;
2929 return encodingInterfaces[i].name;
2930}
2931
2932/*!
2933 \class QStringEncoder
2934 \inmodule QtCore
2935 \brief The QStringEncoder class provides a state-based encoder for text.
2936 \reentrant
2937 \ingroup i18n
2938 \ingroup string-processing
2939
2940 A text encoder converts text from Qt's internal representation into an encoded
2941 text format using a specific encoding.
2942
2943 Converting a string from Unicode to the local encoding can be achieved
2944 using the following code:
2945
2946 \snippet code/src_corelib_text_qstringconverter.cpp 1
2947
2948 The encoder remembers any state that is required between calls, so converting
2949 data received in chunks, for example, when receiving it over a network, is just as
2950 easy, by calling the encoder whenever new data is available:
2951
2952 \snippet code/src_corelib_text_qstringconverter.cpp 3
2953
2954 The QStringEncoder object maintains state between chunks and therefore
2955 works correctly even if a UTF-16 surrogate character is split between
2956 chunks.
2957
2958 QStringEncoder objects can't be copied because of their internal state, but
2959 can be moved.
2960
2961 \sa QStringConverter, QStringDecoder
2962*/
2963
2964/*!
2965 \fn constexpr QStringEncoder::QStringEncoder(const Interface *i)
2966 \internal
2967*/
2968
2969/*!
2970 \fn constexpr QStringEncoder::QStringEncoder()
2971
2972 Default constructs an encoder. The default encoder is not valid,
2973 and can't be used for converting text.
2974*/
2975
2976/*!
2977 \fn constexpr QStringEncoder::QStringEncoder(Encoding encoding, Flags flags = Flag::Default)
2978
2979 Creates an encoder object using \a encoding and \a flags.
2980*/
2981
2982/*!
2983 \fn QStringEncoder::QStringEncoder(QAnyStringView name, Flags flags = Flag::Default)
2984
2985 Creates an encoder object using \a name and \a flags.
2986 If \a name is not the name of a known encoding an invalid converter will get created.
2987
2988 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
2989 which was expected to be UTF-8-encoded.
2990
2991 \sa isValid()
2992*/
2993
2994/*!
2995 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::encode(const QString &in)
2996 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::encode(QStringView in)
2997 \fn QStringEncoder::DecodedData<const QString &> QStringEncoder::operator()(const QString &in)
2998 \fn QStringEncoder::DecodedData<QStringView> QStringEncoder::operator()(QStringView in)
2999
3000 Converts \a in and returns a struct that is implicitly convertible to QByteArray.
3001
3002 \snippet code/src_corelib_text_qstringconverter.cpp 5
3003*/
3004
3005/*!
3006 \fn qsizetype QStringEncoder::requiredSpace(qsizetype inputLength) const
3007
3008 Returns the maximum amount of characters required to be able to process
3009 \a inputLength decoded data.
3010
3011 \sa appendToBuffer()
3012*/
3013
3014/*!
3015 \fn char *QStringEncoder::appendToBuffer(char *out, QStringView in)
3016
3017 Encodes \a in and writes the encoded result into the buffer
3018 starting at \a out. Returns a pointer to the end of the data written.
3019
3020 \note \a out must be large enough to be able to hold all the decoded data. Use
3021 requiredSpace() to determine the maximum size requirement to be able to encode
3022 \a in. This function may write to any bytes between \a out and \c{out +
3023 requiredSpace()}, including those past the returned end pointer.
3024
3025 \sa requiredSpace()
3026*/
3027
3028/*!
3029 \class QStringDecoder
3030 \inmodule QtCore
3031 \brief The QStringDecoder class provides a state-based decoder for text.
3032 \reentrant
3033 \ingroup i18n
3034 \ingroup string-processing
3035
3036 A text decoder converts text an encoded text format that uses a specific encoding
3037 into Qt's internal representation.
3038
3039 Converting encoded data into a QString can be achieved
3040 using the following code:
3041
3042 \snippet code/src_corelib_text_qstringconverter.cpp 0
3043
3044 The decoder remembers any state that is required between calls, so converting
3045 data received in chunks, for example, when receiving it over a network, is just as
3046 easy, by calling the decoder whenever new data is available:
3047
3048 \snippet code/src_corelib_text_qstringconverter.cpp 2
3049
3050 The QStringDecoder object maintains state between chunks and therefore
3051 works correctly even if chunks are split in the middle of a multi-byte character
3052 sequence.
3053
3054 QStringDecoder objects can't be copied because of their internal state, but
3055 can be moved.
3056
3057 \sa QStringConverter, QStringEncoder
3058*/
3059
3060/*!
3061 \fn constexpr QStringDecoder::QStringDecoder(const Interface *i)
3062 \internal
3063*/
3064
3065/*!
3066 \fn constexpr QStringDecoder::QStringDecoder()
3067
3068 Default constructs an decoder. The default decoder is not valid,
3069 and can't be used for converting text.
3070*/
3071
3072/*!
3073 \fn constexpr QStringDecoder::QStringDecoder(Encoding encoding, Flags flags = Flag::Default)
3074
3075 Creates an decoder object using \a encoding and \a flags.
3076*/
3077
3078/*!
3079 \fn QStringDecoder::QStringDecoder(QAnyStringView name, Flags flags = Flag::Default)
3080
3081 Creates an decoder object using \a name and \a flags.
3082 If \a name is not the name of a known encoding an invalid converter will get created.
3083
3084 \note In Qt versions prior to 6.8, this function took only a \c{const char *},
3085 which was expected to be UTF-8-encoded.
3086
3087 \sa isValid()
3088*/
3089
3090/*!
3091 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::operator()(const QByteArray &ba)
3092 \fn QStringDecoder::EncodedData<const QByteArray &> QStringDecoder::decode(const QByteArray &ba)
3093 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::operator()(QByteArrayView ba)
3094 \fn QStringDecoder::EncodedData<QByteArrayView> QStringDecoder::decode(QByteArrayView ba)
3095
3096 Converts \a ba and returns a struct that is implicitly convertible to QString.
3097
3098
3099 \snippet code/src_corelib_text_qstringconverter.cpp 4
3100*/
3101
3102/*!
3103 \fn qsizetype QStringDecoder::requiredSpace(qsizetype inputLength) const
3104
3105 Returns the maximum amount of UTF-16 code units required to be able to process
3106 \a inputLength encoded data.
3107
3108 \sa appendToBuffer
3109*/
3110
3111/*!
3112 \fn QChar *QStringDecoder::appendToBuffer(QChar *out, QByteArrayView in)
3113
3114 Decodes the sequence of bytes viewed by \a in and writes the decoded result into
3115 the buffer starting at \a out. Returns a pointer to the end of data written.
3116
3117 \a out needs to be large enough to be able to hold all the decoded data. Use
3118 \l{requiredSpace} to determine the maximum size requirements to decode an encoded
3119 data buffer of \c in.size() bytes. This function may write to any bytes
3120 between \a out and \c{out + requiredSpace()}, including those past the
3121 returned end pointer.
3122
3123 \sa requiredSpace
3124*/
3125
3126/*!
3127 \fn char16_t *QStringDecoder::appendToBuffer(char16_t *out, QByteArrayView in)
3128 \since 6.6
3129 \overload
3130*/
3131
3132QT_END_NAMESPACE
\inmodule QtCore
static int partiallyParsedDataCount(QStringConverter::State *state)
#define __has_include(x)
static bool nameMatch(const char *a, QAnyStringView b)
static const uchar utf8bom[]
static QChar * fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
@ HeaderDone
static QChar * fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
static QChar * fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf8Len(qsizetype l)
static QChar * fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QChar * fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toLatin1Len(qsizetype l)
static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
static bool nameMatch_impl(const char *a, QLatin1StringView b)
static QChar * fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
static char * toUtf32(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf8Len(qsizetype l)
static char * toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
static qsizetype toUtf16Len(qsizetype l)
static qsizetype fromLatin1Len(qsizetype l)
static char * toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf32Len(qsizetype l)
static qsizetype availableCodecCount()
static QChar * fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf32Len(qsizetype l)
static char * toUtf16(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf16Len(qsizetype l)
static char * toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
static void appendUtf16(const NoOutput &, char16_t)
static void appendUcs4(const NoOutput &, char32_t)