6#include <qstringconverter.h>
7#include <private/qstringconverter_p.h>
10#include "private/qsimd_p.h"
11#include "private/qstringiterator_p.h"
12#include "private/qtools_p.h"
15#include <QtCore/qbytearraylist.h>
19#include <unicode/ucnv.h>
20#include <unicode/ucnv_cb.h>
21#include <unicode/ucnv_err.h>
22#include <unicode/ustring.h>
23#define QT_USE_ICU_CODECS
24#define QT_COM_THREAD_INIT
26#elif QT_CONFIG(winsdkicu)
29#include <private/qfunctions_win_p.h>
30#define QT_USE_ICU_CODECS
31#define QT_COM_THREAD_INIT qt_win_ensureComInitializedOnThisThread();
36#include <qt_windows.h>
37#ifndef QT_BOOTSTRAPPED
38#include <QtCore/qvarlengtharray.h>
39#include <QtCore/private/wcharhelpers_win_p.h>
41#include <QtCore/q20iterator.h>
50#include <QtCore/q20utility.h>
51#ifndef QT_BOOTSTRAPPED
52#include <QtCore/q26numeric.h>
57using namespace QtMiscUtils;
59static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
60static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
61static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
62static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
68#if defined(__SSE2__
) || defined(__ARM_NEON__)
69Q_ALWAYS_INLINE
static uint qBitScanReverse(
unsigned v)
noexcept
71#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
72 return std::bit_width(v) - 1;
74 uint result = qCountLeadingZeroBits(v);
78 result ^=
sizeof(
unsigned) * 8 - 1;
85template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE
static bool
86simdEncodeAscii(uchar *&dst,
const char16_t *&nextAscii,
const char16_t *&src,
const char16_t *end)
88 size_t sizeBytes =
reinterpret_cast<
const char *>(end) -
reinterpret_cast<
const char *>(src);
91 auto process16Chars = [](uchar *dst,
const char16_t *src) {
92 __m128i data1 = _mm_loadu_si128((
const __m128i*)src);
93 __m128i data2 = _mm_loadu_si128(1+(
const __m128i*)src);
103 __m128i packed = _mm_packus_epi16(data1, data2);
104 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
107 _mm_storeu_si128((__m128i*)dst, packed);
110 ushort n = ~_mm_movemask_epi8(nonAscii);
113 auto maybeFoundNonAscii = [&](
auto n, qptrdiff offset = 0) {
120 nextAscii = src + qBitScanReverse(n) + 1;
122 n = qCountTrailingZeroBits(n);
129 auto adjustToEnd = [&] {
130 dst += sizeBytes /
sizeof(
char16_t);
134 if constexpr (Cpu & CpuFeatureAVX2) {
139 constexpr size_t Step = 32;
140 auto process32Chars = [](
const char16_t *src, uchar *dst) {
141 __m256i data1 = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src));
142 __m256i data2 = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src) + 1);
143 __m256i packed = _mm256_packus_epi16(data1, data2);
144 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
145 __m256i nonAscii = _mm256_cmpgt_epi8(permuted, _mm256_setzero_si256());
148 _mm256_storeu_si256(
reinterpret_cast<__m256i *>(dst), permuted);
150 return ~_mm256_movemask_epi8(nonAscii);
153 if constexpr (Cpu & CpuFeatureAVX512VL) {
155 if (sizeBytes <= Step *
sizeof(
char16_t)) {
156 uint mask = _bzhi_u32(-1, uint(sizeBytes / 2));
157 __m256i data1 = _mm256_maskz_loadu_epi16(mask, src);
158 __m256i data2 = _mm256_maskz_loadu_epi16(mask >> 16, src + Step / 2);
159 __m256i packed = _mm256_packus_epi16(data1, data2);
160 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
161 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, permuted, _mm256_setzero_si256());
164 _mm256_mask_storeu_epi8(dst, mask, permuted);
166 return maybeFoundNonAscii(nonAscii);
172 if (sizeBytes >= Step *
sizeof(
char16_t)) {
175 for ( ; (offset + Step) *
sizeof(
char16_t) < sizeBytes; offset += Step) {
176 if (uint n = process32Chars(src + offset, dst + offset))
177 return maybeFoundNonAscii(n, offset);
182 uint n = process32Chars(src - Step, dst - Step);
183 return maybeFoundNonAscii(n, -
int(Step));
187 constexpr size_t Step = 16;
188 if (sizeBytes >= Step *
sizeof(
char16_t)) {
191 for ( ; (offset + Step) *
sizeof(
char16_t) < sizeBytes; offset += Step) {
192 ushort n = process16Chars(dst + offset, src + offset);
194 return maybeFoundNonAscii(n, offset);
195 if (Cpu & CpuFeatureAVX2)
201 ushort n = process16Chars(dst - Step, src - Step);
202 return maybeFoundNonAscii(n, -
int(Step));
205# if !defined(__OPTIMIZE_SIZE__)
206 if (sizeBytes >= 8 *
sizeof(
char16_t)) {
208 __m128i data = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src));
209 __m128i data2 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(end - 8));
210 __m128i packed = _mm_packus_epi16(data, data);
211 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
214 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst), packed);
216 uchar n = ~_mm_movemask_epi8(nonAscii);
218 return maybeFoundNonAscii(n);
221 packed = _mm_packus_epi16(data2, data2);
222 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
223 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst - 8), packed);
224 n = ~_mm_movemask_epi8(nonAscii);
225 return maybeFoundNonAscii(n, -8);
226 }
else if (sizeBytes >= 4 *
sizeof(
char16_t)) {
228 __m128i data1 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src));
229 __m128i data2 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(end - 4));
230 __m128i packed = _mm_packus_epi16(data1, data1);
231 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
234 qToUnaligned(_mm_cvtsi128_si32(packed), dst);
236 uchar n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
238 return maybeFoundNonAscii(n);
241 packed = _mm_packus_epi16(data2, data2);
242 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
243 qToUnaligned(_mm_cvtsi128_si32(packed), dst - 4);
244 n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
245 return maybeFoundNonAscii(n, -4);
252template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE
static bool
253simdDecodeAscii(
char16_t *&dst,
const uchar *&nextAscii,
const uchar *&src,
const uchar *end)
256 auto process16Chars = [](
char16_t *dst,
const uchar *src) {
257 __m128i data = _mm_loadu_si128((
const __m128i*)src);
261 uint n = _mm_movemask_epi8(data);
264 _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
265 _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
268 auto maybeFoundNonAscii = [&](uint n, qptrdiff offset = 0) {
273 uint c = qCountTrailingZeroBits(n);
276 n = qBitScanReverse(n);
277 nextAscii = src + n + 1;
283 auto adjustToEnd = [&] {
288 if constexpr (Cpu & CpuFeatureAVX2) {
289 constexpr qsizetype Step = 32;
290 auto process32Chars = [](
char16_t *dst,
const uchar *src) {
291 __m128i data1 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src));
292 __m128i data2 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src) + 1);
297 __m128i ored = _mm_or_si128(data1, data2);
298 bool any = _mm_movemask_epi8(ored);
301 __m256i extended1 = _mm256_cvtepu8_epi16(data1);
302 __m256i extended2 = _mm256_cvtepu8_epi16(data2);
303 _mm256_storeu_si256(
reinterpret_cast<__m256i *>(dst), extended1);
304 _mm256_storeu_si256(
reinterpret_cast<__m256i *>(dst) + 1, extended2);
306 uint n1 = _mm_movemask_epi8(data1);
307 uint n2 = _mm_movemask_epi8(data2);
311 operator
bool()
const {
return any; }
312 operator uint()
const {
return n1|(n2 << 16); }
314 return R{ n1, n2, any };
317 if constexpr (Cpu & CpuFeatureAVX512VL) {
319 if (end - src <= Step) {
320 __mmask32 mask = _bzhi_u32(-1, uint(end - src));
321 __m256i data = _mm256_maskz_loadu_epi8(mask, src);
322 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, data, _mm256_setzero_si256());
325 __m256i extended1 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data));
326 __m256i extended2 = _mm256_cvtepu8_epi16(_mm256_extracti64x2_epi64(data, 1));
327 _mm256_mask_storeu_epi16(dst, mask, extended1);
328 _mm256_mask_storeu_epi16(dst + Step/2, mask >> 16, extended2);
330 return maybeFoundNonAscii(nonAscii);
336 if (end - src >= Step) {
339 for ( ; offset + Step < end - src; offset += Step) {
340 auto r = process32Chars(dst + offset, src + offset);
342 return maybeFoundNonAscii(r, offset);
347 auto r = process32Chars(dst - Step, src - Step);
348 return maybeFoundNonAscii(r, -Step);
352 constexpr qsizetype Step = 16;
353 if (end - src >= Step) {
355 for ( ; offset + Step < end - src; offset += Step) {
356 ushort n = process16Chars(dst + offset, src + offset);
358 return maybeFoundNonAscii(n, offset);
359 if (Cpu & CpuFeatureAVX2)
365 return maybeFoundNonAscii(process16Chars(dst - Step, src - Step), -Step);
368# if !defined(__OPTIMIZE_SIZE__)
369 if (end - src >= 8) {
370 __m128i data = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src));
371 __m128i data2 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(end - 8));
372 uint n = _mm_movemask_epi8(data) & 0xff;
374 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
376 return maybeFoundNonAscii(n);
380 n = _mm_movemask_epi8(data2) & 0xff;
381 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
382 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dst - 8), data2);
383 return maybeFoundNonAscii(n, -8);
385 if (end - src >= 4) {
386 __m128i data = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src));
387 __m128i data2 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(end - 4));
388 uchar n = uchar(_mm_movemask_epi8(data) & 0xf);
390 data = _mm_unpacklo_epi8(data, _mm_setzero_si128());
391 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst), data);
393 return maybeFoundNonAscii(n);
397 n = uchar(_mm_movemask_epi8(data2) & 0xf);
398 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
399 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst - 4), data2);
400 return maybeFoundNonAscii(n, -4);
407static inline const uchar *simdFindNonAscii(
const uchar *src,
const uchar *end,
const uchar *&nextAscii)
412 const __m256i mask = _mm256_set1_epi8(
char(0x80));
413 for ( ; end - src >= 32; src += 32) {
414 __m256i data = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src));
415 if (_mm256_testz_si256(mask, data))
418 uint n = _mm256_movemask_epi8(data);
424 nextAscii = src + qBitScanReverse(n) + 1;
427 return src + qCountTrailingZeroBits(n);
432 for ( ; end - src >= 16; src += 16) {
433 __m128i data = _mm_loadu_si128(
reinterpret_cast<
const __m128i*>(src));
437 uint n = _mm_movemask_epi8(data);
444 nextAscii = src + qBitScanReverse(n) + 1;
447 return src + qCountTrailingZeroBits(n);
451 for ( ; end - src >= 4; src += 4) {
452 quint32 data = qFromUnaligned<quint32>(src);
469static void simdCompareAscii(
const qchar8_t *&src8,
const qchar8_t *end8,
const char16_t *&src16,
const char16_t *end16)
472 qptrdiff len = qMin(end8 - src8, end16 - src16);
477 for ( ; offset + 16 < len; offset += 16) {
478 __m128i data8 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src8 + offset));
481 __m256i data16 = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src16 + offset));
484 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
485 mask = _mm256_movemask_epi8(datax8);
490 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
491 mask = ~_mm256_movemask_epi8(latin1cmp);
496 __m128i datalo16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src16 + offset));
497 __m128i datahi16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src16 + offset) + 1);
500 __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
501 __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
504 __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
505 __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
506 mask = _mm_movemask_epi8(latin1cmphi) << 16;
507 mask |= ushort(_mm_movemask_epi8(latin1cmplo));
513 mask = _mm_movemask_epi8(data8);
522 auto cmp_lt_16 = [&mask, &offset](
int n, __m128i data8, __m128i data16) {
525 unsigned sizemask = (1U << (2 * n)) - 1;
528 data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
531 __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
532 mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
533 mask |= _mm_movemask_epi8(data8);
539 if (mask == 0 && offset + 8 < len) {
540 __m128i data8 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src8 + offset));
541 __m128i data16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src16 + offset));
542 cmp_lt_16(8, data8, data16);
546 if (mask == 0 && offset + 4 < len) {
547 __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
548 __m128i data16 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src16 + offset));
549 cmp_lt_16(4, data8, data16);
554 offset += qCountTrailingZeroBits(mask) >> bitSpacing;
558#elif defined(__ARM_NEON__)
559static inline bool simdEncodeAscii(uchar *&dst,
const char16_t *&nextAscii,
const char16_t *&src,
const char16_t *end)
561 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
562 uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
563 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
566 for ( ; end - src >= 16; src += 16, dst += 16) {
568 uint16x8x2_t in = vld2q_u16(
reinterpret_cast<
const uint16_t *>(src));
572 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
573 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
576 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
579 vst1q_u8(dst, vreinterpretq_u8_u16(out));
585 nextAscii = src + qBitScanReverse(nonAscii) + 1;
587 nonAscii = qCountTrailingZeroBits(nonAscii);
596static inline bool simdDecodeAscii(
char16_t *&dst,
const uchar *&nextAscii,
const uchar *&src,
const uchar *end)
599 uint8x8_t msb_mask = vdup_n_u8(0x80);
600 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
601 for ( ; end - src >= 8; src += 8, dst += 8) {
602 uint8x8_t c = vld1_u8(src);
603 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
606 vst1q_u16(
reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
619 n = qBitScanReverse(n);
620 nextAscii = src + n + 1;
627static inline const uchar *simdFindNonAscii(
const uchar *src,
const uchar *end,
const uchar *&nextAscii)
635 uint8x8_t msb_mask = vdup_n_u8(0x80);
636 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
637 for ( ; end - src >= 8; src += 8) {
638 uint8x8_t c = vld1_u8(src);
639 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
646 nextAscii = src + qBitScanReverse(n) + 1;
649 return src + qCountTrailingZeroBits(n);
655static void simdCompareAscii(
const qchar8_t *&,
const qchar8_t *,
const char16_t *&,
const char16_t *)
659static inline bool simdEncodeAscii(uchar *,
const char16_t *,
const char16_t *,
const char16_t *)
664static inline bool simdDecodeAscii(
char16_t *,
const uchar *,
const uchar *,
const uchar *)
669static inline const uchar *simdFindNonAscii(
const uchar *src,
const uchar *end,
const uchar *&nextAscii)
675static void simdCompareAscii(
const qchar8_t *&,
const qchar8_t *,
const char16_t *&,
const char16_t *)
682template <
typename OnErrorLambda> Q_ALWAYS_INLINE
683char *QUtf8::convertFromUnicode(
char *out, QStringView in, OnErrorLambda &&onError)
noexcept
685 qsizetype len = in.size();
687 uchar *dst =
reinterpret_cast<uchar *>(out);
688 const char16_t *src =
reinterpret_cast<
const char16_t *>(in.data());
689 const char16_t *
const end = src + len;
692 const char16_t *nextAscii = end;
693 if (simdEncodeAscii(dst, nextAscii, src, end))
698 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
699 if (Q_UNLIKELY(res < 0))
700 onError(dst, u, res);
701 }
while (src < nextAscii);
704 return reinterpret_cast<
char *>(dst);
707char *QUtf8::convertFromUnicode(
char *dst, QStringView in)
noexcept
709 return convertFromUnicode(dst, in, [](
auto *dst, ...) {
715QByteArray QUtf8::convertFromUnicode(QStringView in)
717 qsizetype len = in.size();
720 QByteArray result(len * 3, Qt::Uninitialized);
721 char *dst =
const_cast<
char *>(result.constData());
722 dst = convertFromUnicode(dst, in);
723 result.truncate(dst - result.constData());
727QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverter::State *state)
729 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
730 char *end = convertFromUnicode(ba.data(), in, state);
731 ba.truncate(end - ba.data());
735char *QUtf8::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state)
738 qsizetype len = in.size();
742 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
743 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
754 uchar *cursor =
reinterpret_cast<uchar *>(out);
755 const char16_t *src = in.utf16();
756 const char16_t *
const end = src + len;
758 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
759 if (state->remainingChars) {
760 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
762 cursor = appendReplacementChar(cursor);
763 state->state_data[0] = 0;
764 state->remainingChars = 0;
765 }
else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
767 *cursor++ = utf8bom[0];
768 *cursor++ = utf8bom[1];
769 *cursor++ = utf8bom[2];
770 state->internalState |= HeaderDone;
774 out =
reinterpret_cast<
char *>(cursor);
775 return convertFromUnicode(out, { src, end }, [&](uchar *&cursor,
char16_t uc,
int res) {
776 if (res == QUtf8BaseTraits::Error) {
778 ++state->invalidChars;
779 cursor = appendReplacementChar(cursor);
780 }
else if (res == QUtf8BaseTraits::EndOfString) {
781 if (state->flags & QStringConverter::Flag::Stateless) {
782 ++state->invalidChars;
783 cursor = appendReplacementChar(cursor);
785 state->remainingChars = 1;
786 state->state_data[0] = uc;
792char *QUtf8::convertFromLatin1(
char *out, QLatin1StringView in)
795 for (uchar ch : in) {
800 *out++ = 0b110'0'0000u | (ch >> 6);
801 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
807QString QUtf8::convertToUnicode(QByteArrayView in)
821 QString result(in.size(), Qt::Uninitialized);
822 QChar *data =
const_cast<QChar*>(result.constData());
823 const QChar *end = convertToUnicode(data, in);
824 result.truncate(end - data);
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847char16_t *QUtf8::convertToUnicode(
char16_t *dst, QByteArrayView in)
noexcept
850 auto bom = QByteArrayView::fromArray(utf8bom);
851 if (in.size() >= bom.size() && in.first(bom.size()) == bom)
852 in.slice(
sizeof(utf8bom));
854 return convertToUnicode(dst, in, [](
char16_t *&dst, ...) {
856 *dst++ = QChar::ReplacementCharacter;
861template <
typename OnErrorLambda> Q_ALWAYS_INLINE
char16_t *
862QUtf8::convertToUnicode(
char16_t *dst, QByteArrayView in, OnErrorLambda &&onError)
noexcept
864 const uchar *
const start =
reinterpret_cast<
const uchar *>(in.data());
865 const uchar *src = start;
866 const uchar *end = src + in.size();
869 const uchar *nextAscii = end;
872 if (simdDecodeAscii(dst, nextAscii, src, end))
877 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
878 if (Q_LIKELY(res >= 0))
881 if (!onError(dst, src, res))
883 }
while (src < nextAscii);
889QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
901 QString result(in.size() + 1, Qt::Uninitialized);
902 QChar *end = convertToUnicode(result.data(), in, state);
903 result.truncate(end - result.constData());
907char16_t *QUtf8::convertToUnicode(
char16_t *dst, QByteArrayView in, QStringConverter::State *state)
909 qsizetype len = in.size();
916 char16_t replacement = QChar::ReplacementCharacter;
917 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
918 replacement = QChar::Null;
922 const uchar *src =
reinterpret_cast<
const uchar *>(in.data());
923 const uchar *end = src + len;
925 if (!(state->flags & QStringConverter::Flag::Stateless)) {
926 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
927 if (state->remainingChars || !headerdone) {
929 uchar remainingCharsData[4];
930 qsizetype remainingCharsCount = state->remainingChars;
931 qsizetype newCharsToCopy = qMin<qsizetype>(
sizeof(remainingCharsData) - remainingCharsCount, end - src);
933 memset(remainingCharsData, 0,
sizeof(remainingCharsData));
934 memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
935 memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
937 const uchar *begin = &remainingCharsData[1];
938 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
939 static_cast<
const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
940 if (res == QUtf8BaseTraits::Error) {
941 ++state->invalidChars;
942 *dst++ = replacement;
944 }
else if (res == QUtf8BaseTraits::EndOfString) {
947 state->remainingChars = remainingCharsCount + newCharsToCopy;
948 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
950 }
else if (!headerdone) {
952 if (dst[-1] == 0xfeff)
955 state->internalState |= HeaderDone;
959 Q_ASSERT(res > remainingCharsCount);
960 src += res - remainingCharsCount;
963 }
else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
965 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
972 dst = convertToUnicode(dst, { src, end }, [&](
char16_t *&dst,
const uchar *src_,
int res_) {
975 if (res == QUtf8BaseTraits::Error) {
977 ++state->invalidChars;
978 *dst++ = replacement;
983 if (res == QUtf8BaseTraits::EndOfString) {
985 if (state->flags & QStringConverter::Flag::Stateless) {
986 *dst++ = QChar::ReplacementCharacter;
987 ++state->invalidChars;
988 while (src++ < end) {
989 *dst++ = QChar::ReplacementCharacter;
990 ++state->invalidChars;
992 state->remainingChars = 0;
995 state->remainingChars = end - src;
996 memcpy(&state->state_data[0], src, end - src);
999 state->remainingChars = 0;
1012QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
1014 const uchar *src =
reinterpret_cast<
const uchar *>(in.data());
1015 const uchar *end = src + in.size();
1016 const uchar *nextAscii = src;
1017 bool isValidAscii =
true;
1020 if (src >= nextAscii)
1021 src = simdFindNonAscii(src, end, nextAscii);
1027 if ((b & 0x80) == 0)
1030 isValidAscii =
false;
1031 QUtf8NoOutputTraits::NoOutput output;
1032 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
1035 return {
false,
false };
1037 }
while (src < nextAscii);
1040 return {
true, isValidAscii };
1043int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs)
noexcept
1045 auto src1 =
reinterpret_cast<
const qchar8_t *>(utf8.data());
1046 auto end1 = src1 + utf8.size();
1047 auto src2 =
reinterpret_cast<
const char16_t *>(utf16.data());
1048 auto end2 = src2 + utf16.size();
1051 simdCompareAscii(src1, end1, src2, end2);
1053 if (src1 < end1 && src2 < end2) {
1054 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1055 char32_t uc2 = *src2++;
1060 if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
1061 uc2 = QChar::surrogateToUcs4(uc2, *src2++);
1063 if (cs == Qt::CaseInsensitive) {
1064 uc1 = QChar::toCaseFolded(uc1);
1065 uc2 = QChar::toCaseFolded(uc2);
1068 return int(uc1) -
int(uc2);
1070 }
while (src1 < end1 && src2 < end2);
1073 return (end1 > src1) -
int(end2 > src2);
1076int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
1078 auto src1 =
reinterpret_cast<
const qchar8_t *>(utf8.data());
1079 auto end1 = src1 + utf8.size();
1080 auto src2 =
reinterpret_cast<
const uchar *>(s.latin1());
1081 auto end2 = src2 + s.size();
1083 while (src1 < end1 && src2 < end2) {
1084 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1085 char32_t uc2 = *src2++;
1086 if (cs == Qt::CaseInsensitive) {
1087 uc1 = QChar::toCaseFolded(uc1);
1088 uc2 = QChar::toCaseFolded(uc2);
1091 return int(uc1) -
int(uc2);
1095 return (end1 > src1) - (end2 > src2);
1098int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs)
noexcept
1101 return qt_lencmp(0, rhs.size());
1104 return qt_lencmp(lhs.size(), 0);
1106 if (cs == Qt::CaseSensitive) {
1107 const auto l = std::min(lhs.size(), rhs.size());
1108 int r = memcmp(lhs.data(), rhs.data(), l);
1109 return r ? r : qt_lencmp(lhs.size(), rhs.size());
1112 auto src1 =
reinterpret_cast<
const qchar8_t *>(lhs.data());
1113 auto end1 = src1 + lhs.size();
1114 auto src2 =
reinterpret_cast<
const qchar8_t *>(rhs.data());
1115 auto end2 = src2 + rhs.size();
1117 while (src1 < end1 && src2 < end2) {
1118 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1119 char32_t uc2 = QUtf8Functions::nextUcs4FromUtf8(src2, end2);
1121 uc1 = QChar::toCaseFolded(uc1);
1122 uc2 = QChar::toCaseFolded(uc2);
1124 return int(uc1) -
int(uc2);
1128 return (end1 > src1) - (end2 > src2);
1131#ifndef QT_BOOTSTRAPPED
1132QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1134 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1135 qsizetype length = 2 * in.size();
1139 QByteArray d(length, Qt::Uninitialized);
1140 char *end = convertFromUnicode(d.data(), in, state, endian);
1141 Q_ASSERT(end - d.constData() == d.size());
1146char *QUtf16::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1149 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1151 if (endian == DetectEndianness)
1152 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1156 QChar bom(QChar::ByteOrderMark);
1157 if (endian == BigEndianness)
1158 qToBigEndian(bom.unicode(), out);
1160 qToLittleEndian(bom.unicode(), out);
1163 if (endian == BigEndianness)
1164 qToBigEndian<
char16_t>(in.data(), in.size(), out);
1166 qToLittleEndian<
char16_t>(in.data(), in.size(), out);
1168 state->remainingChars = 0;
1169 state->internalState |= HeaderDone;
1170 return out + 2*in.size();
1173QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1175 QString result((in.size() + 1) >> 1, Qt::Uninitialized);
1176 QChar *qch = convertToUnicode(result.data(), in, state, endian);
1177 result.truncate(qch - result.constData());
1181QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1183 qsizetype len = in.size();
1184 const char *chars = in.data();
1188 if (endian == DetectEndianness)
1189 endian = (DataEndianness)state->state_data[Endian];
1191 const char *end = chars + len;
1194 if (state->remainingChars + len < 2) {
1196 Q_ASSERT(state->remainingChars == 0 && len == 1);
1197 state->remainingChars = 1;
1198 state->state_data[Data] = *chars;
1203 bool headerdone = state && state->internalState & HeaderDone;
1204 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1207 if (!headerdone || state->remainingChars) {
1209 if (state->remainingChars)
1210 buf = state->state_data[Data];
1215 state->internalState |= HeaderDone;
1216 QChar ch(buf, *chars++);
1217 if (endian == DetectEndianness) {
1219 if (ch == QChar::ByteOrderSwapped) {
1220 endian = BigEndianness;
1221 }
else if (ch == QChar::ByteOrderMark) {
1222 endian = LittleEndianness;
1224 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1225 endian = BigEndianness;
1227 endian = LittleEndianness;
1231 if (endian == BigEndianness)
1232 ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1233 if (headerdone || ch != QChar::ByteOrderMark)
1235 }
else if (endian == DetectEndianness) {
1236 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1239 qsizetype nPairs = (end - chars) >> 1;
1240 if (endian == BigEndianness)
1241 qFromBigEndian<
char16_t>(chars, nPairs, out);
1243 qFromLittleEndian<
char16_t>(chars, nPairs, out);
1246 state->state_data[Endian] = endian;
1247 state->remainingChars = 0;
1248 if ((end - chars) & 1) {
1249 if (state->flags & QStringConverter::Flag::Stateless) {
1250 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1252 state->remainingChars = 1;
1253 state->state_data[Data] = *(end - 1);
1256 state->state_data[Data] = 0;
1262QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1264 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1265 qsizetype length = 4*in.size();
1268 QByteArray ba(length, Qt::Uninitialized);
1269 char *end = convertFromUnicode(ba.data(), in, state, endian);
1270 ba.truncate(end - ba.constData());
1274char *QUtf32::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1278 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1279 if (endian == DetectEndianness)
1280 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1284 if (endian == BigEndianness) {
1287 out[2] = (
char)0xfe;
1288 out[3] = (
char)0xff;
1290 out[0] = (
char)0xff;
1291 out[1] = (
char)0xfe;
1296 state->internalState |= HeaderDone;
1299 const QChar *uc = in.data();
1300 const QChar *end = in.data() + in.size();
1303 if (state->remainingChars == 1) {
1304 auto character = state->state_data[Data];
1305 Q_ASSERT(character <= 0xFFFF);
1306 ch = QChar(character);
1308 state->remainingChars = 0;
1309 goto decode_surrogate;
1314 if (Q_LIKELY(!ch.isSurrogate())) {
1315 ucs4 = ch.unicode();
1316 }
else if (Q_LIKELY(ch.isHighSurrogate())) {
1319 if (state->flags & QStringConverter::Flag::Stateless) {
1320 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1322 state->remainingChars = 1;
1323 state->state_data[Data] = ch.unicode();
1326 }
else if (uc->isLowSurrogate()) {
1327 ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1329 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1332 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1334 if (endian == BigEndianness)
1335 qToBigEndian(ucs4, out);
1337 qToLittleEndian(ucs4, out);
1344QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1347 result.resize((in.size() + 7) >> 1);
1348 QChar *end = convertToUnicode(result.data(), in, state, endian);
1349 result.truncate(end - result.constData());
1353QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1355 qsizetype len = in.size();
1356 const char *chars = in.data();
1359 if (endian == DetectEndianness)
1360 endian = (DataEndianness)state->state_data[Endian];
1362 const char *end = chars + len;
1365 memcpy(tuple, &state->state_data[Data], 4);
1368 if (state->remainingChars + len < 4) {
1370 while (chars < end) {
1371 tuple[state->remainingChars] = *chars;
1372 ++state->remainingChars;
1375 Q_ASSERT(state->remainingChars < 4);
1376 memcpy(&state->state_data[Data], tuple, 4);
1381 bool headerdone = state->internalState & HeaderDone;
1382 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1385 qsizetype num = state->remainingChars;
1386 state->remainingChars = 0;
1388 const auto writeCodeToOutput = [&](
char32_t code) {
1389 if (Q_UNLIKELY(code > QChar::LastValidCodePoint)) {
1390 if (state->flags & QStringDecoder::Flag::ConvertInvalidToNull)
1391 *out++ = QChar::Null;
1393 *out++ = QChar::ReplacementCharacter;
1395 for (
char16_t c : QChar::fromUcs4(code))
1400 if (!headerdone || endian == DetectEndianness || num) {
1402 tuple[num++] = *chars++;
1403 if (endian == DetectEndianness) {
1405 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1406 endian = LittleEndianness;
1407 }
else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1408 endian = BigEndianness;
1409 }
else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1410 endian = BigEndianness;
1412 endian = LittleEndianness;
1415 char32_t code = (endian == BigEndianness) ? qFromBigEndian<
char32_t>(tuple) : qFromLittleEndian<
char32_t>(tuple);
1416 if (headerdone || code != QChar::ByteOrderMark) {
1417 writeCodeToOutput(code);
1420 }
else if (endian == DetectEndianness) {
1421 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1423 state->state_data[Endian] = endian;
1424 state->internalState |= HeaderDone;
1426 while (chars < end) {
1427 tuple[num++] = *chars++;
1429 char32_t code = (endian == BigEndianness) ? qFromBigEndian<
char32_t>(tuple) : qFromLittleEndian<
char32_t>(tuple);
1430 writeCodeToOutput(code);
1436 if (state->flags & QStringDecoder::Flag::Stateless) {
1437 *out++ = QChar::ReplacementCharacter;
1439 state->state_data[Endian] = endian;
1440 state->remainingChars = num;
1441 memcpy(&state->state_data[Data], tuple, 4);
1449#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1450int QLocal8Bit::checkUtf8()
1452 return GetACP() == CP_UTF8 ? 1 : -1;
1455QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1457 return convertToUnicode_sys(in, CP_ACP, state);
1460QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1461 QStringConverter::State *state)
1463 const char *mb = in.data();
1464 qsizetype mblen = in.size();
1467 qsizetype &invalidChars = state->invalidChars;
1468 using Flag = QStringConverter::Flag;
1469 const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1470 const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1471 : QChar::ReplacementCharacter;
1472 if (state->flags & Flag::Stateless) {
1473 Q_ASSERT(state->remainingChars == 0);
1485 std::array<
wchar_t, 4096> buf;
1486 wchar_t *out = buf.data();
1487 qsizetype outlen = buf.size();
1492 const auto growOut = [&](qsizetype size) -> std::tuple<
wchar_t *, qsizetype> {
1494 return {out, outlen};
1495 const bool wasStackBuffer = sp.isEmpty();
1496 const auto begin = wasStackBuffer ? buf.data() :
reinterpret_cast<
wchar_t *>(sp.data());
1497 const qsizetype offset = qsizetype(std::distance(begin, out));
1498 qsizetype newSize = 0;
1499 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1501 return {
nullptr, 0};
1504 auto it =
reinterpret_cast<
wchar_t *>(sp.data());
1506 it = std::copy_n(buf.data(), offset, it);
1513 while (state && state->remainingChars && mblen) {
1514 QStringConverter::State localState;
1515 localState.flags = state->flags;
1520 std::array<
char, 6> prev = {0};
1521 Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1522 qsizetype index = 0;
1523 for (; index < state->remainingChars; ++index)
1524 prev[index] = state->state_data[index];
1525 const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1526 for (qsizetype i = 0; i < toCopy; ++i, ++index)
1527 prev[index] = mb[i];
1540 const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1542 std::tie(out, outlen) = growOut(tmp.size());
1545 out = std::copy_n(
reinterpret_cast<
const wchar_t *>(tmp.constData()), tmp.size(), out);
1546 outlen -= tmp.size();
1547 const qsizetype tail = toCopy - localState.remainingChars;
1552 mb -= localState.remainingChars;
1553 mblen += localState.remainingChars;
1554 localState.remainingChars = 0;
1556 state->remainingChars = localState.remainingChars;
1557 state->invalidChars += localState.invalidChars;
1558 std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1561 Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
1565 int nextIn = q26::saturate_cast<
int>(mblen);
1567 std::tie(out, outlen) = growOut(1);
1570 const int nextOut = q26::saturate_cast<
int>(outlen);
1571 int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1578 int r = GetLastError();
1579 if (r == ERROR_INSUFFICIENT_BUFFER) {
1580 const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
1581 std::tie(out, outlen) = growOut(wclen);
1584 }
else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1590 if (state && mblen <= q20::ssize(state->state_data)) {
1591 state->remainingChars = mblen;
1592 std::copy_n(mb, mblen, state->state_data);
1605 const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
1607 nextIn =
int(it - mb);
1615 std::tie(out, outlen) = growOut(1);
1618 *out = replacementCharacter;
1626 qWarning(
"MultiByteToWideChar: Cannot convert multibyte text");
1630 nextIn = q26::saturate_cast<
int>(mblen);
1635 if (out != buf.data())
1636 sp = QStringView(buf.data(), out).toString();
1638 const auto begin =
reinterpret_cast<
wchar_t *>(sp.data());
1639 sp.truncate(std::distance(begin, out));
1642 if (sp.size() && sp.back().isNull())
1645 if (!state && mblen > 0) {
1648 sp.resize(sp.size() + mblen, replacementCharacter);
1649 invalidChars += mblen;
1654QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1656 return convertFromUnicode_sys(in, CP_ACP, state);
1659QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1660 QStringConverter::State *state)
1662 const wchar_t *ch =
reinterpret_cast<
const wchar_t *>(in.data());
1663 qsizetype uclen = in.size();
1673 using Flag = QStringConverter::Flag;
1674 if (state->flags & Flag::Stateless) {
1675 Q_ASSERT(state->remainingChars == 0);
1680 return QByteArray();
1682 return QByteArray(
"");
1689 std::array<
char, 4096> buf;
1690 char *out = buf.data();
1691 qsizetype outlen = buf.size();
1694 if (state && state->remainingChars > 0) {
1695 Q_ASSERT(state->remainingChars == 1);
1697 wchar_t wc[2] = {
wchar_t(state->state_data[0]), ch[0] };
1701 const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
1702 int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen,
nullptr,
1708 if (validCodePoint) {
1712 state->remainingChars = 0;
1713 state->state_data[0] = 0;
1715 return QByteArrayView(buf.data(), len).toByteArray();
1718 if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
1721 state->remainingChars = 1;
1722 state->state_data[0] = ch[uclen - 1];
1725 return QByteArray();
1728 Q_ASSERT(uclen > 0);
1731 const auto growOut = [&](qsizetype size) -> std::tuple<
char *, qsizetype> {
1733 return {out, outlen};
1734 const bool wasStackBuffer = mb.isEmpty();
1735 const auto begin = wasStackBuffer ? buf.data() : mb.data();
1736 const qsizetype offset = qsizetype(std::distance(begin, out));
1737 qsizetype newSize = 0;
1738 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1740 return {
nullptr, 0};
1743 auto it = mb.data();
1745 it = std::copy_n(buf.data(), offset, it);
1751 const auto getNextWindowSize = [&]() {
1752 int nextIn = q26::saturate_cast<
int>(uclen);
1755 if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
1762 const int nextIn = getNextWindowSize();
1763 std::tie(out, outlen) = growOut(1);
1766 const int nextOut = q26::saturate_cast<
int>(outlen);
1767 len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut,
nullptr,
nullptr);
1774 int r = GetLastError();
1775 if (r == ERROR_INSUFFICIENT_BUFFER) {
1776 int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn,
nullptr, 0,
1778 if (neededLength <= 0) {
1786 "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1790 std::tie(out, outlen) = growOut(neededLength);
1799 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1800 r, qt_castToWchar(QStringView(ch, uclen).left(100).toString()));
1808 if (out != buf.data())
1809 mb = QByteArrayView(buf.data(), out).toByteArray();
1811 mb.truncate(std::distance(mb.data(), out));
1817void QStringConverter::State::clear()
noexcept
1822 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1828void QStringConverter::State::reset()
noexcept
1830 if (flags & Flag::UsesIcu) {
1831#if defined(QT_USE_ICU_CODECS)
1833 UConverter *converter =
static_cast<UConverter *>(d[0]);
1835 ucnv_reset(converter);
1844#ifndef QT_BOOTSTRAPPED
1845static QChar *
fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1847 return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
1850static char *
toUtf16(
char *out, QStringView in, QStringConverter::State *state)
1852 return QUtf16::convertFromUnicode(out, in, state, DetectEndianness);
1857 return QUtf16::convertToUnicode(out, in, state, BigEndianness);
1860static char *
toUtf16BE(
char *out, QStringView in, QStringConverter::State *state)
1862 return QUtf16::convertFromUnicode(out, in, state, BigEndianness);
1867 return QUtf16::convertToUnicode(out, in, state, LittleEndianness);
1870static char *
toUtf16LE(
char *out, QStringView in, QStringConverter::State *state)
1872 return QUtf16::convertFromUnicode(out, in, state, LittleEndianness);
1875static QChar *
fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1877 return QUtf32::convertToUnicode(out, in, state, DetectEndianness);
1880static char *
toUtf32(
char *out, QStringView in, QStringConverter::State *state)
1882 return QUtf32::convertFromUnicode(out, in, state, DetectEndianness);
1887 return QUtf32::convertToUnicode(out, in, state, BigEndianness);
1890static char *
toUtf32BE(
char *out, QStringView in, QStringConverter::State *state)
1892 return QUtf32::convertFromUnicode(out, in, state, BigEndianness);
1897 return QUtf32::convertToUnicode(out, in, state, LittleEndianness);
1900static char *
toUtf32LE(
char *out, QStringView in, QStringConverter::State *state)
1902 return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
1906char *QLatin1::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state)
noexcept
1909 if (state->flags & QStringConverter::Flag::Stateless)
1912 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 :
'?';
1913 qsizetype invalid = 0;
1914 for (qsizetype i = 0; i < in.size(); ++i) {
1915 if (in[i] > QChar(0xff)) {
1919 *out = (
char)in[i].cell();
1924 state->invalidChars += invalid;
1930 QString s = QLocal8Bit::convertToUnicode(in, state);
1931 memcpy(out, s.constData(), s.size()*
sizeof(QChar));
1932 return out + s.size();
1935static char *
toLocal8Bit(
char *out, QStringView in, QStringConverter::State *state)
1937 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1938 memcpy(out, s.constData(), s.size());
1939 return out + s.size();
1946#ifndef QT_BOOTSTRAPPED
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2079
2080
2081
2083const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
2085 {
"UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
2086#ifndef QT_BOOTSTRAPPED
2087 {
"UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
2088 {
"UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
2089 {
"UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
2090 {
"UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
2091 {
"UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
2092 {
"UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
2094 {
"ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len },
2095 {
"Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
2099template <
typename Char>
2103 while (*a ==
'-' || *a ==
'_')
2105 while (b != b_end && (*b == Char{
'-'} || *b == Char{
'_'}))
2107 if (!*a && b == b_end)
2109 if (
char16_t(*b) > 127)
2111 }
while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(
char(*b++)));
2118 return nameMatch_impl_impl(a, b.begin(), b.end());
2123 return nameMatch_impl(a, QLatin1StringView{QByteArrayView{b}});
2128 return nameMatch_impl_impl(a, b.utf16(), b.utf16() + b.size());
2133 return b.visit([a](
auto b) {
return nameMatch_impl(a, b); });
2138
2139
2140
2143
2144
2145
2148#if defined(QT_USE_ICU_CODECS)
2150struct QStringConverterICU : QStringConverter
2152 static void clear_function(QStringConverter::State *state)
noexcept
2155 ucnv_close(
static_cast<UConverter *>(state->d[0]));
2156 state->d[0] =
nullptr;
2159 static void ensureConverter(QStringConverter::State *state)
2163 if (state->d[0] ==
nullptr)
2164 state->d[0] = createConverterForName(
static_cast<
const char *>(state->d[1]), state);
2167 static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
2170 ensureConverter(state);
2172 auto icu_conv =
static_cast<UConverter *>(state->d[0]);
2173 UErrorCode err = U_ZERO_ERROR;
2174 auto source = in.data();
2175 auto sourceLimit = in.data() + in.size();
2177 qsizetype length = toLen(in.size());
2179 UChar *target =
reinterpret_cast<UChar *>(out);
2180 auto targetLimit = target + length;
2183 UBool flush =
false;
2186 UConverterToUCallback action;
2187 const void *context;
2188 ucnv_getToUCallBack(icu_conv, &action, &context);
2189 if (context != state)
2190 ucnv_setToUCallBack(icu_conv, action, state,
nullptr,
nullptr, &err);
2192 ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit,
nullptr, flush, &err);
2194 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2195 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2196 if (
auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
2197 ucnv_reset(icu_conv);
2198 state->invalidChars += leftOver;
2201 return reinterpret_cast<QChar *>(target);
2204 static char *fromUtf16(
char *out, QStringView in, QStringConverter::State *state)
2207 ensureConverter(state);
2208 auto icu_conv =
static_cast<UConverter *>(state->d[0]);
2209 UErrorCode err = U_ZERO_ERROR;
2210 auto source =
reinterpret_cast<
const UChar *>(in.data());
2211 auto sourceLimit =
reinterpret_cast<
const UChar *>(in.data() + in.size());
2213 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2216 char *targetLimit = out + length;
2217 UBool flush =
false;
2220 UConverterFromUCallback action;
2221 const void *context;
2222 ucnv_getFromUCallBack(icu_conv, &action, &context);
2223 if (context != state)
2224 ucnv_setFromUCallBack(icu_conv, action, state,
nullptr,
nullptr, &err);
2226 ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit,
nullptr, flush, &err);
2228 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2229 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2230 if (
auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
2231 ucnv_reset(icu_conv);
2232 state->invalidChars += leftOver;
2238 Q_DISABLE_COPY_MOVE(QStringConverterICU)
2240 template<qsizetype X>
2241 static qsizetype fromLen(qsizetype inLength)
2243 return X * inLength *
sizeof(UChar);
2246 static qsizetype toLen(qsizetype inLength)
2250
2251
2252
2253
2254 return 2 * inLength;
2257 static constexpr QStringConverter::Interface forLength[] = {
2258 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
2259 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
2260 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
2261 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
2262 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
2263 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
2264 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
2265 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
2268 static UConverter *createConverterForName(
const char *name,
const State *state)
2273 UErrorCode status = U_ZERO_ERROR;
2274 UConverter *conv = ucnv_open(name, &status);
2275 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2280 if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
2281 UErrorCode error = U_ZERO_ERROR;
2283 auto nullToSubstituter = [](
const void *context, UConverterToUnicodeArgs *toUArgs,
2284 const char *, int32_t length,
2285 UConverterCallbackReason reason, UErrorCode *err) {
2286 if (reason <= UCNV_IRREGULAR) {
2287 *err = U_ZERO_ERROR;
2289 ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
2291 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2292 state->invalidChars += length;
2295 ucnv_setToUCallBack(conv, nullToSubstituter, state,
nullptr,
nullptr, &error);
2297 auto nullFromSubstituter = [](
const void *context, UConverterFromUnicodeArgs *fromUArgs,
2298 const UChar *, int32_t length,
2299 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2300 if (reason <= UCNV_IRREGULAR) {
2301 *err = U_ZERO_ERROR;
2302 const UChar replacement[] = { 0 };
2303 const UChar *stringBegin = std::begin(replacement);
2304 ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
2306 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2307 state->invalidChars += length;
2310 ucnv_setFromUCallBack(conv, nullFromSubstituter, state,
nullptr,
nullptr, &error);
2312 UErrorCode error = U_ZERO_ERROR;
2314 auto qmarkToSubstituter = [](
const void *context, UConverterToUnicodeArgs *toUArgs,
2315 const char *codeUnits,int32_t length,
2316 UConverterCallbackReason reason, UErrorCode *err) {
2317 if (reason <= UCNV_IRREGULAR) {
2319 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2320 state->invalidChars += length;
2323 UCNV_TO_U_CALLBACK_SUBSTITUTE(
nullptr, toUArgs, codeUnits, length, reason, err);
2326 ucnv_setToUCallBack(conv, qmarkToSubstituter, state,
nullptr,
nullptr, &error);
2328 auto qmarkFromSubstituter = [](
const void *context, UConverterFromUnicodeArgs *fromUArgs,
2329 const UChar *codeUnits, int32_t length,
2330 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2331 if (reason <= UCNV_IRREGULAR) {
2333 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2334 state->invalidChars += length;
2337 UCNV_FROM_U_CALLBACK_SUBSTITUTE(
nullptr, fromUArgs, codeUnits, length,
2338 codePoint, reason, err);
2340 ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state,
nullptr,
nullptr, &error);
2345 static std::string nul_terminate_impl(QLatin1StringView name)
2346 {
return name.isNull() ? std::string() : std::string{name.data(), size_t(name.size())}; }
2348 static std::string nul_terminate_impl(QUtf8StringView name)
2349 {
return nul_terminate_impl(QLatin1StringView{QByteArrayView{name}}); }
2351 static std::string nul_terminate_impl(QStringView name)
2354 const auto convert = [&](
char *p, size_t n) {
2355 const auto sz = QLatin1::convertFromUnicode(p, name) - p;
2356 Q_ASSERT(q20::cmp_less_equal(sz, n));
2359#ifdef __cpp_lib_string_resize_and_overwrite
2360 result.resize_and_overwrite(size_t(name.size()), convert);
2362 result.resize(size_t(name.size()));
2363 result.resize(convert(result.data(), result.size()));
2368 static std::string nul_terminate(QAnyStringView name)
2369 {
return name.visit([](
auto name) {
return nul_terminate_impl(name); }); }
2371 static const QStringConverter::Interface *
2372 make_icu_converter(QStringConverter::State *state, QAnyStringView name)
2373 {
return make_icu_converter(state, nul_terminate(name).data()); }
2375 static const QStringConverter::Interface *make_icu_converter(
2376 QStringConverter::State *state,
2380 UErrorCode status = U_ZERO_ERROR;
2381 UConverter *conv = createConverterForName(name, state);
2385 const char *icuName = ucnv_getName(conv, &status);
2388 const char *persistentName = ucnv_getStandardName(icuName,
"MIME", &status);
2389 if (U_FAILURE(status) || !persistentName) {
2390 status = U_ZERO_ERROR;
2391 persistentName = ucnv_getStandardName(icuName,
"IANA", &status);
2393 state->d[1] =
const_cast<
char *>(persistentName);
2395 state->flags |= QStringConverter::Flag::UsesIcu;
2396 qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
2397 state->clearFn = QStringConverterICU::clear_function;
2398 if (maxCharSize > 8 || maxCharSize < 1) {
2399 qWarning(
"Encountered unexpected codec \"%s\" which requires >8x space", name);
2402 return &forLength[maxCharSize - 1];
2411
2412
2413QStringConverter::QStringConverter(QAnyStringView name, Flags f)
2414 : iface(
nullptr), state(f)
2416 auto e = encodingForName(name);
2418 iface = encodingInterfaces +
int(*e);
2419#if defined(QT_USE_ICU_CODECS)
2421 iface = QStringConverterICU::make_icu_converter(&state, name);
2426const char *QStringConverter::name()
const noexcept
2430 if (state.flags & QStringConverter::Flag::UsesIcu) {
2431#if defined(QT_USE_ICU_CODECS)
2432 return static_cast<
const char*>(state.d[1]);
2442
2443
2444
2445
2446
2447
2448
2449
2452
2453
2454
2455
2456
2459
2460
2461
2462
2463
2464
2467
2468
2469
2470
2471
2472
2473
2474
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAnyStringView name)
noexcept
2490 return std::nullopt;
2491 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2492 if (nameMatch(encodingInterfaces[i].name, name))
2493 return QStringConverter::Encoding(i);
2495 if (nameMatch(
"latin1", name))
2496 return QStringConverter::Latin1;
2497 return std::nullopt;
2500#ifndef QT_BOOTSTRAPPED
2507 if (state->flags & QStringConverter::Flag::UsesIcu) {
2508 UConverter *converter =
static_cast<UConverter *>(state->d[0]);
2511 UErrorCode err = U_ZERO_ERROR;
2512 auto leftOver = ucnv_fromUCountPending(converter, &err);
2515 return std::max(leftOver, 0);
2518 return q26::saturate_cast<
int>(state->remainingChars);
2523
2524
2525
2526
2527
2528
2529std::optional<QStringConverter::Encoding>
2530QStringConverter::encodingForData(QByteArrayView data,
char16_t expectedFirstCharacter)
noexcept
2533 qsizetype arraySize = data.size();
2534 if (arraySize > 3) {
2535 char32_t uc = qFromUnaligned<
char32_t>(data.data());
2536 if (uc == qToBigEndian(
char32_t(QChar::ByteOrderMark)))
2537 return QStringConverter::Utf32BE;
2538 if (uc == qToLittleEndian(
char32_t(QChar::ByteOrderMark)))
2539 return QStringConverter::Utf32LE;
2540 if (expectedFirstCharacter) {
2542 if (qToLittleEndian(uc) == expectedFirstCharacter)
2543 return QStringConverter::Utf32LE;
2544 else if (qToBigEndian(uc) == expectedFirstCharacter)
2545 return QStringConverter::Utf32BE;
2549 if (arraySize > 2) {
2550 if (memcmp(data.data(), utf8bom,
sizeof(utf8bom)) == 0)
2551 return QStringConverter::Utf8;
2554 if (arraySize > 1) {
2555 char16_t uc = qFromUnaligned<
char16_t>(data.data());
2556 if (uc == qToBigEndian(
char16_t(QChar::ByteOrderMark)))
2557 return QStringConverter::Utf16BE;
2558 if (uc == qToLittleEndian(
char16_t(QChar::ByteOrderMark)))
2559 return QStringConverter::Utf16LE;
2560 if (expectedFirstCharacter) {
2562 if (qToLittleEndian(uc) == expectedFirstCharacter)
2563 return QStringConverter::Utf16LE;
2564 else if (qToBigEndian(uc) == expectedFirstCharacter)
2565 return QStringConverter::Utf16BE;
2568 return std::nullopt;
2573 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher(
"meta ");
2574 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher(
"charset=");
2576 QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
2577 qsizetype pos = metaSearcher.indexIn(header);
2579 pos = charsetSearcher.indexIn(header, pos);
2581 pos += qstrlen(
"charset=");
2582 if (pos < header.size() && (header.at(pos) ==
'\"' || header.at(pos) ==
'\''))
2585 qsizetype pos2 = pos;
2588 while (++pos2 < header.size()) {
2589 char ch = header.at(pos2);
2590 if (ch ==
'\"' || ch ==
'\'' || ch ==
'>' || ch ==
'/') {
2591 QByteArray name = header.mid(pos, pos2 - pos);
2592 qsizetype colon = name.indexOf(
':');
2594 name = name.left(colon);
2595 name = name.simplified();
2596 if (name ==
"unicode")
2597 name = QByteArrayLiteral(
"UTF-8");
2598 if (!name.isEmpty())
2604 return QByteArray();
2608
2609
2610
2611
2612
2613
2614
2615std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2618 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2623 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2624 if (!encodingTag.isEmpty())
2625 return encodingForName(encodingTag);
2632#if !defined(QT_USE_ICU_CODECS)
2633 return QStringConverter::Encoding::LastEncoding;
2637
2638
2639 return 1 + ucnv_countAvailable();
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656QStringList QStringConverter::availableCodecs()
2658 auto availableCodec = [](qsizetype index) -> QString
2660 #if !defined(QT_USE_ICU_CODECS)
2661 return QString::fromLatin1(encodingInterfaces[index].name);
2664 return QString::fromLatin1(
2665 encodingInterfaces[QStringConverter::Encoding::System].name);
2668 UErrorCode status = U_ZERO_ERROR;
2669 auto icuName = ucnv_getAvailableName(int32_t(index - 1));
2670 const char *standardName = ucnv_getStandardName(icuName,
"MIME", &status);
2671 if (U_FAILURE(status) || !standardName) {
2672 status = U_ZERO_ERROR;
2673 standardName = ucnv_getStandardName(icuName,
"IANA", &status);
2676 standardName = icuName;
2677 return QString::fromLatin1(standardName);
2681 qsizetype codecCount = availableCodecCount();
2683 result.reserve(codecCount);
2684 for (qsizetype i = 0; i < codecCount; ++i)
2685 result.push_back(availableCodec(i));
2690
2691
2692
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2708
2709
2710
2711
2712
2713
2714
2717
2718
2719
2721
2722
2723
2725
2726
2727
2728
2731
2732
2733
2734
2737
2738
2739
2740
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2773 count = QtPrivate::partiallyParsedDataCount(&state);
2774 using Error = FinalizeResult::Error;
2775 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2776 if (count == 0 || !out) {
2778 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2781 return { {}, out, invalidChars, Error::NotEnoughSpace };
2783 const char16_t replacement = (state.flags & QStringConverter::Flag::ConvertInvalidToNull)
2785 : QChar::ReplacementCharacter;
2786 out =
std::fill_n(out, count, replacement);
2788 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2792
2793
2794
2795
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825auto QStringEncoder::finalize(
char *out, qsizetype maxlen) -> QStringEncoder::FinalizeResult
2827 qsizetype count = 0;
2829 count = QtPrivate::partiallyParsedDataCount(&state);
2832 using Error = FinalizeResult::Error;
2833 const bool usesIcu = !!(state.flags & QStringConverter::Flag::UsesIcu) && !!state.d[0];
2834 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2835 if (!isValid() || (!count && !usesIcu) || !out) {
2837 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2841#if defined(QT_USE_ICU_CODECS)
2842 }
else if (usesIcu) {
2844 auto *icu_conv =
static_cast<UConverter *>(state.d[0]);
2846 UErrorCode err = U_ZERO_ERROR;
2851 UConverterFromUCallback action;
2852 const void *context;
2853 ucnv_getFromUCallBack(icu_conv, &action, &context);
2854 if (context != &state)
2855 ucnv_setFromUCallBack(icu_conv, action, &state,
nullptr,
nullptr, &err);
2856 const UChar *dummyInput = u"";
2857 const char *outEnd = out + maxlen;
2858 ucnv_fromUnicode(icu_conv, &out, outEnd, &dummyInput, dummyInput,
nullptr, flush, &err);
2859 if (err == U_BUFFER_OVERFLOW_ERROR)
2860 return { {}, out, invalidChars, Error::NotEnoughSpace };
2863 }
else if (!(state.flags & QStringConverter::Flag::ConvertInvalidToNull)) {
2865
2866
2867
2868
2869
2870
2871
2872 constexpr QChar replacementCharacter = QChar::ReplacementCharacter;
2873 constexpr char16_t repl = replacementCharacter.unicode();
2874 constexpr std::array<
char16_t, 4> replacement{ repl, repl, repl, repl };
2875 const qsizetype charactersToEncode =
std::min(count, qsizetype(replacement.size()));
2876 if (maxlen < requiredSpace(charactersToEncode))
2877 return { {}, out, invalidChars, Error::NotEnoughSpace };
2881 out = appendToBuffer(out, QStringView(replacement.data(), charactersToEncode));
2884 return { {}, out, invalidChars, Error::NotEnoughSpace };
2885 out =
std::fill_n(out, count,
'\0');
2888 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2892
2893
2894
2895
2896
2897
2898
2899
2903 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2906 return QStringDecoder(encoding.value());
2908 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2909 if (!encodingTag.isEmpty())
2912 return QStringDecoder(Utf8);
2917
2918
2919
2920
2921
2922
2923
2924const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
noexcept
2927 if (Q_UNLIKELY(i >= std::size(encodingInterfaces)))
2929 return encodingInterfaces[i].name;
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2965
2966
2967
2970
2971
2972
2973
2974
2977
2978
2979
2980
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2995
2996
2997
2998
2999
3000
3001
3002
3003
3006
3007
3008
3009
3010
3011
3012
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3061
3062
3063
3066
3067
3068
3069
3070
3073
3074
3075
3076
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3103
3104
3105
3106
3107
3108
3109
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3127
3128
3129
3130
static int partiallyParsedDataCount(QStringConverter::State *state)
static bool nameMatch(const char *a, QAnyStringView b)
static const uchar utf8bom[]
static QChar * fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QChar * fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
static QChar * fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf8Len(qsizetype l)
static QChar * fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QChar * fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toLatin1Len(qsizetype l)
static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
static bool nameMatch_impl(const char *a, QLatin1StringView b)
static QChar * fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
static char * toUtf32(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf8Len(qsizetype l)
static char * toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
static qsizetype toUtf16Len(qsizetype l)
static qsizetype fromLatin1Len(qsizetype l)
static char * toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf32Len(qsizetype l)
static qsizetype availableCodecCount()
static QChar * fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf32Len(qsizetype l)
static char * toUtf16(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf16Len(qsizetype l)
static char * toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
static void appendUtf16(const NoOutput &, char16_t)
static void appendUcs4(const NoOutput &, char32_t)