6#include <qstringconverter.h>
7#include <private/qstringconverter_p.h>
10#include "private/qsimd_p.h"
11#include "private/qstringiterator_p.h"
12#include "private/qtools_p.h"
15#include <QtCore/qbytearraylist.h>
19#include <unicode/ucnv.h>
20#include <unicode/ucnv_cb.h>
21#include <unicode/ucnv_err.h>
22#include <unicode/ustring.h>
23#define QT_USE_ICU_CODECS
24#define QT_COM_THREAD_INIT
26#elif QT_CONFIG(winsdkicu)
29#include <private/qfunctions_win_p.h>
30#define QT_USE_ICU_CODECS
31#define QT_COM_THREAD_INIT qt_win_ensureComInitializedOnThisThread();
36#include <qt_windows.h>
37#ifndef QT_BOOTSTRAPPED
38#include <QtCore/qvarlengtharray.h>
39#include <QtCore/private/wcharhelpers_win_p.h>
41#include <QtCore/q20iterator.h>
50#include <QtCore/q20utility.h>
51#ifndef QT_BOOTSTRAPPED
52#include <QtCore/q26numeric.h>
57using namespace QtMiscUtils;
59static_assert(std::is_nothrow_move_constructible_v<QStringEncoder>);
60static_assert(std::is_nothrow_move_assignable_v<QStringEncoder>);
61static_assert(std::is_nothrow_move_constructible_v<QStringDecoder>);
62static_assert(std::is_nothrow_move_assignable_v<QStringDecoder>);
68#if defined(__SSE2__
) || defined(__ARM_NEON__)
69Q_ALWAYS_INLINE
static uint qBitScanReverse(
unsigned v)
noexcept
71#if defined(__cpp_lib_int_pow2) && __cpp_lib_int_pow2 >= 202002L
72 return std::bit_width(v) - 1;
74 uint result = qCountLeadingZeroBits(v);
78 result ^=
sizeof(
unsigned) * 8 - 1;
85template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE
static bool
86simdEncodeAscii(uchar *&dst,
const char16_t *&nextAscii,
const char16_t *&src,
const char16_t *end)
88 size_t sizeBytes =
reinterpret_cast<
const char *>(end) -
reinterpret_cast<
const char *>(src);
91 auto process16Chars = [](uchar *dst,
const char16_t *src) {
92 __m128i data1 = _mm_loadu_si128((
const __m128i*)src);
93 __m128i data2 = _mm_loadu_si128(1+(
const __m128i*)src);
103 __m128i packed = _mm_packus_epi16(data1, data2);
104 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
107 _mm_storeu_si128((__m128i*)dst, packed);
110 ushort n = ~_mm_movemask_epi8(nonAscii);
113 auto maybeFoundNonAscii = [&](
auto n, qptrdiff offset = 0) {
120 nextAscii = src + qBitScanReverse(n) + 1;
122 n = qCountTrailingZeroBits(n);
129 auto adjustToEnd = [&] {
130 dst += sizeBytes /
sizeof(
char16_t);
134 if constexpr (Cpu & CpuFeatureAVX2) {
139 constexpr size_t Step = 32;
140 auto process32Chars = [](
const char16_t *src, uchar *dst) {
141 __m256i data1 = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src));
142 __m256i data2 = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src) + 1);
143 __m256i packed = _mm256_packus_epi16(data1, data2);
144 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
145 __m256i nonAscii = _mm256_cmpgt_epi8(permuted, _mm256_setzero_si256());
148 _mm256_storeu_si256(
reinterpret_cast<__m256i *>(dst), permuted);
150 return ~_mm256_movemask_epi8(nonAscii);
153 if constexpr (Cpu & CpuFeatureAVX512VL) {
155 if (sizeBytes <= Step *
sizeof(
char16_t)) {
156 uint mask = _bzhi_u32(-1, uint(sizeBytes / 2));
157 __m256i data1 = _mm256_maskz_loadu_epi16(mask, src);
158 __m256i data2 = _mm256_maskz_loadu_epi16(mask >> 16, src + Step / 2);
159 __m256i packed = _mm256_packus_epi16(data1, data2);
160 __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
161 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, permuted, _mm256_setzero_si256());
164 _mm256_mask_storeu_epi8(dst, mask, permuted);
166 return maybeFoundNonAscii(nonAscii);
172 if (sizeBytes >= Step *
sizeof(
char16_t)) {
175 for ( ; (offset + Step) *
sizeof(
char16_t) < sizeBytes; offset += Step) {
176 if (uint n = process32Chars(src + offset, dst + offset))
177 return maybeFoundNonAscii(n, offset);
182 uint n = process32Chars(src - Step, dst - Step);
183 return maybeFoundNonAscii(n, -
int(Step));
187 constexpr size_t Step = 16;
188 if (sizeBytes >= Step *
sizeof(
char16_t)) {
191 for ( ; (offset + Step) *
sizeof(
char16_t) < sizeBytes; offset += Step) {
192 ushort n = process16Chars(dst + offset, src + offset);
194 return maybeFoundNonAscii(n, offset);
195 if (Cpu & CpuFeatureAVX2)
201 ushort n = process16Chars(dst - Step, src - Step);
202 return maybeFoundNonAscii(n, -
int(Step));
205# if !defined(__OPTIMIZE_SIZE__)
206 if (sizeBytes >= 8 *
sizeof(
char16_t)) {
208 __m128i data = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src));
209 __m128i data2 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(end - 8));
210 __m128i packed = _mm_packus_epi16(data, data);
211 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
214 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst), packed);
216 uchar n = ~_mm_movemask_epi8(nonAscii);
218 return maybeFoundNonAscii(n);
221 packed = _mm_packus_epi16(data2, data2);
222 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
223 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst - 8), packed);
224 n = ~_mm_movemask_epi8(nonAscii);
225 return maybeFoundNonAscii(n, -8);
226 }
else if (sizeBytes >= 4 *
sizeof(
char16_t)) {
228 __m128i data1 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src));
229 __m128i data2 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(end - 4));
230 __m128i packed = _mm_packus_epi16(data1, data1);
231 __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
234 qToUnaligned(_mm_cvtsi128_si32(packed), dst);
236 uchar n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
238 return maybeFoundNonAscii(n);
241 packed = _mm_packus_epi16(data2, data2);
242 nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
243 qToUnaligned(_mm_cvtsi128_si32(packed), dst - 4);
244 n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
245 return maybeFoundNonAscii(n, -4);
252template <QCpuFeatureType Cpu = _compilerCpuFeatures> Q_ALWAYS_INLINE
static bool
253simdDecodeAscii(
char16_t *&dst,
const uchar *&nextAscii,
const uchar *&src,
const uchar *end)
256 auto process16Chars = [](
char16_t *dst,
const uchar *src) {
257 __m128i data = _mm_loadu_si128((
const __m128i*)src);
261 uint n = _mm_movemask_epi8(data);
264 _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
265 _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
268 auto maybeFoundNonAscii = [&](uint n, qptrdiff offset = 0) {
273 uint c = qCountTrailingZeroBits(n);
276 n = qBitScanReverse(n);
277 nextAscii = src + n + 1;
283 auto adjustToEnd = [&] {
288 if constexpr (Cpu & CpuFeatureAVX2) {
289 constexpr qsizetype Step = 32;
290 auto process32Chars = [](
char16_t *dst,
const uchar *src) {
291 __m128i data1 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src));
292 __m128i data2 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src) + 1);
297 __m128i ored = _mm_or_si128(data1, data2);
298 bool any = _mm_movemask_epi8(ored);
301 __m256i extended1 = _mm256_cvtepu8_epi16(data1);
302 __m256i extended2 = _mm256_cvtepu8_epi16(data2);
303 _mm256_storeu_si256(
reinterpret_cast<__m256i *>(dst), extended1);
304 _mm256_storeu_si256(
reinterpret_cast<__m256i *>(dst) + 1, extended2);
306 uint n1 = _mm_movemask_epi8(data1);
307 uint n2 = _mm_movemask_epi8(data2);
311 operator
bool()
const {
return any; }
312 operator uint()
const {
return n1|(n2 << 16); }
314 return R{ n1, n2, any };
317 if constexpr (Cpu & CpuFeatureAVX512VL) {
319 if (end - src <= Step) {
320 __mmask32 mask = _bzhi_u32(-1, uint(end - src));
321 __m256i data = _mm256_maskz_loadu_epi8(mask, src);
322 __mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, data, _mm256_setzero_si256());
325 __m256i extended1 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data));
326 __m256i extended2 = _mm256_cvtepu8_epi16(_mm256_extracti64x2_epi64(data, 1));
327 _mm256_mask_storeu_epi16(dst, mask, extended1);
328 _mm256_mask_storeu_epi16(dst + Step/2, mask >> 16, extended2);
330 return maybeFoundNonAscii(nonAscii);
336 if (end - src >= Step) {
339 for ( ; offset + Step < end - src; offset += Step) {
340 auto r = process32Chars(dst + offset, src + offset);
342 return maybeFoundNonAscii(r, offset);
347 auto r = process32Chars(dst - Step, src - Step);
348 return maybeFoundNonAscii(r, -Step);
352 constexpr qsizetype Step = 16;
353 if (end - src >= Step) {
355 for ( ; offset + Step < end - src; offset += Step) {
356 ushort n = process16Chars(dst + offset, src + offset);
358 return maybeFoundNonAscii(n, offset);
359 if (Cpu & CpuFeatureAVX2)
365 return maybeFoundNonAscii(process16Chars(dst - Step, src - Step), -Step);
368# if !defined(__OPTIMIZE_SIZE__)
369 if (end - src >= 8) {
370 __m128i data = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src));
371 __m128i data2 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(end - 8));
372 uint n = _mm_movemask_epi8(data) & 0xff;
374 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
376 return maybeFoundNonAscii(n);
380 n = _mm_movemask_epi8(data2) & 0xff;
381 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
382 _mm_storeu_si128(
reinterpret_cast<__m128i *>(dst - 8), data2);
383 return maybeFoundNonAscii(n, -8);
385 if (end - src >= 4) {
386 __m128i data = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src));
387 __m128i data2 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(end - 4));
388 uchar n = uchar(_mm_movemask_epi8(data) & 0xf);
390 data = _mm_unpacklo_epi8(data, _mm_setzero_si128());
391 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst), data);
393 return maybeFoundNonAscii(n);
397 n = uchar(_mm_movemask_epi8(data2) & 0xf);
398 data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
399 _mm_storel_epi64(
reinterpret_cast<__m128i *>(dst - 4), data2);
400 return maybeFoundNonAscii(n, -4);
407static inline const uchar *simdFindNonAscii(
const uchar *src,
const uchar *end,
const uchar *&nextAscii)
412 const __m256i mask = _mm256_set1_epi8(
char(0x80));
413 for ( ; end - src >= 32; src += 32) {
414 __m256i data = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src));
415 if (_mm256_testz_si256(mask, data))
418 uint n = _mm256_movemask_epi8(data);
424 nextAscii = src + qBitScanReverse(n) + 1;
427 return src + qCountTrailingZeroBits(n);
432 for ( ; end - src >= 16; src += 16) {
433 __m128i data = _mm_loadu_si128(
reinterpret_cast<
const __m128i*>(src));
437 uint n = _mm_movemask_epi8(data);
444 nextAscii = src + qBitScanReverse(n) + 1;
447 return src + qCountTrailingZeroBits(n);
451 for ( ; end - src >= 4; src += 4) {
452 quint32 data = qFromUnaligned<quint32>(src);
469static void simdCompareAscii(
const qchar8_t *&src8,
const qchar8_t *end8,
const char16_t *&src16,
const char16_t *end16)
472 qptrdiff len = qMin(end8 - src8, end16 - src16);
477 for ( ; offset + 16 < len; offset += 16) {
478 __m128i data8 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src8 + offset));
481 __m256i data16 = _mm256_loadu_si256(
reinterpret_cast<
const __m256i *>(src16 + offset));
484 __m256i datax8 = _mm256_cvtepu8_epi16(data8);
485 mask = _mm256_movemask_epi8(datax8);
490 __m256i latin1cmp = _mm256_cmpeq_epi16(datax8, data16);
491 mask = ~_mm256_movemask_epi8(latin1cmp);
496 __m128i datalo16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src16 + offset));
497 __m128i datahi16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src16 + offset) + 1);
500 __m128i datalo8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
501 __m128i datahi8 = _mm_unpackhi_epi8(data8, _mm_setzero_si128());
504 __m128i latin1cmplo = _mm_cmpeq_epi16(datalo8, datalo16);
505 __m128i latin1cmphi = _mm_cmpeq_epi16(datahi8, datahi16);
506 mask = _mm_movemask_epi8(latin1cmphi) << 16;
507 mask |= ushort(_mm_movemask_epi8(latin1cmplo));
513 mask = _mm_movemask_epi8(data8);
522 auto cmp_lt_16 = [&mask, &offset](
int n, __m128i data8, __m128i data16) {
525 unsigned sizemask = (1U << (2 * n)) - 1;
528 data8 = _mm_unpacklo_epi8(data8, _mm_setzero_si128());
531 __m128i latin1cmp = _mm_cmpeq_epi16(data8, data16);
532 mask = ~_mm_movemask_epi8(latin1cmp) & sizemask;
533 mask |= _mm_movemask_epi8(data8);
539 if (mask == 0 && offset + 8 < len) {
540 __m128i data8 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src8 + offset));
541 __m128i data16 = _mm_loadu_si128(
reinterpret_cast<
const __m128i *>(src16 + offset));
542 cmp_lt_16(8, data8, data16);
546 if (mask == 0 && offset + 4 < len) {
547 __m128i data8 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src8 + offset));
548 __m128i data16 = _mm_loadl_epi64(
reinterpret_cast<
const __m128i *>(src16 + offset));
549 cmp_lt_16(4, data8, data16);
554 offset += qCountTrailingZeroBits(mask) >> bitSpacing;
558#elif defined(__ARM_NEON__)
559static inline bool simdEncodeAscii(uchar *&dst,
const char16_t *&nextAscii,
const char16_t *&src,
const char16_t *end)
561 uint16x8_t maxAscii = vdupq_n_u16(0x7f);
562 uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
563 uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
566 for ( ; end - src >= 16; src += 16, dst += 16) {
568 uint16x8x2_t in = vld2q_u16(
reinterpret_cast<
const uint16_t *>(src));
572 uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1))
573 | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2));
576 uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8);
579 vst1q_u8(dst, vreinterpretq_u8_u16(out));
585 nextAscii = src + qBitScanReverse(nonAscii) + 1;
587 nonAscii = qCountTrailingZeroBits(nonAscii);
596static inline bool simdDecodeAscii(
char16_t *&dst,
const uchar *&nextAscii,
const uchar *&src,
const uchar *end)
599 uint8x8_t msb_mask = vdup_n_u8(0x80);
600 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
601 for ( ; end - src >= 8; src += 8, dst += 8) {
602 uint8x8_t c = vld1_u8(src);
603 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
606 vst1q_u16(
reinterpret_cast<uint16_t *>(dst), vmovl_u8(c));
619 n = qBitScanReverse(n);
620 nextAscii = src + n + 1;
627static inline const uchar *simdFindNonAscii(
const uchar *src,
const uchar *end,
const uchar *&nextAscii)
635 uint8x8_t msb_mask = vdup_n_u8(0x80);
636 uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
637 for ( ; end - src >= 8; src += 8) {
638 uint8x8_t c = vld1_u8(src);
639 uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
646 nextAscii = src + qBitScanReverse(n) + 1;
649 return src + qCountTrailingZeroBits(n);
655static void simdCompareAscii(
const qchar8_t *&,
const qchar8_t *,
const char16_t *&,
const char16_t *)
659static inline bool simdEncodeAscii(uchar *,
const char16_t *,
const char16_t *,
const char16_t *)
664static inline bool simdDecodeAscii(
char16_t *,
const uchar *,
const uchar *,
const uchar *)
669static inline const uchar *simdFindNonAscii(
const uchar *src,
const uchar *end,
const uchar *&nextAscii)
675static void simdCompareAscii(
const qchar8_t *&,
const qchar8_t *,
const char16_t *&,
const char16_t *)
682template <
typename OnErrorLambda> Q_ALWAYS_INLINE
683char *QUtf8::convertFromUnicode(
char *out, QStringView in, OnErrorLambda &&onError)
noexcept
685 qsizetype len = in.size();
687 uchar *dst =
reinterpret_cast<uchar *>(out);
688 const char16_t *src =
reinterpret_cast<
const char16_t *>(in.data());
689 const char16_t *
const end = src + len;
692 const char16_t *nextAscii = end;
693 if (simdEncodeAscii(dst, nextAscii, src, end))
698 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
699 if (Q_UNLIKELY(res < 0))
700 onError(dst, u, res);
701 }
while (src < nextAscii);
704 return reinterpret_cast<
char *>(dst);
707char *QUtf8::convertFromUnicode(
char *dst, QStringView in)
noexcept
709 return convertFromUnicode(dst, in, [](
auto *dst, ...) {
715QByteArray QUtf8::convertFromUnicode(QStringView in)
717 qsizetype len = in.size();
720 QByteArray result(len * 3, Qt::Uninitialized);
721 char *dst =
const_cast<
char *>(result.constData());
722 dst = convertFromUnicode(dst, in);
723 result.truncate(dst - result.constData());
727QByteArray QUtf8::convertFromUnicode(QStringView in, QStringConverter::State *state)
729 QByteArray ba(3*in.size() +3, Qt::Uninitialized);
730 char *end = convertFromUnicode(ba.data(), in, state);
731 ba.truncate(end - ba.data());
735char *QUtf8::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state)
738 qsizetype len = in.size();
742 auto appendReplacementChar = [state](uchar *cursor) -> uchar * {
743 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull) {
754 uchar *cursor =
reinterpret_cast<uchar *>(out);
755 const char16_t *src = in.utf16();
756 const char16_t *
const end = src + len;
758 if (!(state->flags & QStringDecoder::Flag::Stateless)) {
759 if (state->remainingChars) {
760 int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(state->state_data[0], cursor, src, end);
762 cursor = appendReplacementChar(cursor);
763 state->state_data[0] = 0;
764 state->remainingChars = 0;
765 }
else if (!(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom) {
767 *cursor++ = utf8bom[0];
768 *cursor++ = utf8bom[1];
769 *cursor++ = utf8bom[2];
770 state->internalState |= HeaderDone;
774 out =
reinterpret_cast<
char *>(cursor);
775 return convertFromUnicode(out, { src, end }, [&](uchar *&cursor,
char16_t uc,
int res) {
776 if (res == QUtf8BaseTraits::Error) {
778 ++state->invalidChars;
779 cursor = appendReplacementChar(cursor);
780 }
else if (res == QUtf8BaseTraits::EndOfString) {
781 if (state->flags & QStringConverter::Flag::Stateless) {
782 ++state->invalidChars;
783 cursor = appendReplacementChar(cursor);
785 state->remainingChars = 1;
786 state->state_data[0] = uc;
792char *QUtf8::convertFromLatin1(
char *out, QLatin1StringView in)
795 for (uchar ch : in) {
800 *out++ = 0b110'0'0000u | (ch >> 6);
801 *out++ = 0b10'00'0000u | (ch & 0b0011'1111);
807QString QUtf8::convertToUnicode(QByteArrayView in)
821 QString result(in.size(), Qt::Uninitialized);
822 QChar *data =
const_cast<QChar*>(result.constData());
823 const QChar *end = convertToUnicode(data, in);
824 result.truncate(end - data);
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847char16_t *QUtf8::convertToUnicode(
char16_t *dst, QByteArrayView in)
noexcept
850 auto bom = QByteArrayView::fromArray(utf8bom);
851 if (in.size() >= bom.size() && in.first(bom.size()) == bom)
852 in.slice(
sizeof(utf8bom));
854 return convertToUnicode(dst, in, [](
char16_t *&dst, ...) {
856 *dst++ = QChar::ReplacementCharacter;
861template <
typename OnErrorLambda> Q_ALWAYS_INLINE
char16_t *
862QUtf8::convertToUnicode(
char16_t *dst, QByteArrayView in, OnErrorLambda &&onError)
noexcept
864 const uchar *
const start =
reinterpret_cast<
const uchar *>(in.data());
865 const uchar *src = start;
866 const uchar *end = src + in.size();
869 const uchar *nextAscii = end;
872 if (simdDecodeAscii(dst, nextAscii, src, end))
877 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
878 if (Q_LIKELY(res >= 0))
881 if (!onError(dst, src, res))
883 }
while (src < nextAscii);
889QString QUtf8::convertToUnicode(QByteArrayView in, QStringConverter::State *state)
901 QString result(in.size() + 1, Qt::Uninitialized);
902 QChar *end = convertToUnicode(result.data(), in, state);
903 result.truncate(end - result.constData());
907char16_t *QUtf8::convertToUnicode(
char16_t *dst, QByteArrayView in, QStringConverter::State *state)
909 qsizetype len = in.size();
916 char16_t replacement = QChar::ReplacementCharacter;
917 if (state->flags & QStringConverter::Flag::ConvertInvalidToNull)
918 replacement = QChar::Null;
922 const uchar *src =
reinterpret_cast<
const uchar *>(in.data());
923 const uchar *end = src + len;
925 if (!(state->flags & QStringConverter::Flag::Stateless)) {
926 bool headerdone = state->internalState & HeaderDone || state->flags & QStringConverter::Flag::ConvertInitialBom;
927 if (state->remainingChars || !headerdone) {
929 uchar remainingCharsData[4];
930 qsizetype remainingCharsCount = state->remainingChars;
931 qsizetype newCharsToCopy = qMin<qsizetype>(
sizeof(remainingCharsData) - remainingCharsCount, end - src);
933 memset(remainingCharsData, 0,
sizeof(remainingCharsData));
934 memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
935 memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
937 const uchar *begin = &remainingCharsData[1];
938 res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
939 static_cast<
const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
940 if (res == QUtf8BaseTraits::Error) {
941 ++state->invalidChars;
942 *dst++ = replacement;
944 }
else if (res == QUtf8BaseTraits::EndOfString) {
947 state->remainingChars = remainingCharsCount + newCharsToCopy;
948 memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
950 }
else if (!headerdone) {
952 if (dst[-1] == 0xfeff)
955 state->internalState |= HeaderDone;
959 Q_ASSERT(res > remainingCharsCount);
960 src += res - remainingCharsCount;
963 }
else if (!(state->flags & QStringConverter::Flag::ConvertInitialBom)) {
965 if (len > 2 && src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])
972 dst = convertToUnicode(dst, { src, end }, [&](
char16_t *&dst,
const uchar *src_,
int res_) {
975 if (res == QUtf8BaseTraits::Error) {
977 ++state->invalidChars;
978 *dst++ = replacement;
983 if (res == QUtf8BaseTraits::EndOfString) {
985 if (state->flags & QStringConverter::Flag::Stateless) {
986 *dst++ = QChar::ReplacementCharacter;
987 ++state->invalidChars;
988 while (src++ < end) {
989 *dst++ = QChar::ReplacementCharacter;
990 ++state->invalidChars;
992 state->remainingChars = 0;
995 state->remainingChars = end - src;
996 memcpy(&state->state_data[0], src, end - src);
999 state->remainingChars = 0;
1012QUtf8::ValidUtf8Result QUtf8::isValidUtf8(QByteArrayView in)
1014 const uchar *src =
reinterpret_cast<
const uchar *>(in.data());
1015 const uchar *end = src + in.size();
1016 const uchar *nextAscii = src;
1017 bool isValidAscii =
true;
1020 if (src >= nextAscii)
1021 src = simdFindNonAscii(src, end, nextAscii);
1027 if ((b & 0x80) == 0)
1030 isValidAscii =
false;
1031 QUtf8NoOutputTraits::NoOutput output;
1032 const qsizetype res = QUtf8Functions::fromUtf8<QUtf8NoOutputTraits>(b, output, src, end);
1035 return {
false,
false };
1037 }
while (src < nextAscii);
1040 return {
true, isValidAscii };
1043int QUtf8::compareUtf8(QByteArrayView utf8, QStringView utf16, Qt::CaseSensitivity cs)
noexcept
1045 auto src1 =
reinterpret_cast<
const qchar8_t *>(utf8.data());
1046 auto end1 = src1 + utf8.size();
1047 auto src2 =
reinterpret_cast<
const char16_t *>(utf16.data());
1048 auto end2 = src2 + utf16.size();
1051 simdCompareAscii(src1, end1, src2, end2);
1053 if (src1 < end1 && src2 < end2) {
1054 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1055 char32_t uc2 = *src2++;
1060 if (QChar::isHighSurrogate(uc2) && src2 < end2 && QChar::isLowSurrogate(*src2))
1061 uc2 = QChar::surrogateToUcs4(uc2, *src2++);
1063 if (cs == Qt::CaseInsensitive) {
1064 uc1 = QChar::toCaseFolded(uc1);
1065 uc2 = QChar::toCaseFolded(uc2);
1068 return int(uc1) -
int(uc2);
1070 }
while (src1 < end1 && src2 < end2);
1073 return (end1 > src1) -
int(end2 > src2);
1076int QUtf8::compareUtf8(QByteArrayView utf8, QLatin1StringView s, Qt::CaseSensitivity cs)
1078 auto src1 =
reinterpret_cast<
const qchar8_t *>(utf8.data());
1079 auto end1 = src1 + utf8.size();
1080 auto src2 =
reinterpret_cast<
const uchar *>(s.latin1());
1081 auto end2 = src2 + s.size();
1083 while (src1 < end1 && src2 < end2) {
1084 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1085 char32_t uc2 = *src2++;
1086 if (cs == Qt::CaseInsensitive) {
1087 uc1 = QChar::toCaseFolded(uc1);
1088 uc2 = QChar::toCaseFolded(uc2);
1091 return int(uc1) -
int(uc2);
1095 return (end1 > src1) - (end2 > src2);
1098int QUtf8::compareUtf8(QByteArrayView lhs, QByteArrayView rhs, Qt::CaseSensitivity cs)
noexcept
1101 return qt_lencmp(0, rhs.size());
1104 return qt_lencmp(lhs.size(), 0);
1106 if (cs == Qt::CaseSensitive) {
1107 const auto l = std::min(lhs.size(), rhs.size());
1108 int r = memcmp(lhs.data(), rhs.data(), l);
1109 return r ? r : qt_lencmp(lhs.size(), rhs.size());
1112 auto src1 =
reinterpret_cast<
const qchar8_t *>(lhs.data());
1113 auto end1 = src1 + lhs.size();
1114 auto src2 =
reinterpret_cast<
const qchar8_t *>(rhs.data());
1115 auto end2 = src2 + rhs.size();
1117 while (src1 < end1 && src2 < end2) {
1118 char32_t uc1 = QUtf8Functions::nextUcs4FromUtf8(src1, end1);
1119 char32_t uc2 = QUtf8Functions::nextUcs4FromUtf8(src2, end2);
1121 uc1 = QChar::toCaseFolded(uc1);
1122 uc2 = QChar::toCaseFolded(uc2);
1124 return int(uc1) -
int(uc2);
1128 return (end1 > src1) - (end2 > src2);
1131#ifndef QT_BOOTSTRAPPED
1132QByteArray QUtf16::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1134 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1135 qsizetype length = 2 * in.size();
1139 QByteArray d(length, Qt::Uninitialized);
1140 char *end = convertFromUnicode(d.data(), in, state, endian);
1141 Q_ASSERT(end - d.constData() == d.size());
1146char *QUtf16::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1149 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1151 if (endian == DetectEndianness)
1152 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1156 QChar bom(QChar::ByteOrderMark);
1157 if (endian == BigEndianness)
1158 qToBigEndian(bom.unicode(), out);
1160 qToLittleEndian(bom.unicode(), out);
1163 if (endian == BigEndianness)
1164 qToBigEndian<
char16_t>(in.data(), in.size(), out);
1166 qToLittleEndian<
char16_t>(in.data(), in.size(), out);
1168 state->remainingChars = 0;
1169 state->internalState |= HeaderDone;
1170 return out + 2*in.size();
1173QString QUtf16::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1175 QString result((in.size() + 1) >> 1, Qt::Uninitialized);
1176 QChar *qch = convertToUnicode(result.data(), in, state, endian);
1177 result.truncate(qch - result.constData());
1181QChar *QUtf16::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1183 qsizetype len = in.size();
1184 const char *chars = in.data();
1188 if (endian == DetectEndianness)
1189 endian = (DataEndianness)state->state_data[Endian];
1191 const char *end = chars + len;
1194 if (state->remainingChars + len < 2) {
1196 Q_ASSERT(state->remainingChars == 0 && len == 1);
1197 state->remainingChars = 1;
1198 state->state_data[Data] = *chars;
1203 bool headerdone = state && state->internalState & HeaderDone;
1204 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1207 if (!headerdone || state->remainingChars) {
1209 if (state->remainingChars)
1210 buf = state->state_data[Data];
1215 state->internalState |= HeaderDone;
1216 QChar ch(buf, *chars++);
1217 if (endian == DetectEndianness) {
1219 if (ch == QChar::ByteOrderSwapped) {
1220 endian = BigEndianness;
1221 }
else if (ch == QChar::ByteOrderMark) {
1222 endian = LittleEndianness;
1224 if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1225 endian = BigEndianness;
1227 endian = LittleEndianness;
1231 if (endian == BigEndianness)
1232 ch = QChar::fromUcs2((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
1233 if (headerdone || ch != QChar::ByteOrderMark)
1235 }
else if (endian == DetectEndianness) {
1236 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1239 qsizetype nPairs = (end - chars) >> 1;
1240 if (endian == BigEndianness)
1241 qFromBigEndian<
char16_t>(chars, nPairs, out);
1243 qFromLittleEndian<
char16_t>(chars, nPairs, out);
1246 state->state_data[Endian] = endian;
1247 state->remainingChars = 0;
1248 if ((end - chars) & 1) {
1249 if (state->flags & QStringConverter::Flag::Stateless) {
1250 *out++ = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? QChar::Null : QChar::ReplacementCharacter;
1252 state->remainingChars = 1;
1253 state->state_data[Data] = *(end - 1);
1256 state->state_data[Data] = 0;
1262QByteArray QUtf32::convertFromUnicode(QStringView in, QStringConverter::State *state, DataEndianness endian)
1264 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1265 qsizetype length = 4*in.size();
1268 QByteArray ba(length, Qt::Uninitialized);
1269 char *end = convertFromUnicode(ba.data(), in, state, endian);
1270 ba.truncate(end - ba.constData());
1274char *QUtf32::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state, DataEndianness endian)
1278 bool writeBom = !(state->internalState & HeaderDone) && state->flags & QStringConverter::Flag::WriteBom;
1279 if (endian == DetectEndianness)
1280 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1284 if (endian == BigEndianness) {
1287 out[2] = (
char)0xfe;
1288 out[3] = (
char)0xff;
1290 out[0] = (
char)0xff;
1291 out[1] = (
char)0xfe;
1296 state->internalState |= HeaderDone;
1299 const QChar *uc = in.data();
1300 const QChar *end = in.data() + in.size();
1303 if (state->remainingChars == 1) {
1304 auto character = state->state_data[Data];
1305 Q_ASSERT(character <= 0xFFFF);
1306 ch = QChar(character);
1308 state->remainingChars = 0;
1309 goto decode_surrogate;
1314 if (Q_LIKELY(!ch.isSurrogate())) {
1315 ucs4 = ch.unicode();
1316 }
else if (Q_LIKELY(ch.isHighSurrogate())) {
1319 if (state->flags & QStringConverter::Flag::Stateless) {
1320 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1322 state->remainingChars = 1;
1323 state->state_data[Data] = ch.unicode();
1326 }
else if (uc->isLowSurrogate()) {
1327 ucs4 = QChar::surrogateToUcs4(ch, *uc++);
1329 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1332 ucs4 = state->flags & QStringConverter::Flag::ConvertInvalidToNull ? 0 : QChar::ReplacementCharacter;
1334 if (endian == BigEndianness)
1335 qToBigEndian(ucs4, out);
1337 qToLittleEndian(ucs4, out);
1344QString QUtf32::convertToUnicode(QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1347 result.resize((in.size() + 7) >> 1);
1348 QChar *end = convertToUnicode(result.data(), in, state, endian);
1349 result.truncate(end - result.constData());
1353QChar *QUtf32::convertToUnicode(QChar *out, QByteArrayView in, QStringConverter::State *state, DataEndianness endian)
1355 qsizetype len = in.size();
1356 const char *chars = in.data();
1359 if (endian == DetectEndianness)
1360 endian = (DataEndianness)state->state_data[Endian];
1362 const char *end = chars + len;
1365 memcpy(tuple, &state->state_data[Data], 4);
1368 if (state->remainingChars + len < 4) {
1370 while (chars < end) {
1371 tuple[state->remainingChars] = *chars;
1372 ++state->remainingChars;
1375 Q_ASSERT(state->remainingChars < 4);
1376 memcpy(&state->state_data[Data], tuple, 4);
1381 bool headerdone = state->internalState & HeaderDone;
1382 if (state->flags & QStringConverter::Flag::ConvertInitialBom)
1385 qsizetype num = state->remainingChars;
1386 state->remainingChars = 0;
1388 if (!headerdone || endian == DetectEndianness || num) {
1390 tuple[num++] = *chars++;
1391 if (endian == DetectEndianness) {
1393 if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0) {
1394 endian = LittleEndianness;
1395 }
else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff) {
1396 endian = BigEndianness;
1397 }
else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
1398 endian = BigEndianness;
1400 endian = LittleEndianness;
1403 char32_t code = (endian == BigEndianness) ? qFromBigEndian<
char32_t>(tuple) : qFromLittleEndian<
char32_t>(tuple);
1404 if (headerdone || code != QChar::ByteOrderMark) {
1405 if (QChar::requiresSurrogates(code)) {
1406 *out++ = QChar(QChar::highSurrogate(code));
1407 *out++ = QChar(QChar::lowSurrogate(code));
1409 *out++ = QChar(code);
1413 }
else if (endian == DetectEndianness) {
1414 endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
1416 state->state_data[Endian] = endian;
1417 state->internalState |= HeaderDone;
1419 while (chars < end) {
1420 tuple[num++] = *chars++;
1422 char32_t code = (endian == BigEndianness) ? qFromBigEndian<
char32_t>(tuple) : qFromLittleEndian<
char32_t>(tuple);
1423 for (
char16_t c : QChar::fromUcs4(code))
1430 if (state->flags & QStringDecoder::Flag::Stateless) {
1431 *out++ = QChar::ReplacementCharacter;
1433 state->state_data[Endian] = endian;
1434 state->remainingChars = num;
1435 memcpy(&state->state_data[Data], tuple, 4);
1443#if defined(Q_OS_WIN) && !defined(QT_BOOTSTRAPPED)
1444int QLocal8Bit::checkUtf8()
1446 return GetACP() == CP_UTF8 ? 1 : -1;
1449QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state)
1451 return convertToUnicode_sys(in, CP_ACP, state);
1454QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
1455 QStringConverter::State *state)
1457 const char *mb = in.data();
1458 qsizetype mblen = in.size();
1461 qsizetype &invalidChars = state->invalidChars;
1462 using Flag = QStringConverter::Flag;
1463 const bool useNullForReplacement = !!(state->flags & Flag::ConvertInvalidToNull);
1464 const char16_t replacementCharacter = useNullForReplacement ? QChar::Null
1465 : QChar::ReplacementCharacter;
1466 if (state->flags & Flag::Stateless) {
1467 Q_ASSERT(state->remainingChars == 0);
1479 std::array<
wchar_t, 4096> buf;
1480 wchar_t *out = buf.data();
1481 qsizetype outlen = buf.size();
1486 const auto growOut = [&](qsizetype size) -> std::tuple<
wchar_t *, qsizetype> {
1488 return {out, outlen};
1489 const bool wasStackBuffer = sp.isEmpty();
1490 const auto begin = wasStackBuffer ? buf.data() :
reinterpret_cast<
wchar_t *>(sp.data());
1491 const qsizetype offset = qsizetype(std::distance(begin, out));
1492 qsizetype newSize = 0;
1493 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1495 return {
nullptr, 0};
1498 auto it =
reinterpret_cast<
wchar_t *>(sp.data());
1500 it = std::copy_n(buf.data(), offset, it);
1507 while (state && state->remainingChars && mblen) {
1508 QStringConverter::State localState;
1509 localState.flags = state->flags;
1514 std::array<
char, 6> prev = {0};
1515 Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data));
1516 qsizetype index = 0;
1517 for (; index < state->remainingChars; ++index)
1518 prev[index] = state->state_data[index];
1519 const qsizetype toCopy = std::min(q20::ssize(prev) - index, mblen);
1520 for (qsizetype i = 0; i < toCopy; ++i, ++index)
1521 prev[index] = mb[i];
1534 const QString tmp = convertToUnicode_sys(QByteArrayView(prev.data(), index), codePage,
1536 std::tie(out, outlen) = growOut(tmp.size());
1539 out = std::copy_n(
reinterpret_cast<
const wchar_t *>(tmp.constData()), tmp.size(), out);
1540 outlen -= tmp.size();
1541 const qsizetype tail = toCopy - localState.remainingChars;
1546 mb -= localState.remainingChars;
1547 mblen += localState.remainingChars;
1548 localState.remainingChars = 0;
1550 state->remainingChars = localState.remainingChars;
1551 state->invalidChars += localState.invalidChars;
1552 std::copy_n(localState.state_data, state->remainingChars, state->state_data);
1555 Q_ASSERT(!state || state->remainingChars == 0 || mblen == 0);
1559 int nextIn = q26::saturate_cast<
int>(mblen);
1561 std::tie(out, outlen) = growOut(1);
1564 const int nextOut = q26::saturate_cast<
int>(outlen);
1565 int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, nextIn, out, nextOut);
1572 int r = GetLastError();
1573 if (r == ERROR_INSUFFICIENT_BUFFER) {
1574 const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
1575 std::tie(out, outlen) = growOut(wclen);
1578 }
else if (r == ERROR_NO_UNICODE_TRANSLATION) {
1584 if (state && mblen <= q20::ssize(state->state_data)) {
1585 state->remainingChars = mblen;
1586 std::copy_n(mb, mblen, state->state_data);
1599 const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
1601 nextIn =
int(it - mb);
1609 std::tie(out, outlen) = growOut(1);
1612 *out = replacementCharacter;
1620 qWarning(
"MultiByteToWideChar: Cannot convert multibyte text");
1624 nextIn = q26::saturate_cast<
int>(mblen);
1629 if (out != buf.data())
1630 sp = QStringView(buf.data(), out).toString();
1632 const auto begin =
reinterpret_cast<
wchar_t *>(sp.data());
1633 sp.truncate(std::distance(begin, out));
1636 if (sp.size() && sp.back().isNull())
1639 if (!state && mblen > 0) {
1642 sp.resize(sp.size() + mblen, replacementCharacter);
1643 invalidChars += mblen;
1648QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state)
1650 return convertFromUnicode_sys(in, CP_ACP, state);
1653QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
1654 QStringConverter::State *state)
1656 const wchar_t *ch =
reinterpret_cast<
const wchar_t *>(in.data());
1657 qsizetype uclen = in.size();
1667 using Flag = QStringConverter::Flag;
1668 if (state->flags & Flag::Stateless) {
1669 Q_ASSERT(state->remainingChars == 0);
1674 return QByteArray();
1676 return QByteArray(
"");
1683 std::array<
char, 4096> buf;
1684 char *out = buf.data();
1685 qsizetype outlen = buf.size();
1688 if (state && state->remainingChars > 0) {
1689 Q_ASSERT(state->remainingChars == 1);
1691 wchar_t wc[2] = {
wchar_t(state->state_data[0]), ch[0] };
1695 const bool validCodePoint = QChar::isLowSurrogate(wc[1]);
1696 int len = WideCharToMultiByte(codePage, 0, wc, validCodePoint ? 2 : 1, out, outlen,
nullptr,
1702 if (validCodePoint) {
1706 state->remainingChars = 0;
1707 state->state_data[0] = 0;
1709 return QByteArrayView(buf.data(), len).toByteArray();
1712 if (state && QChar::isHighSurrogate(ch[uclen - 1])) {
1715 state->remainingChars = 1;
1716 state->state_data[0] = ch[uclen - 1];
1719 return QByteArray();
1722 Q_ASSERT(uclen > 0);
1725 const auto growOut = [&](qsizetype size) -> std::tuple<
char *, qsizetype> {
1727 return {out, outlen};
1728 const bool wasStackBuffer = mb.isEmpty();
1729 const auto begin = wasStackBuffer ? buf.data() : mb.data();
1730 const qsizetype offset = qsizetype(std::distance(begin, out));
1731 qsizetype newSize = 0;
1732 if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
1734 return {
nullptr, 0};
1737 auto it = mb.data();
1739 it = std::copy_n(buf.data(), offset, it);
1745 const auto getNextWindowSize = [&]() {
1746 int nextIn = q26::saturate_cast<
int>(uclen);
1749 if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
1756 const int nextIn = getNextWindowSize();
1757 std::tie(out, outlen) = growOut(1);
1760 const int nextOut = q26::saturate_cast<
int>(outlen);
1761 len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut,
nullptr,
nullptr);
1768 int r = GetLastError();
1769 if (r == ERROR_INSUFFICIENT_BUFFER) {
1770 int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn,
nullptr, 0,
1772 if (neededLength <= 0) {
1780 "WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
1784 std::tie(out, outlen) = growOut(neededLength);
1793 "WideCharToMultiByte: Cannot convert multibyte text (error %d): %ls\n",
1794 r, qt_castToWchar(QStringView(ch, uclen).left(100).toString()));
1802 if (out != buf.data())
1803 mb = QByteArrayView(buf.data(), out).toByteArray();
1805 mb.truncate(std::distance(mb.data(), out));
1811void QStringConverter::State::clear()
noexcept
1816 state_data[0] = state_data[1] = state_data[2] = state_data[3] = 0;
1822void QStringConverter::State::reset()
noexcept
1824 if (flags & Flag::UsesIcu) {
1825#if defined(QT_USE_ICU_CODECS)
1827 UConverter *converter =
static_cast<UConverter *>(d[0]);
1829 ucnv_reset(converter);
1838#ifndef QT_BOOTSTRAPPED
1839static QChar *
fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
1841 return QUtf16::convertToUnicode(out, in, state, DetectEndianness);
1844static char *
toUtf16(
char *out, QStringView in, QStringConverter::State *state)
1846 return QUtf16::convertFromUnicode(out, in, state, DetectEndianness);
1851 return QUtf16::convertToUnicode(out, in, state, BigEndianness);
1854static char *
toUtf16BE(
char *out, QStringView in, QStringConverter::State *state)
1856 return QUtf16::convertFromUnicode(out, in, state, BigEndianness);
1861 return QUtf16::convertToUnicode(out, in, state, LittleEndianness);
1864static char *
toUtf16LE(
char *out, QStringView in, QStringConverter::State *state)
1866 return QUtf16::convertFromUnicode(out, in, state, LittleEndianness);
1869static QChar *
fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
1871 return QUtf32::convertToUnicode(out, in, state, DetectEndianness);
1874static char *
toUtf32(
char *out, QStringView in, QStringConverter::State *state)
1876 return QUtf32::convertFromUnicode(out, in, state, DetectEndianness);
1881 return QUtf32::convertToUnicode(out, in, state, BigEndianness);
1884static char *
toUtf32BE(
char *out, QStringView in, QStringConverter::State *state)
1886 return QUtf32::convertFromUnicode(out, in, state, BigEndianness);
1891 return QUtf32::convertToUnicode(out, in, state, LittleEndianness);
1894static char *
toUtf32LE(
char *out, QStringView in, QStringConverter::State *state)
1896 return QUtf32::convertFromUnicode(out, in, state, LittleEndianness);
1900char *QLatin1::convertFromUnicode(
char *out, QStringView in, QStringConverter::State *state)
noexcept
1903 if (state->flags & QStringConverter::Flag::Stateless)
1906 const char replacement = (state && state->flags & QStringConverter::Flag::ConvertInvalidToNull) ? 0 :
'?';
1907 qsizetype invalid = 0;
1908 for (qsizetype i = 0; i < in.size(); ++i) {
1909 if (in[i] > QChar(0xff)) {
1913 *out = (
char)in[i].cell();
1918 state->invalidChars += invalid;
1924 QString s = QLocal8Bit::convertToUnicode(in, state);
1925 memcpy(out, s.constData(), s.size()*
sizeof(QChar));
1926 return out + s.size();
1929static char *
toLocal8Bit(
char *out, QStringView in, QStringConverter::State *state)
1931 QByteArray s = QLocal8Bit::convertFromUnicode(in, state);
1932 memcpy(out, s.constData(), s.size());
1933 return out + s.size();
1940#ifndef QT_BOOTSTRAPPED
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2073
2074
2075
2077const QStringConverter::Interface QStringConverter::encodingInterfaces[QStringConverter::LastEncoding + 1] =
2079 {
"UTF-8", QUtf8::convertToUnicode, fromUtf8Len, QUtf8::convertFromUnicode, toUtf8Len },
2080#ifndef QT_BOOTSTRAPPED
2081 {
"UTF-16", fromUtf16, fromUtf16Len, toUtf16, toUtf16Len },
2082 {
"UTF-16LE", fromUtf16LE, fromUtf16Len, toUtf16LE, toUtf16Len },
2083 {
"UTF-16BE", fromUtf16BE, fromUtf16Len, toUtf16BE, toUtf16Len },
2084 {
"UTF-32", fromUtf32, fromUtf32Len, toUtf32, toUtf32Len },
2085 {
"UTF-32LE", fromUtf32LE, fromUtf32Len, toUtf32LE, toUtf32Len },
2086 {
"UTF-32BE", fromUtf32BE, fromUtf32Len, toUtf32BE, toUtf32Len },
2088 {
"ISO-8859-1", QLatin1::convertToUnicode, fromLatin1Len, QLatin1::convertFromUnicode, toLatin1Len },
2089 {
"Locale", fromLocal8Bit, fromUtf8Len, toLocal8Bit, toUtf8Len }
2093template <
typename Char>
2097 while (*a ==
'-' || *a ==
'_')
2099 while (b != b_end && (*b == Char{
'-'} || *b == Char{
'_'}))
2101 if (!*a && b == b_end)
2103 if (
char16_t(*b) > 127)
2105 }
while (QtMiscUtils::toAsciiLower(*a++) == QtMiscUtils::toAsciiLower(
char(*b++)));
2112 return nameMatch_impl_impl(a, b.begin(), b.end());
2117 return nameMatch_impl(a, QLatin1StringView{QByteArrayView{b}});
2122 return nameMatch_impl_impl(a, b.utf16(), b.utf16() + b.size());
2127 return b.visit([a](
auto b) {
return nameMatch_impl(a, b); });
2132
2133
2134
2137
2138
2139
2142#if defined(QT_USE_ICU_CODECS)
2144struct QStringConverterICU : QStringConverter
2146 static void clear_function(QStringConverter::State *state)
noexcept
2149 ucnv_close(
static_cast<UConverter *>(state->d[0]));
2150 state->d[0] =
nullptr;
2153 static void ensureConverter(QStringConverter::State *state)
2157 if (state->d[0] ==
nullptr)
2158 state->d[0] = createConverterForName(
static_cast<
const char *>(state->d[1]), state);
2161 static QChar *toUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
2164 ensureConverter(state);
2166 auto icu_conv =
static_cast<UConverter *>(state->d[0]);
2167 UErrorCode err = U_ZERO_ERROR;
2168 auto source = in.data();
2169 auto sourceLimit = in.data() + in.size();
2171 qsizetype length = toLen(in.size());
2173 UChar *target =
reinterpret_cast<UChar *>(out);
2174 auto targetLimit = target + length;
2177 UBool flush =
false;
2180 UConverterToUCallback action;
2181 const void *context;
2182 ucnv_getToUCallBack(icu_conv, &action, &context);
2183 if (context != state)
2184 ucnv_setToUCallBack(icu_conv, action, state,
nullptr,
nullptr, &err);
2186 ucnv_toUnicode(icu_conv, &target, targetLimit, &source, sourceLimit,
nullptr, flush, &err);
2188 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2189 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2190 if (
auto leftOver = ucnv_toUCountPending(icu_conv, &err)) {
2191 ucnv_reset(icu_conv);
2192 state->invalidChars += leftOver;
2195 return reinterpret_cast<QChar *>(target);
2198 static char *fromUtf16(
char *out, QStringView in, QStringConverter::State *state)
2201 ensureConverter(state);
2202 auto icu_conv =
static_cast<UConverter *>(state->d[0]);
2203 UErrorCode err = U_ZERO_ERROR;
2204 auto source =
reinterpret_cast<
const UChar *>(in.data());
2205 auto sourceLimit =
reinterpret_cast<
const UChar *>(in.data() + in.size());
2207 qsizetype length = UCNV_GET_MAX_BYTES_FOR_STRING(in.size(), ucnv_getMaxCharSize(icu_conv));
2210 char *targetLimit = out + length;
2211 UBool flush =
false;
2214 UConverterFromUCallback action;
2215 const void *context;
2216 ucnv_getFromUCallBack(icu_conv, &action, &context);
2217 if (context != state)
2218 ucnv_setFromUCallBack(icu_conv, action, state,
nullptr,
nullptr, &err);
2220 ucnv_fromUnicode(icu_conv, &target, targetLimit, &source, sourceLimit,
nullptr, flush, &err);
2222 Q_ASSERT(err != U_BUFFER_OVERFLOW_ERROR);
2223 if (state->flags.testFlag(QStringConverter::Flag::Stateless)) {
2224 if (
auto leftOver = ucnv_fromUCountPending(icu_conv, &err)) {
2225 ucnv_reset(icu_conv);
2226 state->invalidChars += leftOver;
2232 Q_DISABLE_COPY_MOVE(QStringConverterICU)
2234 template<qsizetype X>
2235 static qsizetype fromLen(qsizetype inLength)
2237 return X * inLength *
sizeof(UChar);
2240 static qsizetype toLen(qsizetype inLength)
2244
2245
2246
2247
2248 return 2 * inLength;
2251 static constexpr QStringConverter::Interface forLength[] = {
2252 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<1>},
2253 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<2>},
2254 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<3>},
2255 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<4>},
2256 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<5>},
2257 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<6>},
2258 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<7>},
2259 {
"icu, recompile if you see this", QStringConverterICU::toUtf16, QStringConverterICU::toLen, QStringConverterICU::fromUtf16, QStringConverterICU::fromLen<8>}
2262 static UConverter *createConverterForName(
const char *name,
const State *state)
2267 UErrorCode status = U_ZERO_ERROR;
2268 UConverter *conv = ucnv_open(name, &status);
2269 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
2274 if (state->flags.testFlag(Flag::ConvertInvalidToNull)) {
2275 UErrorCode error = U_ZERO_ERROR;
2277 auto nullToSubstituter = [](
const void *context, UConverterToUnicodeArgs *toUArgs,
2278 const char *, int32_t length,
2279 UConverterCallbackReason reason, UErrorCode *err) {
2280 if (reason <= UCNV_IRREGULAR) {
2281 *err = U_ZERO_ERROR;
2283 ucnv_cbToUWriteUChars(toUArgs, &c, 1, 0, err);
2285 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2286 state->invalidChars += length;
2289 ucnv_setToUCallBack(conv, nullToSubstituter, state,
nullptr,
nullptr, &error);
2291 auto nullFromSubstituter = [](
const void *context, UConverterFromUnicodeArgs *fromUArgs,
2292 const UChar *, int32_t length,
2293 UChar32, UConverterCallbackReason reason, UErrorCode *err) {
2294 if (reason <= UCNV_IRREGULAR) {
2295 *err = U_ZERO_ERROR;
2296 const UChar replacement[] = { 0 };
2297 const UChar *stringBegin = std::begin(replacement);
2298 ucnv_cbFromUWriteUChars(fromUArgs, &stringBegin, std::end(replacement), 0, err);
2300 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2301 state->invalidChars += length;
2304 ucnv_setFromUCallBack(conv, nullFromSubstituter, state,
nullptr,
nullptr, &error);
2306 UErrorCode error = U_ZERO_ERROR;
2308 auto qmarkToSubstituter = [](
const void *context, UConverterToUnicodeArgs *toUArgs,
2309 const char *codeUnits,int32_t length,
2310 UConverterCallbackReason reason, UErrorCode *err) {
2311 if (reason <= UCNV_IRREGULAR) {
2313 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2314 state->invalidChars += length;
2317 UCNV_TO_U_CALLBACK_SUBSTITUTE(
nullptr, toUArgs, codeUnits, length, reason, err);
2320 ucnv_setToUCallBack(conv, qmarkToSubstituter, state,
nullptr,
nullptr, &error);
2322 auto qmarkFromSubstituter = [](
const void *context, UConverterFromUnicodeArgs *fromUArgs,
2323 const UChar *codeUnits, int32_t length,
2324 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode *err) {
2325 if (reason <= UCNV_IRREGULAR) {
2327 auto state =
const_cast<State *>(
static_cast<
const State *>(context));
2328 state->invalidChars += length;
2331 UCNV_FROM_U_CALLBACK_SUBSTITUTE(
nullptr, fromUArgs, codeUnits, length,
2332 codePoint, reason, err);
2334 ucnv_setFromUCallBack(conv, qmarkFromSubstituter, state,
nullptr,
nullptr, &error);
2339 static std::string nul_terminate_impl(QLatin1StringView name)
2340 {
return name.isNull() ? std::string() : std::string{name.data(), size_t(name.size())}; }
2342 static std::string nul_terminate_impl(QUtf8StringView name)
2343 {
return nul_terminate_impl(QLatin1StringView{QByteArrayView{name}}); }
2345 static std::string nul_terminate_impl(QStringView name)
2348 const auto convert = [&](
char *p, size_t n) {
2349 const auto sz = QLatin1::convertFromUnicode(p, name) - p;
2350 Q_ASSERT(q20::cmp_less_equal(sz, n));
2353#ifdef __cpp_lib_string_resize_and_overwrite
2354 result.resize_and_overwrite(size_t(name.size()), convert);
2356 result.resize(size_t(name.size()));
2357 result.resize(convert(result.data(), result.size()));
2362 static std::string nul_terminate(QAnyStringView name)
2363 {
return name.visit([](
auto name) {
return nul_terminate_impl(name); }); }
2365 static const QStringConverter::Interface *
2366 make_icu_converter(QStringConverter::State *state, QAnyStringView name)
2367 {
return make_icu_converter(state, nul_terminate(name).data()); }
2369 static const QStringConverter::Interface *make_icu_converter(
2370 QStringConverter::State *state,
2374 UErrorCode status = U_ZERO_ERROR;
2375 UConverter *conv = createConverterForName(name, state);
2379 const char *icuName = ucnv_getName(conv, &status);
2382 const char *persistentName = ucnv_getStandardName(icuName,
"MIME", &status);
2383 if (U_FAILURE(status) || !persistentName) {
2384 status = U_ZERO_ERROR;
2385 persistentName = ucnv_getStandardName(icuName,
"IANA", &status);
2387 state->d[1] =
const_cast<
char *>(persistentName);
2389 state->flags |= QStringConverter::Flag::UsesIcu;
2390 qsizetype maxCharSize = ucnv_getMaxCharSize(conv);
2391 state->clearFn = QStringConverterICU::clear_function;
2392 if (maxCharSize > 8 || maxCharSize < 1) {
2393 qWarning(
"Encountered unexpected codec \"%s\" which requires >8x space", name);
2396 return &forLength[maxCharSize - 1];
2405
2406
2407QStringConverter::QStringConverter(QAnyStringView name, Flags f)
2408 : iface(
nullptr), state(f)
2410 auto e = encodingForName(name);
2412 iface = encodingInterfaces +
int(*e);
2413#if defined(QT_USE_ICU_CODECS)
2415 iface = QStringConverterICU::make_icu_converter(&state, name);
2420const char *QStringConverter::name()
const noexcept
2424 if (state.flags & QStringConverter::Flag::UsesIcu) {
2425#if defined(QT_USE_ICU_CODECS)
2426 return static_cast<
const char*>(state.d[1]);
2436
2437
2438
2439
2440
2441
2442
2443
2446
2447
2448
2449
2450
2453
2454
2455
2456
2457
2458
2461
2462
2463
2464
2465
2466
2467
2468
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAnyStringView name)
noexcept
2484 return std::nullopt;
2485 for (qsizetype i = 0; i < LastEncoding + 1; ++i) {
2486 if (nameMatch(encodingInterfaces[i].name, name))
2487 return QStringConverter::Encoding(i);
2489 if (nameMatch(
"latin1", name))
2490 return QStringConverter::Latin1;
2491 return std::nullopt;
2494#ifndef QT_BOOTSTRAPPED
2501 if (state->flags & QStringConverter::Flag::UsesIcu) {
2502 UConverter *converter =
static_cast<UConverter *>(state->d[0]);
2505 UErrorCode err = U_ZERO_ERROR;
2506 auto leftOver = ucnv_fromUCountPending(converter, &err);
2509 return std::max(leftOver, 0);
2512 return q26::saturate_cast<
int>(state->remainingChars);
2517
2518
2519
2520
2521
2522
2523std::optional<QStringConverter::Encoding>
2524QStringConverter::encodingForData(QByteArrayView data,
char16_t expectedFirstCharacter)
noexcept
2527 qsizetype arraySize = data.size();
2528 if (arraySize > 3) {
2529 char32_t uc = qFromUnaligned<
char32_t>(data.data());
2530 if (uc == qToBigEndian(
char32_t(QChar::ByteOrderMark)))
2531 return QStringConverter::Utf32BE;
2532 if (uc == qToLittleEndian(
char32_t(QChar::ByteOrderMark)))
2533 return QStringConverter::Utf32LE;
2534 if (expectedFirstCharacter) {
2536 if (qToLittleEndian(uc) == expectedFirstCharacter)
2537 return QStringConverter::Utf32LE;
2538 else if (qToBigEndian(uc) == expectedFirstCharacter)
2539 return QStringConverter::Utf32BE;
2543 if (arraySize > 2) {
2544 if (memcmp(data.data(), utf8bom,
sizeof(utf8bom)) == 0)
2545 return QStringConverter::Utf8;
2548 if (arraySize > 1) {
2549 char16_t uc = qFromUnaligned<
char16_t>(data.data());
2550 if (uc == qToBigEndian(
char16_t(QChar::ByteOrderMark)))
2551 return QStringConverter::Utf16BE;
2552 if (uc == qToLittleEndian(
char16_t(QChar::ByteOrderMark)))
2553 return QStringConverter::Utf16LE;
2554 if (expectedFirstCharacter) {
2556 if (qToLittleEndian(uc) == expectedFirstCharacter)
2557 return QStringConverter::Utf16LE;
2558 else if (qToBigEndian(uc) == expectedFirstCharacter)
2559 return QStringConverter::Utf16BE;
2562 return std::nullopt;
2567 static constexpr auto metaSearcher = qMakeStaticByteArrayMatcher(
"meta ");
2568 static constexpr auto charsetSearcher = qMakeStaticByteArrayMatcher(
"charset=");
2570 QByteArray header = data.first(qMin(data.size(), qsizetype(1024))).toByteArray().toLower();
2571 qsizetype pos = metaSearcher.indexIn(header);
2573 pos = charsetSearcher.indexIn(header, pos);
2575 pos += qstrlen(
"charset=");
2576 if (pos < header.size() && (header.at(pos) ==
'\"' || header.at(pos) ==
'\''))
2579 qsizetype pos2 = pos;
2582 while (++pos2 < header.size()) {
2583 char ch = header.at(pos2);
2584 if (ch ==
'\"' || ch ==
'\'' || ch ==
'>' || ch ==
'/') {
2585 QByteArray name = header.mid(pos, pos2 - pos);
2586 qsizetype colon = name.indexOf(
':');
2588 name = name.left(colon);
2589 name = name.simplified();
2590 if (name ==
"unicode")
2591 name = QByteArrayLiteral(
"UTF-8");
2592 if (!name.isEmpty())
2598 return QByteArray();
2602
2603
2604
2605
2606
2607
2608
2609std::optional<QStringConverter::Encoding> QStringConverter::encodingForHtml(QByteArrayView data)
2612 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2617 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2618 if (!encodingTag.isEmpty())
2619 return encodingForName(encodingTag);
2626#if !defined(QT_USE_ICU_CODECS)
2627 return QStringConverter::Encoding::LastEncoding;
2631
2632
2633 return 1 + ucnv_countAvailable();
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650QStringList QStringConverter::availableCodecs()
2652 auto availableCodec = [](qsizetype index) -> QString
2654 #if !defined(QT_USE_ICU_CODECS)
2655 return QString::fromLatin1(encodingInterfaces[index].name);
2658 return QString::fromLatin1(
2659 encodingInterfaces[QStringConverter::Encoding::System].name);
2662 UErrorCode status = U_ZERO_ERROR;
2663 auto icuName = ucnv_getAvailableName(int32_t(index - 1));
2664 const char *standardName = ucnv_getStandardName(icuName,
"MIME", &status);
2665 if (U_FAILURE(status) || !standardName) {
2666 status = U_ZERO_ERROR;
2667 standardName = ucnv_getStandardName(icuName,
"IANA", &status);
2670 standardName = icuName;
2671 return QString::fromLatin1(standardName);
2675 qsizetype codecCount = availableCodecCount();
2677 result.reserve(codecCount);
2678 for (qsizetype i = 0; i < codecCount; ++i)
2679 result.push_back(availableCodec(i));
2684
2685
2686
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2699
2700
2701
2702
2703
2704
2705
2708
2709
2710
2712
2713
2714
2716
2717
2718
2719
2722
2723
2724
2725
2728
2729
2730
2731
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2764 count = QtPrivate::partiallyParsedDataCount(&state);
2765 using Error = FinalizeResult::Error;
2766 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2767 if (count == 0 || !out) {
2769 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2772 return { {}, out, invalidChars, Error::NotEnoughSpace };
2774 const char16_t replacement = (state.flags & QStringConverter::Flag::ConvertInvalidToNull)
2776 : QChar::ReplacementCharacter;
2777 out =
std::fill_n(out, count, replacement);
2779 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2783
2784
2785
2786
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816auto QStringEncoder::finalize(
char *out, qsizetype maxlen) -> QStringEncoder::FinalizeResult
2818 qsizetype count = 0;
2820 count = QtPrivate::partiallyParsedDataCount(&state);
2823 using Error = FinalizeResult::Error;
2824 const bool usesIcu = !!(state.flags & QStringConverter::Flag::UsesIcu) && !!state.d[0];
2825 const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
2826 if (!isValid() || (!count && !usesIcu) || !out) {
2828 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2832#if defined(QT_USE_ICU_CODECS)
2833 }
else if (usesIcu) {
2835 auto *icu_conv =
static_cast<UConverter *>(state.d[0]);
2837 UErrorCode err = U_ZERO_ERROR;
2842 UConverterFromUCallback action;
2843 const void *context;
2844 ucnv_getFromUCallBack(icu_conv, &action, &context);
2845 if (context != &state)
2846 ucnv_setFromUCallBack(icu_conv, action, &state,
nullptr,
nullptr, &err);
2847 const UChar *dummyInput = u"";
2848 const char *outEnd = out + maxlen;
2849 ucnv_fromUnicode(icu_conv, &out, outEnd, &dummyInput, dummyInput,
nullptr, flush, &err);
2850 if (err == U_BUFFER_OVERFLOW_ERROR)
2851 return { {}, out, invalidChars, Error::NotEnoughSpace };
2854 }
else if (!(state.flags & QStringConverter::Flag::ConvertInvalidToNull)) {
2856
2857
2858
2859
2860
2861
2862
2863 constexpr QChar replacementCharacter = QChar::ReplacementCharacter;
2864 constexpr char16_t repl = replacementCharacter.unicode();
2865 constexpr std::array<
char16_t, 4> replacement{ repl, repl, repl, repl };
2866 const qsizetype charactersToEncode =
std::min(count, qsizetype(replacement.size()));
2867 if (maxlen < requiredSpace(charactersToEncode))
2868 return { {}, out, invalidChars, Error::NotEnoughSpace };
2872 out = appendToBuffer(out, QStringView(replacement.data(), charactersToEncode));
2875 return { {}, out, invalidChars, Error::NotEnoughSpace };
2876 out =
std::fill_n(out, count,
'\0');
2879 return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
2883
2884
2885
2886
2887
2888
2889
2890
2894 std::optional<QStringConverter::Encoding> encoding = encodingForData(data);
2897 return QStringDecoder(encoding.value());
2899 QByteArray encodingTag = parseHtmlMetaForEncoding(data);
2900 if (!encodingTag.isEmpty())
2903 return QStringDecoder(Utf8);
2908
2909
2910
2911
2912
2913
2914
2915const char *QStringConverter::nameForEncoding(QStringConverter::Encoding e)
noexcept
2918 if (Q_UNLIKELY(i >= std::size(encodingInterfaces)))
2920 return encodingInterfaces[i].name;
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2956
2957
2958
2961
2962
2963
2964
2965
2968
2969
2970
2971
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2986
2987
2988
2989
2990
2991
2992
2993
2994
2997
2998
2999
3000
3001
3002
3003
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3052
3053
3054
3057
3058
3059
3060
3061
3064
3065
3066
3067
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3094
3095
3096
3097
3098
3099
3100
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3118
3119
3120
3121
static int partiallyParsedDataCount(QStringConverter::State *state)
static bool nameMatch(const char *a, QAnyStringView b)
static const uchar utf8bom[]
static QChar * fromUtf32LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QChar * fromUtf16LE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QByteArray parseHtmlMetaForEncoding(QByteArrayView data)
static QChar * fromUtf32BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf8Len(qsizetype l)
static QChar * fromLocal8Bit(QChar *out, QByteArrayView in, QStringConverter::State *state)
static QChar * fromUtf16(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toLatin1Len(qsizetype l)
static bool nameMatch_impl_impl(const char *a, const Char *b, const Char *b_end)
static bool nameMatch_impl(const char *a, QLatin1StringView b)
static QChar * fromUtf32(QChar *out, QByteArrayView in, QStringConverter::State *state)
static char * toUtf32(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf16LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf8Len(qsizetype l)
static char * toLocal8Bit(char *out, QStringView in, QStringConverter::State *state)
static qsizetype toUtf16Len(qsizetype l)
static qsizetype fromLatin1Len(qsizetype l)
static char * toUtf16BE(char *out, QStringView in, QStringConverter::State *state)
static char * toUtf32LE(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf32Len(qsizetype l)
static qsizetype availableCodecCount()
static QChar * fromUtf16BE(QChar *out, QByteArrayView in, QStringConverter::State *state)
static qsizetype toUtf32Len(qsizetype l)
static char * toUtf16(char *out, QStringView in, QStringConverter::State *state)
static qsizetype fromUtf16Len(qsizetype l)
static char * toUtf32BE(char *out, QStringView in, QStringConverter::State *state)
static void appendUtf16(const NoOutput &, char16_t)
static void appendUcs4(const NoOutput &, char32_t)