Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qurlidna.cpp
Go to the documentation of this file.
1// Copyright (C) 2016 The Qt Company Ltd.
2// Copyright (C) 2016 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4// Qt-Security score:critical reason:data-parser
5
6#include "qurl_p.h"
7
8#include <QtCore/qstringlist.h>
9#include <QtCore/private/qnumeric_p.h>
10#include <QtCore/private/qoffsetstringarray_p.h>
11#include <QtCore/private/qstringiterator_p.h>
12#include <QtCore/private/qunicodetables_p.h>
13
14#include <algorithm>
15
16QT_BEGIN_NAMESPACE
17
18using namespace Qt::StringLiterals;
19
20// needed by the punycode encoder/decoder
21static const uint base = 36;
22static const uint tmin = 1;
23static const uint tmax = 26;
24static const uint skew = 38;
25static const uint damp = 700;
26static const uint initial_bias = 72;
27static const uint initial_n = 128;
28
29static constexpr qsizetype MaxDomainLabelLength = 63;
30
31static inline uint encodeDigit(uint digit)
32{
33 return digit + 22 + 75 * (digit < 26);
34}
35
36static inline uint adapt(uint delta, uint numpoints, bool firsttime)
37{
38 delta /= (firsttime ? damp : 2);
39 delta += (delta / numpoints);
40
41 uint k = 0;
42 for (; delta > ((base - tmin) * tmax) / 2; k += base)
43 delta /= (base - tmin);
44
45 return k + (((base - tmin + 1) * delta) / (delta + skew));
46}
47
48static inline void appendEncode(QString *output, uint delta, uint bias)
49{
50 uint qq;
51 uint k;
52 uint t;
53
54 // insert the variable length delta integer.
55 for (qq = delta, k = base;; k += base) {
56 // stop generating digits when the threshold is
57 // detected.
58 t = (k <= bias) ? tmin : (k >= bias + tmax) ? tmax : k - bias;
59 if (qq < t) break;
60
61 *output += QChar(encodeDigit(t + (qq - t) % (base - t)));
62 qq = (qq - t) / (base - t);
63 }
64
65 *output += QChar(encodeDigit(qq));
66}
67
68Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
69{
70 uint n = initial_n;
71 uint delta = 0;
72 uint bias = initial_bias;
73
74 // Do not try to encode strings that certainly will result in output
75 // that is longer than allowable domain name label length. Note that
76 // non-BMP codepoints are encoded as two QChars.
77 if (in.size() > MaxDomainLabelLength * 2)
78 return;
79
80 int outLen = output->size();
81 output->resize(outLen + in.size());
82
83 QChar *d = output->data() + outLen;
84 bool skipped = false;
85 // copy all basic code points verbatim to output.
86 for (QChar c : in) {
87 if (c.unicode() < 0x80)
88 *d++ = c;
89 else
90 skipped = true;
91 }
92
93 // if there were only basic code points, just return them
94 // directly; don't do any encoding.
95 if (!skipped)
96 return;
97
98 output->truncate(d - output->constData());
99 int copied = output->size() - outLen;
100
101 // h and b now contain the number of basic code points in input.
102 uint b = copied;
103 uint h = copied;
104
105 // if basic code points were copied, add the delimiter character.
106 if (h > 0)
107 *output += u'-';
108
109 // compute the input length in Unicode code points.
110 uint inputLength = 0;
111 for (QStringIterator iter(in); iter.hasNext();) {
112 inputLength++;
113
114 if (iter.next(char32_t(-1)) == char32_t(-1)) {
115 output->truncate(outLen);
116 return; // invalid surrogate pair
117 }
118 }
119
120 // while there are still unprocessed non-basic code points left in
121 // the input string...
122 while (h < inputLength) {
123 // find the character in the input string with the lowest unprocessed value.
124 uint m = std::numeric_limits<uint>::max();
125 for (QStringIterator iter(in); iter.hasNext();) {
126 auto c = iter.nextUnchecked();
127 static_assert(std::numeric_limits<decltype(m)>::max()
128 >= std::numeric_limits<decltype(c)>::max(),
129 "Punycode uint should be able to cover all codepoints");
130 if (c >= n && c < m)
131 m = c;
132 }
133
134 // delta = delta + (m - n) * (h + 1), fail on overflow
135 uint tmp;
136 if (qMulOverflow<uint>(m - n, h + 1, &tmp) || qAddOverflow<uint>(delta, tmp, &delta)) {
137 output->truncate(outLen);
138 return; // punycode_overflow
139 }
140 n = m;
141
142 for (QStringIterator iter(in); iter.hasNext();) {
143 auto c = iter.nextUnchecked();
144
145 // increase delta until we reach the character processed in this iteration;
146 // fail if delta overflows.
147 if (c < n) {
148 if (qAddOverflow<uint>(delta, 1, &delta)) {
149 output->truncate(outLen);
150 return; // punycode_overflow
151 }
152 }
153
154 if (c == n) {
155 appendEncode(output, delta, bias);
156
157 bias = adapt(delta, h + 1, h == b);
158 delta = 0;
159 ++h;
160 }
161 }
162
163 ++delta;
164 ++n;
165 }
166
167 // prepend ACE prefix
168 output->insert(outLen, "xn--"_L1);
169 return;
170}
171
172Q_AUTOTEST_EXPORT QString qt_punycodeDecoder(const QString &pc)
173{
174 uint n = initial_n;
175 uint i = 0;
176 uint bias = initial_bias;
177
178 // Do not try to decode strings longer than allowable for a domain label.
179 // Non-ASCII strings are not allowed here anyway, so there is no need
180 // to account for surrogates.
181 if (pc.size() > MaxDomainLabelLength)
182 return QString();
183
184 // strip any ACE prefix
185 int start = pc.startsWith("xn--"_L1) ? 4 : 0;
186 if (!start)
187 return pc;
188
189 // find the last delimiter character '-' in the input array. copy
190 // all data before this delimiter directly to the output array.
191 int delimiterPos = pc.lastIndexOf(u'-');
192 auto output = delimiterPos < 4 ? std::u32string()
193 : pc.mid(start, delimiterPos - start).toStdU32String();
194
195 // if a delimiter was found, skip to the position after it;
196 // otherwise start at the front of the input string. everything
197 // before the delimiter is assumed to be basic code points.
198 uint cnt = delimiterPos + 1;
199
200 // loop through the rest of the input string, inserting non-basic
201 // characters into output as we go.
202 while (cnt < (uint) pc.size()) {
203 uint oldi = i;
204 uint w = 1;
205
206 // find the next index for inserting a non-basic character.
207 for (uint k = base; cnt < (uint) pc.size(); k += base) {
208 // grab a character from the punycode input and find its
209 // delta digit (each digit code is part of the
210 // variable-length integer delta)
211 uint digit = pc.at(cnt++).unicode();
212 if (digit - 48 < 10) digit -= 22;
213 else if (digit - 65 < 26) digit -= 65;
214 else if (digit - 97 < 26) digit -= 97;
215 else digit = base;
216
217 // Fail if the code point has no digit value
218 if (digit >= base)
219 return QString();
220
221 // i = i + digit * w, fail on overflow
222 uint tmp;
223 if (qMulOverflow<uint>(digit, w, &tmp) || qAddOverflow<uint>(i, tmp, &i))
224 return QString();
225
226 // detect threshold to stop reading delta digits
227 uint t;
228 if (k <= bias) t = tmin;
229 else if (k >= bias + tmax) t = tmax;
230 else t = k - bias;
231
232 if (digit < t) break;
233
234 // w = w * (base - t), fail on overflow
235 if (qMulOverflow<uint>(w, base - t, &w))
236 return QString();
237 }
238
239 // find new bias and calculate the next non-basic code
240 // character.
241 uint outputLength = static_cast<uint>(output.length());
242 bias = adapt(i - oldi, outputLength + 1, oldi == 0);
243
244 // n = n + i div (length(output) + 1), fail on overflow
245 if (qAddOverflow<uint>(n, i / (outputLength + 1), &n))
246 return QString();
247
248 // allow the deltas to wrap around
249 i %= (outputLength + 1);
250
251 // if n is a basic code point then fail; this should not happen with
252 // correct implementation of Punycode, but check just n case.
253 if (n < initial_n) {
254 // Don't use Q_ASSERT() to avoid possibility of DoS
255 qWarning("Attempt to insert a basic codepoint. Unhandled overflow?");
256 return QString();
257 }
258
259 // Surrogates should normally be rejected later by other IDNA code.
260 // But because of Qt's use of UTF-16 to represent strings the
261 // IDNA code is not able to distinguish characters represented as pairs
262 // of surrogates from normal code points. This is why surrogates are
263 // not allowed here.
264 //
265 // Allowing surrogates would lead to non-unique (after normalization)
266 // encoding of strings with non-BMP characters.
267 //
268 // Punycode that encodes characters outside the Unicode range is also
269 // invalid and is rejected here.
270 if (QChar::isSurrogate(n) || n > QChar::LastValidCodePoint)
271 return QString();
272
273 // insert the character n at position i
274 output.insert(i, 1, static_cast<char32_t>(n));
275 ++i;
276 }
277
278 return QString::fromStdU32String(output);
279}
280
281static constexpr auto idn_whitelist = qOffsetStringArray(
282 "ac", "ar", "asia", "at",
283 "biz", "br",
284 "cat", "ch", "cl", "cn", "com",
285 "de", "dk",
286 "es",
287 "fi",
288 "gr",
289 "hu",
290 "il", "info", "io", "ir", "is",
291 "jp",
292 "kr",
293 "li", "lt", "lu", "lv",
294 "museum",
295 "name", "net", "no", "nu", "nz",
296 "org",
297 "pl", "pr",
298 "se", "sh",
299 "tel", "th", "tm", "tw",
300 "ua",
301 "vn",
302 "xn--fiqs8s", // China
303 "xn--fiqz9s", // China
304 "xn--fzc2c9e2c", // Sri Lanka
305 "xn--j6w193g", // Hong Kong
306 "xn--kprw13d", // Taiwan
307 "xn--kpry57d", // Taiwan
308 "xn--mgba3a4f16a", // Iran
309 "xn--mgba3a4fra", // Iran
310 "xn--mgbaam7a8h", // UAE
311 "xn--mgbayh7gpa", // Jordan
312 "xn--mgberp4a5d4ar", // Saudi Arabia
313 "xn--ogbpf8fl", // Syria
314 "xn--p1ai", // Russian Federation
315 "xn--wgbh1c", // Egypt
316 "xn--wgbl6a", // Qatar
317 "xn--xkc2al3hye2a" // Sri Lanka
318);
319
320Q_CONSTINIT static QStringList *user_idn_whitelist = nullptr;
321
322static bool lessThan(const QChar *a, int l, const char *c)
323{
324 const auto *uc = reinterpret_cast<const char16_t *>(a);
325 const char16_t *e = uc + l;
326
327 if (!c || *c == 0)
328 return false;
329
330 while (*c) {
331 if (uc == e || *uc != static_cast<unsigned char>(*c))
332 break;
333 ++uc;
334 ++c;
335 }
336 return uc == e ? *c : (*uc < static_cast<unsigned char>(*c));
337}
338
339static bool equal(const QChar *a, int l, const char *b)
340{
341 while (l && a->unicode() && *b) {
342 if (*a != QLatin1Char(*b))
343 return false;
344 ++a;
345 ++b;
346 --l;
347 }
348 return l == 0;
349}
350
351static bool qt_is_idn_enabled(QStringView aceDomain)
352{
353 auto idx = aceDomain.lastIndexOf(u'.');
354 if (idx == -1)
355 return false;
356
357 auto tldString = aceDomain.mid(idx + 1);
358 const auto len = tldString.size();
359
360 const QChar *tld = tldString.constData();
361
362 if (user_idn_whitelist)
363 return user_idn_whitelist->contains(tldString);
364
365 int l = 0;
366 int r = idn_whitelist.count() - 1;
367 int i = (l + r + 1) / 2;
368
369 while (r != l) {
370 if (lessThan(tld, len, idn_whitelist.at(i)))
371 r = i - 1;
372 else
373 l = i;
374 i = (l + r + 1) / 2;
375 }
376 return equal(tld, len, idn_whitelist.at(i));
377}
378
379template<typename C>
380static inline bool isValidInNormalizedAsciiLabel(C c)
381{
382 return c == u'-' || c == u'_' || (c >= u'0' && c <= u'9') || (c >= u'a' && c <= u'z');
383}
384
385template<typename C>
386static inline bool isValidInNormalizedAsciiName(C c)
387{
388 return isValidInNormalizedAsciiLabel(c) || c == u'.';
389}
390
391/*
392 Map domain name according to algorithm in UTS #46, 4.1
393
394 Returns empty string if there are disallowed characters in the input.
395
396 Sets resultIsAscii if the result is known for sure to be all ASCII.
397*/
398static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options,
399 bool *resultIsAscii)
400{
401 *resultIsAscii = true;
402
403 // Check if the input is already normalized ASCII first and can be returned as is.
404 int i = 0;
405 for (auto c : in) {
406 if (c.unicode() >= 0x80 || !isValidInNormalizedAsciiName(c))
407 break;
408 i++;
409 }
410
411 if (i == in.size())
412 return in;
413
414 QString result;
415 result.reserve(in.size());
416 result.append(in.constData(), i);
417 bool allAscii = true;
418
419 for (QStringIterator iter(QStringView(in).sliced(i)); iter.hasNext();) {
420 char32_t uc = iter.next();
421
422 // Fast path for ASCII-only inputs
423 if (Q_LIKELY(uc < 0x80)) {
424 if (uc >= U'A' && uc <= U'Z')
425 uc |= 0x20; // lower-case it
426
427 if (isValidInNormalizedAsciiName(uc)) {
428 result.append(static_cast<char16_t>(uc));
429 continue;
430 }
431 }
432
433 allAscii = false;
434
435 // Capital sharp S is a special case since UTR #46 revision 31 (Unicode 15.1)
436 if (uc == 0x1E9E && options.testFlag(QUrl::AceTransitionalProcessing)) {
437 result.append(u"ss"_s);
438 continue;
439 }
440
441 QUnicodeTables::IdnaStatus status = QUnicodeTables::idnaStatus(uc);
442
443 if (status == QUnicodeTables::IdnaStatus::Deviation)
444 status = options.testFlag(QUrl::AceTransitionalProcessing)
445 ? QUnicodeTables::IdnaStatus::Mapped
446 : QUnicodeTables::IdnaStatus::Valid;
447
448 switch (status) {
449 case QUnicodeTables::IdnaStatus::Ignored:
450 continue;
451 case QUnicodeTables::IdnaStatus::Valid:
452 case QUnicodeTables::IdnaStatus::Disallowed:
453 for (auto c : QChar::fromUcs4(uc))
454 result.append(c);
455 break;
456 case QUnicodeTables::IdnaStatus::Mapped:
457 result.append(QUnicodeTables::idnaMapping(uc));
458 break;
459 default:
460 Q_UNREACHABLE();
461 }
462 }
463
464 *resultIsAscii = allAscii;
465 return result;
466}
467
468/*
469 Check the rules for an ASCII label.
470
471 Check the size restriction and that the label does not start or end with dashes.
472
473 The label should be nonempty.
474*/
475static bool validateAsciiLabel(QStringView label)
476{
477 if (label.size() > MaxDomainLabelLength)
478 return false;
479
480 if (label.first() == u'-' || label.last() == u'-')
481 return false;
482
483 return std::all_of(label.begin(), label.end(), isValidInNormalizedAsciiLabel<QChar>);
484}
485
486namespace {
487
488class DomainValidityChecker
489{
490 bool domainNameIsBidi = false;
491 bool hadBidiErrors = false;
492 bool ignoreBidiErrors;
493
494 static constexpr char32_t ZWNJ = U'\u200C';
495 static constexpr char32_t ZWJ = U'\u200D';
496
497public:
498 DomainValidityChecker(bool ignoreBidiErrors = false) : ignoreBidiErrors(ignoreBidiErrors) { }
499 bool checkLabel(const QString &label, QUrl::AceProcessingOptions options);
500
501private:
502 static bool checkContextJRules(QStringView label);
503 static bool checkBidiRules(QStringView label);
504};
505
506} // anonymous namespace
507
508/*
509 Check CONTEXTJ rules according to RFC 5892, appendix A.1 & A.2.
510
511 Rule Set for U+200C (ZWNJ):
512
513 False;
514
515 If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
516
517 If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
518
519 (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
520
521 Rule Set for U+200D (ZWJ):
522
523 False;
524
525 If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
526
527*/
528bool DomainValidityChecker::checkContextJRules(QStringView label)
529{
530 constexpr unsigned char CombiningClassVirama = 9;
531
532 enum class State {
533 Initial,
534 LD_T, // L,D with possible following T*
535 ZWNJ_T, // ZWNJ with possible following T*
536 };
537 State regexpState = State::Initial;
538 bool previousIsVirama = false;
539
540 for (QStringIterator iter(label); iter.hasNext();) {
541 auto ch = iter.next();
542
543 if (ch == ZWJ) {
544 if (!previousIsVirama)
545 return false;
546 regexpState = State::Initial;
547 } else if (ch == ZWNJ) {
548 if (!previousIsVirama && regexpState != State::LD_T)
549 return false;
550 regexpState = previousIsVirama ? State::Initial : State::ZWNJ_T;
551 } else {
552 switch (QChar::joiningType(ch)) {
553 case QChar::Joining_Left:
554 if (regexpState == State::ZWNJ_T)
555 return false;
556 regexpState = State::LD_T;
557 break;
558 case QChar::Joining_Right:
559 regexpState = State::Initial;
560 break;
561 case QChar::Joining_Dual:
562 regexpState = State::LD_T;
563 break;
564 case QChar::Joining_Transparent:
565 break;
566 default:
567 regexpState = State::Initial;
568 break;
569 }
570 }
571
572 previousIsVirama = QChar::combiningClass(ch) == CombiningClassVirama;
573 }
574
575 return regexpState != State::ZWNJ_T;
576}
577
578/*
579 Check if the label conforms to BiDi rule of RFC 5893.
580
581 1. The first character must be a character with Bidi property L, R,
582 or AL. If it has the R or AL property, it is an RTL label; if it
583 has the L property, it is an LTR label.
584
585 2. In an RTL label, only characters with the Bidi properties R, AL,
586 AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
587
588 3. In an RTL label, the end of the label must be a character with
589 Bidi property R, AL, EN, or AN, followed by zero or more
590 characters with Bidi property NSM.
591
592 4. In an RTL label, if an EN is present, no AN may be present, and
593 vice versa.
594
595 5. In an LTR label, only characters with the Bidi properties L, EN,
596 ES, CS, ET, ON, BN, or NSM are allowed.
597
598 6. In an LTR label, the end of the label must be a character with
599 Bidi property L or EN, followed by zero or more characters with
600 Bidi property NSM.
601*/
602bool DomainValidityChecker::checkBidiRules(QStringView label)
603{
604 if (label.isEmpty())
605 return true;
606
607 QStringIterator iter(label);
608 Q_ASSERT(iter.hasNext());
609
610 char32_t ch = iter.next();
611 bool labelIsRTL = false;
612
613 switch (QChar::direction(ch)) {
614 case QChar::DirL:
615 break;
616 case QChar::DirR:
617 case QChar::DirAL:
618 labelIsRTL = true;
619 break;
620 default:
621 return false;
622 }
623
624 bool tailOk = true;
625 bool labelHasEN = false;
626 bool labelHasAN = false;
627
628 while (iter.hasNext()) {
629 ch = iter.next();
630
631 switch (QChar::direction(ch)) {
632 case QChar::DirR:
633 case QChar::DirAL:
634 if (!labelIsRTL)
635 return false;
636 tailOk = true;
637 break;
638
639 case QChar::DirL:
640 if (labelIsRTL)
641 return false;
642 tailOk = true;
643 break;
644
645 case QChar::DirES:
646 case QChar::DirCS:
647 case QChar::DirET:
648 case QChar::DirON:
649 case QChar::DirBN:
650 tailOk = false;
651 break;
652
653 case QChar::DirNSM:
654 break;
655
656 case QChar::DirAN:
657 if (labelIsRTL) {
658 if (labelHasEN)
659 return false;
660 labelHasAN = true;
661 tailOk = true;
662 } else {
663 return false;
664 }
665 break;
666
667 case QChar::DirEN:
668 if (labelIsRTL) {
669 if (labelHasAN)
670 return false;
671 labelHasEN = true;
672 }
673 tailOk = true;
674 break;
675
676 default:
677 return false;
678 }
679 }
680
681 return tailOk;
682}
683
684/*
685 Check if the given label is valid according to UTS #46 validity criteria.
686
687 NFC check can be skipped if the label was transformed to NFC before calling
688 this function (as optimization).
689
690 The domain name is considered invalid if this function returns false at least
691 once.
692
693 1. The label must be in Unicode Normalization Form NFC.
694 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character
695 in both the third and fourth positions.
696 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character.
697 4. The label must not contain a U+002E ( . ) FULL STOP.
698 5. The label must not begin with a combining mark, that is: General_Category=Mark.
699 6. Each code point in the label must only have certain status values according to Section 5,
700 IDNA Mapping Table:
701 1. For Transitional Processing, each value must be valid.
702 2. For Nontransitional Processing, each value must be either valid or deviation.
703 7. If CheckJoiners, the label must satisfy the ContextJ rules from Appendix A, in The Unicode
704 Code Points and Internationalized Domain Names for Applications (IDNA).
705 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy
706 all six of the numbered conditions in RFC 5893, Section 2.
707
708 NOTE: Don't use QStringView for label, so that call to QString::normalized() can avoid
709 memory allocation when there is nothing to normalize.
710*/
711bool DomainValidityChecker::checkLabel(const QString &label, QUrl::AceProcessingOptions options)
712{
713 if (label.isEmpty())
714 return true;
715
716 if (label != label.normalized(QString::NormalizationForm_C))
717 return false;
718
719 if (label.size() >= 4) {
720 // This assumes that the first two characters are in BMP, but that's ok
721 // because non-BMP characters are unlikely to be used for specifying
722 // future extensions.
723 if (label[2] == u'-' && label[3] == u'-')
724 return ignoreBidiErrors && label.startsWith(u"xn") && validateAsciiLabel(label);
725 }
726
727 if (label.startsWith(u'-') || label.endsWith(u'-'))
728 return false;
729
730 if (label.contains(u'.'))
731 return false;
732
733 QStringIterator iter(label);
734 auto c = iter.next();
735
736 if (QChar::isMark(c))
737 return false;
738
739 // As optimization, CONTEXTJ rules check can be skipped if no
740 // ZWJ/ZWNJ characters were found during the first pass.
741 bool hasJoiners = false;
742
743 for (;;) {
744 hasJoiners = hasJoiners || c == ZWNJ || c == ZWJ;
745
746 if (!ignoreBidiErrors && !domainNameIsBidi) {
747 switch (QChar::direction(c)) {
748 case QChar::DirR:
749 case QChar::DirAL:
750 case QChar::DirAN:
751 domainNameIsBidi = true;
752 if (hadBidiErrors)
753 return false;
754 break;
755 default:
756 break;
757 }
758 }
759
760 switch (QUnicodeTables::idnaStatus(c)) {
761 case QUnicodeTables::IdnaStatus::Valid:
762 break;
763 case QUnicodeTables::IdnaStatus::Deviation:
764 if (options.testFlag(QUrl::AceTransitionalProcessing))
765 return false;
766 break;
767 default:
768 return false;
769 }
770
771 if (!iter.hasNext())
772 break;
773 c = iter.next();
774 }
775
776 if (hasJoiners && !checkContextJRules(label))
777 return false;
778
779 hadBidiErrors = hadBidiErrors || !checkBidiRules(label);
780
781 if (domainNameIsBidi && hadBidiErrors)
782 return false;
783
784 return true;
785}
786
787static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot)
788{
789 qsizetype lastIdx = 0;
790 QString aceForm; // this variable is here for caching
791 QString aceResult;
792
793 while (true) {
794 qsizetype idx = normalizedDomain.indexOf(u'.', lastIdx);
795 if (idx == -1)
796 idx = normalizedDomain.size();
797
798 const qsizetype labelLength = idx - lastIdx;
799 if (labelLength) {
800 const auto label = normalizedDomain.sliced(lastIdx, labelLength);
801 aceForm.clear();
802 qt_punycodeEncoder(label, &aceForm);
803 if (aceForm.isEmpty())
804 return {};
805
806 aceResult.append(aceForm);
807 }
808
809 if (idx == normalizedDomain.size())
810 break;
811
812 if (labelLength == 0 && (dot == ForbidLeadingDot || idx > 0))
813 return {}; // two delimiters in a row -- empty label not allowed
814
815 lastIdx = idx + 1;
816 aceResult += u'.';
817 }
818
819 return aceResult;
820}
821
822static bool checkAsciiDomainName(QStringView normalizedDomain, AceLeadingDot dot,
823 bool *usesPunycode)
824{
825 qsizetype lastIdx = 0;
826 bool hasPunycode = false;
827 *usesPunycode = false;
828
829 while (lastIdx < normalizedDomain.size()) {
830 auto idx = normalizedDomain.indexOf(u'.', lastIdx);
831 if (idx == -1)
832 idx = normalizedDomain.size();
833
834 const auto labelLength = idx - lastIdx;
835 if (labelLength == 0) {
836 if (idx == normalizedDomain.size())
837 break;
838 if (dot == ForbidLeadingDot || idx > 0)
839 return false; // two delimiters in a row -- empty label not allowed
840 } else {
841 const auto label = normalizedDomain.sliced(lastIdx, labelLength);
842 if (!validateAsciiLabel(label))
843 return false;
844
845 hasPunycode = hasPunycode || label.startsWith("xn--"_L1);
846 }
847
848 lastIdx = idx + 1;
849 }
850
851 *usesPunycode = hasPunycode;
852 return true;
853}
854
855static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options)
856{
857 QString result;
858 result.reserve(asciiDomain.size());
859 qsizetype lastIdx = 0;
860
861 DomainValidityChecker checker;
862
863 while (true) {
864 auto idx = asciiDomain.indexOf(u'.', lastIdx);
865 if (idx == -1)
866 idx = asciiDomain.size();
867
868 const auto labelLength = idx - lastIdx;
869 if (labelLength == 0) {
870 if (idx == asciiDomain.size())
871 break;
872 } else {
873 const auto label = asciiDomain.sliced(lastIdx, labelLength);
874 const auto unicodeLabel = qt_punycodeDecoder(label);
875
876 if (unicodeLabel.isEmpty())
877 return asciiDomain;
878
879 if (!checker.checkLabel(unicodeLabel, options))
880 return asciiDomain;
881
882 result.append(unicodeLabel);
883 }
884
885 if (idx == asciiDomain.size())
886 break;
887
888 lastIdx = idx + 1;
889 result += u'.';
890 }
891 return result;
892}
893
894static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options)
895{
896 qsizetype lastIdx = 0;
897
898 DomainValidityChecker checker(true);
899
900 while (true) {
901 qsizetype idx = domainName.indexOf(u'.', lastIdx);
902 if (idx == -1)
903 idx = domainName.size();
904
905 const qsizetype labelLength = idx - lastIdx;
906 if (labelLength) {
907 const auto label = domainName.sliced(lastIdx, labelLength);
908
909 if (!checker.checkLabel(label, options))
910 return false;
911 }
912
913 if (idx == domainName.size())
914 break;
915
916 lastIdx = idx + 1;
917 }
918 return true;
919}
920
921QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot,
922 QUrl::AceProcessingOptions options)
923{
924 if (domain.isEmpty())
925 return {};
926
927 bool mappedToAscii;
928 const QString mapped = mapDomainName(domain, options, &mappedToAscii);
929 const QString normalized =
930 mappedToAscii ? mapped : mapped.normalized(QString::NormalizationForm_C);
931
932 if (normalized.isEmpty())
933 return {};
934
935 if (!mappedToAscii && !checkUnicodeName(normalized, options))
936 return {};
937
938 bool needsConversionToUnicode;
939 const QString aceResult = mappedToAscii ? normalized : convertToAscii(normalized, dot);
940 if (aceResult.isEmpty() || !checkAsciiDomainName(aceResult, dot, &needsConversionToUnicode))
941 return {};
942
943 if (op == ToAceOnly || !needsConversionToUnicode
944 || (!options.testFlag(QUrl::IgnoreIDNWhitelist) && !qt_is_idn_enabled(aceResult))) {
945 return aceResult;
946 }
947
948 return convertToUnicode(aceResult, options);
949}
950
951/*!
952 \since 4.2
953
954 Returns the current whitelist of top-level domains that are allowed
955 to have non-ASCII characters in their compositions.
956
957 See setIdnWhitelist() for the rationale of this list.
958
959 \sa AceProcessingOption
960*/
961QStringList QUrl::idnWhitelist()
962{
963 if (user_idn_whitelist)
964 return *user_idn_whitelist;
965 static const QStringList list = [] {
966 QStringList list;
967 list.reserve(idn_whitelist.count());
968 int i = 0;
969 while (i < idn_whitelist.count()) {
970 list << QLatin1StringView(idn_whitelist.at(i));
971 ++i;
972 }
973 return list;
974 }();
975 return list;
976}
977
978/*!
979 \since 4.2
980
981 Sets the whitelist of Top-Level Domains (TLDs) that are allowed to have
982 non-ASCII characters in domains to the value of \a list.
983
984 Note that if you call this function, you need to do so \e before
985 you start any threads that might access idnWhitelist().
986
987 Qt comes with a default list that contains the Internet top-level domains
988 that have published support for Internationalized Domain Names (IDNs)
989 and rules to guarantee that no deception can happen between similarly-looking
990 characters (such as the Latin lowercase letter \c 'a' and the Cyrillic
991 equivalent, which in most fonts are visually identical).
992
993 This list is periodically maintained, as registrars publish new rules.
994
995 This function is provided for those who need to manipulate the list, in
996 order to add or remove a TLD. It is not recommended to change its value
997 for purposes other than testing, as it may expose users to security risks.
998*/
999void QUrl::setIdnWhitelist(const QStringList &list)
1000{
1001 if (!user_idn_whitelist)
1002 user_idn_whitelist = new QStringList;
1003 *user_idn_whitelist = list;
1004}
1005
1006QT_END_NAMESPACE
AceLeadingDot
Definition qurl_p.h:33
@ ForbidLeadingDot
Definition qurl_p.h:33
AceOperation
Definition qurl_p.h:34
static const uint initial_bias
Definition qurlidna.cpp:26
static constexpr auto idn_whitelist
Definition qurlidna.cpp:281
static constexpr qsizetype MaxDomainLabelLength
Definition qurlidna.cpp:29
static QString convertToAscii(QStringView normalizedDomain, AceLeadingDot dot)
Definition qurlidna.cpp:787
static bool checkAsciiDomainName(QStringView normalizedDomain, AceLeadingDot dot, bool *usesPunycode)
Definition qurlidna.cpp:822
QString qt_ACE_do(const QString &domain, AceOperation op, AceLeadingDot dot, QUrl::AceProcessingOptions options)
Definition qurlidna.cpp:921
static bool checkUnicodeName(const QString &domainName, QUrl::AceProcessingOptions options)
Definition qurlidna.cpp:894
static bool isValidInNormalizedAsciiName(C c)
Definition qurlidna.cpp:386
static const uint tmin
Definition qurlidna.cpp:22
static const uint damp
Definition qurlidna.cpp:25
static bool qt_is_idn_enabled(QStringView aceDomain)
Definition qurlidna.cpp:351
static const uint initial_n
Definition qurlidna.cpp:27
static bool validateAsciiLabel(QStringView label)
Definition qurlidna.cpp:475
static bool isValidInNormalizedAsciiLabel(C c)
Definition qurlidna.cpp:380
static void appendEncode(QString *output, uint delta, uint bias)
Definition qurlidna.cpp:48
static const uint base
Definition qurlidna.cpp:21
static bool lessThan(const QChar *a, int l, const char *c)
Definition qurlidna.cpp:322
static QString mapDomainName(const QString &in, QUrl::AceProcessingOptions options, bool *resultIsAscii)
Definition qurlidna.cpp:398
static QString convertToUnicode(const QString &asciiDomain, QUrl::AceProcessingOptions options)
Definition qurlidna.cpp:855
static const uint skew
Definition qurlidna.cpp:24
static const uint tmax
Definition qurlidna.cpp:23
Q_AUTOTEST_EXPORT void qt_punycodeEncoder(QStringView in, QString *output)
Definition qurlidna.cpp:68
static bool equal(const QChar *a, int l, const char *b)
Definition qurlidna.cpp:339
static uint encodeDigit(uint digit)
Definition qurlidna.cpp:31
static uint adapt(uint delta, uint numpoints, bool firsttime)
Definition qurlidna.cpp:36