Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qunicodetools.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:critical reason:data-parser
4
6
7#include <QtCore/private/qstringiterator_p.h>
10#if QT_CONFIG(library)
11#include "qlibrary.h"
12#endif
13
14#include <limits.h>
15
16#define FLAG(x) (1 << (x))
17
19
20using namespace Qt::StringLiterals;
21
22#ifdef QT_BUILD_INTERNAL
23Q_CONSTINIT Q_AUTOTEST_EXPORT
24#else
25constexpr
26#endif
28
29namespace QUnicodeTools {
30
31// -----------------------------------------------------------------------------------------------------
32//
33// The text boundaries determination algorithm.
34// See https://www.unicode.org/reports/tr29/tr29-37.html
35//
36// -----------------------------------------------------------------------------------------------------
37
38namespace GB {
39
40// This table is indexed by the grapheme break classes of two
41// (adjacent) code points.
42// The class of the first code point selects an entry.
43// If the entry's bit at position second_cp_class is set
44// (in other words: if entry & (1u << second_cp_class) is non-zero)
45// then there is NO grapheme break between the two code points.
46
48
49// Check that we have enough bits in the table (in case
50// NumGraphemeBreakClasses grows too much).
52 "Internal error: increase the size in bits of GBTableEntryType");
53
54// GB9, GB9a
59
60static const GBTableEntryType HardBreak = 0u;
61
63 Extend_SpacingMark_ZWJ, // Any
65 HardBreak, // LF
66 HardBreak, // Control
67 Extend_SpacingMark_ZWJ, // Extend
68 Extend_SpacingMark_ZWJ, // ZWJ
69 Extend_SpacingMark_ZWJ, // RegionalIndicator
80 ), // Prepend
81 Extend_SpacingMark_ZWJ, // SpacingMark
87 ), // L
91 ), // V
94 ), // T
98 ), // LV
101 ), // LVT
102 Extend_SpacingMark_ZWJ // Extended_Pictographic
103};
104
107{
108 return (breakTable[first] & FLAG(second)) == 0;
109}
110
111// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
112// so we need to store some local state.
113enum class State : uchar {
115 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
116 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
117 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
118};
119
120} // namespace GB
121
122static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
123{
125 GB::State state = GB::State::Normal;
126 QStringIterator it(QStringView{string, len});
127 while (it.hasNext()) {
128 const qsizetype pos = it.index();
129 const char32_t ucs4 = it.nextOrRawCodeUnit();
130
131 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
132 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
133
134 bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
135 bool handled = false;
136
137 switch (state) {
139 break; // will deal with it below
140
142 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
144 // keep going in the current state
145 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
146 handled = true;
147 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
149 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
150 handled = true;
151 } else {
152 state = GB::State::Normal;
153 }
154 break;
155
157 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
159 shouldBreak = false;
160 handled = true;
161 }
162
163 state = GB::State::Normal;
164 break;
165
169 shouldBreak = false;
170 handled = true;
171 }
172
173 state = GB::State::Normal;
174 break;
175 }
176
177 if (!handled) {
178 Q_ASSERT(state == GB::State::Normal);
182 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
183 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
185 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
186 }
187 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
188 state = GB::State::GB12_13_RI;
189 }
190 }
191
192 if (shouldBreak)
193 attributes[pos].graphemeBoundary = true;
194
195 lcls = cls;
196 }
197
198 attributes[len].graphemeBoundary = true; // GB2
199}
200
201
202namespace WB {
203
210
212// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
213 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
214 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
215 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
216 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
218 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
219 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
220 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
226 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
227 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
232};
233
234} // namespace WB
235
236static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
237{
238 enum WordType {
239 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
240 } currentWordType = WordTypeNone;
241
243 auto real_cls = cls; // Unaffected by WB4
244
245 QStringIterator it(QStringView{string, len});
246 while (it.hasNext()) {
247 const qsizetype pos = it.index();
248 const char32_t ucs4 = it.nextOrRawCodeUnit();
249
250 const auto prop = QUnicodeTables::properties(ucs4);
251 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
253 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
254 // which caused "hi.there" to be treated like if it were just a single word;
255 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
256 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
257 if (ucs4 == 0x002E) // FULL STOP
259 else if (ucs4 == 0x003A) // COLON
261 }
262
263 uchar action = WB::breakTable[cls][ncls];
264 switch (action) {
265 case WB::Break:
266 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
267 && prop->graphemeBreakClass
269 // WB3c: ZWJ × \p{Extended_Pictographic}
270 action = WB::NoBreak;
271 }
272 break;
273 case WB::NoBreak:
275 // WB4: X(Extend|Format)* -> X
276 real_cls = ncls;
277 continue;
278 }
279 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
280 // WB15/WB16: break between pairs of Regional indicator
282 }
283 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
284 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
285 // WB3d should not be affected by WB4
286 action = WB::Break;
287 }
288 break;
289 case WB::Lookup:
290 case WB::LookupW:
291 for (auto lookahead = it; lookahead.hasNext(); /**/) {
292 const char32_t ucs4 = lookahead.nextOrRawCodeUnit();
293
294 const auto prop = QUnicodeTables::properties(ucs4);
295 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
296
298 // WB4: X(Extend|Format)* -> X
299 continue;
300 }
301
302 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
303 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
304 it = lookahead;
305 ncls = tcls;
306 action = WB::NoBreak;
307 }
308 break;
309 }
310 if (action != WB::NoBreak) {
311 action = WB::Break;
313 action = WB::NoBreak; // WB7a
314 }
315 break;
316 }
317
318 cls = ncls;
319 real_cls = ncls;
320
321 if (action == WB::Break) {
322 attributes[pos].wordBreak = true;
323 if (currentWordType != WordTypeNone)
324 attributes[pos].wordEnd = true;
325 switch (cls) {
327 currentWordType = WordTypeHiraganaKatakana;
328 attributes[pos].wordStart = true;
329 break;
333 currentWordType = WordTypeAlphaNumeric;
334 attributes[pos].wordStart = true;
335 break;
336 default:
337 currentWordType = WordTypeNone;
338 break;
339 }
340 }
341 }
342
343 if (currentWordType != WordTypeNone)
344 attributes[len].wordEnd = true;
345 attributes[len].wordBreak = true; // WB2
346}
347
348
349namespace SB {
350
367
369// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
373
374 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
375 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
376 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
377 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
378
379 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
380 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
381 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
382 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
383 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
384};
385
386} // namespace SB
387
388static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
389{
390 uchar state = SB::BAfter; // to meet SB1
391
392 QStringIterator it(QStringView{string, len});
393 while (it.hasNext()) {
394 const qsizetype pos = it.index();
395 const char32_t ucs4 = it.nextOrRawCodeUnit();
396
397 const auto prop = QUnicodeTables::properties(ucs4);
398 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
399
400 Q_ASSERT(state <= SB::BAfter);
401 state = SB::breakTable[state][ncls];
402 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
403 state = SB::Break;
404 for (auto lookahead = it; lookahead.hasNext(); /**/) {
405 const char32_t ucs4 = lookahead.nextOrRawCodeUnit();
406
407 const auto prop = QUnicodeTables::properties(ucs4);
408 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
409 switch (tcls) {
416 continue;
418 it = lookahead;
419 state = SB::Initial;
420 break;
421 default:
422 break;
423 }
424 break;
425 }
426 }
427 if (Q_UNLIKELY(state == SB::Break)) {
428 attributes[pos].sentenceBoundary = true;
429 state = SB::breakTable[SB::Initial][ncls];
430 }
431 }
432
433 attributes[len].sentenceBoundary = true; // SB2
434}
435
436
437// -----------------------------------------------------------------------------------------------------
438//
439// The line breaking algorithm.
440// See http://www.unicode.org/reports/tr14/tr14-39.html
441//
442// -----------------------------------------------------------------------------------------------------
443
444namespace LB {
445
446namespace NS { // Number Sequence
447
448// This namespace is used to implement LB25 which, as of Unicode 16, has this
449// definition:
450// NU ( SY | IS )* CL × PO
451// NU ( SY | IS )* CP × PO
452// NU ( SY | IS )* CL × PR
453// NU ( SY | IS )* CP × PR
454// NU ( SY | IS )* × PO
455// NU ( SY | IS )* × PR
456// PO × OP NU
457// PO × OP IS NU
458// PO × NU
459// PR × OP NU
460// PR × OP IS NU
461// PR × NU
462// HY × NU
463// IS × NU
464// NU ( SY | IS )* × NU
465
466enum Action {
471 NeedOPNU, // Like Start, but must be followed by sequence `(OP (IS)?)? NU`
472 // These are 'synthetic' actions and are not used in the table but are
473 // tracked otherwise in the code for LB25, to track the state of specific
474 // sequences:
475 CNeedNU, // Like Continue, but must be followed by NU
476 CNeedISNU, // Like Continue, but must be followed by IS? NU
477};
478
489
490static const uchar actionTable[CLCP + 1][CLCP + 1] = {
491// XX PRPO OP HY NU SY IS CLCP
492 { None , NeedOPNU, Start , None , Start , None , None , None }, // XX
493 { None , NeedOPNU, Continue, Break , Start , None , None , None }, // PRPO
494 { None , Start , Start , Break , Continue, None , Continue, None }, // OP
495 { None , None , None , Start , Continue, None , None , None }, // HY
499 { Break , Continue, Break , Break , Break , Break , Break , Break }, // CLCP
500};
501
503{
504 switch (lbc) {
506 return PRPO;
508 return OP;
510 return HY;
512 return NU;
514 return SY;
516 return IS;
518 return CLCP;
519 default:
520 break;
521 }
522 return XX;
523}
524
525} // namespace NS
526
527namespace BRS { // Brahmic Sequence, used to implement LB28a
528 constexpr char32_t DottedCircle = U'\u25CC';
529
530 // The LB28a_{n} value maps to the 'regex' on the nth line in LB28a
531 // The only special case is LB28a_2VI which is a direct match to the 2nd
532 // line, but it also leads to LB28a_3VIAK, the 3rd line.
533 enum State {
535 Start, // => Have: `(AK | [◌] | AS)`
536 LB28a_2VF, // => Have: `(AK | [◌] | AS) VF`
537 LB28a_2VI, // => Have: `(AK | [◌] | AS) VI` May find: `(AK | [◌])`
538 LB28a_3VIAK, // => Have: `(AK | [◌] | AS) VI (AK | [◌])`
539 LB28a_4, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS)` May find: `VF`
540 LB28a_4VF, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS) VF`
542 };
552 {
553 using LBC = QUnicodeTables::LineBreakClass;
554 if (lb.lbc == LBC::LineBreak_CM)
555 return state;
556
557 switch (state) {
558 case Start:
559 if (lb.lbc == LBC::LineBreak_VF)
560 return LB28a_2VF;
561 if (lb.lbc == LBC::LineBreak_VI)
562 return LB28a_2VI;
563 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
564 || lb.lbc == LBC::LineBreak_AS)
565 return LB28a_4;
566 break;
567 case LB28a_2VI:
568 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK)
569 return LB28a_3VIAK;
570 break;
571 case LB28a_4:
572 if (lb.lbc == LBC::LineBreak_VF)
573 return LB28a_4VF;
574 // Had (AK | [◌] | AS) (AK | [◌] | AS), which could mean the 2nd capture is the start
575 // of a new sequence, so we need to check if it makes sense.
576 return Restart;
577 case None:
578 if (Q_UNLIKELY(lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
579 || lb.lbc == LBC::LineBreak_AS)) {
580 return Start;
581 }
582 break;
583 case LB28a_2VF:
584 case LB28a_4VF:
585 case LB28a_3VIAK:
586 case Restart:
587 // These are all terminal states, so no need to update
588 Q_UNREACHABLE();
589 }
590 return None;
591 }
592}
593
594enum Action { // => Given sequence 'BA' (see legacy https://www.unicode.org/reports/tr14/tr14-37.html#ExampleTable):
595 ProhibitedBreak, PB = ProhibitedBreak, // => Do not break between B and A, regardless of spaces
596 DirectBreak, DB = DirectBreak, // => Always a valid break position
597 IndirectBreak, IB = IndirectBreak, // => Only break after B if followed by spaces
598 CombiningIndirectBreak, CI = CombiningIndirectBreak, // => Break after B if A is CM, and B is followed by spaces
599 CombiningProhibitedBreak, CP = CombiningProhibitedBreak, // => Do not break after B if A is CM
603};
604
605// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
606// about the table. It was removed in the later versions of the standard.
608/* 1↓ 2→ OP CL CP QU +Pi +Pf +19 GL NS EX SY IS PR PO NU AL HL ID IN HY +WS HH BA +WS HYBA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM AK AP AS VI VF*/
609/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
610/* CL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
611/* CP */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
612/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
613/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
614/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
615/* +19*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
616/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
617/* NS */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
618/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
619/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
620/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DN, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
621/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, DB, DB, DB },
622/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
623/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
624/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
625/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, CI, CI, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
626/* ID */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
627/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
628/* HY */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
629/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, IB, IB, HH, IB, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
630/* HH */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
631/* BA */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
632/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, IB, IB, HH, IB, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
633/*HYBA*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, DB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
634/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB },
635/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
636/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
637/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
638/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
639/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
640/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
641/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
642/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
643/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
644/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB },
645/* CB */ { DB, PB, PB, IB, IB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
646/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB },
647/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
648/* AK */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
649/* AP */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, IB, DB, DB },
650/* AS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
651/* VI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
652/* VF */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
653};
654
655// The following line break classes are not treated by the pair table
656// and must be resolved outside:
657// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX, ZWJ
658
659} // namespace LB
660
661static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
662{
663 qsizetype nestart = 0;
664 LB::NS::Class nelast = LB::NS::XX;
665 LB::NS::Action neactlast = LB::NS::None;
666
667 LB::BRS::ParseState brsState;
668
670 QUnicodeTables::LineBreakClass cls = lcls;
671 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n');
672
673 constexpr static auto isEastAsian = [](QUnicodeTables::EastAsianWidth eaw) {
674 using EAW = QUnicodeTables::EastAsianWidth;
675 return eaw == EAW::W || eaw == EAW::F || eaw == EAW::H;
676 };
677
678 for (qsizetype i = 0; i != len; ++i) {
679 qsizetype pos = i;
680 char32_t ucs4 = string[i];
681 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
682 ushort low = string[i + 1];
683 if (QChar::isLowSurrogate(low)) {
684 ucs4 = QChar::surrogateToUcs4(ucs4, low);
685 ++i;
686 }
687 }
688
689 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
690 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
692
694 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
696 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
697 ) {
698 // LB27: use SPACE for line breaking
699 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
700 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
701 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
703 } else {
704 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
705 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
706 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
707 if (FLAG(prop->category) & test)
709 }
710 }
711 }
712
713 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
714 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
715 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
716 if (FLAG(prop->category) & test)
718 }
719
720 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
721 if (prop->category == QChar::Punctuation_InitialQuote) {
722 // LB15a: Do not break after an unresolved initial punctuation
723 // that lies at the start of the line, after a space, after
724 // opening punctuation, or after an unresolved quotation mark,
725 // even after spaces.
726 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
727 // [\p{Pi}&QU] SP* ×
728 // Note: sot is treated as LF here due to initial loop setup.
729 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
735 if (std::any_of(std::begin(lb15a), std::end(lb15a),
736 [lcls](auto x) { return x == lcls; })) {
738 }
739 } else if (prop->category == QChar::Punctuation_FinalQuote) {
740 // LB15b: Do not break before an unresolved final punctuation
741 // that lies at the end of the line, before a space, before
742 // a prohibited break, or before an unresolved quotation mark,
743 // even after spaces.
744 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
745 // | SY | BK | CR | LF | NL | ZW | eot)
746 const auto nncls = [&] {
747 if (i + 1 >= len)
749 char32_t c = string[i + 1];
750 if (QChar::isHighSurrogate(c) && i + 2 < len) {
751 ushort low = string[i + 2];
752 if (QChar::isLowSurrogate(low))
753 c = QChar::surrogateToUcs4(c, low);
754 else
755 return QUnicodeTables::LineBreak_SG; // all surrogates
756 }
757 return QUnicodeTables::lineBreakClass(c);
758 }();
759
760 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
769 if (std::any_of(std::begin(lb15b), std::end(lb15b),
770 [nncls](auto x) { return x == nncls; })) {
772 }
773 }
774 }
775
776 if (Q_UNLIKELY((lcls >= QUnicodeTables::LineBreak_SP || lcls == QUnicodeTables::LineBreak_ZW
780 || ncls == QUnicodeTables::LineBreak_HH))) {
781 // LB20a: Do not break after a word-initial hyphen.
782 // ( sot | BK | CR | LF | NL | SP | ZW | CB | GL ) ( HY | HH ) × ( AL | HL )
783
784 // Remap to the synthetic class WS_* (whitespace+*), which is just
785 // like the current respective linebreak class but with an IB action
786 // if the next class is AL or HL.
787 // The synthetic class tied to HH is named WS_BA because it was previously a subset of
788 // BA. With Unicode 17.0.0, the HH line-break class was introduced, split out from BA.
791 else
793 }
794
795 if (Q_UNLIKELY(cls == QUnicodeTables::LineBreak_AP && ucs4 == LB::BRS::DottedCircle)
796 && lcls != QUnicodeTables::LineBreak_SP) {
797 // LB28a: Do not break inside the orthographic syllables of Brahmic scripts
798 // AP × (AK | [◌] | AS)
799 // @note: AP × (AK | AS) is checked by the breakTable
800 goto next;
801 }
802 while (true) { // May need to recheck once.
803 // LB28a cont'd
804 LB::BRS::State oldState = brsState.state;
805 brsState.state = LB::BRS::updateState(brsState.state, {ncls, ucs4});
806 if (Q_LIKELY(brsState.state == oldState))
807 break;
808 switch (brsState.state) {
809 case LB::BRS::Start:
810 brsState.start = i;
811 break;
812 case LB::BRS::LB28a_2VI: // Wait for more characters, but also valid sequence
813 // We may get another character, but this is already a complete
814 // sequence that should not have any breaks:
815 for (qsizetype j = brsState.start + 1; j < i; ++j)
816 attributes[j].lineBreak = false;
817 // No need to mark this sequence again later, so move 'start'
818 // up to the current position:
819 brsState.start = i;
820 goto next;
821 case LB::BRS::Restart:
822 // The previous character was possibly the start of a new sequence
823 brsState.state = LB::BRS::Start;
824 brsState.start = pos - 1;
825 continue; // Doing the loop again!
829 for (qsizetype j = brsState.start + 1; j < i; ++j)
830 attributes[j].lineBreak = false;
831 if (brsState.state == LB::BRS::LB28a_3VIAK) {
832 // This might be the start of a new sequence
833 brsState.state = LB::BRS::Start;
834 brsState.start = i;
835 } else {
836 brsState.state = LB::BRS::None;
837 }
838 goto next;
839 case LB::BRS::LB28a_4: // Wait for more characters
840 Q_LIKELY_BRANCH
841 case LB::BRS::None: // Nothing to do
842 break;
843 }
844 break;
845 }
846
847 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_IS)) {
848 // LB15c Break before a decimal mark that follows a space, for instance, in
849 // ‘subtract .5’.
850 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_SP)) {
851 if (i + 1 < len) {
852 constexpr char32_t Invalid = ~U'\0';
853 char32_t ch = string[i + 1];
854 if (QChar::isHighSurrogate(ch) && i + 2 < len) {
855 ushort low = string[i + 2];
856 if (QChar::isLowSurrogate(low))
857 ch = QChar::surrogateToUcs4(ch, low);
858 else
859 ch = Invalid;
860 }
861 if (ch != Invalid // surrogates won't match (ensured by util/unicode)
862 && QUnicodeTables::lineBreakClass(ch) == QUnicodeTables::LineBreak_NU) {
863 attributes[pos].lineBreak = true;
864 goto next;
865 }
866 }
867 }
868 }
869
870 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_HL)) {
871 // LB21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
872 // HL (HY | HH) × [^HL]
874 // Remap to synthetic HYBA class which handles the next
875 // character. Generally (LB21) there are no breaks before
876 // HY or HH, so we can skip ahead to the next character.
877 // The synthetic class is named such because prior to Unicode 17 this
878 // rule applied to a subset of BA. Unicode 17 added the HH class,
879 // which occupies the subset of BA that was previously used.
881 goto next;
882 }
883 }
884
885 // LB25: do not break lines inside numbers
886 {
887 LB::NS::Class necur = LB::NS::toClass(ncls);
888 LB::NS::Action neact = LB::NS::Action(LB::NS::actionTable[nelast][necur]);
889 if (Q_UNLIKELY(neactlast == LB::NS::CNeedNU && necur != LB::NS::NU)) {
890 neact = LB::NS::None;
891 } else if (Q_UNLIKELY(neactlast == LB::NS::NeedOPNU)) {
892 if (necur == LB::NS::OP)
893 neact = LB::NS::CNeedISNU;
894 else if (necur == LB::NS::NU)
895 neact = LB::NS::Continue;
896 else // Anything else and we ignore the sequence
897 neact = LB::NS::None;
898 } else if (Q_UNLIKELY(neactlast == LB::NS::CNeedISNU)) {
899 if (necur == LB::NS::IS)
900 neact = LB::NS::CNeedNU;
901 else if (necur == LB::NS::NU)
902 neact = LB::NS::Continue;
903 else // Anything else and we ignore the sequence
904 neact = LB::NS::None;
905 }
906 switch (neact) {
907 case LB::NS::Break:
908 // do not change breaks before and after the expression
909 for (qsizetype j = nestart + 1; j < pos; ++j)
910 attributes[j].lineBreak = false;
911 Q_FALLTHROUGH();
912 Q_LIKELY_BRANCH
913 case LB::NS::None:
914 nelast = LB::NS::XX; // reset state
915 break;
916 case LB::NS::NeedOPNU:
917 case LB::NS::Start:
918 if (neactlast == LB::NS::Start || neactlast == LB::NS::Continue) {
919 // Apply the linebreaks for the previous stretch; we need to start a new one
920 for (qsizetype j = nestart + 1; j < pos; ++j)
921 attributes[j].lineBreak = false;
922 }
923 nestart = i;
924 Q_FALLTHROUGH();
925 case LB::NS::CNeedNU:
927 case LB::NS::Continue:
928 nelast = necur;
929 break;
930 }
931 neactlast = neact;
932 }
933
934 // LB19a Unless surrounded by East Asian characters, do not break either side of any
935 // unresolved quotation marks
936 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU
938 && lcls != QUnicodeTables::LineBreak_ZW)) {
939 using EAW = QUnicodeTables::EastAsianWidth;
940 constexpr static auto nextCharNonEastAsian = [](const char16_t *string, qsizetype len) {
941 if (len > 0) {
942 char32_t nch = string[0];
943 if (QChar::isHighSurrogate(nch) && len > 1) {
944 char16_t low = string[1];
945 if (QChar::isLowSurrogate(low))
946 nch = QChar::surrogateToUcs4(char16_t(nch), low);
947 }
948 const auto *nextProp = QUnicodeTables::properties(nch);
950 nextProp->lineBreakClass);
951 QUnicodeTables::EastAsianWidth neaw = EAW(nextProp->eastAsianWidth);
952 return nncls != QUnicodeTables::LineBreak_CM
953 && nncls <= QUnicodeTables::LineBreak_SP
954 && !isEastAsian(neaw);
955 }
956 return true; // end-of-text counts as non-East-Asian
957 };
958 if (Q_UNLIKELY(!isEastAsian(EAW(lastProp->eastAsianWidth))
959 || nextCharNonEastAsian(string + i + 1, len - i - 1))) {
960 // Remap to the synthetic QU_19 class which has indirect breaks
961 // for most following classes.
963 }
964 }
965
966 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
967 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
968 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
969 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
970 goto next;
971 }
972
973 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
975 goto next; // LB6: x(BK|CR|LF|NL)
976 goto next_no_cls_update; // LB7: xSP
977 }
978
979 // LB19 - do not break before non-initial unresolved quotation marks, or after non-final
980 // unresolved quotation marks
981 if (Q_UNLIKELY(((ncls == QUnicodeTables::LineBreak_QU
982 || ncls == QUnicodeTables::LineBreak_QU_19)
983 && prop->category != QChar::Punctuation_InitialQuote)
984 || (cls == QUnicodeTables::LineBreak_QU
985 && lastProp->category != QChar::Punctuation_FinalQuote))) {
986 // Make sure the previous character is not one that we have to break after.
987 // Also skip if ncls is CM so it can be treated as lcls (LB9)
989 && ncls != QUnicodeTables::LineBreak_CM) {
990 goto next;
991 }
992 }
993
994 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
995 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
997 // don't update anything
998 goto next_no_cls_update;
999 }
1000
1001 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
1002 // LB8a: ZWJ x
1003 goto next;
1004 }
1005
1006 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
1007 // LB30a
1009 goto next;
1010 }
1011
1012 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
1013 && lastProp->category == QChar::Other_NotAssigned
1014 && lastProp->graphemeBreakClass
1015 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)
1016 && lcls != QUnicodeTables::LineBreak_SP) {
1017 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
1018 goto next;
1019 }
1020
1021 // for South East Asian chars that require a complex analysis, the Unicode
1022 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
1023 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
1025
1026 tcls = cls;
1027
1028 constexpr static auto remapToAL = [](QUnicodeTables::LineBreakClass &c, auto &property) {
1029 if (Q_UNLIKELY(c == QUnicodeTables::LineBreak_CM
1032 property = QUnicodeTables::properties(U'\u0041');
1033 }
1034 };
1035 // LB10 Treat any remaining combining mark or ZWJ as AL,
1036 // as if it had the properties of U+0041 A LATIN CAPITAL LETTER
1037 remapToAL(tcls, lastProp);
1038 remapToAL(ncls, prop);
1039
1041 case LB::DirectBreak:
1042 attributes[pos].lineBreak = true;
1043 break;
1044 case LB::IndirectBreak:
1045 if (lcls == QUnicodeTables::LineBreak_SP)
1046 attributes[pos].lineBreak = true;
1047 break;
1049 if (lcls != QUnicodeTables::LineBreak_SP)
1050 goto next_no_cls_update;
1051 attributes[pos].lineBreak = true;
1052 break;
1054 if (lcls != QUnicodeTables::LineBreak_SP)
1055 goto next_no_cls_update;
1056 break;
1057 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
1058 if (lcls != QUnicodeTables::LineBreak_HL)
1059 attributes[pos].lineBreak = true;
1060 break;
1062 using EAW = QUnicodeTables::EastAsianWidth;
1063 switch (EAW(prop->eastAsianWidth)) {
1064 default:
1065 if (lcls != QUnicodeTables::LineBreak_SP)
1066 break;
1067 Q_FALLTHROUGH();
1068 case QUnicodeTables::EastAsianWidth::F:
1069 case QUnicodeTables::EastAsianWidth::W:
1070 case QUnicodeTables::EastAsianWidth::H:
1071 attributes[pos].lineBreak = true;
1072 break;
1073 }
1074 break;
1075 case LB::DirectBreakOutsideNumericSequence:
1076 if (neactlast == LB::NS::None || neactlast > LB::NS::Break)
1077 attributes[pos].lineBreak = true;
1078 break;
1080 // nothing to do
1081 default:
1082 break;
1083 }
1084
1085 next:
1087 cls = ncls;
1088 lastProp = prop;
1089 }
1090 next_no_cls_update:
1091 lcls = ncls;
1092 }
1093
1094 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
1095 // LB25: do not break lines inside numbers
1096 for (qsizetype j = nestart + 1; j < len; ++j)
1097 attributes[j].lineBreak = false;
1098 }
1099
1100 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
1101 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
1102}
1103
1104
1105static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1106{
1107 QStringIterator it(QStringView{string, len});
1108 while (it.hasNext()) {
1109 const auto pos = it.index();
1110 if (Q_UNLIKELY(QChar::isSpace(it.nextOrRawCodeUnit())))
1111 attributes[pos].whiteSpace = true;
1112 }
1113}
1114
1115namespace Tailored {
1116
1117using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
1118
1119
1134
1135static const unsigned char indicForms[0xe00-0x900] = {
1136 // Devangari
1141
1146
1151
1156
1161
1166
1171
1176
1177 // Bengali
1182
1187
1192
1197
1202
1207
1212
1217
1218 // Gurmukhi
1223
1228
1233
1238
1243
1248
1253
1258
1259 // Gujarati
1264
1269
1274
1279
1284
1289
1294
1299
1300 // Oriya
1305
1310
1315
1320
1325
1330
1335
1340
1341 //Tamil
1346
1351
1356
1361
1366
1371
1376
1381
1382 // Telugu
1387
1392
1397
1402
1407
1412
1417
1422
1423 // Kannada
1428
1433
1438
1443
1448
1453
1458
1463
1464 // Malayalam
1469
1474
1479
1484
1489
1494
1499
1504
1505 // Sinhala
1510
1515
1520
1525
1530
1535
1540
1545};
1546
1547static inline Form form(unsigned short uc) {
1548 if (uc < 0x900 || uc > 0xdff) {
1549 if (uc == 0x25cc)
1550 return Consonant;
1551 if (uc == 0x200c || uc == 0x200d)
1552 return Control;
1553 return Other;
1554 }
1555 return (Form)indicForms[uc-0x900];
1556}
1557
1558// #define INDIC_DEBUG
1559#ifdef INDIC_DEBUG
1560#define IDEBUG qDebug
1561#else
1562#define IDEBUG if constexpr (1) ; else qDebug
1563#endif
1564
1565/* syllables are of the form:
1566
1567 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1568 (Consonant Nukta? Halant)* Consonant Halant
1569 IndependentVowel VowelMark? StressMark?
1570
1571 We return syllable boundaries on invalid combinations as well
1572*/
1573static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1574{
1575 *invalid = false;
1576 IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1577 const char16_t *uc = s+start;
1578
1579 qsizetype pos = 0;
1580 Form state = form(uc[pos]);
1581 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1582 pos++;
1583
1584 if (state != Consonant && state != IndependentVowel) {
1585 if (state != Other)
1586 *invalid = true;
1587 goto finish;
1588 }
1589
1590 while (pos < end - start) {
1591 Form newState = form(uc[pos]);
1592 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1593 switch (newState) {
1594 case Control:
1595 newState = state;
1596 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1597 break;
1598 // the control character should be the last char in the item
1599 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1600 break;
1601 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1602 break;
1603 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1604 ++pos;
1605 goto finish;
1606 case Consonant:
1607 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1608 break;
1609 goto finish;
1610 case Halant:
1611 if (state == Nukta || state == Consonant)
1612 break;
1613 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1614 if (script == QChar::Script_Bengali && pos == 1 &&
1615 (uc[0] == 0x0985 || uc[0] == 0x098f))
1616 break;
1617 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1618 if (script == QChar::Script_Sinhala && state == Matra) {
1619 ++pos;
1620 continue;
1621 }
1622 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1623 ++pos;
1624 continue;
1625 }
1626 goto finish;
1627 case Nukta:
1628 if (state == Consonant)
1629 break;
1630 goto finish;
1631 case StressMark:
1632 if (state == VowelMark)
1633 break;
1634 Q_FALLTHROUGH();
1635 case VowelMark:
1636 if (state == Matra || state == LengthMark || state == IndependentVowel)
1637 break;
1638 Q_FALLTHROUGH();
1639 case Matra:
1640 if (state == Consonant || state == Nukta)
1641 break;
1642 if (state == Matra) {
1643 // ### needs proper testing for correct two/three part matras
1644 break;
1645 }
1646 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1647 // it work for all Indic languages?
1648 // the combination Independent_A + Vowel Sign AA is allowed.
1649 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1650 break;
1651 if (script == QChar::Script_Tamil && state == Matra) {
1652 if (uc[pos-1] == 0x0bc6 &&
1653 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1654 break;
1655 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1656 break;
1657 }
1658 goto finish;
1659
1660 case LengthMark:
1661 if (state == Matra) {
1662 // ### needs proper testing for correct two/three part matras
1663 break;
1664 }
1665 Q_FALLTHROUGH();
1666 case IndependentVowel:
1667 case Invalid:
1668 case Other:
1669 goto finish;
1670 }
1671 state = newState;
1672 pos++;
1673 }
1674 finish:
1675 return pos+start;
1676}
1677
1678static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1679{
1680 qsizetype end = from + len;
1681 attributes += from;
1682 qsizetype i = 0;
1683 while (i < len) {
1684 bool invalid;
1685 qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1686 attributes[i].graphemeBoundary = true;
1687
1688 if (boundary > len-1) boundary = len;
1689 i++;
1690 while (i < boundary) {
1691 attributes[i].graphemeBoundary = false;
1692 ++i;
1693 }
1694 assert(i == boundary);
1695 }
1696
1697
1698}
1699
1700#if QT_CONFIG(library)
1701
1702#define LIBTHAI_MAJOR 0
1703
1704/*
1705 * if libthai changed please update these codes too.
1706 */
1707struct thcell_t {
1708 unsigned char base; /**< base character */
1709 unsigned char hilo; /**< upper/lower vowel/diacritic */
1710 unsigned char top; /**< top-level mark */
1711};
1712
1713using ThBrk = struct _ThBrk;
1714
1715namespace {
1716
1717class LibThai final
1718{
1720
1721 using th_brk_new_def = ThBrk *(*)(const char *);
1722 using th_brk_delete_def = void (*)(ThBrk *);
1723 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1724 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1725
1726public:
1727 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1728 {
1730 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
1731 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
1732
1733 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
1734 if (th_brk_new) {
1735 m_state = th_brk_new(nullptr);
1737 reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
1738 }
1739 }
1740
1741 ~LibThai()
1742 {
1743 if (m_state && m_th_brk_delete)
1745 m_library.unload();
1746 }
1747
1748 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1749
1750 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1751 {
1755 }
1756
1757 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1758 {
1761 }
1762
1763private:
1765
1766 // Global state for th_brk_find_breaks().
1767 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1768 // state is read-only, and so it is safe to use it from multiple threads after
1769 // initialization. This is also stated in the libthai documentation.
1770 ThBrk *m_state = nullptr;
1771
1775};
1776
1777} // unnamed namespace
1778
1780
1781static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1782{
1783 qsizetype i;
1784 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1785
1786 for (i = 0; i < len; ++i) {
1787 if (string[i] <= 0xa0)
1788 result[i] = static_cast<unsigned char>(string[i]);
1789 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1790 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1791 else
1792 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1793 }
1794
1795 result[len] = 0;
1796}
1797
1798/*
1799 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1800 */
1801static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1802{
1803 constexpr qsizetype Prealloc = 128;
1804 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1807 struct thcell_t tis_cell;
1808
1810 if (!libThai || !libThai->isInitialized())
1811 return;
1812
1813 to_tis620(string, len, s.data());
1814
1815 for (i = 0; i < len; ++i) {
1816 attributes[i].wordBreak = false;
1817 attributes[i].wordStart = false;
1818 attributes[i].wordEnd = false;
1819 attributes[i].lineBreak = false;
1820 }
1821
1822 attributes[0].wordBreak = true;
1823 attributes[0].wordStart = true;
1824 attributes[0].wordEnd = false;
1825 numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
1827 static_cast<size_t>(break_positions.size()));
1828 for (i = 0; i < numbreaks; ++i) {
1833 }
1834 if (numbreaks > 0)
1836
1837 /* manage grapheme boundaries */
1838 i = 0;
1839 while (i < len) {
1841 libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
1842 size_t(len - i), &tis_cell, true);
1843
1845 for (size_t j = 1; j < cell_length; ++j)
1846 attributes[i + j].graphemeBoundary = false;
1847
1848 i += cell_length;
1849 }
1850}
1851
1852#endif // QT_CONFIG(library)
1853
1854static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1855{
1856 assert(script == QChar::Script_Thai);
1857#if QT_CONFIG(library)
1858 const char16_t *uc = text + from;
1859 attributes += from;
1860 Q_UNUSED(script);
1861 thaiAssignAttributes(uc, len, attributes);
1862#else
1863 Q_UNUSED(script);
1864 Q_UNUSED(text);
1865 Q_UNUSED(from);
1866 Q_UNUSED(len);
1867 Q_UNUSED(attributes);
1868#endif
1869}
1870
1871/*
1872 tibetan syllables are of the form:
1873 head position consonant
1874 first sub-joined consonant
1875 ....intermediate sub-joined consonants (if any)
1876 last sub-joined consonant
1877 sub-joined vowel (a-chung U+0F71)
1878 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1879*/
1880
1888
1889/* this table starts at U+0f40 */
1890static const unsigned char tibetanForm[0x80] = {
1895
1900
1905
1910
1915
1920
1925
1930};
1931
1932#define tibetan_form(c)
1933 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1934
1935static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1936{
1937 const char16_t *uc = s + start;
1938
1939 qsizetype pos = 0;
1940 TibetanForm state = tibetan_form(*uc);
1941
1942/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1943 pos++;
1944
1945 if (state != TibetanHeadConsonant) {
1946 if (state != TibetanOther)
1947 *invalid = true;
1948 goto finish;
1949 }
1950
1951 while (pos < end - start) {
1952 TibetanForm newState = tibetan_form(uc[pos]);
1953 switch (newState) {
1956 if (state != TibetanHeadConsonant &&
1958 goto finish;
1959 state = newState;
1960 break;
1961 case TibetanVowel:
1962 if (state != TibetanHeadConsonant &&
1963 state != TibetanSubjoinedConsonant &&
1964 state != TibetanSubjoinedVowel)
1965 goto finish;
1966 break;
1967 case TibetanOther:
1969 goto finish;
1970 }
1971 pos++;
1972 }
1973
1974finish:
1975 *invalid = false;
1976 return start+pos;
1977}
1978
1979static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1980{
1981 qsizetype end = from + len;
1982 qsizetype i = 0;
1983 Q_UNUSED(script);
1984 attributes += from;
1985 while (i < len) {
1986 bool invalid;
1987 qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1988
1989 attributes[i].graphemeBoundary = true;
1990
1991 if (boundary > len-1) boundary = len;
1992 i++;
1993 while (i < boundary) {
1994 attributes[i].graphemeBoundary = false;
1995 ++i;
1996 }
1997 assert(i == boundary);
1998 }
1999}
2000
2003 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
2004 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
2005 Mymr_CC_NGA = 3, /* Consonant NGA */
2006 Mymr_CC_YA = 4, /* Consonant YA */
2007 Mymr_CC_RA = 5, /* Consonant RA */
2008 Mymr_CC_WA = 6, /* Consonant WA */
2009 Mymr_CC_HA = 7, /* Consonant HA */
2010 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
2011 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
2012 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
2013 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
2014 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
2015 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
2016 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
2020 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
2021 Mymr_CC_COUNT = 19 /* This is the number of character classes */
2022};
2023
2026
2027 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2028 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
2029 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
2030 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
2031 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
2032 first in a syllable */
2033 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
2034
2035 /* position flags */
2037 Mymr_CF_POS_BELOW = 0x00040000,
2038 Mymr_CF_POS_ABOVE = 0x00020000,
2039 Mymr_CF_POS_AFTER = 0x00010000,
2040 Mymr_CF_POS_MASK = 0x000f0000,
2041
2043};
2044
2045Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
2046
2047/* Characters that get refrered to by name */
2049{
2053 Mymr_C_RA = 0x101B,
2054 Mymr_C_YA = 0x101A,
2055 Mymr_C_NGA = 0x1004,
2058};
2059
2060enum
2061{
2079};
2080
2081
2082typedef int MymrCharClass;
2083
2084
2100
2101static MymrCharClass
2103{
2104 if (ch == Mymr_C_SIGN_ZWJ)
2106
2107 if (ch == Mymr_C_SIGN_ZWNJ)
2109
2110 if (ch < 0x1000 || ch > 0x105f)
2111 return Mymr_CC_RESERVED;
2112
2113 return mymrCharClasses[ch - 0x1000];
2114}
2115
2116static const signed char mymrStateTable[][Mymr_CC_COUNT] =
2117{
2118/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
2119 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
2120 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
2121 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
2122 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
2123 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
2124 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
2125 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
2126 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
2127 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
2128 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
2129 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
2130 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
2131 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
2132 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
2133 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
2134 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
2135 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
2136 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
2137 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
2138 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
2139 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
2140 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
2141 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
2142 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
2143 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
2144 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
2145 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
2146 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
2147/* exit state -2 is for invalid order of medials and combination of invalids
2148 with virama where virama should treat as start of next syllable
2149 */
2150};
2151
2152/*#define MYANMAR_DEBUG */
2153#ifdef MYANMAR_DEBUG
2154#define MMDEBUG qDebug
2155#else
2156# define MMDEBUG
2157 if (0)
2158 printf
2159#endif
2160
2161/*
2162// Given an input string of characters and a location in which to start looking
2163// calculate, using the state table, which one is the last character of the syllable
2164// that starts in the starting position.
2165*/
2166static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2167{
2168 const char16_t *uc = s + start;
2169 int state = 0;
2170 qsizetype pos = start;
2171 *invalid = false;
2172
2173 while (pos < end) {
2174 MymrCharClass charClass = getMyanmarCharClass(*uc);
2175 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
2176 if (pos == start)
2177 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
2178
2179 MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
2180
2181 if (state < 0) {
2182 if (state < -1)
2183 --pos;
2184 break;
2185 }
2186 ++uc;
2187 ++pos;
2188 }
2189 return pos;
2190}
2191
2192static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2193{
2194 qsizetype end = from + len;
2195 qsizetype i = 0;
2196 Q_UNUSED(script);
2197 attributes += from;
2198 while (i < len) {
2199 bool invalid;
2200 qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
2201
2202 attributes[i].graphemeBoundary = true;
2203 attributes[i].lineBreak = true;
2204
2205 if (boundary > len-1)
2206 boundary = len;
2207 i++;
2208 while (i < boundary) {
2209 attributes[i].graphemeBoundary = false;
2210 ++i;
2211 }
2212 assert(i == boundary);
2213 }
2214}
2215
2216/*
2217// Vocabulary
2218// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
2219// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
2220// split vowels, signs... but there is only one base in a syllable, it has to be coded as
2221// the first character of the syllable.
2222// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
2223// Khmer language has five of them. Khmer split vowels either have one part before the
2224// base and one after the base or they have a part before the base and a part above the base.
2225// The first part of all Khmer split vowels is the same character, identical to
2226// the glyph of Khmer dependent vowel SRA EI
2227// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
2228// Differently than indian languages, the coeng modifies the consonant that follows it,
2229// not the one preceding it Each consonant has two forms, the base form and the subscript form
2230// the base form is the normal one (using the consonants code-point), the subscript form is
2231// displayed when the combination coeng + consonant is encountered.
2232// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
2233// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
2234// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
2235// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
2236// if it is attached to a consonant of the first series or a consonant of the second series
2237// Most consonants have an equivalent in the other series, but some of theme exist only in
2238// one series (for example SA). If we want to use the consonant SA with a vowel sound that
2239// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
2240// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
2241// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
2242// MUSIKATOAN a second series consonant to have a first series vowel sound.
2243// Consonant shifter are both normally supercript marks, but, when they are followed by a
2244// superscript, they change shape and take the form of subscript dependent vowel SRA U.
2245// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
2246// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
2247// be placed after the coeng consonant.
2248// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2249// Each vowel has its own position. Only one vowel per syllable is allowed.
2250// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2251// Allowed in a syllable.
2252//
2253//
2254// order is important here! This order must be the same that is found in each horizontal
2255// line in the statetable for Khmer (see khmerStateTable) .
2256*/
2259 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2260 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2261 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2262 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2264 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2265 CC_COENG = 7, /* Subscript consonant combining character */
2269 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2270 CC_COUNT = 12 /* This is the number of character classes */
2271};
2272
2273
2275 CF_CLASS_MASK = 0x0000FFFF,
2276
2277 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2278 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2279 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2280 CF_COENG = 0x08000000, /* flag to speed up comparing */
2281 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2282 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2283
2284 /* position flags */
2285 CF_POS_BEFORE = 0x00080000,
2286 CF_POS_BELOW = 0x00040000,
2287 CF_POS_ABOVE = 0x00020000,
2288 CF_POS_AFTER = 0x00010000,
2289 CF_POS_MASK = 0x000f0000
2290};
2291
2292Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2293
2294/* Characters that get referred to by name */
2296 C_SIGN_ZWNJ = 0x200C,
2297 C_SIGN_ZWJ = 0x200D,
2298 C_RO = 0x179A,
2299 C_VOWEL_AA = 0x17B6,
2301 C_VOWEL_E = 0x17C1,
2302 C_COENG = 0x17D2
2303};
2304
2305
2306/*
2307// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2308// they are also used to know where a character should be placed (location in reference to the base character)
2309// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2310// indicate error in syllable construction
2311*/
2312enum {
2326
2327 /* split vowel */
2330};
2331
2332
2333/*
2334// Character class: a character class value
2335// ORed with character class flags.
2336*/
2337typedef unsigned long KhmerCharClass;
2338
2339
2340/*
2341// Character class tables
2342// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2343// _sa Sign placed above the base
2344// _sp Sign placed after the base
2345// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2346// _c2 Consonant of type 2 (only RO)
2347// _c3 Consonant of type 3
2348// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2349// _cd Consonant-shifter
2350// _dl Dependent vowel placed before the base (left of the base)
2351// _db Dependent vowel placed below the base
2352// _da Dependent vowel placed above the base
2353// _dr Dependent vowel placed behind the base (right of the base)
2354// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2355// it to create a subscript consonant or independent vowel
2356// _va Khmer split vowel in which the first part is before the base and the second one above the base
2357// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2358*/
2360 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2361 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2362 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2363 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2364 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2365 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2366};
2367
2368/* this enum must reflect the range of khmerCharClasses */
2373
2374/*
2375// Below we define how a character in the input string is either in the khmerCharClasses table
2376// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2377// within the syllable, but are not in the table) we also get their type back, or an unknown object
2378// in which case we get _xx (CC_RESERVED) back
2379*/
2381{
2382 if (uc == C_SIGN_ZWJ) {
2383 return CC_ZERO_WIDTH_J_MARK;
2384 }
2385
2386 if (uc == C_SIGN_ZWNJ) {
2387 return CC_ZERO_WIDTH_NJ_MARK;
2388 }
2389
2390 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2391 return CC_RESERVED;
2392 }
2393
2394 return khmerCharClasses[uc - KhmerFirstChar];
2395}
2396
2397
2398/*
2399// The stateTable is used to calculate the end (the length) of a well
2400// formed Khmer Syllable.
2401//
2402// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2403// CharClassValues. This coincidence of values allows the follow up of the table.
2404//
2405// Each line corresponds to a state, which does not necessarily need to be a type
2406// of component... for example, state 2 is a base, with is always a first character
2407// in the syllable, but the state could be produced a consonant of any type when
2408// it is the first character that is analysed (in ground state).
2409//
2410// Differentiating 3 types of consonants is necessary in order to
2411// forbid the use of certain combinations, such as having a second
2412// coeng after a coeng RO,
2413// The inexistent possibility of having a type 3 after another type 3 is permitted,
2414// eliminating it would very much complicate the table, and it does not create typing
2415// problems, as the case above.
2416//
2417// The table is quite complex, in order to limit the number of coeng consonants
2418// to 2 (by means of the table).
2419//
2420// There a peculiarity, as far as Unicode is concerned:
2421// - The consonant-shifter is considered in two possible different
2422// locations, the one considered in Unicode 3.0 and the one considered in
2423// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2424//
2425//
2426// xx independent character, such as a number, punctuation sign or non-khmer char
2427//
2428// c1 Khmer consonant of type 1 or an independent vowel
2429// that is, a letter in which the subscript for is only under the
2430// base, not taking any space to the right or to the left
2431//
2432// c2 Khmer consonant of type 2, the coeng form takes space under
2433// and to the left of the base (only RO is of this type)
2434//
2435// c3 Khmer consonant of type 3. Its subscript form takes space under
2436// and to the right of the base.
2437//
2438// cs Khmer consonant shifter
2439//
2440// rb Khmer robat
2441//
2442// co coeng character (u17D2)
2443//
2444// dv dependent vowel (including split vowels, they are treated in the same way).
2445// even if dv is not defined above, the component that is really tested for is
2446// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2447//
2448// zwj Zero Width joiner
2449//
2450// zwnj Zero width non joiner
2451//
2452// sa above sign
2453//
2454// sp post sign
2455//
2456// there are lines with equal content but for an easier understanding
2457// (and maybe change in the future) we did not join them
2458*/
2459static const signed char khmerStateTable[][CC_COUNT] =
2460{
2461 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2462 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2463 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2464 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2465 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2466 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2467 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2468 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2469 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2470 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2471 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2472 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2473 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2474 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2475 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2476 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2477 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2478 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2479 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2480 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2481 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2482 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2483};
2484
2485
2486/* #define KHMER_DEBUG */
2487#ifdef KHMER_DEBUG
2488#define KHDEBUG qDebug
2489#else
2490# define KHDEBUG
2491 if (0)
2492 printf
2493#endif
2494
2495/*
2496// Given an input string of characters and a location in which to start looking
2497// calculate, using the state table, which one is the last character of the syllable
2498// that starts in the starting position.
2499*/
2500static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2501{
2502 const char16_t *uc = s + start;
2503 int state = 0;
2504 qsizetype pos = start;
2505 *invalid = false;
2506
2507 while (pos < end) {
2508 KhmerCharClass charClass = getKhmerCharClass(*uc);
2509 if (pos == start) {
2510 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2511 }
2512 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2513
2514 KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2515 charClass, *uc );
2516
2517 if (state < 0) {
2518 break;
2519 }
2520 ++uc;
2521 ++pos;
2522 }
2523 return pos;
2524}
2525
2526static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2527{
2528 qsizetype end = from + len;
2529 qsizetype i = 0;
2530 Q_UNUSED(script);
2531 attributes += from;
2532 while ( i < len ) {
2533 bool invalid;
2534 qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2535
2536 attributes[i].graphemeBoundary = true;
2537
2538 if ( boundary > len-1 ) boundary = len;
2539 i++;
2540 while ( i < boundary ) {
2541 attributes[i].graphemeBoundary = false;
2542 ++i;
2543 }
2544 assert( i == boundary );
2545 }
2546}
2547
2548
2550{
2551 switch (script) {
2552 case QChar::Script_Unknown:
2553 case QChar::Script_Inherited:
2554 case QChar::Script_Common:
2555 case QChar::Script_Latin:
2556 case QChar::Script_Greek:
2557 case QChar::Script_Cyrillic:
2558 case QChar::Script_Armenian:
2559 case QChar::Script_Hebrew:
2560 case QChar::Script_Arabic:
2561 case QChar::Script_Syriac:
2562 case QChar::Script_Thaana:
2563 return nullptr;
2564 case QChar::Script_Devanagari:
2565 case QChar::Script_Bengali:
2566 case QChar::Script_Gurmukhi:
2567 case QChar::Script_Gujarati:
2568 case QChar::Script_Oriya:
2569 case QChar::Script_Tamil:
2570 case QChar::Script_Telugu:
2571 case QChar::Script_Kannada:
2572 case QChar::Script_Malayalam:
2573 case QChar::Script_Sinhala:
2574 return &indicAttributes;
2575 case QChar::Script_Thai:
2576 return &thaiAttributes;
2577 case QChar::Script_Lao:
2578 return nullptr;
2579 case QChar::Script_Tibetan:
2580 return &tibetanAttributes;
2581 case QChar::Script_Myanmar:
2582 return &myanmarAttributes;
2583 case QChar::Script_Georgian:
2584 case QChar::Script_Hangul:
2585 case QChar::Script_Ethiopic:
2586 case QChar::Script_Cherokee:
2587 case QChar::Script_CanadianAboriginal:
2588 case QChar::Script_Ogham:
2589 case QChar::Script_Runic:
2590 return nullptr;
2591 case QChar::Script_Khmer:
2592 return &khmerAttributes;
2593 case QChar::Script_Mongolian:
2594 case QChar::Script_Hiragana:
2595 case QChar::Script_Katakana:
2596 case QChar::Script_Bopomofo:
2597 case QChar::Script_Han:
2598 case QChar::Script_Yi:
2599 case QChar::Script_OldItalic:
2600 case QChar::Script_Gothic:
2601 case QChar::Script_Deseret:
2602 case QChar::Script_Tagalog:
2603 case QChar::Script_Hanunoo:
2604 case QChar::Script_Buhid:
2605 case QChar::Script_Tagbanwa:
2606 case QChar::Script_Coptic:
2607 case QChar::Script_Limbu:
2608 case QChar::Script_TaiLe:
2609 case QChar::Script_LinearB:
2610 case QChar::Script_Ugaritic:
2611 case QChar::Script_Shavian:
2612 case QChar::Script_Osmanya:
2613 case QChar::Script_Cypriot:
2614 case QChar::Script_Braille:
2615 case QChar::Script_Buginese:
2616 case QChar::Script_NewTaiLue:
2617 case QChar::Script_Glagolitic:
2618 case QChar::Script_Tifinagh:
2619 case QChar::Script_SylotiNagri:
2620 case QChar::Script_OldPersian:
2621 case QChar::Script_Kharoshthi:
2622 case QChar::Script_Balinese:
2623 case QChar::Script_Cuneiform:
2624 case QChar::Script_Phoenician:
2625 case QChar::Script_PhagsPa:
2626 case QChar::Script_Nko:
2627 case QChar::Script_Sundanese:
2628 case QChar::Script_Lepcha:
2629 case QChar::Script_OlChiki:
2630 case QChar::Script_Vai:
2631 case QChar::Script_Saurashtra:
2632 case QChar::Script_KayahLi:
2633 case QChar::Script_Rejang:
2634 case QChar::Script_Lycian:
2635 case QChar::Script_Carian:
2636 case QChar::Script_Lydian:
2637 case QChar::Script_Cham:
2638 case QChar::Script_TaiTham:
2639 case QChar::Script_TaiViet:
2640 case QChar::Script_Avestan:
2641 case QChar::Script_EgyptianHieroglyphs:
2642 case QChar::Script_Samaritan:
2643 case QChar::Script_Lisu:
2644 case QChar::Script_Bamum:
2645 case QChar::Script_Javanese:
2646 case QChar::Script_MeeteiMayek:
2647 case QChar::Script_ImperialAramaic:
2648 case QChar::Script_OldSouthArabian:
2649 case QChar::Script_InscriptionalParthian:
2650 case QChar::Script_InscriptionalPahlavi:
2651 case QChar::Script_OldTurkic:
2652 case QChar::Script_Kaithi:
2653 case QChar::Script_Batak:
2654 case QChar::Script_Brahmi:
2655 case QChar::Script_Mandaic:
2656 case QChar::Script_Chakma:
2657 case QChar::Script_MeroiticCursive:
2658 case QChar::Script_MeroiticHieroglyphs:
2659 case QChar::Script_Miao:
2660 case QChar::Script_Sharada:
2661 case QChar::Script_SoraSompeng:
2662 case QChar::Script_Takri:
2663 case QChar::Script_CaucasianAlbanian:
2664 case QChar::Script_BassaVah:
2665 case QChar::Script_Duployan:
2666 case QChar::Script_Elbasan:
2667 case QChar::Script_Grantha:
2668 case QChar::Script_PahawhHmong:
2669 case QChar::Script_Khojki:
2670 case QChar::Script_LinearA:
2671 case QChar::Script_Mahajani:
2672 case QChar::Script_Manichaean:
2673 case QChar::Script_MendeKikakui:
2674 case QChar::Script_Modi:
2675 case QChar::Script_Mro:
2676 case QChar::Script_OldNorthArabian:
2677 case QChar::Script_Nabataean:
2678 case QChar::Script_Palmyrene:
2679 case QChar::Script_PauCinHau:
2680 case QChar::Script_OldPermic:
2681 case QChar::Script_PsalterPahlavi:
2682 case QChar::Script_Siddham:
2683 case QChar::Script_Khudawadi:
2684 case QChar::Script_Tirhuta:
2685 case QChar::Script_WarangCiti:
2686 case QChar::Script_Ahom:
2687 case QChar::Script_AnatolianHieroglyphs:
2688 case QChar::Script_Hatran:
2689 case QChar::Script_Multani:
2690 case QChar::Script_OldHungarian:
2691 case QChar::Script_SignWriting:
2692 case QChar::Script_Adlam:
2693 case QChar::Script_Bhaiksuki:
2694 case QChar::Script_Marchen:
2695 case QChar::Script_Newa:
2696 case QChar::Script_Osage:
2697 case QChar::Script_Tangut:
2698 case QChar::Script_MasaramGondi:
2699 case QChar::Script_Nushu:
2700 case QChar::Script_Soyombo:
2701 case QChar::Script_ZanabazarSquare:
2702 case QChar::Script_Dogra:
2703 case QChar::Script_GunjalaGondi:
2704 case QChar::Script_HanifiRohingya:
2705 case QChar::Script_Makasar:
2706 case QChar::Script_Medefaidrin:
2707 case QChar::Script_OldSogdian:
2708 case QChar::Script_Sogdian:
2709 case QChar::Script_Elymaic:
2710 case QChar::Script_Nandinagari:
2711 case QChar::Script_NyiakengPuachueHmong:
2712 case QChar::Script_Wancho:
2713 case QChar::Script_Chorasmian:
2714 case QChar::Script_DivesAkuru:
2715 case QChar::Script_KhitanSmallScript:
2716 case QChar::Script_Yezidi:
2717 case QChar::Script_CyproMinoan:
2718 case QChar::Script_OldUyghur:
2719 case QChar::Script_Tangsa:
2720 case QChar::Script_Toto:
2721 case QChar::Script_Vithkuqi:
2722 case QChar::Script_Kawi:
2723 case QChar::Script_NagMundari:
2724 case QChar::Script_Garay:
2725 case QChar::Script_GurungKhema:
2726 case QChar::Script_KiratRai:
2727 case QChar::Script_OlOnal:
2728 case QChar::Script_Sunuwar:
2729 case QChar::Script_Todhri:
2730 case QChar::Script_TuluTigalari:
2731 case QChar::Script_Sidetic:
2732 case QChar::Script_TaiYo:
2733 case QChar::Script_TolongSiki:
2734 case QChar::Script_BeriaErfe:
2735 return nullptr;
2736 case QChar::ScriptCount:
2737 // Don't Q_UNREACHABLE here; this might be a newer value in later Qt versions
2738 // (incl. patch releases)
2739 ;
2740 }
2741 return nullptr;
2742};
2743
2744static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2745 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2746 QCharAttributes *attributes)
2747{
2748 if (stringLength == 0)
2749 return;
2750 for (qsizetype i = 0; i < numItems; ++i) {
2751 QChar::Script script = items[i].script;
2752 CharAttributeFunction attributeFunction = charAttributeFunction(script);
2753 if (!attributeFunction)
2754 continue;
2755 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2756 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2757 }
2758}
2759
2760}
2761
2762Q_CORE_EXPORT void initCharAttributes(QStringView string,
2763 const ScriptItem *items, qsizetype numItems,
2764 QCharAttributes *attributes, CharAttributeOptions options)
2765{
2766 if (string.size() <= 0)
2767 return;
2768
2769 if (!(options & DontClearAttributes))
2770 ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2771
2772 if (options & GraphemeBreaks)
2773 getGraphemeBreaks(string.utf16(), string.size(), attributes);
2774 if (options & WordBreaks)
2775 getWordBreaks(string.utf16(), string.size(), attributes);
2776 if (options & SentenceBreaks)
2777 getSentenceBreaks(string.utf16(), string.size(), attributes);
2778 if (options & LineBreaks)
2779 getLineBreaks(string.utf16(), string.size(), attributes, options);
2780 if (options & WhiteSpaces)
2781 getWhiteSpaces(string.utf16(), string.size(), attributes);
2782
2784 if (!items || numItems <= 0)
2785 return;
2786
2787 Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2788 }
2789}
2790
2791
2792// ----------------------------------------------------------------------------
2793//
2794// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2795//
2796// ----------------------------------------------------------------------------
2797
2798Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2799{
2800 qsizetype sor = 0;
2801 QChar::Script script = QChar::Script_Common;
2802
2803 QStringIterator it(string);
2804 while (it.hasNext()) {
2805 const auto eor = it.index();
2806 const char32_t ucs4 = it.nextOrRawCodeUnit();
2807
2808 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2809
2810 QChar::Script nscript = QChar::Script(prop->script);
2811
2812 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2813 continue;
2814
2815 // inherit preceding Common-s
2816 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2817 // also covers a case where the base character of Common script followed
2818 // by one or more combining marks of non-Inherited, non-Common script
2819 script = nscript;
2820 continue;
2821 }
2822
2823 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2824 // Thus, a combining mark - whatever its script property value is - should inherit
2825 // the script property value of its base character.
2826 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2827 if (Q_UNLIKELY(FLAG(prop->category) & test))
2828 continue;
2829
2830 Q_ASSERT(script > QChar::Script_Common);
2831 Q_ASSERT(sor < eor);
2832 scripts->append(ScriptItem{sor, script});
2833 sor = eor;
2834
2835 script = nscript;
2836 }
2837
2838 Q_ASSERT(script >= QChar::Script_Common);
2839 scripts->append(ScriptItem{sor, script});
2840}
2841
2842} // namespace QUnicodeTools
2843
2844QT_END_NAMESPACE
Combined button and popup list for selecting options.
static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first, QUnicodeTables::GraphemeBreakClass second)
static const GBTableEntryType Extend_SpacingMark_ZWJ
static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses]
static const GBTableEntryType HardBreak
State updateState(State state, LinebreakUnit lb)
constexpr char32_t DottedCircle
Class toClass(QUnicodeTables::LineBreakClass lbc)
static const uchar actionTable[CLCP+1][CLCP+1]
static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ]
static const uchar breakTable[BAfter+1][QUnicodeTables::NumSentenceBreakClasses]
static const KhmerCharClass khmerCharClasses[]
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static CharAttributeFunction charAttributeFunction(QChar::Script script)
static MymrCharClass getMyanmarCharClass(ushort ch)
static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const signed char mymrStateTable[][Mymr_CC_COUNT]
static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const MymrCharClass mymrCharClasses[]
static Form form(unsigned short uc)
static const unsigned char indicForms[0xe00-0x900]
static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
void(*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) CharAttributeFunction
static const signed char khmerStateTable[][CC_COUNT]
static void getCharAttributes(const char16_t *string, qsizetype stringLength, const QUnicodeTools::ScriptItem *items, qsizetype numItems, QCharAttributes *attributes)
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char tibetanForm[0x80]
static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static KhmerCharClass getKhmerCharClass(ushort uc)
static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses]
static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initCharAttributes(QStringView string, const ScriptItem *items, qsizetype numItems, QCharAttributes *attributes, CharAttributeOptions options)
static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
#define FLAG(x)
Definition qchar.cpp:14
#define KHDEBUG
#define IDEBUG
constexpr int qt_initcharattributes_default_algorithm_only
#define tibetan_form(c)
#define MMDEBUG
QUnicodeTables::LineBreakClass lbc