Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qunicodetools.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:critical reason:data-parser
4
6
9#if QT_CONFIG(library)
10#include "qlibrary.h"
11#endif
12
13#include <limits.h>
14
15#define FLAG(x) (1 << (x))
16
18
19using namespace Qt::StringLiterals;
20
21#ifdef QT_BUILD_INTERNAL
22Q_CONSTINIT Q_AUTOTEST_EXPORT
23#else
24constexpr
25#endif
27
28namespace QUnicodeTools {
29
30// -----------------------------------------------------------------------------------------------------
31//
32// The text boundaries determination algorithm.
33// See https://www.unicode.org/reports/tr29/tr29-37.html
34//
35// -----------------------------------------------------------------------------------------------------
36
37namespace GB {
38
39// This table is indexed by the grapheme break classes of two
40// (adjacent) code points.
41// The class of the first code point selects an entry.
42// If the entry's bit at position second_cp_class is set
43// (in other words: if entry & (1u << second_cp_class) is non-zero)
44// then there is NO grapheme break between the two code points.
45
47
48// Check that we have enough bits in the table (in case
49// NumGraphemeBreakClasses grows too much).
51 "Internal error: increase the size in bits of GBTableEntryType");
52
53// GB9, GB9a
58
59static const GBTableEntryType HardBreak = 0u;
60
62 Extend_SpacingMark_ZWJ, // Any
64 HardBreak, // LF
65 HardBreak, // Control
66 Extend_SpacingMark_ZWJ, // Extend
67 Extend_SpacingMark_ZWJ, // ZWJ
68 Extend_SpacingMark_ZWJ, // RegionalIndicator
79 ), // Prepend
80 Extend_SpacingMark_ZWJ, // SpacingMark
86 ), // L
90 ), // V
93 ), // T
97 ), // LV
100 ), // LVT
101 Extend_SpacingMark_ZWJ // Extended_Pictographic
102};
103
106{
107 return (breakTable[first] & FLAG(second)) == 0;
108}
109
110// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
111// so we need to store some local state.
112enum class State : uchar {
114 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
115 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
116 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
117};
118
119} // namespace GB
120
121static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
122{
124 GB::State state = GB::State::Normal;
125 for (qsizetype i = 0; i != len; ++i) {
126 qsizetype pos = i;
127 char32_t ucs4 = string[i];
128 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
129 ushort low = string[i + 1];
130 if (QChar::isLowSurrogate(low)) {
131 ucs4 = QChar::surrogateToUcs4(ucs4, low);
132 ++i;
133 }
134 }
135
136 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
137 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
138
139 bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
140 bool handled = false;
141
142 switch (state) {
144 break; // will deal with it below
145
147 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
149 // keep going in the current state
150 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
151 handled = true;
152 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
154 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
155 handled = true;
156 } else {
157 state = GB::State::Normal;
158 }
159 break;
160
162 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
164 shouldBreak = false;
165 handled = true;
166 }
167
168 state = GB::State::Normal;
169 break;
170
174 shouldBreak = false;
175 handled = true;
176 }
177
178 state = GB::State::Normal;
179 break;
180 }
181
182 if (!handled) {
183 Q_ASSERT(state == GB::State::Normal);
187 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
188 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
190 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
191 }
192 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
193 state = GB::State::GB12_13_RI;
194 }
195 }
196
197 if (shouldBreak)
198 attributes[pos].graphemeBoundary = true;
199
200 lcls = cls;
201 }
202
203 attributes[len].graphemeBoundary = true; // GB2
204}
205
206
207namespace WB {
208
215
217// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
218 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
219 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
221 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
226 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
233 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
236 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
237};
238
239} // namespace WB
240
241static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
242{
243 enum WordType {
244 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
245 } currentWordType = WordTypeNone;
246
248 auto real_cls = cls; // Unaffected by WB4
249
250 for (qsizetype i = 0; i != len; ++i) {
251 qsizetype pos = i;
252 char32_t ucs4 = string[i];
253 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
254 ushort low = string[i + 1];
255 if (QChar::isLowSurrogate(low)) {
256 ucs4 = QChar::surrogateToUcs4(ucs4, low);
257 ++i;
258 }
259 }
260
261 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
262 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
264 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
265 // which caused "hi.there" to be treated like if it were just a single word;
266 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
267 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
268 if (ucs4 == 0x002E) // FULL STOP
270 else if (ucs4 == 0x003A) // COLON
272 }
273
274 uchar action = WB::breakTable[cls][ncls];
275 switch (action) {
276 case WB::Break:
277 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
278 && prop->graphemeBreakClass
280 // WB3c: ZWJ × \p{Extended_Pictographic}
281 action = WB::NoBreak;
282 }
283 break;
284 case WB::NoBreak:
286 // WB4: X(Extend|Format)* -> X
287 real_cls = ncls;
288 continue;
289 }
290 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
291 // WB15/WB16: break between pairs of Regional indicator
293 }
294 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
295 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
296 // WB3d should not be affected by WB4
297 action = WB::Break;
298 }
299 break;
300 case WB::Lookup:
301 case WB::LookupW:
302 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
303 ucs4 = string[lookahead];
304 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
305 ushort low = string[lookahead + 1];
306 if (QChar::isLowSurrogate(low)) {
307 ucs4 = QChar::surrogateToUcs4(ucs4, low);
308 ++lookahead;
309 }
310 }
311
312 prop = QUnicodeTables::properties(ucs4);
313 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
314
316 // WB4: X(Extend|Format)* -> X
317 continue;
318 }
319
320 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
321 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
322 i = lookahead;
323 ncls = tcls;
324 action = WB::NoBreak;
325 }
326 break;
327 }
328 if (action != WB::NoBreak) {
329 action = WB::Break;
331 action = WB::NoBreak; // WB7a
332 }
333 break;
334 }
335
336 cls = ncls;
337 real_cls = ncls;
338
339 if (action == WB::Break) {
340 attributes[pos].wordBreak = true;
341 if (currentWordType != WordTypeNone)
342 attributes[pos].wordEnd = true;
343 switch (cls) {
345 currentWordType = WordTypeHiraganaKatakana;
346 attributes[pos].wordStart = true;
347 break;
351 currentWordType = WordTypeAlphaNumeric;
352 attributes[pos].wordStart = true;
353 break;
354 default:
355 currentWordType = WordTypeNone;
356 break;
357 }
358 }
359 }
360
361 if (currentWordType != WordTypeNone)
362 attributes[len].wordEnd = true;
363 attributes[len].wordBreak = true; // WB2
364}
365
366
367namespace SB {
368
385
387// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
391
392 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
394 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
395 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
396
397 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
398 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
399 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
400 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
401 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
402};
403
404} // namespace SB
405
406static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
407{
408 uchar state = SB::BAfter; // to meet SB1
409 for (qsizetype i = 0; i != len; ++i) {
410 qsizetype pos = i;
411 char32_t ucs4 = string[i];
412 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
413 ushort low = string[i + 1];
414 if (QChar::isLowSurrogate(low)) {
415 ucs4 = QChar::surrogateToUcs4(ucs4, low);
416 ++i;
417 }
418 }
419
420 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
421 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
422
423 Q_ASSERT(state <= SB::BAfter);
424 state = SB::breakTable[state][ncls];
425 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
426 state = SB::Break;
427 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
428 ucs4 = string[lookahead];
429 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
430 ushort low = string[lookahead + 1];
431 if (QChar::isLowSurrogate(low)) {
432 ucs4 = QChar::surrogateToUcs4(ucs4, low);
433 ++lookahead;
434 }
435 }
436
437 prop = QUnicodeTables::properties(ucs4);
438 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
439 switch (tcls) {
446 continue;
448 i = lookahead;
449 state = SB::Initial;
450 break;
451 default:
452 break;
453 }
454 break;
455 }
456 }
457 if (Q_UNLIKELY(state == SB::Break)) {
458 attributes[pos].sentenceBoundary = true;
459 state = SB::breakTable[SB::Initial][ncls];
460 }
461 }
462
463 attributes[len].sentenceBoundary = true; // SB2
464}
465
466
467// -----------------------------------------------------------------------------------------------------
468//
469// The line breaking algorithm.
470// See http://www.unicode.org/reports/tr14/tr14-39.html
471//
472// -----------------------------------------------------------------------------------------------------
473
474namespace LB {
475
476namespace NS { // Number Sequence
477
478// This namespace is used to implement LB25 which, as of Unicode 16, has this
479// definition:
480// NU ( SY | IS )* CL × PO
481// NU ( SY | IS )* CP × PO
482// NU ( SY | IS )* CL × PR
483// NU ( SY | IS )* CP × PR
484// NU ( SY | IS )* × PO
485// NU ( SY | IS )* × PR
486// PO × OP NU
487// PO × OP IS NU
488// PO × NU
489// PR × OP NU
490// PR × OP IS NU
491// PR × NU
492// HY × NU
493// IS × NU
494// NU ( SY | IS )* × NU
495
496enum Action {
501 NeedOPNU, // Like Start, but must be followed by sequence `(OP (IS)?)? NU`
502 // These are 'synthetic' actions and are not used in the table but are
503 // tracked otherwise in the code for LB25, to track the state of specific
504 // sequences:
505 CNeedNU, // Like Continue, but must be followed by NU
506 CNeedISNU, // Like Continue, but must be followed by IS? NU
507};
508
519
520static const uchar actionTable[CLCP + 1][CLCP + 1] = {
521// XX PRPO OP HY NU SY IS CLCP
522 { None , NeedOPNU, Start , None , Start , None , None , None }, // XX
523 { None , NeedOPNU, Continue, Break , Start , None , None , None }, // PRPO
524 { None , Start , Start , Break , Continue, None , Continue, None }, // OP
525 { None , None , None , Start , Continue, None , None , None }, // HY
529 { Break , Continue, Break , Break , Break , Break , Break , Break }, // CLCP
530};
531
533{
534 switch (lbc) {
536 return PRPO;
538 return OP;
540 return HY;
542 return NU;
544 return SY;
546 return IS;
548 return CLCP;
549 default:
550 break;
551 }
552 return XX;
553}
554
555} // namespace NS
556
557namespace BRS { // Brahmic Sequence, used to implement LB28a
558 constexpr char32_t DottedCircle = U'\u25CC';
559
560 // The LB28a_{n} value maps to the 'regex' on the nth line in LB28a
561 // The only special case is LB28a_2VI which is a direct match to the 2nd
562 // line, but it also leads to LB28a_3VIAK, the 3rd line.
563 enum State {
565 Start, // => Have: `(AK | [◌] | AS)`
566 LB28a_2VF, // => Have: `(AK | [◌] | AS) VF`
567 LB28a_2VI, // => Have: `(AK | [◌] | AS) VI` May find: `(AK | [◌])`
568 LB28a_3VIAK, // => Have: `(AK | [◌] | AS) VI (AK | [◌])`
569 LB28a_4, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS)` May find: `VF`
570 LB28a_4VF, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS) VF`
572 };
582 {
583 using LBC = QUnicodeTables::LineBreakClass;
584 if (lb.lbc == LBC::LineBreak_CM)
585 return state;
586
587 switch (state) {
588 case Start:
589 if (lb.lbc == LBC::LineBreak_VF)
590 return LB28a_2VF;
591 if (lb.lbc == LBC::LineBreak_VI)
592 return LB28a_2VI;
593 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
594 || lb.lbc == LBC::LineBreak_AS)
595 return LB28a_4;
596 break;
597 case LB28a_2VI:
598 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK)
599 return LB28a_3VIAK;
600 break;
601 case LB28a_4:
602 if (lb.lbc == LBC::LineBreak_VF)
603 return LB28a_4VF;
604 // Had (AK | [◌] | AS) (AK | [◌] | AS), which could mean the 2nd capture is the start
605 // of a new sequence, so we need to check if it makes sense.
606 return Restart;
607 case None:
608 if (Q_UNLIKELY(lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
609 || lb.lbc == LBC::LineBreak_AS)) {
610 return Start;
611 }
612 break;
613 case LB28a_2VF:
614 case LB28a_4VF:
615 case LB28a_3VIAK:
616 case Restart:
617 // These are all terminal states, so no need to update
618 Q_UNREACHABLE();
619 }
620 return None;
621 }
622}
623
634
635// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
636// about the table. It was removed in the later versions of the standard.
638/* 1↓ 2→ OP CL CP QU +Pi +Pf +19 GL NS EX SY IS PR PO NU AL HL ID IN HY +WS BA +WS HYBA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM AK AP AS VI VF*/
639/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
640/* CL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
641/* CP */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
642/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
643/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
644/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
645/* +19*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
646/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
647/* NS */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
648/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
649/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
650/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DN, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
651/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, DB, DB, DB },
652/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
653/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
654/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
655/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, CI, CI, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
656/* ID */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
657/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
658/* HY */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
659/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
660/* BA */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
661/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
662/*HYBA*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, DB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
663/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB },
664/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
665/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
666/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
667/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
668/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
669/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
670/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
671/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
672/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
673/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB },
674/* CB */ { DB, PB, PB, IB, IB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
675/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB },
676/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
677/* AK */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
678/* AP */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, IB, DB, DB },
679/* AS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
680/* VI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
681/* VF */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
682};
683
684// The following line break classes are not treated by the pair table
685// and must be resolved outside:
686// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX, ZWJ
687
688} // namespace LB
689
690static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
691{
692 qsizetype nestart = 0;
693 LB::NS::Class nelast = LB::NS::XX;
694 LB::NS::Action neactlast = LB::NS::None;
695
696 LB::BRS::ParseState brsState;
697
699 QUnicodeTables::LineBreakClass cls = lcls;
700 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n');
701
702 constexpr static auto isEastAsian = [](QUnicodeTables::EastAsianWidth eaw) {
703 using EAW = QUnicodeTables::EastAsianWidth;
704 return eaw == EAW::W || eaw == EAW::F || eaw == EAW::H;
705 };
706
707 for (qsizetype i = 0; i != len; ++i) {
708 qsizetype pos = i;
709 char32_t ucs4 = string[i];
710 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
711 ushort low = string[i + 1];
712 if (QChar::isLowSurrogate(low)) {
713 ucs4 = QChar::surrogateToUcs4(ucs4, low);
714 ++i;
715 }
716 }
717
718 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
719 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
721
723 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
725 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
726 ) {
727 // LB27: use SPACE for line breaking
728 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
729 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
730 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
732 } else {
733 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
734 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
735 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
736 if (FLAG(prop->category) & test)
738 }
739 }
740 }
741
742 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
743 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
744 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
745 if (FLAG(prop->category) & test)
747 }
748
749 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
750 if (prop->category == QChar::Punctuation_InitialQuote) {
751 // LB15a: Do not break after an unresolved initial punctuation
752 // that lies at the start of the line, after a space, after
753 // opening punctuation, or after an unresolved quotation mark,
754 // even after spaces.
755 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
756 // [\p{Pi}&QU] SP* ×
757 // Note: sot is treated as LF here due to initial loop setup.
758 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
764 if (std::any_of(std::begin(lb15a), std::end(lb15a),
765 [lcls](auto x) { return x == lcls; })) {
767 }
768 } else if (prop->category == QChar::Punctuation_FinalQuote) {
769 // LB15b: Do not break before an unresolved final punctuation
770 // that lies at the end of the line, before a space, before
771 // a prohibited break, or before an unresolved quotation mark,
772 // even after spaces.
773 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
774 // | SY | BK | CR | LF | NL | ZW | eot)
775 auto nncls = QUnicodeTables::LineBreak_LF;
776
777 if (i + 1 < len) {
778 char32_t c = string[i + 1];
779 if (QChar::isHighSurrogate(c) && i + 2 < len) {
780 ushort low = string[i + 2];
781 if (QChar::isLowSurrogate(low))
782 c = QChar::surrogateToUcs4(c, low);
783 }
784 nncls = QUnicodeTables::LineBreakClass(
785 QUnicodeTables::properties(c)->lineBreakClass);
786 }
787
788 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
797 if (std::any_of(std::begin(lb15b), std::end(lb15b),
798 [nncls](auto x) { return x == nncls; })) {
800 }
801 }
802 }
803
804 if (Q_UNLIKELY((lcls >= QUnicodeTables::LineBreak_SP || lcls == QUnicodeTables::LineBreak_ZW
807 && (ncls == QUnicodeTables::LineBreak_HY || ucs4 == u'\u2010'))) {
808 // LB20a: Do not break after a word-initial hyphen.
809 // ( sot | BK | CR | LF | NL | SP | ZW | CB | GL ) ( HY | [\u2010] ) × AL
810
811 // Remap to the synthetic class WS_* (whitespace+*), which is just
812 // like the current respective linebreak class but with an IB action
813 // if the next class is AL.
814 if (ucs4 == u'\u2010')
816 else
818 }
819
820 if (Q_UNLIKELY(cls == QUnicodeTables::LineBreak_AP && ucs4 == LB::BRS::DottedCircle)) {
821 // LB28a: Do not break inside the orthographic syllables of Brahmic scripts
822 // AP × (AK | [◌] | AS)
823 // @note: AP × (AK | AS) is checked by the breakTable
824 goto next;
825 }
826 while (true) { // May need to recheck once.
827 // LB28a cont'd
828 LB::BRS::State oldState = brsState.state;
829 brsState.state = LB::BRS::updateState(brsState.state, {ncls, ucs4});
830 if (Q_LIKELY(brsState.state == oldState))
831 break;
832 switch (brsState.state) {
833 case LB::BRS::Start:
834 brsState.start = i;
835 break;
836 case LB::BRS::LB28a_2VI: // Wait for more characters, but also valid sequence
837 // We may get another character, but this is already a complete
838 // sequence that should not have any breaks:
839 for (qsizetype j = brsState.start + 1; j < i; ++j)
840 attributes[j].lineBreak = false;
841 // No need to mark this sequence again later, so move 'start'
842 // up to the current position:
843 brsState.start = i;
844 goto next;
845 case LB::BRS::Restart:
846 // The previous character was possibly the start of a new sequence
847 brsState.state = LB::BRS::Start;
848 brsState.start = pos - 1;
849 continue; // Doing the loop again!
853 for (qsizetype j = brsState.start + 1; j < i; ++j)
854 attributes[j].lineBreak = false;
855 if (brsState.state == LB::BRS::LB28a_3VIAK) {
856 // This might be the start of a new sequence
857 brsState.state = LB::BRS::Start;
858 brsState.start = i;
859 } else {
860 brsState.state = LB::BRS::None;
861 }
862 goto next;
863 case LB::BRS::LB28a_4: // Wait for more characters
864 Q_LIKELY_BRANCH
865 case LB::BRS::None: // Nothing to do
866 break;
867 }
868 break;
869 }
870
871 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_IS)) {
872 // LB15c Break before a decimal mark that follows a space, for instance, in
873 // ‘subtract .5’.
874 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_SP)) {
875 if (i + 1 < len) {
876 char32_t ch = string[i + 1];
877 if (QChar::isHighSurrogate(ch) && i + 2 < len) {
878 ushort low = string[i + 2];
879 if (QChar::isLowSurrogate(low))
880 ch = QChar::surrogateToUcs4(ch, low);
881 }
882 if (QUnicodeTables::properties(ch)->lineBreakClass
883 == QUnicodeTables::LineBreak_NU) {
884 attributes[pos].lineBreak = true;
885 goto next;
886 }
887 }
888 }
889 }
890
891 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_HL)) {
892 // LB21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
893 // HL (HY | [ BA - $EastAsian ]) × [^HL]
894 auto eaw = QUnicodeTables::EastAsianWidth(prop->eastAsianWidth);
895 const bool isNonEaBA = ncls == QUnicodeTables::LineBreak_BA && !isEastAsian(eaw);
896 if (isNonEaBA || ncls == QUnicodeTables::LineBreak_HY) {
897 // Remap to synthetic HYBA class which handles the next
898 // character. Generally (LB21) there are no breaks before
899 // HY or BA, so we can skip ahead to the next character.
901 goto next;
902 }
903 }
904
905 // LB25: do not break lines inside numbers
906 {
907 LB::NS::Class necur = LB::NS::toClass(ncls);
908 LB::NS::Action neact = LB::NS::Action(LB::NS::actionTable[nelast][necur]);
909 if (Q_UNLIKELY(neactlast == LB::NS::CNeedNU && necur != LB::NS::NU)) {
910 neact = LB::NS::None;
911 } else if (Q_UNLIKELY(neactlast == LB::NS::NeedOPNU)) {
912 if (necur == LB::NS::OP)
913 neact = LB::NS::CNeedISNU;
914 else if (necur == LB::NS::NU)
915 neact = LB::NS::Continue;
916 else // Anything else and we ignore the sequence
917 neact = LB::NS::None;
918 } else if (Q_UNLIKELY(neactlast == LB::NS::CNeedISNU)) {
919 if (necur == LB::NS::IS)
920 neact = LB::NS::CNeedNU;
921 else if (necur == LB::NS::NU)
922 neact = LB::NS::Continue;
923 else // Anything else and we ignore the sequence
924 neact = LB::NS::None;
925 }
926 switch (neact) {
927 case LB::NS::Break:
928 // do not change breaks before and after the expression
929 for (qsizetype j = nestart + 1; j < pos; ++j)
930 attributes[j].lineBreak = false;
931 Q_FALLTHROUGH();
932 Q_LIKELY_BRANCH
933 case LB::NS::None:
934 nelast = LB::NS::XX; // reset state
935 break;
936 case LB::NS::NeedOPNU:
937 case LB::NS::Start:
938 if (neactlast == LB::NS::Start || neactlast == LB::NS::Continue) {
939 // Apply the linebreaks for the previous stretch; we need to start a new one
940 for (qsizetype j = nestart + 1; j < pos; ++j)
941 attributes[j].lineBreak = false;
942 }
943 nestart = i;
944 Q_FALLTHROUGH();
945 case LB::NS::CNeedNU:
947 case LB::NS::Continue:
948 nelast = necur;
949 break;
950 }
951 neactlast = neact;
952 }
953
954 // LB19a Unless surrounded by East Asian characters, do not break either side of any
955 // unresolved quotation marks
956 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU
958 && lcls != QUnicodeTables::LineBreak_ZW)) {
959 using EAW = QUnicodeTables::EastAsianWidth;
960 constexpr static auto nextCharNonEastAsian = [](const char16_t *string, qsizetype len) {
961 if (len > 0) {
962 char32_t nch = string[0];
963 if (QChar::isHighSurrogate(nch) && len > 1) {
964 char16_t low = string[1];
965 if (QChar::isLowSurrogate(low))
966 nch = QChar::surrogateToUcs4(char16_t(nch), low);
967 }
968 const auto *nextProp = QUnicodeTables::properties(nch);
970 nextProp->lineBreakClass);
971 QUnicodeTables::EastAsianWidth neaw = EAW(nextProp->eastAsianWidth);
972 return nncls != QUnicodeTables::LineBreak_CM
973 && nncls <= QUnicodeTables::LineBreak_SP
974 && !isEastAsian(neaw);
975 }
976 return true; // end-of-text counts as non-East-Asian
977 };
978 if (Q_UNLIKELY(!isEastAsian(EAW(lastProp->eastAsianWidth))
979 || nextCharNonEastAsian(string + i + 1, len - i - 1))) {
980 // Remap to the synthetic QU_19 class which has indirect breaks
981 // for most following classes.
983 }
984 }
985
986 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
987 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
988 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
989 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
990 goto next;
991 }
992
993 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
995 goto next; // LB6: x(BK|CR|LF|NL)
996 goto next_no_cls_update; // LB7: xSP
997 }
998
999 // LB19 - do not break before non-initial unresolved quotation marks, or after non-final
1000 // unresolved quotation marks
1001 if (Q_UNLIKELY(((ncls == QUnicodeTables::LineBreak_QU
1002 || ncls == QUnicodeTables::LineBreak_QU_19)
1003 && prop->category != QChar::Punctuation_InitialQuote)
1004 || (cls == QUnicodeTables::LineBreak_QU
1005 && lastProp->category != QChar::Punctuation_FinalQuote))) {
1006 // Make sure the previous character is not one that we have to break after.
1007 // Also skip if ncls is CM so it can be treated as lcls (LB9)
1009 && ncls != QUnicodeTables::LineBreak_CM) {
1010 goto next;
1011 }
1012 }
1013
1014 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
1015 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
1017 // don't update anything
1018 goto next_no_cls_update;
1019 }
1020
1021 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
1022 // LB8a: ZWJ x
1023 goto next;
1024 }
1025
1026 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
1027 // LB30a
1029 goto next;
1030 }
1031
1032 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
1033 && lastProp->category == QChar::Other_NotAssigned
1034 && lastProp->graphemeBreakClass
1035 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
1036 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
1037 goto next;
1038 }
1039
1040 // for South East Asian chars that require a complex analysis, the Unicode
1041 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
1042 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
1044
1045 tcls = cls;
1046
1047 constexpr static auto remapToAL = [](QUnicodeTables::LineBreakClass &c, auto &property) {
1048 if (Q_UNLIKELY(c == QUnicodeTables::LineBreak_CM
1051 property = QUnicodeTables::properties(U'\u0041');
1052 }
1053 };
1054 // LB10 Treat any remaining combining mark or ZWJ as AL,
1055 // as if it had the properties of U+0041 A LATIN CAPITAL LETTER
1056 remapToAL(tcls, lastProp);
1057 remapToAL(ncls, prop);
1058
1060 case LB::DirectBreak:
1061 attributes[pos].lineBreak = true;
1062 break;
1063 case LB::IndirectBreak:
1064 if (lcls == QUnicodeTables::LineBreak_SP)
1065 attributes[pos].lineBreak = true;
1066 break;
1068 if (lcls != QUnicodeTables::LineBreak_SP)
1069 goto next_no_cls_update;
1070 attributes[pos].lineBreak = true;
1071 break;
1073 if (lcls != QUnicodeTables::LineBreak_SP)
1074 goto next_no_cls_update;
1075 break;
1076 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
1077 if (lcls != QUnicodeTables::LineBreak_HL)
1078 attributes[pos].lineBreak = true;
1079 break;
1081 using EAW = QUnicodeTables::EastAsianWidth;
1082 switch (EAW(prop->eastAsianWidth)) {
1083 default:
1084 if (lcls != QUnicodeTables::LineBreak_SP)
1085 break;
1086 Q_FALLTHROUGH();
1087 case QUnicodeTables::EastAsianWidth::F:
1088 case QUnicodeTables::EastAsianWidth::W:
1089 case QUnicodeTables::EastAsianWidth::H:
1090 attributes[pos].lineBreak = true;
1091 break;
1092 }
1093 break;
1094 case LB::DirectBreakOutsideNumericSequence:
1095 if (neactlast == LB::NS::None || neactlast > LB::NS::Break)
1096 attributes[pos].lineBreak = true;
1097 break;
1099 // nothing to do
1100 default:
1101 break;
1102 }
1103
1104 next:
1106 cls = ncls;
1107 lastProp = prop;
1108 }
1109 next_no_cls_update:
1110 lcls = ncls;
1111 }
1112
1113 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
1114 // LB25: do not break lines inside numbers
1115 for (qsizetype j = nestart + 1; j < len; ++j)
1116 attributes[j].lineBreak = false;
1117 }
1118
1119 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
1120 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
1121}
1122
1123
1124static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1125{
1126 for (qsizetype i = 0; i != len; ++i) {
1127 uint ucs4 = string[i];
1128 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
1129 ushort low = string[i + 1];
1130 if (QChar::isLowSurrogate(low)) {
1131 ucs4 = QChar::surrogateToUcs4(ucs4, low);
1132 ++i;
1133 }
1134 }
1135
1136 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
1137 attributes[i].whiteSpace = true;
1138 }
1139}
1140
1141namespace Tailored {
1142
1143using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
1144
1145
1160
1161static const unsigned char indicForms[0xe00-0x900] = {
1162 // Devangari
1167
1172
1177
1182
1187
1192
1197
1202
1203 // Bengali
1208
1213
1218
1223
1228
1233
1238
1243
1244 // Gurmukhi
1249
1254
1259
1264
1269
1274
1279
1284
1285 // Gujarati
1290
1295
1300
1305
1310
1315
1320
1325
1326 // Oriya
1331
1336
1341
1346
1351
1356
1361
1366
1367 //Tamil
1372
1377
1382
1387
1392
1397
1402
1407
1408 // Telugu
1413
1418
1423
1428
1433
1438
1443
1448
1449 // Kannada
1454
1459
1464
1469
1474
1479
1484
1489
1490 // Malayalam
1495
1500
1505
1510
1515
1520
1525
1530
1531 // Sinhala
1536
1541
1546
1551
1556
1561
1566
1571};
1572
1573static inline Form form(unsigned short uc) {
1574 if (uc < 0x900 || uc > 0xdff) {
1575 if (uc == 0x25cc)
1576 return Consonant;
1577 if (uc == 0x200c || uc == 0x200d)
1578 return Control;
1579 return Other;
1580 }
1581 return (Form)indicForms[uc-0x900];
1582}
1583
1584// #define INDIC_DEBUG
1585#ifdef INDIC_DEBUG
1586#define IDEBUG qDebug
1587#else
1588#define IDEBUG if constexpr (1) ; else qDebug
1589#endif
1590
1591/* syllables are of the form:
1592
1593 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1594 (Consonant Nukta? Halant)* Consonant Halant
1595 IndependentVowel VowelMark? StressMark?
1596
1597 We return syllable boundaries on invalid combinations as well
1598*/
1599static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1600{
1601 *invalid = false;
1602 IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1603 const char16_t *uc = s+start;
1604
1605 qsizetype pos = 0;
1606 Form state = form(uc[pos]);
1607 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1608 pos++;
1609
1610 if (state != Consonant && state != IndependentVowel) {
1611 if (state != Other)
1612 *invalid = true;
1613 goto finish;
1614 }
1615
1616 while (pos < end - start) {
1617 Form newState = form(uc[pos]);
1618 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1619 switch (newState) {
1620 case Control:
1621 newState = state;
1622 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1623 break;
1624 // the control character should be the last char in the item
1625 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1626 break;
1627 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1628 break;
1629 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1630 ++pos;
1631 goto finish;
1632 case Consonant:
1633 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1634 break;
1635 goto finish;
1636 case Halant:
1637 if (state == Nukta || state == Consonant)
1638 break;
1639 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1640 if (script == QChar::Script_Bengali && pos == 1 &&
1641 (uc[0] == 0x0985 || uc[0] == 0x098f))
1642 break;
1643 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1644 if (script == QChar::Script_Sinhala && state == Matra) {
1645 ++pos;
1646 continue;
1647 }
1648 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1649 ++pos;
1650 continue;
1651 }
1652 goto finish;
1653 case Nukta:
1654 if (state == Consonant)
1655 break;
1656 goto finish;
1657 case StressMark:
1658 if (state == VowelMark)
1659 break;
1660 Q_FALLTHROUGH();
1661 case VowelMark:
1662 if (state == Matra || state == LengthMark || state == IndependentVowel)
1663 break;
1664 Q_FALLTHROUGH();
1665 case Matra:
1666 if (state == Consonant || state == Nukta)
1667 break;
1668 if (state == Matra) {
1669 // ### needs proper testing for correct two/three part matras
1670 break;
1671 }
1672 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1673 // it work for all Indic languages?
1674 // the combination Independent_A + Vowel Sign AA is allowed.
1675 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1676 break;
1677 if (script == QChar::Script_Tamil && state == Matra) {
1678 if (uc[pos-1] == 0x0bc6 &&
1679 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1680 break;
1681 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1682 break;
1683 }
1684 goto finish;
1685
1686 case LengthMark:
1687 if (state == Matra) {
1688 // ### needs proper testing for correct two/three part matras
1689 break;
1690 }
1691 Q_FALLTHROUGH();
1692 case IndependentVowel:
1693 case Invalid:
1694 case Other:
1695 goto finish;
1696 }
1697 state = newState;
1698 pos++;
1699 }
1700 finish:
1701 return pos+start;
1702}
1703
1704static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1705{
1706 qsizetype end = from + len;
1707 attributes += from;
1708 qsizetype i = 0;
1709 while (i < len) {
1710 bool invalid;
1711 qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1712 attributes[i].graphemeBoundary = true;
1713
1714 if (boundary > len-1) boundary = len;
1715 i++;
1716 while (i < boundary) {
1717 attributes[i].graphemeBoundary = false;
1718 ++i;
1719 }
1720 assert(i == boundary);
1721 }
1722
1723
1724}
1725
1726#if QT_CONFIG(library)
1727
1728#define LIBTHAI_MAJOR 0
1729
1730/*
1731 * if libthai changed please update these codes too.
1732 */
1733struct thcell_t {
1734 unsigned char base; /**< base character */
1735 unsigned char hilo; /**< upper/lower vowel/diacritic */
1736 unsigned char top; /**< top-level mark */
1737};
1738
1739using ThBrk = struct _ThBrk;
1740
1741namespace {
1742
1743class LibThai final
1744{
1746
1747 using th_brk_new_def = ThBrk *(*)(const char *);
1748 using th_brk_delete_def = void (*)(ThBrk *);
1749 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1750 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1751
1752public:
1753 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1754 {
1756 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
1757 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
1758
1759 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
1760 if (th_brk_new) {
1761 m_state = th_brk_new(nullptr);
1763 reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
1764 }
1765 }
1766
1767 ~LibThai()
1768 {
1769 if (m_state && m_th_brk_delete)
1771 m_library.unload();
1772 }
1773
1774 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1775
1776 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1777 {
1781 }
1782
1783 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1784 {
1787 }
1788
1789private:
1791
1792 // Global state for th_brk_find_breaks().
1793 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1794 // state is read-only, and so it is safe to use it from multiple threads after
1795 // initialization. This is also stated in the libthai documentation.
1796 ThBrk *m_state = nullptr;
1797
1801};
1802
1803} // unnamed namespace
1804
1806
1807static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1808{
1809 qsizetype i;
1810 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1811
1812 for (i = 0; i < len; ++i) {
1813 if (string[i] <= 0xa0)
1814 result[i] = static_cast<unsigned char>(string[i]);
1815 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1816 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1817 else
1818 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1819 }
1820
1821 result[len] = 0;
1822}
1823
1824/*
1825 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1826 */
1827static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1828{
1829 constexpr qsizetype Prealloc = 128;
1830 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1833 struct thcell_t tis_cell;
1834
1836 if (!libThai || !libThai->isInitialized())
1837 return;
1838
1839 to_tis620(string, len, s.data());
1840
1841 for (i = 0; i < len; ++i) {
1842 attributes[i].wordBreak = false;
1843 attributes[i].wordStart = false;
1844 attributes[i].wordEnd = false;
1845 attributes[i].lineBreak = false;
1846 }
1847
1848 attributes[0].wordBreak = true;
1849 attributes[0].wordStart = true;
1850 attributes[0].wordEnd = false;
1851 numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
1853 static_cast<size_t>(break_positions.size()));
1854 for (i = 0; i < numbreaks; ++i) {
1859 }
1860 if (numbreaks > 0)
1862
1863 /* manage grapheme boundaries */
1864 i = 0;
1865 while (i < len) {
1867 libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
1868 size_t(len - i), &tis_cell, true);
1869
1871 for (size_t j = 1; j < cell_length; ++j)
1872 attributes[i + j].graphemeBoundary = false;
1873
1874 i += cell_length;
1875 }
1876}
1877
1878#endif // QT_CONFIG(library)
1879
1880static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1881{
1882 assert(script == QChar::Script_Thai);
1883#if QT_CONFIG(library)
1884 const char16_t *uc = text + from;
1885 attributes += from;
1886 Q_UNUSED(script);
1887 thaiAssignAttributes(uc, len, attributes);
1888#else
1889 Q_UNUSED(script);
1890 Q_UNUSED(text);
1891 Q_UNUSED(from);
1892 Q_UNUSED(len);
1893 Q_UNUSED(attributes);
1894#endif
1895}
1896
1897/*
1898 tibetan syllables are of the form:
1899 head position consonant
1900 first sub-joined consonant
1901 ....intermediate sub-joined consonants (if any)
1902 last sub-joined consonant
1903 sub-joined vowel (a-chung U+0F71)
1904 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1905*/
1906
1914
1915/* this table starts at U+0f40 */
1916static const unsigned char tibetanForm[0x80] = {
1921
1926
1931
1936
1941
1946
1951
1956};
1957
1958#define tibetan_form(c)
1959 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1960
1961static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1962{
1963 const char16_t *uc = s + start;
1964
1965 qsizetype pos = 0;
1966 TibetanForm state = tibetan_form(*uc);
1967
1968/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1969 pos++;
1970
1971 if (state != TibetanHeadConsonant) {
1972 if (state != TibetanOther)
1973 *invalid = true;
1974 goto finish;
1975 }
1976
1977 while (pos < end - start) {
1978 TibetanForm newState = tibetan_form(uc[pos]);
1979 switch (newState) {
1982 if (state != TibetanHeadConsonant &&
1984 goto finish;
1985 state = newState;
1986 break;
1987 case TibetanVowel:
1988 if (state != TibetanHeadConsonant &&
1989 state != TibetanSubjoinedConsonant &&
1990 state != TibetanSubjoinedVowel)
1991 goto finish;
1992 break;
1993 case TibetanOther:
1995 goto finish;
1996 }
1997 pos++;
1998 }
1999
2000finish:
2001 *invalid = false;
2002 return start+pos;
2003}
2004
2005static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2006{
2007 qsizetype end = from + len;
2008 qsizetype i = 0;
2009 Q_UNUSED(script);
2010 attributes += from;
2011 while (i < len) {
2012 bool invalid;
2013 qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
2014
2015 attributes[i].graphemeBoundary = true;
2016
2017 if (boundary > len-1) boundary = len;
2018 i++;
2019 while (i < boundary) {
2020 attributes[i].graphemeBoundary = false;
2021 ++i;
2022 }
2023 assert(i == boundary);
2024 }
2025}
2026
2029 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
2030 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
2031 Mymr_CC_NGA = 3, /* Consonant NGA */
2032 Mymr_CC_YA = 4, /* Consonant YA */
2033 Mymr_CC_RA = 5, /* Consonant RA */
2034 Mymr_CC_WA = 6, /* Consonant WA */
2035 Mymr_CC_HA = 7, /* Consonant HA */
2036 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
2037 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
2038 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
2039 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
2040 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
2041 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
2042 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
2046 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
2047 Mymr_CC_COUNT = 19 /* This is the number of character classes */
2048};
2049
2052
2053 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2054 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
2055 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
2056 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
2057 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
2058 first in a syllable */
2059 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
2060
2061 /* position flags */
2063 Mymr_CF_POS_BELOW = 0x00040000,
2064 Mymr_CF_POS_ABOVE = 0x00020000,
2065 Mymr_CF_POS_AFTER = 0x00010000,
2066 Mymr_CF_POS_MASK = 0x000f0000,
2067
2069};
2070
2071Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
2072
2073/* Characters that get refrered to by name */
2075{
2079 Mymr_C_RA = 0x101B,
2080 Mymr_C_YA = 0x101A,
2081 Mymr_C_NGA = 0x1004,
2084};
2085
2086enum
2087{
2105};
2106
2107
2108typedef int MymrCharClass;
2109
2110
2126
2127static MymrCharClass
2129{
2130 if (ch == Mymr_C_SIGN_ZWJ)
2132
2133 if (ch == Mymr_C_SIGN_ZWNJ)
2135
2136 if (ch < 0x1000 || ch > 0x105f)
2137 return Mymr_CC_RESERVED;
2138
2139 return mymrCharClasses[ch - 0x1000];
2140}
2141
2142static const signed char mymrStateTable[][Mymr_CC_COUNT] =
2143{
2144/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
2145 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
2146 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
2147 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
2148 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
2149 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
2150 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
2151 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
2152 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
2153 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
2154 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
2155 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
2156 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
2157 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
2158 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
2159 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
2160 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
2161 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
2162 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
2163 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
2164 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
2165 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
2166 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
2167 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
2168 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
2169 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
2170 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
2171 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
2172 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
2173/* exit state -2 is for invalid order of medials and combination of invalids
2174 with virama where virama should treat as start of next syllable
2175 */
2176};
2177
2178/*#define MYANMAR_DEBUG */
2179#ifdef MYANMAR_DEBUG
2180#define MMDEBUG qDebug
2181#else
2182# define MMDEBUG
2183 if (0)
2184 printf
2185#endif
2186
2187/*
2188// Given an input string of characters and a location in which to start looking
2189// calculate, using the state table, which one is the last character of the syllable
2190// that starts in the starting position.
2191*/
2192static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2193{
2194 const char16_t *uc = s + start;
2195 int state = 0;
2196 qsizetype pos = start;
2197 *invalid = false;
2198
2199 while (pos < end) {
2200 MymrCharClass charClass = getMyanmarCharClass(*uc);
2201 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
2202 if (pos == start)
2203 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
2204
2205 MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
2206
2207 if (state < 0) {
2208 if (state < -1)
2209 --pos;
2210 break;
2211 }
2212 ++uc;
2213 ++pos;
2214 }
2215 return pos;
2216}
2217
2218static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2219{
2220 qsizetype end = from + len;
2221 qsizetype i = 0;
2222 Q_UNUSED(script);
2223 attributes += from;
2224 while (i < len) {
2225 bool invalid;
2226 qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
2227
2228 attributes[i].graphemeBoundary = true;
2229 attributes[i].lineBreak = true;
2230
2231 if (boundary > len-1)
2232 boundary = len;
2233 i++;
2234 while (i < boundary) {
2235 attributes[i].graphemeBoundary = false;
2236 ++i;
2237 }
2238 assert(i == boundary);
2239 }
2240}
2241
2242/*
2243// Vocabulary
2244// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
2245// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
2246// split vowels, signs... but there is only one base in a syllable, it has to be coded as
2247// the first character of the syllable.
2248// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
2249// Khmer language has five of them. Khmer split vowels either have one part before the
2250// base and one after the base or they have a part before the base and a part above the base.
2251// The first part of all Khmer split vowels is the same character, identical to
2252// the glyph of Khmer dependent vowel SRA EI
2253// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
2254// Differently than indian languages, the coeng modifies the consonant that follows it,
2255// not the one preceding it Each consonant has two forms, the base form and the subscript form
2256// the base form is the normal one (using the consonants code-point), the subscript form is
2257// displayed when the combination coeng + consonant is encountered.
2258// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
2259// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
2260// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
2261// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
2262// if it is attached to a consonant of the first series or a consonant of the second series
2263// Most consonants have an equivalent in the other series, but some of theme exist only in
2264// one series (for example SA). If we want to use the consonant SA with a vowel sound that
2265// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
2266// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
2267// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
2268// MUSIKATOAN a second series consonant to have a first series vowel sound.
2269// Consonant shifter are both normally supercript marks, but, when they are followed by a
2270// superscript, they change shape and take the form of subscript dependent vowel SRA U.
2271// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
2272// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
2273// be placed after the coeng consonant.
2274// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2275// Each vowel has its own position. Only one vowel per syllable is allowed.
2276// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2277// Allowed in a syllable.
2278//
2279//
2280// order is important here! This order must be the same that is found in each horizontal
2281// line in the statetable for Khmer (see khmerStateTable) .
2282*/
2285 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2286 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2287 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2288 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2290 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2291 CC_COENG = 7, /* Subscript consonant combining character */
2295 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2296 CC_COUNT = 12 /* This is the number of character classes */
2297};
2298
2299
2301 CF_CLASS_MASK = 0x0000FFFF,
2302
2303 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2304 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2305 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2306 CF_COENG = 0x08000000, /* flag to speed up comparing */
2307 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2308 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2309
2310 /* position flags */
2311 CF_POS_BEFORE = 0x00080000,
2312 CF_POS_BELOW = 0x00040000,
2313 CF_POS_ABOVE = 0x00020000,
2314 CF_POS_AFTER = 0x00010000,
2315 CF_POS_MASK = 0x000f0000
2316};
2317
2318Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2319
2320/* Characters that get referred to by name */
2322 C_SIGN_ZWNJ = 0x200C,
2323 C_SIGN_ZWJ = 0x200D,
2324 C_RO = 0x179A,
2325 C_VOWEL_AA = 0x17B6,
2327 C_VOWEL_E = 0x17C1,
2328 C_COENG = 0x17D2
2329};
2330
2331
2332/*
2333// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2334// they are also used to know where a character should be placed (location in reference to the base character)
2335// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2336// indicate error in syllable construction
2337*/
2338enum {
2352
2353 /* split vowel */
2356};
2357
2358
2359/*
2360// Character class: a character class value
2361// ORed with character class flags.
2362*/
2363typedef unsigned long KhmerCharClass;
2364
2365
2366/*
2367// Character class tables
2368// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2369// _sa Sign placed above the base
2370// _sp Sign placed after the base
2371// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2372// _c2 Consonant of type 2 (only RO)
2373// _c3 Consonant of type 3
2374// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2375// _cd Consonant-shifter
2376// _dl Dependent vowel placed before the base (left of the base)
2377// _db Dependent vowel placed below the base
2378// _da Dependent vowel placed above the base
2379// _dr Dependent vowel placed behind the base (right of the base)
2380// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2381// it to create a subscript consonant or independent vowel
2382// _va Khmer split vowel in which the first part is before the base and the second one above the base
2383// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2384*/
2386 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2387 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2388 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2389 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2390 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2391 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2392};
2393
2394/* this enum must reflect the range of khmerCharClasses */
2399
2400/*
2401// Below we define how a character in the input string is either in the khmerCharClasses table
2402// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2403// within the syllable, but are not in the table) we also get their type back, or an unknown object
2404// in which case we get _xx (CC_RESERVED) back
2405*/
2407{
2408 if (uc == C_SIGN_ZWJ) {
2409 return CC_ZERO_WIDTH_J_MARK;
2410 }
2411
2412 if (uc == C_SIGN_ZWNJ) {
2413 return CC_ZERO_WIDTH_NJ_MARK;
2414 }
2415
2416 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2417 return CC_RESERVED;
2418 }
2419
2420 return khmerCharClasses[uc - KhmerFirstChar];
2421}
2422
2423
2424/*
2425// The stateTable is used to calculate the end (the length) of a well
2426// formed Khmer Syllable.
2427//
2428// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2429// CharClassValues. This coincidence of values allows the follow up of the table.
2430//
2431// Each line corresponds to a state, which does not necessarily need to be a type
2432// of component... for example, state 2 is a base, with is always a first character
2433// in the syllable, but the state could be produced a consonant of any type when
2434// it is the first character that is analysed (in ground state).
2435//
2436// Differentiating 3 types of consonants is necessary in order to
2437// forbid the use of certain combinations, such as having a second
2438// coeng after a coeng RO,
2439// The inexistent possibility of having a type 3 after another type 3 is permitted,
2440// eliminating it would very much complicate the table, and it does not create typing
2441// problems, as the case above.
2442//
2443// The table is quite complex, in order to limit the number of coeng consonants
2444// to 2 (by means of the table).
2445//
2446// There a peculiarity, as far as Unicode is concerned:
2447// - The consonant-shifter is considered in two possible different
2448// locations, the one considered in Unicode 3.0 and the one considered in
2449// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2450//
2451//
2452// xx independent character, such as a number, punctuation sign or non-khmer char
2453//
2454// c1 Khmer consonant of type 1 or an independent vowel
2455// that is, a letter in which the subscript for is only under the
2456// base, not taking any space to the right or to the left
2457//
2458// c2 Khmer consonant of type 2, the coeng form takes space under
2459// and to the left of the base (only RO is of this type)
2460//
2461// c3 Khmer consonant of type 3. Its subscript form takes space under
2462// and to the right of the base.
2463//
2464// cs Khmer consonant shifter
2465//
2466// rb Khmer robat
2467//
2468// co coeng character (u17D2)
2469//
2470// dv dependent vowel (including split vowels, they are treated in the same way).
2471// even if dv is not defined above, the component that is really tested for is
2472// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2473//
2474// zwj Zero Width joiner
2475//
2476// zwnj Zero width non joiner
2477//
2478// sa above sign
2479//
2480// sp post sign
2481//
2482// there are lines with equal content but for an easier understanding
2483// (and maybe change in the future) we did not join them
2484*/
2485static const signed char khmerStateTable[][CC_COUNT] =
2486{
2487 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2488 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2489 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2490 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2491 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2492 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2493 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2494 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2495 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2496 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2497 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2498 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2499 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2500 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2501 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2502 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2503 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2504 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2505 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2506 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2507 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2508 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2509};
2510
2511
2512/* #define KHMER_DEBUG */
2513#ifdef KHMER_DEBUG
2514#define KHDEBUG qDebug
2515#else
2516# define KHDEBUG
2517 if (0)
2518 printf
2519#endif
2520
2521/*
2522// Given an input string of characters and a location in which to start looking
2523// calculate, using the state table, which one is the last character of the syllable
2524// that starts in the starting position.
2525*/
2526static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2527{
2528 const char16_t *uc = s + start;
2529 int state = 0;
2530 qsizetype pos = start;
2531 *invalid = false;
2532
2533 while (pos < end) {
2534 KhmerCharClass charClass = getKhmerCharClass(*uc);
2535 if (pos == start) {
2536 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2537 }
2538 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2539
2540 KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2541 charClass, *uc );
2542
2543 if (state < 0) {
2544 break;
2545 }
2546 ++uc;
2547 ++pos;
2548 }
2549 return pos;
2550}
2551
2552static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2553{
2554 qsizetype end = from + len;
2555 qsizetype i = 0;
2556 Q_UNUSED(script);
2557 attributes += from;
2558 while ( i < len ) {
2559 bool invalid;
2560 qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2561
2562 attributes[i].graphemeBoundary = true;
2563
2564 if ( boundary > len-1 ) boundary = len;
2565 i++;
2566 while ( i < boundary ) {
2567 attributes[i].graphemeBoundary = false;
2568 ++i;
2569 }
2570 assert( i == boundary );
2571 }
2572}
2573
2574
2576{
2577 switch (script) {
2578 case QChar::Script_Unknown:
2579 case QChar::Script_Inherited:
2580 case QChar::Script_Common:
2581 case QChar::Script_Latin:
2582 case QChar::Script_Greek:
2583 case QChar::Script_Cyrillic:
2584 case QChar::Script_Armenian:
2585 case QChar::Script_Hebrew:
2586 case QChar::Script_Arabic:
2587 case QChar::Script_Syriac:
2588 case QChar::Script_Thaana:
2589 return nullptr;
2590 case QChar::Script_Devanagari:
2591 case QChar::Script_Bengali:
2592 case QChar::Script_Gurmukhi:
2593 case QChar::Script_Gujarati:
2594 case QChar::Script_Oriya:
2595 case QChar::Script_Tamil:
2596 case QChar::Script_Telugu:
2597 case QChar::Script_Kannada:
2598 case QChar::Script_Malayalam:
2599 case QChar::Script_Sinhala:
2600 return &indicAttributes;
2601 case QChar::Script_Thai:
2602 return &thaiAttributes;
2603 case QChar::Script_Lao:
2604 return nullptr;
2605 case QChar::Script_Tibetan:
2606 return &tibetanAttributes;
2607 case QChar::Script_Myanmar:
2608 return &myanmarAttributes;
2609 case QChar::Script_Georgian:
2610 case QChar::Script_Hangul:
2611 case QChar::Script_Ethiopic:
2612 case QChar::Script_Cherokee:
2613 case QChar::Script_CanadianAboriginal:
2614 case QChar::Script_Ogham:
2615 case QChar::Script_Runic:
2616 return nullptr;
2617 case QChar::Script_Khmer:
2618 return &khmerAttributes;
2619 case QChar::Script_Mongolian:
2620 case QChar::Script_Hiragana:
2621 case QChar::Script_Katakana:
2622 case QChar::Script_Bopomofo:
2623 case QChar::Script_Han:
2624 case QChar::Script_Yi:
2625 case QChar::Script_OldItalic:
2626 case QChar::Script_Gothic:
2627 case QChar::Script_Deseret:
2628 case QChar::Script_Tagalog:
2629 case QChar::Script_Hanunoo:
2630 case QChar::Script_Buhid:
2631 case QChar::Script_Tagbanwa:
2632 case QChar::Script_Coptic:
2633 case QChar::Script_Limbu:
2634 case QChar::Script_TaiLe:
2635 case QChar::Script_LinearB:
2636 case QChar::Script_Ugaritic:
2637 case QChar::Script_Shavian:
2638 case QChar::Script_Osmanya:
2639 case QChar::Script_Cypriot:
2640 case QChar::Script_Braille:
2641 case QChar::Script_Buginese:
2642 case QChar::Script_NewTaiLue:
2643 case QChar::Script_Glagolitic:
2644 case QChar::Script_Tifinagh:
2645 case QChar::Script_SylotiNagri:
2646 case QChar::Script_OldPersian:
2647 case QChar::Script_Kharoshthi:
2648 case QChar::Script_Balinese:
2649 case QChar::Script_Cuneiform:
2650 case QChar::Script_Phoenician:
2651 case QChar::Script_PhagsPa:
2652 case QChar::Script_Nko:
2653 case QChar::Script_Sundanese:
2654 case QChar::Script_Lepcha:
2655 case QChar::Script_OlChiki:
2656 case QChar::Script_Vai:
2657 case QChar::Script_Saurashtra:
2658 case QChar::Script_KayahLi:
2659 case QChar::Script_Rejang:
2660 case QChar::Script_Lycian:
2661 case QChar::Script_Carian:
2662 case QChar::Script_Lydian:
2663 case QChar::Script_Cham:
2664 case QChar::Script_TaiTham:
2665 case QChar::Script_TaiViet:
2666 case QChar::Script_Avestan:
2667 case QChar::Script_EgyptianHieroglyphs:
2668 case QChar::Script_Samaritan:
2669 case QChar::Script_Lisu:
2670 case QChar::Script_Bamum:
2671 case QChar::Script_Javanese:
2672 case QChar::Script_MeeteiMayek:
2673 case QChar::Script_ImperialAramaic:
2674 case QChar::Script_OldSouthArabian:
2675 case QChar::Script_InscriptionalParthian:
2676 case QChar::Script_InscriptionalPahlavi:
2677 case QChar::Script_OldTurkic:
2678 case QChar::Script_Kaithi:
2679 case QChar::Script_Batak:
2680 case QChar::Script_Brahmi:
2681 case QChar::Script_Mandaic:
2682 case QChar::Script_Chakma:
2683 case QChar::Script_MeroiticCursive:
2684 case QChar::Script_MeroiticHieroglyphs:
2685 case QChar::Script_Miao:
2686 case QChar::Script_Sharada:
2687 case QChar::Script_SoraSompeng:
2688 case QChar::Script_Takri:
2689 case QChar::Script_CaucasianAlbanian:
2690 case QChar::Script_BassaVah:
2691 case QChar::Script_Duployan:
2692 case QChar::Script_Elbasan:
2693 case QChar::Script_Grantha:
2694 case QChar::Script_PahawhHmong:
2695 case QChar::Script_Khojki:
2696 case QChar::Script_LinearA:
2697 case QChar::Script_Mahajani:
2698 case QChar::Script_Manichaean:
2699 case QChar::Script_MendeKikakui:
2700 case QChar::Script_Modi:
2701 case QChar::Script_Mro:
2702 case QChar::Script_OldNorthArabian:
2703 case QChar::Script_Nabataean:
2704 case QChar::Script_Palmyrene:
2705 case QChar::Script_PauCinHau:
2706 case QChar::Script_OldPermic:
2707 case QChar::Script_PsalterPahlavi:
2708 case QChar::Script_Siddham:
2709 case QChar::Script_Khudawadi:
2710 case QChar::Script_Tirhuta:
2711 case QChar::Script_WarangCiti:
2712 case QChar::Script_Ahom:
2713 case QChar::Script_AnatolianHieroglyphs:
2714 case QChar::Script_Hatran:
2715 case QChar::Script_Multani:
2716 case QChar::Script_OldHungarian:
2717 case QChar::Script_SignWriting:
2718 case QChar::Script_Adlam:
2719 case QChar::Script_Bhaiksuki:
2720 case QChar::Script_Marchen:
2721 case QChar::Script_Newa:
2722 case QChar::Script_Osage:
2723 case QChar::Script_Tangut:
2724 case QChar::Script_MasaramGondi:
2725 case QChar::Script_Nushu:
2726 case QChar::Script_Soyombo:
2727 case QChar::Script_ZanabazarSquare:
2728 case QChar::Script_Dogra:
2729 case QChar::Script_GunjalaGondi:
2730 case QChar::Script_HanifiRohingya:
2731 case QChar::Script_Makasar:
2732 case QChar::Script_Medefaidrin:
2733 case QChar::Script_OldSogdian:
2734 case QChar::Script_Sogdian:
2735 case QChar::Script_Elymaic:
2736 case QChar::Script_Nandinagari:
2737 case QChar::Script_NyiakengPuachueHmong:
2738 case QChar::Script_Wancho:
2739 case QChar::Script_Chorasmian:
2740 case QChar::Script_DivesAkuru:
2741 case QChar::Script_KhitanSmallScript:
2742 case QChar::Script_Yezidi:
2743 case QChar::Script_CyproMinoan:
2744 case QChar::Script_OldUyghur:
2745 case QChar::Script_Tangsa:
2746 case QChar::Script_Toto:
2747 case QChar::Script_Vithkuqi:
2748 case QChar::Script_Kawi:
2749 case QChar::Script_NagMundari:
2750 case QChar::Script_Garay:
2751 case QChar::Script_GurungKhema:
2752 case QChar::Script_KiratRai:
2753 case QChar::Script_OlOnal:
2754 case QChar::Script_Sunuwar:
2755 case QChar::Script_Todhri:
2756 case QChar::Script_TuluTigalari:
2757 return nullptr;
2758 case QChar::ScriptCount:
2759 // Don't Q_UNREACHABLE here; this might be a newer value in later Qt versions
2760 // (incl. patch releases)
2761 ;
2762 }
2763 return nullptr;
2764};
2765
2766static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2767 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2768 QCharAttributes *attributes)
2769{
2770 if (stringLength == 0)
2771 return;
2772 for (qsizetype i = 0; i < numItems; ++i) {
2773 QChar::Script script = items[i].script;
2774 CharAttributeFunction attributeFunction = charAttributeFunction(script);
2775 if (!attributeFunction)
2776 continue;
2777 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2778 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2779 }
2780}
2781
2782}
2783
2784Q_CORE_EXPORT void initCharAttributes(QStringView string,
2785 const ScriptItem *items, qsizetype numItems,
2786 QCharAttributes *attributes, CharAttributeOptions options)
2787{
2788 if (string.size() <= 0)
2789 return;
2790
2791 if (!(options & DontClearAttributes))
2792 ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2793
2794 if (options & GraphemeBreaks)
2795 getGraphemeBreaks(string.utf16(), string.size(), attributes);
2796 if (options & WordBreaks)
2797 getWordBreaks(string.utf16(), string.size(), attributes);
2798 if (options & SentenceBreaks)
2799 getSentenceBreaks(string.utf16(), string.size(), attributes);
2800 if (options & LineBreaks)
2801 getLineBreaks(string.utf16(), string.size(), attributes, options);
2802 if (options & WhiteSpaces)
2803 getWhiteSpaces(string.utf16(), string.size(), attributes);
2804
2806 if (!items || numItems <= 0)
2807 return;
2808
2809 Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2810 }
2811}
2812
2813
2814// ----------------------------------------------------------------------------
2815//
2816// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2817//
2818// ----------------------------------------------------------------------------
2819
2820Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2821{
2822 qsizetype sor = 0;
2823 qsizetype eor = 0;
2824 QChar::Script script = QChar::Script_Common;
2825
2826 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2827 char32_t ucs4 = string[i].unicode();
2828 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2829 ushort low = string[i + 1].unicode();
2830 if (QChar::isLowSurrogate(low)) {
2831 ucs4 = QChar::surrogateToUcs4(ucs4, low);
2832 ++i;
2833 }
2834 }
2835
2836 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2837
2838 QChar::Script nscript = QChar::Script(prop->script);
2839
2840 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2841 continue;
2842
2843 // inherit preceding Common-s
2844 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2845 // also covers a case where the base character of Common script followed
2846 // by one or more combining marks of non-Inherited, non-Common script
2847 script = nscript;
2848 continue;
2849 }
2850
2851 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2852 // Thus, a combining mark - whatever its script property value is - should inherit
2853 // the script property value of its base character.
2854 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2855 if (Q_UNLIKELY(FLAG(prop->category) & test))
2856 continue;
2857
2858 Q_ASSERT(script > QChar::Script_Common);
2859 Q_ASSERT(sor < eor);
2860 scripts->append(ScriptItem{sor, script});
2861 sor = eor;
2862
2863 script = nscript;
2864 }
2865
2866 Q_ASSERT(script >= QChar::Script_Common);
2867 Q_ASSERT(eor == string.size());
2868 scripts->append(ScriptItem{sor, script});
2869}
2870
2871} // namespace QUnicodeTools
2872
2873QT_END_NAMESPACE
static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first, QUnicodeTables::GraphemeBreakClass second)
static const GBTableEntryType Extend_SpacingMark_ZWJ
static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses]
static const GBTableEntryType HardBreak
State updateState(State state, LinebreakUnit lb)
constexpr char32_t DottedCircle
Class toClass(QUnicodeTables::LineBreakClass lbc)
static const uchar actionTable[CLCP+1][CLCP+1]
static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ]
static const uchar breakTable[BAfter+1][QUnicodeTables::NumSentenceBreakClasses]
static const KhmerCharClass khmerCharClasses[]
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static CharAttributeFunction charAttributeFunction(QChar::Script script)
static MymrCharClass getMyanmarCharClass(ushort ch)
static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const signed char mymrStateTable[][Mymr_CC_COUNT]
static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const MymrCharClass mymrCharClasses[]
static Form form(unsigned short uc)
static const unsigned char indicForms[0xe00-0x900]
static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
void(*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) CharAttributeFunction
static const signed char khmerStateTable[][CC_COUNT]
static void getCharAttributes(const char16_t *string, qsizetype stringLength, const QUnicodeTools::ScriptItem *items, qsizetype numItems, QCharAttributes *attributes)
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char tibetanForm[0x80]
static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static KhmerCharClass getKhmerCharClass(ushort uc)
static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses]
static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initCharAttributes(QStringView string, const ScriptItem *items, qsizetype numItems, QCharAttributes *attributes, CharAttributeOptions options)
static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
#define FLAG(x)
Definition qchar.cpp:14
#define KHDEBUG
#define IDEBUG
constexpr int qt_initcharattributes_default_algorithm_only
#define tibetan_form(c)
#define MMDEBUG
QUnicodeTables::LineBreakClass lbc