Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qunicodetools.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
5
8#if QT_CONFIG(library)
9#include "qlibrary.h"
10#endif
11
12#include <limits.h>
13
14#define FLAG(x) (1 << (x))
15
17
18using namespace Qt::StringLiterals;
19
20#ifdef QT_BUILD_INTERNAL
21Q_CONSTINIT Q_AUTOTEST_EXPORT
22#else
23constexpr
24#endif
26
27namespace QUnicodeTools {
28
29// -----------------------------------------------------------------------------------------------------
30//
31// The text boundaries determination algorithm.
32// See https://www.unicode.org/reports/tr29/tr29-37.html
33//
34// -----------------------------------------------------------------------------------------------------
35
36namespace GB {
37
38// This table is indexed by the grapheme break classes of two
39// (adjacent) code points.
40// The class of the first code point selects an entry.
41// If the entry's bit at position second_cp_class is set
42// (in other words: if entry & (1u << second_cp_class) is non-zero)
43// then there is NO grapheme break between the two code points.
44
46
47// Check that we have enough bits in the table (in case
48// NumGraphemeBreakClasses grows too much).
50 "Internal error: increase the size in bits of GBTableEntryType");
51
52// GB9, GB9a
57
58static const GBTableEntryType HardBreak = 0u;
59
61 Extend_SpacingMark_ZWJ, // Any
63 HardBreak, // LF
64 HardBreak, // Control
65 Extend_SpacingMark_ZWJ, // Extend
66 Extend_SpacingMark_ZWJ, // ZWJ
67 Extend_SpacingMark_ZWJ, // RegionalIndicator
78 ), // Prepend
79 Extend_SpacingMark_ZWJ, // SpacingMark
85 ), // L
89 ), // V
92 ), // T
96 ), // LV
99 ), // LVT
100 Extend_SpacingMark_ZWJ // Extended_Pictographic
101};
102
105{
106 return (breakTable[first] & FLAG(second)) == 0;
107}
108
109// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110// so we need to store some local state.
111enum class State : uchar {
113 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116};
117
118} // namespace GB
119
120static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
121{
123 GB::State state = GB::State::Normal;
124 for (qsizetype i = 0; i != len; ++i) {
125 qsizetype pos = i;
126 char32_t ucs4 = string[i];
127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
128 ushort low = string[i + 1];
129 if (QChar::isLowSurrogate(low)) {
130 ucs4 = QChar::surrogateToUcs4(ucs4, low);
131 ++i;
132 }
133 }
134
135 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
136 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
137
138 bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
139 bool handled = false;
140
141 switch (state) {
143 break; // will deal with it below
144
146 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
148 // keep going in the current state
149 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150 handled = true;
151 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
153 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154 handled = true;
155 } else {
156 state = GB::State::Normal;
157 }
158 break;
159
161 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
163 shouldBreak = false;
164 handled = true;
165 }
166
167 state = GB::State::Normal;
168 break;
169
173 shouldBreak = false;
174 handled = true;
175 }
176
177 state = GB::State::Normal;
178 break;
179 }
180
181 if (!handled) {
182 Q_ASSERT(state == GB::State::Normal);
186 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
189 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190 }
191 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
192 state = GB::State::GB12_13_RI;
193 }
194 }
195
196 if (shouldBreak)
197 attributes[pos].graphemeBoundary = true;
198
199 lcls = cls;
200 }
201
202 attributes[len].graphemeBoundary = true; // GB2
203}
204
205
206namespace WB {
207
214
216// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
235 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236};
237
238} // namespace WB
239
240static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
241{
242 enum WordType {
243 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244 } currentWordType = WordTypeNone;
245
247 auto real_cls = cls; // Unaffected by WB4
248
249 for (qsizetype i = 0; i != len; ++i) {
250 qsizetype pos = i;
251 char32_t ucs4 = string[i];
252 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
253 ushort low = string[i + 1];
254 if (QChar::isLowSurrogate(low)) {
255 ucs4 = QChar::surrogateToUcs4(ucs4, low);
256 ++i;
257 }
258 }
259
260 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
261 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
263 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264 // which caused "hi.there" to be treated like if it were just a single word;
265 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
267 if (ucs4 == 0x002E) // FULL STOP
269 else if (ucs4 == 0x003A) // COLON
271 }
272
273 uchar action = WB::breakTable[cls][ncls];
274 switch (action) {
275 case WB::Break:
276 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
277 && prop->graphemeBreakClass
279 // WB3c: ZWJ × \p{Extended_Pictographic}
280 action = WB::NoBreak;
281 }
282 break;
283 case WB::NoBreak:
285 // WB4: X(Extend|Format)* -> X
286 real_cls = ncls;
287 continue;
288 }
289 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
290 // WB15/WB16: break between pairs of Regional indicator
292 }
293 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
294 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295 // WB3d should not be affected by WB4
296 action = WB::Break;
297 }
298 break;
299 case WB::Lookup:
300 case WB::LookupW:
301 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
302 ucs4 = string[lookahead];
303 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
304 ushort low = string[lookahead + 1];
305 if (QChar::isLowSurrogate(low)) {
306 ucs4 = QChar::surrogateToUcs4(ucs4, low);
307 ++lookahead;
308 }
309 }
310
311 prop = QUnicodeTables::properties(ucs4);
312 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
313
315 // WB4: X(Extend|Format)* -> X
316 continue;
317 }
318
319 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
321 i = lookahead;
322 ncls = tcls;
323 action = WB::NoBreak;
324 }
325 break;
326 }
327 if (action != WB::NoBreak) {
328 action = WB::Break;
330 action = WB::NoBreak; // WB7a
331 }
332 break;
333 }
334
335 cls = ncls;
336 real_cls = ncls;
337
338 if (action == WB::Break) {
339 attributes[pos].wordBreak = true;
340 if (currentWordType != WordTypeNone)
341 attributes[pos].wordEnd = true;
342 switch (cls) {
344 currentWordType = WordTypeHiraganaKatakana;
345 attributes[pos].wordStart = true;
346 break;
350 currentWordType = WordTypeAlphaNumeric;
351 attributes[pos].wordStart = true;
352 break;
353 default:
354 currentWordType = WordTypeNone;
355 break;
356 }
357 }
358 }
359
360 if (currentWordType != WordTypeNone)
361 attributes[len].wordEnd = true;
362 attributes[len].wordBreak = true; // WB2
363}
364
365
366namespace SB {
367
384
386// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
390
391 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401};
402
403} // namespace SB
404
405static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
406{
407 uchar state = SB::BAfter; // to meet SB1
408 for (qsizetype i = 0; i != len; ++i) {
409 qsizetype pos = i;
410 char32_t ucs4 = string[i];
411 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
412 ushort low = string[i + 1];
413 if (QChar::isLowSurrogate(low)) {
414 ucs4 = QChar::surrogateToUcs4(ucs4, low);
415 ++i;
416 }
417 }
418
419 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
420 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
421
422 Q_ASSERT(state <= SB::BAfter);
423 state = SB::breakTable[state][ncls];
424 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
425 state = SB::Break;
426 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
427 ucs4 = string[lookahead];
428 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
429 ushort low = string[lookahead + 1];
430 if (QChar::isLowSurrogate(low)) {
431 ucs4 = QChar::surrogateToUcs4(ucs4, low);
432 ++lookahead;
433 }
434 }
435
436 prop = QUnicodeTables::properties(ucs4);
437 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
438 switch (tcls) {
445 continue;
447 i = lookahead;
448 state = SB::Initial;
449 break;
450 default:
451 break;
452 }
453 break;
454 }
455 }
456 if (Q_UNLIKELY(state == SB::Break)) {
457 attributes[pos].sentenceBoundary = true;
458 state = SB::breakTable[SB::Initial][ncls];
459 }
460 }
461
462 attributes[len].sentenceBoundary = true; // SB2
463}
464
465
466// -----------------------------------------------------------------------------------------------------
467//
468// The line breaking algorithm.
469// See http://www.unicode.org/reports/tr14/tr14-39.html
470//
471// -----------------------------------------------------------------------------------------------------
472
473namespace LB {
474
475namespace NS { // Number Sequence
476
477// This namespace is used to implement LB25 which, as of Unicode 16, has this
478// definition:
479// NU ( SY | IS )* CL × PO
480// NU ( SY | IS )* CP × PO
481// NU ( SY | IS )* CL × PR
482// NU ( SY | IS )* CP × PR
483// NU ( SY | IS )* × PO
484// NU ( SY | IS )* × PR
485// PO × OP NU
486// PO × OP IS NU
487// PO × NU
488// PR × OP NU
489// PR × OP IS NU
490// PR × NU
491// HY × NU
492// IS × NU
493// NU ( SY | IS )* × NU
494
495enum Action {
500 NeedOPNU, // Like Start, but must be followed by sequence `(OP (IS)?)? NU`
501 // These are 'synthetic' actions and are not used in the table but are
502 // tracked otherwise in the code for LB25, to track the state of specific
503 // sequences:
504 CNeedNU, // Like Continue, but must be followed by NU
505 CNeedISNU, // Like Continue, but must be followed by IS? NU
506};
507
518
519static const uchar actionTable[CLCP + 1][CLCP + 1] = {
520// XX PRPO OP HY NU SY IS CLCP
521 { None , NeedOPNU, Start , None , Start , None , None , None }, // XX
522 { None , NeedOPNU, Continue, Break , Start , None , None , None }, // PRPO
523 { None , Start , Start , Break , Continue, None , Continue, None }, // OP
524 { None , None , None , Start , Continue, None , None , None }, // HY
528 { Break , Continue, Break , Break , Break , Break , Break , Break }, // CLCP
529};
530
532{
533 switch (lbc) {
535 return PRPO;
537 return OP;
539 return HY;
541 return NU;
543 return SY;
545 return IS;
547 return CLCP;
548 default:
549 break;
550 }
551 return XX;
552}
553
554} // namespace NS
555
556namespace BRS { // Brahmic Sequence, used to implement LB28a
557 constexpr char32_t DottedCircle = U'\u25CC';
558
559 // The LB28a_{n} value maps to the 'regex' on the nth line in LB28a
560 // The only special case is LB28a_2VI which is a direct match to the 2nd
561 // line, but it also leads to LB28a_3VIAK, the 3rd line.
562 enum State {
564 Start, // => Have: `(AK | [◌] | AS)`
565 LB28a_2VF, // => Have: `(AK | [◌] | AS) VF`
566 LB28a_2VI, // => Have: `(AK | [◌] | AS) VI` May find: `(AK | [◌])`
567 LB28a_3VIAK, // => Have: `(AK | [◌] | AS) VI (AK | [◌])`
568 LB28a_4, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS)` May find: `VF`
569 LB28a_4VF, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS) VF`
571 };
581 {
582 using LBC = QUnicodeTables::LineBreakClass;
583 if (lb.lbc == LBC::LineBreak_CM)
584 return state;
585
586 switch (state) {
587 case Start:
588 if (lb.lbc == LBC::LineBreak_VF)
589 return LB28a_2VF;
590 if (lb.lbc == LBC::LineBreak_VI)
591 return LB28a_2VI;
592 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
593 || lb.lbc == LBC::LineBreak_AS)
594 return LB28a_4;
595 break;
596 case LB28a_2VI:
597 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK)
598 return LB28a_3VIAK;
599 break;
600 case LB28a_4:
601 if (lb.lbc == LBC::LineBreak_VF)
602 return LB28a_4VF;
603 // Had (AK | [◌] | AS) (AK | [◌] | AS), which could mean the 2nd capture is the start
604 // of a new sequence, so we need to check if it makes sense.
605 return Restart;
606 case None:
607 if (Q_UNLIKELY(lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
608 || lb.lbc == LBC::LineBreak_AS)) {
609 return Start;
610 }
611 break;
612 case LB28a_2VF:
613 case LB28a_4VF:
614 case LB28a_3VIAK:
615 case Restart:
616 // These are all terminal states, so no need to update
617 Q_UNREACHABLE();
618 }
619 return None;
620 }
621}
622
633
634// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
635// about the table. It was removed in the later versions of the standard.
637/* 1↓ 2→ OP CL CP QU +Pi +Pf +19 GL NS EX SY IS PR PO NU AL HL ID IN HY +WS BA +WS HYBA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM AK AP AS VI VF*/
638/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
639/* CL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
640/* CP */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
641/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
642/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
643/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
644/* +19*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
645/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
646/* NS */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
647/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
648/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
649/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DN, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
650/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, DB, DB, DB },
651/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
652/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
653/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
654/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, CI, CI, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
655/* ID */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
656/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
657/* HY */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
658/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
659/* BA */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
660/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
661/*HYBA*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, DB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
662/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB },
663/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
664/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
665/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
666/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
667/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
668/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
669/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
670/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
671/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
672/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB },
673/* CB */ { DB, PB, PB, IB, IB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
674/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB },
675/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
676/* AK */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
677/* AP */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, IB, DB, DB },
678/* AS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
679/* VI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
680/* VF */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
681};
682
683// The following line break classes are not treated by the pair table
684// and must be resolved outside:
685// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX, ZWJ
686
687} // namespace LB
688
689static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
690{
691 qsizetype nestart = 0;
692 LB::NS::Class nelast = LB::NS::XX;
693 LB::NS::Action neactlast = LB::NS::None;
694
695 LB::BRS::ParseState brsState;
696
698 QUnicodeTables::LineBreakClass cls = lcls;
699 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n');
700
701 constexpr static auto isEastAsian = [](QUnicodeTables::EastAsianWidth eaw) {
702 using EAW = QUnicodeTables::EastAsianWidth;
703 return eaw == EAW::W || eaw == EAW::F || eaw == EAW::H;
704 };
705
706 for (qsizetype i = 0; i != len; ++i) {
707 qsizetype pos = i;
708 char32_t ucs4 = string[i];
709 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
710 ushort low = string[i + 1];
711 if (QChar::isLowSurrogate(low)) {
712 ucs4 = QChar::surrogateToUcs4(ucs4, low);
713 ++i;
714 }
715 }
716
717 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
718 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
720
722 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
724 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
725 ) {
726 // LB27: use SPACE for line breaking
727 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
728 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
729 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
731 } else {
732 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
733 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
734 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
735 if (FLAG(prop->category) & test)
737 }
738 }
739 }
740
741 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
742 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
743 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
744 if (FLAG(prop->category) & test)
746 }
747
748 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
749 if (prop->category == QChar::Punctuation_InitialQuote) {
750 // LB15a: Do not break after an unresolved initial punctuation
751 // that lies at the start of the line, after a space, after
752 // opening punctuation, or after an unresolved quotation mark,
753 // even after spaces.
754 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
755 // [\p{Pi}&QU] SP* ×
756 // Note: sot is treated as LF here due to initial loop setup.
757 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
763 if (std::any_of(std::begin(lb15a), std::end(lb15a),
764 [lcls](auto x) { return x == lcls; })) {
766 }
767 } else if (prop->category == QChar::Punctuation_FinalQuote) {
768 // LB15b: Do not break before an unresolved final punctuation
769 // that lies at the end of the line, before a space, before
770 // a prohibited break, or before an unresolved quotation mark,
771 // even after spaces.
772 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
773 // | SY | BK | CR | LF | NL | ZW | eot)
774 auto nncls = QUnicodeTables::LineBreak_LF;
775
776 if (i + 1 < len) {
777 char32_t c = string[i + 1];
778 if (QChar::isHighSurrogate(c) && i + 2 < len) {
779 ushort low = string[i + 2];
780 if (QChar::isLowSurrogate(low))
781 c = QChar::surrogateToUcs4(c, low);
782 }
783 nncls = QUnicodeTables::LineBreakClass(
784 QUnicodeTables::properties(c)->lineBreakClass);
785 }
786
787 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
796 if (std::any_of(std::begin(lb15b), std::end(lb15b),
797 [nncls](auto x) { return x == nncls; })) {
799 }
800 }
801 }
802
803 if (Q_UNLIKELY((lcls >= QUnicodeTables::LineBreak_SP || lcls == QUnicodeTables::LineBreak_ZW
806 && (ncls == QUnicodeTables::LineBreak_HY || ucs4 == u'\u2010'))) {
807 // LB20a: Do not break after a word-initial hyphen.
808 // ( sot | BK | CR | LF | NL | SP | ZW | CB | GL ) ( HY | [\u2010] ) × AL
809
810 // Remap to the synthetic class WS_* (whitespace+*), which is just
811 // like the current respective linebreak class but with an IB action
812 // if the next class is AL.
813 if (ucs4 == u'\u2010')
815 else
817 }
818
819 if (Q_UNLIKELY(cls == QUnicodeTables::LineBreak_AP && ucs4 == LB::BRS::DottedCircle)) {
820 // LB28a: Do not break inside the orthographic syllables of Brahmic scripts
821 // AP × (AK | [◌] | AS)
822 // @note: AP × (AK | AS) is checked by the breakTable
823 goto next;
824 }
825 while (true) { // May need to recheck once.
826 // LB28a cont'd
827 LB::BRS::State oldState = brsState.state;
828 brsState.state = LB::BRS::updateState(brsState.state, {ncls, ucs4});
829 if (Q_LIKELY(brsState.state == oldState))
830 break;
831 switch (brsState.state) {
832 case LB::BRS::Start:
833 brsState.start = i;
834 break;
835 case LB::BRS::LB28a_2VI: // Wait for more characters, but also valid sequence
836 // We may get another character, but this is already a complete
837 // sequence that should not have any breaks:
838 for (qsizetype j = brsState.start + 1; j < i; ++j)
839 attributes[j].lineBreak = false;
840 // No need to mark this sequence again later, so move 'start'
841 // up to the current position:
842 brsState.start = i;
843 goto next;
844 case LB::BRS::Restart:
845 // The previous character was possibly the start of a new sequence
846 brsState.state = LB::BRS::Start;
847 brsState.start = pos - 1;
848 continue; // Doing the loop again!
852 for (qsizetype j = brsState.start + 1; j < i; ++j)
853 attributes[j].lineBreak = false;
854 if (brsState.state == LB::BRS::LB28a_3VIAK) {
855 // This might be the start of a new sequence
856 brsState.state = LB::BRS::Start;
857 brsState.start = i;
858 } else {
859 brsState.state = LB::BRS::None;
860 }
861 goto next;
862 case LB::BRS::LB28a_4: // Wait for more characters
863 Q_LIKELY_BRANCH
864 case LB::BRS::None: // Nothing to do
865 break;
866 }
867 break;
868 }
869
870 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_IS)) {
871 // LB15c Break before a decimal mark that follows a space, for instance, in
872 // ‘subtract .5’.
873 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_SP)) {
874 if (i + 1 < len) {
875 char32_t ch = string[i + 1];
876 if (QChar::isHighSurrogate(ch) && i + 2 < len) {
877 ushort low = string[i + 2];
878 if (QChar::isLowSurrogate(low))
879 ch = QChar::surrogateToUcs4(ch, low);
880 }
881 if (QUnicodeTables::properties(ch)->lineBreakClass
882 == QUnicodeTables::LineBreak_NU) {
883 attributes[pos].lineBreak = true;
884 goto next;
885 }
886 }
887 }
888 }
889
890 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_HL)) {
891 // LB21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
892 // HL (HY | [ BA - $EastAsian ]) × [^HL]
893 auto eaw = QUnicodeTables::EastAsianWidth(prop->eastAsianWidth);
894 const bool isNonEaBA = ncls == QUnicodeTables::LineBreak_BA && !isEastAsian(eaw);
895 if (isNonEaBA || ncls == QUnicodeTables::LineBreak_HY) {
896 // Remap to synthetic HYBA class which handles the next
897 // character. Generally (LB21) there are no breaks before
898 // HY or BA, so we can skip ahead to the next character.
900 goto next;
901 }
902 }
903
904 // LB25: do not break lines inside numbers
905 {
906 LB::NS::Class necur = LB::NS::toClass(ncls);
907 LB::NS::Action neact = LB::NS::Action(LB::NS::actionTable[nelast][necur]);
908 if (Q_UNLIKELY(neactlast == LB::NS::CNeedNU && necur != LB::NS::NU)) {
909 neact = LB::NS::None;
910 } else if (Q_UNLIKELY(neactlast == LB::NS::NeedOPNU)) {
911 if (necur == LB::NS::OP)
912 neact = LB::NS::CNeedISNU;
913 else if (necur == LB::NS::NU)
914 neact = LB::NS::Continue;
915 else // Anything else and we ignore the sequence
916 neact = LB::NS::None;
917 } else if (Q_UNLIKELY(neactlast == LB::NS::CNeedISNU)) {
918 if (necur == LB::NS::IS)
919 neact = LB::NS::CNeedNU;
920 else if (necur == LB::NS::NU)
921 neact = LB::NS::Continue;
922 else // Anything else and we ignore the sequence
923 neact = LB::NS::None;
924 }
925 switch (neact) {
926 case LB::NS::Break:
927 // do not change breaks before and after the expression
928 for (qsizetype j = nestart + 1; j < pos; ++j)
929 attributes[j].lineBreak = false;
930 Q_FALLTHROUGH();
931 Q_LIKELY_BRANCH
932 case LB::NS::None:
933 nelast = LB::NS::XX; // reset state
934 break;
935 case LB::NS::NeedOPNU:
936 case LB::NS::Start:
937 if (neactlast == LB::NS::Start || neactlast == LB::NS::Continue) {
938 // Apply the linebreaks for the previous stretch; we need to start a new one
939 for (qsizetype j = nestart + 1; j < pos; ++j)
940 attributes[j].lineBreak = false;
941 }
942 nestart = i;
943 Q_FALLTHROUGH();
944 case LB::NS::CNeedNU:
946 case LB::NS::Continue:
947 nelast = necur;
948 break;
949 }
950 neactlast = neact;
951 }
952
953 // LB19a Unless surrounded by East Asian characters, do not break either side of any
954 // unresolved quotation marks
955 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU
957 && lcls != QUnicodeTables::LineBreak_ZW)) {
958 using EAW = QUnicodeTables::EastAsianWidth;
959 constexpr static auto nextCharNonEastAsian = [](const char16_t *string, qsizetype len) {
960 if (len > 0) {
961 char32_t nch = string[0];
962 if (QChar::isHighSurrogate(nch) && len > 1) {
963 char16_t low = string[1];
964 if (QChar::isLowSurrogate(low))
965 nch = QChar::surrogateToUcs4(char16_t(nch), low);
966 }
967 const auto *nextProp = QUnicodeTables::properties(nch);
969 nextProp->lineBreakClass);
970 QUnicodeTables::EastAsianWidth neaw = EAW(nextProp->eastAsianWidth);
971 return nncls != QUnicodeTables::LineBreak_CM
972 && nncls <= QUnicodeTables::LineBreak_SP
973 && !isEastAsian(neaw);
974 }
975 return true; // end-of-text counts as non-East-Asian
976 };
977 if (Q_UNLIKELY(!isEastAsian(EAW(lastProp->eastAsianWidth))
978 || nextCharNonEastAsian(string + i + 1, len - i - 1))) {
979 // Remap to the synthetic QU_19 class which has indirect breaks
980 // for most following classes.
982 }
983 }
984
985 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
986 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
987 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
988 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
989 goto next;
990 }
991
992 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
994 goto next; // LB6: x(BK|CR|LF|NL)
995 goto next_no_cls_update; // LB7: xSP
996 }
997
998 // LB19 - do not break before non-initial unresolved quotation marks, or after non-final
999 // unresolved quotation marks
1000 if (Q_UNLIKELY(((ncls == QUnicodeTables::LineBreak_QU
1001 || ncls == QUnicodeTables::LineBreak_QU_19)
1002 && prop->category != QChar::Punctuation_InitialQuote)
1003 || (cls == QUnicodeTables::LineBreak_QU
1004 && lastProp->category != QChar::Punctuation_FinalQuote))) {
1005 // Make sure the previous character is not one that we have to break after.
1006 // Also skip if ncls is CM so it can be treated as lcls (LB9)
1008 && ncls != QUnicodeTables::LineBreak_CM) {
1009 goto next;
1010 }
1011 }
1012
1013 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
1014 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
1016 // don't update anything
1017 goto next_no_cls_update;
1018 }
1019
1020 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
1021 // LB8a: ZWJ x
1022 goto next;
1023 }
1024
1025 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
1026 // LB30a
1028 goto next;
1029 }
1030
1031 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
1032 && lastProp->category == QChar::Other_NotAssigned
1033 && lastProp->graphemeBreakClass
1034 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
1035 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
1036 goto next;
1037 }
1038
1039 // for South East Asian chars that require a complex analysis, the Unicode
1040 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
1041 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
1043
1044 tcls = cls;
1045
1046 constexpr static auto remapToAL = [](QUnicodeTables::LineBreakClass &c, auto &property) {
1047 if (Q_UNLIKELY(c == QUnicodeTables::LineBreak_CM
1050 property = QUnicodeTables::properties(U'\u0041');
1051 }
1052 };
1053 // LB10 Treat any remaining combining mark or ZWJ as AL,
1054 // as if it had the properties of U+0041 A LATIN CAPITAL LETTER
1055 remapToAL(tcls, lastProp);
1056 remapToAL(ncls, prop);
1057
1059 case LB::DirectBreak:
1060 attributes[pos].lineBreak = true;
1061 break;
1062 case LB::IndirectBreak:
1063 if (lcls == QUnicodeTables::LineBreak_SP)
1064 attributes[pos].lineBreak = true;
1065 break;
1067 if (lcls != QUnicodeTables::LineBreak_SP)
1068 goto next_no_cls_update;
1069 attributes[pos].lineBreak = true;
1070 break;
1072 if (lcls != QUnicodeTables::LineBreak_SP)
1073 goto next_no_cls_update;
1074 break;
1075 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
1076 if (lcls != QUnicodeTables::LineBreak_HL)
1077 attributes[pos].lineBreak = true;
1078 break;
1080 using EAW = QUnicodeTables::EastAsianWidth;
1081 switch (EAW(prop->eastAsianWidth)) {
1082 default:
1083 if (lcls != QUnicodeTables::LineBreak_SP)
1084 break;
1085 Q_FALLTHROUGH();
1086 case QUnicodeTables::EastAsianWidth::F:
1087 case QUnicodeTables::EastAsianWidth::W:
1088 case QUnicodeTables::EastAsianWidth::H:
1089 attributes[pos].lineBreak = true;
1090 break;
1091 }
1092 break;
1093 case LB::DirectBreakOutsideNumericSequence:
1094 if (neactlast == LB::NS::None || neactlast > LB::NS::Break)
1095 attributes[pos].lineBreak = true;
1096 break;
1098 // nothing to do
1099 default:
1100 break;
1101 }
1102
1103 next:
1105 cls = ncls;
1106 lastProp = prop;
1107 }
1108 next_no_cls_update:
1109 lcls = ncls;
1110 }
1111
1112 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
1113 // LB25: do not break lines inside numbers
1114 for (qsizetype j = nestart + 1; j < len; ++j)
1115 attributes[j].lineBreak = false;
1116 }
1117
1118 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
1119 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
1120}
1121
1122
1123static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1124{
1125 for (qsizetype i = 0; i != len; ++i) {
1126 uint ucs4 = string[i];
1127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
1128 ushort low = string[i + 1];
1129 if (QChar::isLowSurrogate(low)) {
1130 ucs4 = QChar::surrogateToUcs4(ucs4, low);
1131 ++i;
1132 }
1133 }
1134
1135 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
1136 attributes[i].whiteSpace = true;
1137 }
1138}
1139
1140namespace Tailored {
1141
1142using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
1143
1144
1159
1160static const unsigned char indicForms[0xe00-0x900] = {
1161 // Devangari
1166
1171
1176
1181
1186
1191
1196
1201
1202 // Bengali
1207
1212
1217
1222
1227
1232
1237
1242
1243 // Gurmukhi
1248
1253
1258
1263
1268
1273
1278
1283
1284 // Gujarati
1289
1294
1299
1304
1309
1314
1319
1324
1325 // Oriya
1330
1335
1340
1345
1350
1355
1360
1365
1366 //Tamil
1371
1376
1381
1386
1391
1396
1401
1406
1407 // Telugu
1412
1417
1422
1427
1432
1437
1442
1447
1448 // Kannada
1453
1458
1463
1468
1473
1478
1483
1488
1489 // Malayalam
1494
1499
1504
1509
1514
1519
1524
1529
1530 // Sinhala
1535
1540
1545
1550
1555
1560
1565
1570};
1571
1572static inline Form form(unsigned short uc) {
1573 if (uc < 0x900 || uc > 0xdff) {
1574 if (uc == 0x25cc)
1575 return Consonant;
1576 if (uc == 0x200c || uc == 0x200d)
1577 return Control;
1578 return Other;
1579 }
1580 return (Form)indicForms[uc-0x900];
1581}
1582
1583// #define INDIC_DEBUG
1584#ifdef INDIC_DEBUG
1585#define IDEBUG qDebug
1586#else
1587#define IDEBUG if constexpr (1) ; else qDebug
1588#endif
1589
1590/* syllables are of the form:
1591
1592 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1593 (Consonant Nukta? Halant)* Consonant Halant
1594 IndependentVowel VowelMark? StressMark?
1595
1596 We return syllable boundaries on invalid combinations as well
1597*/
1598static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1599{
1600 *invalid = false;
1601 IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1602 const char16_t *uc = s+start;
1603
1604 qsizetype pos = 0;
1605 Form state = form(uc[pos]);
1606 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1607 pos++;
1608
1609 if (state != Consonant && state != IndependentVowel) {
1610 if (state != Other)
1611 *invalid = true;
1612 goto finish;
1613 }
1614
1615 while (pos < end - start) {
1616 Form newState = form(uc[pos]);
1617 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1618 switch (newState) {
1619 case Control:
1620 newState = state;
1621 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1622 break;
1623 // the control character should be the last char in the item
1624 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1625 break;
1626 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1627 break;
1628 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1629 ++pos;
1630 goto finish;
1631 case Consonant:
1632 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1633 break;
1634 goto finish;
1635 case Halant:
1636 if (state == Nukta || state == Consonant)
1637 break;
1638 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1639 if (script == QChar::Script_Bengali && pos == 1 &&
1640 (uc[0] == 0x0985 || uc[0] == 0x098f))
1641 break;
1642 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1643 if (script == QChar::Script_Sinhala && state == Matra) {
1644 ++pos;
1645 continue;
1646 }
1647 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1648 ++pos;
1649 continue;
1650 }
1651 goto finish;
1652 case Nukta:
1653 if (state == Consonant)
1654 break;
1655 goto finish;
1656 case StressMark:
1657 if (state == VowelMark)
1658 break;
1659 Q_FALLTHROUGH();
1660 case VowelMark:
1661 if (state == Matra || state == LengthMark || state == IndependentVowel)
1662 break;
1663 Q_FALLTHROUGH();
1664 case Matra:
1665 if (state == Consonant || state == Nukta)
1666 break;
1667 if (state == Matra) {
1668 // ### needs proper testing for correct two/three part matras
1669 break;
1670 }
1671 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1672 // it work for all Indic languages?
1673 // the combination Independent_A + Vowel Sign AA is allowed.
1674 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1675 break;
1676 if (script == QChar::Script_Tamil && state == Matra) {
1677 if (uc[pos-1] == 0x0bc6 &&
1678 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1679 break;
1680 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1681 break;
1682 }
1683 goto finish;
1684
1685 case LengthMark:
1686 if (state == Matra) {
1687 // ### needs proper testing for correct two/three part matras
1688 break;
1689 }
1690 Q_FALLTHROUGH();
1691 case IndependentVowel:
1692 case Invalid:
1693 case Other:
1694 goto finish;
1695 }
1696 state = newState;
1697 pos++;
1698 }
1699 finish:
1700 return pos+start;
1701}
1702
1703static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1704{
1705 qsizetype end = from + len;
1706 attributes += from;
1707 qsizetype i = 0;
1708 while (i < len) {
1709 bool invalid;
1710 qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1711 attributes[i].graphemeBoundary = true;
1712
1713 if (boundary > len-1) boundary = len;
1714 i++;
1715 while (i < boundary) {
1716 attributes[i].graphemeBoundary = false;
1717 ++i;
1718 }
1719 assert(i == boundary);
1720 }
1721
1722
1723}
1724
1725#if QT_CONFIG(library)
1726
1727#define LIBTHAI_MAJOR 0
1728
1729/*
1730 * if libthai changed please update these codes too.
1731 */
1732struct thcell_t {
1733 unsigned char base; /**< base character */
1734 unsigned char hilo; /**< upper/lower vowel/diacritic */
1735 unsigned char top; /**< top-level mark */
1736};
1737
1738using ThBrk = struct _ThBrk;
1739
1740namespace {
1741
1742class LibThai final
1743{
1745
1746 using th_brk_new_def = ThBrk *(*)(const char *);
1747 using th_brk_delete_def = void (*)(ThBrk *);
1748 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1749 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1750
1751public:
1752 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1753 {
1755 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
1756 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
1757
1758 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
1759 if (th_brk_new) {
1760 m_state = th_brk_new(nullptr);
1762 reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
1763 }
1764 }
1765
1766 ~LibThai()
1767 {
1768 if (m_state && m_th_brk_delete)
1770 m_library.unload();
1771 }
1772
1773 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1774
1775 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1776 {
1780 }
1781
1782 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1783 {
1786 }
1787
1788private:
1790
1791 // Global state for th_brk_find_breaks().
1792 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1793 // state is read-only, and so it is safe to use it from multiple threads after
1794 // initialization. This is also stated in the libthai documentation.
1795 ThBrk *m_state = nullptr;
1796
1800};
1801
1802} // unnamed namespace
1803
1805
1806static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1807{
1808 qsizetype i;
1809 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1810
1811 for (i = 0; i < len; ++i) {
1812 if (string[i] <= 0xa0)
1813 result[i] = static_cast<unsigned char>(string[i]);
1814 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1815 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1816 else
1817 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1818 }
1819
1820 result[len] = 0;
1821}
1822
1823/*
1824 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1825 */
1826static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1827{
1828 constexpr qsizetype Prealloc = 128;
1829 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1832 struct thcell_t tis_cell;
1833
1835 if (!libThai || !libThai->isInitialized())
1836 return;
1837
1838 to_tis620(string, len, s.data());
1839
1840 for (i = 0; i < len; ++i) {
1841 attributes[i].wordBreak = false;
1842 attributes[i].wordStart = false;
1843 attributes[i].wordEnd = false;
1844 attributes[i].lineBreak = false;
1845 }
1846
1847 attributes[0].wordBreak = true;
1848 attributes[0].wordStart = true;
1849 attributes[0].wordEnd = false;
1850 numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
1852 static_cast<size_t>(break_positions.size()));
1853 for (i = 0; i < numbreaks; ++i) {
1858 }
1859 if (numbreaks > 0)
1861
1862 /* manage grapheme boundaries */
1863 i = 0;
1864 while (i < len) {
1866 libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
1867 size_t(len - i), &tis_cell, true);
1868
1870 for (size_t j = 1; j < cell_length; ++j)
1871 attributes[i + j].graphemeBoundary = false;
1872
1873 i += cell_length;
1874 }
1875}
1876
1877#endif // QT_CONFIG(library)
1878
1879static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1880{
1881 assert(script == QChar::Script_Thai);
1882#if QT_CONFIG(library)
1883 const char16_t *uc = text + from;
1884 attributes += from;
1885 Q_UNUSED(script);
1886 thaiAssignAttributes(uc, len, attributes);
1887#else
1888 Q_UNUSED(script);
1889 Q_UNUSED(text);
1890 Q_UNUSED(from);
1891 Q_UNUSED(len);
1892 Q_UNUSED(attributes);
1893#endif
1894}
1895
1896/*
1897 tibetan syllables are of the form:
1898 head position consonant
1899 first sub-joined consonant
1900 ....intermediate sub-joined consonants (if any)
1901 last sub-joined consonant
1902 sub-joined vowel (a-chung U+0F71)
1903 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1904*/
1905
1913
1914/* this table starts at U+0f40 */
1915static const unsigned char tibetanForm[0x80] = {
1920
1925
1930
1935
1940
1945
1950
1955};
1956
1957#define tibetan_form(c)
1958 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1959
1960static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1961{
1962 const char16_t *uc = s + start;
1963
1964 qsizetype pos = 0;
1965 TibetanForm state = tibetan_form(*uc);
1966
1967/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1968 pos++;
1969
1970 if (state != TibetanHeadConsonant) {
1971 if (state != TibetanOther)
1972 *invalid = true;
1973 goto finish;
1974 }
1975
1976 while (pos < end - start) {
1977 TibetanForm newState = tibetan_form(uc[pos]);
1978 switch (newState) {
1981 if (state != TibetanHeadConsonant &&
1983 goto finish;
1984 state = newState;
1985 break;
1986 case TibetanVowel:
1987 if (state != TibetanHeadConsonant &&
1988 state != TibetanSubjoinedConsonant &&
1989 state != TibetanSubjoinedVowel)
1990 goto finish;
1991 break;
1992 case TibetanOther:
1994 goto finish;
1995 }
1996 pos++;
1997 }
1998
1999finish:
2000 *invalid = false;
2001 return start+pos;
2002}
2003
2004static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2005{
2006 qsizetype end = from + len;
2007 qsizetype i = 0;
2008 Q_UNUSED(script);
2009 attributes += from;
2010 while (i < len) {
2011 bool invalid;
2012 qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
2013
2014 attributes[i].graphemeBoundary = true;
2015
2016 if (boundary > len-1) boundary = len;
2017 i++;
2018 while (i < boundary) {
2019 attributes[i].graphemeBoundary = false;
2020 ++i;
2021 }
2022 assert(i == boundary);
2023 }
2024}
2025
2028 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
2029 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
2030 Mymr_CC_NGA = 3, /* Consonant NGA */
2031 Mymr_CC_YA = 4, /* Consonant YA */
2032 Mymr_CC_RA = 5, /* Consonant RA */
2033 Mymr_CC_WA = 6, /* Consonant WA */
2034 Mymr_CC_HA = 7, /* Consonant HA */
2035 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
2036 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
2037 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
2038 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
2039 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
2040 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
2041 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
2045 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
2046 Mymr_CC_COUNT = 19 /* This is the number of character classes */
2047};
2048
2051
2052 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2053 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
2054 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
2055 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
2056 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
2057 first in a syllable */
2058 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
2059
2060 /* position flags */
2062 Mymr_CF_POS_BELOW = 0x00040000,
2063 Mymr_CF_POS_ABOVE = 0x00020000,
2064 Mymr_CF_POS_AFTER = 0x00010000,
2065 Mymr_CF_POS_MASK = 0x000f0000,
2066
2068};
2069
2070Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
2071
2072/* Characters that get refrered to by name */
2074{
2078 Mymr_C_RA = 0x101B,
2079 Mymr_C_YA = 0x101A,
2080 Mymr_C_NGA = 0x1004,
2083};
2084
2085enum
2086{
2104};
2105
2106
2107typedef int MymrCharClass;
2108
2109
2125
2126static MymrCharClass
2128{
2129 if (ch == Mymr_C_SIGN_ZWJ)
2131
2132 if (ch == Mymr_C_SIGN_ZWNJ)
2134
2135 if (ch < 0x1000 || ch > 0x105f)
2136 return Mymr_CC_RESERVED;
2137
2138 return mymrCharClasses[ch - 0x1000];
2139}
2140
2141static const signed char mymrStateTable[][Mymr_CC_COUNT] =
2142{
2143/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
2144 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
2145 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
2146 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
2147 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
2148 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
2149 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
2150 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
2151 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
2152 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
2153 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
2154 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
2155 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
2156 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
2157 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
2158 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
2159 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
2160 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
2161 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
2162 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
2163 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
2164 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
2165 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
2166 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
2167 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
2168 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
2169 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
2170 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
2171 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
2172/* exit state -2 is for invalid order of medials and combination of invalids
2173 with virama where virama should treat as start of next syllable
2174 */
2175};
2176
2177/*#define MYANMAR_DEBUG */
2178#ifdef MYANMAR_DEBUG
2179#define MMDEBUG qDebug
2180#else
2181# define MMDEBUG
2182 if (0)
2183 printf
2184#endif
2185
2186/*
2187// Given an input string of characters and a location in which to start looking
2188// calculate, using the state table, which one is the last character of the syllable
2189// that starts in the starting position.
2190*/
2191static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2192{
2193 const char16_t *uc = s + start;
2194 int state = 0;
2195 qsizetype pos = start;
2196 *invalid = false;
2197
2198 while (pos < end) {
2199 MymrCharClass charClass = getMyanmarCharClass(*uc);
2200 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
2201 if (pos == start)
2202 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
2203
2204 MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
2205
2206 if (state < 0) {
2207 if (state < -1)
2208 --pos;
2209 break;
2210 }
2211 ++uc;
2212 ++pos;
2213 }
2214 return pos;
2215}
2216
2217static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2218{
2219 qsizetype end = from + len;
2220 qsizetype i = 0;
2221 Q_UNUSED(script);
2222 attributes += from;
2223 while (i < len) {
2224 bool invalid;
2225 qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
2226
2227 attributes[i].graphemeBoundary = true;
2228 attributes[i].lineBreak = true;
2229
2230 if (boundary > len-1)
2231 boundary = len;
2232 i++;
2233 while (i < boundary) {
2234 attributes[i].graphemeBoundary = false;
2235 ++i;
2236 }
2237 assert(i == boundary);
2238 }
2239}
2240
2241/*
2242// Vocabulary
2243// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
2244// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
2245// split vowels, signs... but there is only one base in a syllable, it has to be coded as
2246// the first character of the syllable.
2247// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
2248// Khmer language has five of them. Khmer split vowels either have one part before the
2249// base and one after the base or they have a part before the base and a part above the base.
2250// The first part of all Khmer split vowels is the same character, identical to
2251// the glyph of Khmer dependent vowel SRA EI
2252// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
2253// Differently than indian languages, the coeng modifies the consonant that follows it,
2254// not the one preceding it Each consonant has two forms, the base form and the subscript form
2255// the base form is the normal one (using the consonants code-point), the subscript form is
2256// displayed when the combination coeng + consonant is encountered.
2257// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
2258// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
2259// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
2260// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
2261// if it is attached to a consonant of the first series or a consonant of the second series
2262// Most consonants have an equivalent in the other series, but some of theme exist only in
2263// one series (for example SA). If we want to use the consonant SA with a vowel sound that
2264// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
2265// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
2266// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
2267// MUSIKATOAN a second series consonant to have a first series vowel sound.
2268// Consonant shifter are both normally supercript marks, but, when they are followed by a
2269// superscript, they change shape and take the form of subscript dependent vowel SRA U.
2270// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
2271// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
2272// be placed after the coeng consonant.
2273// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2274// Each vowel has its own position. Only one vowel per syllable is allowed.
2275// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2276// Allowed in a syllable.
2277//
2278//
2279// order is important here! This order must be the same that is found in each horizontal
2280// line in the statetable for Khmer (see khmerStateTable) .
2281*/
2284 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2285 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2286 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2287 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2289 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2290 CC_COENG = 7, /* Subscript consonant combining character */
2294 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2295 CC_COUNT = 12 /* This is the number of character classes */
2296};
2297
2298
2300 CF_CLASS_MASK = 0x0000FFFF,
2301
2302 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2303 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2304 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2305 CF_COENG = 0x08000000, /* flag to speed up comparing */
2306 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2307 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2308
2309 /* position flags */
2310 CF_POS_BEFORE = 0x00080000,
2311 CF_POS_BELOW = 0x00040000,
2312 CF_POS_ABOVE = 0x00020000,
2313 CF_POS_AFTER = 0x00010000,
2314 CF_POS_MASK = 0x000f0000
2315};
2316
2317Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2318
2319/* Characters that get referred to by name */
2321 C_SIGN_ZWNJ = 0x200C,
2322 C_SIGN_ZWJ = 0x200D,
2323 C_RO = 0x179A,
2324 C_VOWEL_AA = 0x17B6,
2326 C_VOWEL_E = 0x17C1,
2327 C_COENG = 0x17D2
2328};
2329
2330
2331/*
2332// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2333// they are also used to know where a character should be placed (location in reference to the base character)
2334// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2335// indicate error in syllable construction
2336*/
2337enum {
2351
2352 /* split vowel */
2355};
2356
2357
2358/*
2359// Character class: a character class value
2360// ORed with character class flags.
2361*/
2362typedef unsigned long KhmerCharClass;
2363
2364
2365/*
2366// Character class tables
2367// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2368// _sa Sign placed above the base
2369// _sp Sign placed after the base
2370// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2371// _c2 Consonant of type 2 (only RO)
2372// _c3 Consonant of type 3
2373// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2374// _cd Consonant-shifter
2375// _dl Dependent vowel placed before the base (left of the base)
2376// _db Dependent vowel placed below the base
2377// _da Dependent vowel placed above the base
2378// _dr Dependent vowel placed behind the base (right of the base)
2379// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2380// it to create a subscript consonant or independent vowel
2381// _va Khmer split vowel in which the first part is before the base and the second one above the base
2382// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2383*/
2385 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2386 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2387 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2388 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2389 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2390 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2391};
2392
2393/* this enum must reflect the range of khmerCharClasses */
2398
2399/*
2400// Below we define how a character in the input string is either in the khmerCharClasses table
2401// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2402// within the syllable, but are not in the table) we also get their type back, or an unknown object
2403// in which case we get _xx (CC_RESERVED) back
2404*/
2406{
2407 if (uc == C_SIGN_ZWJ) {
2408 return CC_ZERO_WIDTH_J_MARK;
2409 }
2410
2411 if (uc == C_SIGN_ZWNJ) {
2412 return CC_ZERO_WIDTH_NJ_MARK;
2413 }
2414
2415 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2416 return CC_RESERVED;
2417 }
2418
2419 return khmerCharClasses[uc - KhmerFirstChar];
2420}
2421
2422
2423/*
2424// The stateTable is used to calculate the end (the length) of a well
2425// formed Khmer Syllable.
2426//
2427// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2428// CharClassValues. This coincidence of values allows the follow up of the table.
2429//
2430// Each line corresponds to a state, which does not necessarily need to be a type
2431// of component... for example, state 2 is a base, with is always a first character
2432// in the syllable, but the state could be produced a consonant of any type when
2433// it is the first character that is analysed (in ground state).
2434//
2435// Differentiating 3 types of consonants is necessary in order to
2436// forbid the use of certain combinations, such as having a second
2437// coeng after a coeng RO,
2438// The inexistent possibility of having a type 3 after another type 3 is permitted,
2439// eliminating it would very much complicate the table, and it does not create typing
2440// problems, as the case above.
2441//
2442// The table is quite complex, in order to limit the number of coeng consonants
2443// to 2 (by means of the table).
2444//
2445// There a peculiarity, as far as Unicode is concerned:
2446// - The consonant-shifter is considered in two possible different
2447// locations, the one considered in Unicode 3.0 and the one considered in
2448// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2449//
2450//
2451// xx independent character, such as a number, punctuation sign or non-khmer char
2452//
2453// c1 Khmer consonant of type 1 or an independent vowel
2454// that is, a letter in which the subscript for is only under the
2455// base, not taking any space to the right or to the left
2456//
2457// c2 Khmer consonant of type 2, the coeng form takes space under
2458// and to the left of the base (only RO is of this type)
2459//
2460// c3 Khmer consonant of type 3. Its subscript form takes space under
2461// and to the right of the base.
2462//
2463// cs Khmer consonant shifter
2464//
2465// rb Khmer robat
2466//
2467// co coeng character (u17D2)
2468//
2469// dv dependent vowel (including split vowels, they are treated in the same way).
2470// even if dv is not defined above, the component that is really tested for is
2471// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2472//
2473// zwj Zero Width joiner
2474//
2475// zwnj Zero width non joiner
2476//
2477// sa above sign
2478//
2479// sp post sign
2480//
2481// there are lines with equal content but for an easier understanding
2482// (and maybe change in the future) we did not join them
2483*/
2484static const signed char khmerStateTable[][CC_COUNT] =
2485{
2486 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2487 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2488 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2489 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2490 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2491 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2492 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2493 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2494 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2495 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2496 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2497 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2498 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2499 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2500 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2501 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2502 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2503 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2504 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2505 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2506 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2507 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2508};
2509
2510
2511/* #define KHMER_DEBUG */
2512#ifdef KHMER_DEBUG
2513#define KHDEBUG qDebug
2514#else
2515# define KHDEBUG
2516 if (0)
2517 printf
2518#endif
2519
2520/*
2521// Given an input string of characters and a location in which to start looking
2522// calculate, using the state table, which one is the last character of the syllable
2523// that starts in the starting position.
2524*/
2525static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2526{
2527 const char16_t *uc = s + start;
2528 int state = 0;
2529 qsizetype pos = start;
2530 *invalid = false;
2531
2532 while (pos < end) {
2533 KhmerCharClass charClass = getKhmerCharClass(*uc);
2534 if (pos == start) {
2535 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2536 }
2537 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2538
2539 KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2540 charClass, *uc );
2541
2542 if (state < 0) {
2543 break;
2544 }
2545 ++uc;
2546 ++pos;
2547 }
2548 return pos;
2549}
2550
2551static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2552{
2553 qsizetype end = from + len;
2554 qsizetype i = 0;
2555 Q_UNUSED(script);
2556 attributes += from;
2557 while ( i < len ) {
2558 bool invalid;
2559 qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2560
2561 attributes[i].graphemeBoundary = true;
2562
2563 if ( boundary > len-1 ) boundary = len;
2564 i++;
2565 while ( i < boundary ) {
2566 attributes[i].graphemeBoundary = false;
2567 ++i;
2568 }
2569 assert( i == boundary );
2570 }
2571}
2572
2573
2574const CharAttributeFunction charAttributeFunction[] = {
2575// Script_Unknown,
2576 nullptr,
2577// Script_Inherited,
2578 nullptr,
2579// Script_Common,
2580 nullptr,
2581// Script_Latin,
2582 nullptr,
2583// Script_Greek,
2584 nullptr,
2585// Script_Cyrillic,
2586 nullptr,
2587// Script_Armenian,
2588 nullptr,
2589// Script_Hebrew,
2590 nullptr,
2591// Script_Arabic,
2592 nullptr,
2593// Script_Syriac,
2594 nullptr,
2595// Script_Thaana,
2596 nullptr,
2597// Script_Devanagari,
2598 indicAttributes,
2599// Script_Bengali,
2600 indicAttributes,
2601// Script_Gurmukhi,
2602 indicAttributes,
2603// Script_Gujarati,
2604 indicAttributes,
2605// Script_Oriya,
2606 indicAttributes,
2607// Script_Tamil,
2608 indicAttributes,
2609// Script_Telugu,
2610 indicAttributes,
2611// Script_Kannada,
2612 indicAttributes,
2613// Script_Malayalam,
2614 indicAttributes,
2615// Script_Sinhala,
2616 indicAttributes,
2617// Script_Thai,
2618 thaiAttributes,
2619// Script_Lao,
2620 nullptr,
2621// Script_Tibetan,
2622 tibetanAttributes,
2623// Script_Myanmar,
2624 myanmarAttributes,
2625// Script_Georgian,
2626 nullptr,
2627// Script_Hangul,
2628 nullptr,
2629// Script_Ethiopic,
2630 nullptr,
2631// Script_Cherokee,
2632 nullptr,
2633// Script_CanadianAboriginal,
2634 nullptr,
2635// Script_Ogham,
2636 nullptr,
2637// Script_Runic,
2638 nullptr,
2639// Script_Khmer,
2640 khmerAttributes
2641};
2642
2643static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2644 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2645 QCharAttributes *attributes)
2646{
2647 if (stringLength == 0)
2648 return;
2649 for (qsizetype i = 0; i < numItems; ++i) {
2650 QChar::Script script = items[i].script;
2651 if (script > QChar::Script_Khmer)
2652 script = QChar::Script_Common;
2653 CharAttributeFunction attributeFunction = charAttributeFunction[script];
2654 if (!attributeFunction)
2655 continue;
2656 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2657 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2658 }
2659}
2660
2661}
2662
2663Q_CORE_EXPORT void initCharAttributes(QStringView string,
2664 const ScriptItem *items, qsizetype numItems,
2665 QCharAttributes *attributes, CharAttributeOptions options)
2666{
2667 if (string.size() <= 0)
2668 return;
2669
2670 if (!(options & DontClearAttributes))
2671 ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2672
2673 if (options & GraphemeBreaks)
2674 getGraphemeBreaks(string.utf16(), string.size(), attributes);
2675 if (options & WordBreaks)
2676 getWordBreaks(string.utf16(), string.size(), attributes);
2677 if (options & SentenceBreaks)
2678 getSentenceBreaks(string.utf16(), string.size(), attributes);
2679 if (options & LineBreaks)
2680 getLineBreaks(string.utf16(), string.size(), attributes, options);
2681 if (options & WhiteSpaces)
2682 getWhiteSpaces(string.utf16(), string.size(), attributes);
2683
2685 if (!items || numItems <= 0)
2686 return;
2687
2688 Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2689 }
2690}
2691
2692
2693// ----------------------------------------------------------------------------
2694//
2695// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2696//
2697// ----------------------------------------------------------------------------
2698
2699Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2700{
2701 qsizetype sor = 0;
2702 qsizetype eor = 0;
2703 QChar::Script script = QChar::Script_Common;
2704
2705 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2706 char32_t ucs4 = string[i].unicode();
2707 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2708 ushort low = string[i + 1].unicode();
2709 if (QChar::isLowSurrogate(low)) {
2710 ucs4 = QChar::surrogateToUcs4(ucs4, low);
2711 ++i;
2712 }
2713 }
2714
2715 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2716
2717 QChar::Script nscript = QChar::Script(prop->script);
2718
2719 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2720 continue;
2721
2722 // inherit preceding Common-s
2723 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2724 // also covers a case where the base character of Common script followed
2725 // by one or more combining marks of non-Inherited, non-Common script
2726 script = nscript;
2727 continue;
2728 }
2729
2730 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2731 // Thus, a combining mark - whatever its script property value is - should inherit
2732 // the script property value of its base character.
2733 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2734 if (Q_UNLIKELY(FLAG(prop->category) & test))
2735 continue;
2736
2737 Q_ASSERT(script > QChar::Script_Common);
2738 Q_ASSERT(sor < eor);
2739 scripts->append(ScriptItem{sor, script});
2740 sor = eor;
2741
2742 script = nscript;
2743 }
2744
2745 Q_ASSERT(script >= QChar::Script_Common);
2746 Q_ASSERT(eor == string.size());
2747 scripts->append(ScriptItem{sor, script});
2748}
2749
2750} // namespace QUnicodeTools
2751
2752QT_END_NAMESPACE
Combined button and popup list for selecting options.
@ GraphemeBreak_Extended_Pictographic
static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first, QUnicodeTables::GraphemeBreakClass second)
static const GBTableEntryType Extend_SpacingMark_ZWJ
static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses]
static const GBTableEntryType HardBreak
State updateState(State state, LinebreakUnit lb)
constexpr char32_t DottedCircle
Class toClass(QUnicodeTables::LineBreakClass lbc)
static const uchar actionTable[CLCP+1][CLCP+1]
static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ]
static const uchar breakTable[BAfter+1][QUnicodeTables::NumSentenceBreakClasses]
static const KhmerCharClass khmerCharClasses[]
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static MymrCharClass getMyanmarCharClass(ushort ch)
static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const signed char mymrStateTable[][Mymr_CC_COUNT]
static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const MymrCharClass mymrCharClasses[]
const CharAttributeFunction charAttributeFunction[]
static Form form(unsigned short uc)
static const unsigned char indicForms[0xe00-0x900]
static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const signed char khmerStateTable[][CC_COUNT]
static void getCharAttributes(const char16_t *string, qsizetype stringLength, const QUnicodeTools::ScriptItem *items, qsizetype numItems, QCharAttributes *attributes)
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char tibetanForm[0x80]
static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static KhmerCharClass getKhmerCharClass(ushort uc)
static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses]
static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initCharAttributes(QStringView string, const ScriptItem *items, qsizetype numItems, QCharAttributes *attributes, CharAttributeOptions options)
static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
#define FLAG(x)
Definition qchar.cpp:13
#define KHDEBUG
#define IDEBUG
constexpr int qt_initcharattributes_default_algorithm_only
#define tibetan_form(c)
#define MMDEBUG
QUnicodeTables::LineBreakClass lbc