Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
qunicodetools.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:critical reason:data-parser
4
6
7#include <QtCore/private/qstringiterator_p.h>
10#if QT_CONFIG(library)
11#include "qlibrary.h"
12#endif
13
14#include <limits.h>
15
16#define FLAG(x) (1 << (x))
17
19
20using namespace Qt::StringLiterals;
21
22#ifdef QT_BUILD_INTERNAL
23Q_CONSTINIT Q_AUTOTEST_EXPORT
24#else
25constexpr
26#endif
28
29namespace QUnicodeTools {
30
31// -----------------------------------------------------------------------------------------------------
32//
33// The text boundaries determination algorithm.
34// See https://www.unicode.org/reports/tr29/tr29-37.html
35//
36// -----------------------------------------------------------------------------------------------------
37
38namespace GB {
39
40// This table is indexed by the grapheme break classes of two
41// (adjacent) code points.
42// The class of the first code point selects an entry.
43// If the entry's bit at position second_cp_class is set
44// (in other words: if entry & (1u << second_cp_class) is non-zero)
45// then there is NO grapheme break between the two code points.
46
48
49// Check that we have enough bits in the table (in case
50// NumGraphemeBreakClasses grows too much).
52 "Internal error: increase the size in bits of GBTableEntryType");
53
54// GB9, GB9a
59
60static const GBTableEntryType HardBreak = 0u;
61
63 Extend_SpacingMark_ZWJ, // Any
65 HardBreak, // LF
66 HardBreak, // Control
67 Extend_SpacingMark_ZWJ, // Extend
68 Extend_SpacingMark_ZWJ, // ZWJ
69 Extend_SpacingMark_ZWJ, // RegionalIndicator
80 ), // Prepend
81 Extend_SpacingMark_ZWJ, // SpacingMark
87 ), // L
91 ), // V
94 ), // T
98 ), // LV
101 ), // LVT
102 Extend_SpacingMark_ZWJ // Extended_Pictographic
103};
104
107{
108 return (breakTable[first] & FLAG(second)) == 0;
109}
110
111// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
112// so we need to store some local state.
113enum class State : uchar {
115 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
116 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
117 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
118};
119
120} // namespace GB
121
122static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
123{
125 GB::State state = GB::State::Normal;
126 QStringIterator it(QStringView{string, len});
127 while (it.hasNext()) {
128 const qsizetype pos = it.index();
129 const char32_t ucs4 = it.nextOrRawCodeUnit();
130
131 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
132 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
133
134 bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
135 bool handled = false;
136
137 switch (state) {
139 break; // will deal with it below
140
142 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
144 // keep going in the current state
145 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
146 handled = true;
147 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
149 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
150 handled = true;
151 } else {
152 state = GB::State::Normal;
153 }
154 break;
155
157 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
159 shouldBreak = false;
160 handled = true;
161 }
162
163 state = GB::State::Normal;
164 break;
165
169 shouldBreak = false;
170 handled = true;
171 }
172
173 state = GB::State::Normal;
174 break;
175 }
176
177 if (!handled) {
178 Q_ASSERT(state == GB::State::Normal);
182 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
183 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
185 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
186 }
187 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
188 state = GB::State::GB12_13_RI;
189 }
190 }
191
192 if (shouldBreak)
193 attributes[pos].graphemeBoundary = true;
194
195 lcls = cls;
196 }
197
198 attributes[len].graphemeBoundary = true; // GB2
199}
200
201
202namespace WB {
203
210
212// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
213 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
214 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
215 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
216 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
218 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
219 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
220 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
226 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
227 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
232};
233
234} // namespace WB
235
236static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
237{
238 enum WordType {
239 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
240 } currentWordType = WordTypeNone;
241
243 auto real_cls = cls; // Unaffected by WB4
244
245 QStringIterator it(QStringView{string, len});
246 while (it.hasNext()) {
247 const qsizetype pos = it.index();
248 const char32_t ucs4 = it.nextOrRawCodeUnit();
249
250 const auto prop = QUnicodeTables::properties(ucs4);
251 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
253 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
254 // which caused "hi.there" to be treated like if it were just a single word;
255 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
256 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
257 if (ucs4 == 0x002E) // FULL STOP
259 else if (ucs4 == 0x003A) // COLON
261 }
262
263 uchar action = WB::breakTable[cls][ncls];
264 switch (action) {
265 case WB::Break:
266 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
267 && prop->graphemeBreakClass
269 // WB3c: ZWJ × \p{Extended_Pictographic}
270 action = WB::NoBreak;
271 }
272 break;
273 case WB::NoBreak:
275 // WB4: X(Extend|Format)* -> X
276 real_cls = ncls;
277 continue;
278 }
279 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
280 // WB15/WB16: break between pairs of Regional indicator
282 }
283 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
284 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
285 // WB3d should not be affected by WB4
286 action = WB::Break;
287 }
288 break;
289 case WB::Lookup:
290 case WB::LookupW:
291 for (auto lookahead = it; lookahead.hasNext(); /**/) {
292 const char32_t ucs4 = lookahead.nextOrRawCodeUnit();
293
294 const auto prop = QUnicodeTables::properties(ucs4);
295 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
296
298 // WB4: X(Extend|Format)* -> X
299 continue;
300 }
301
302 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
303 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
304 it = lookahead;
305 ncls = tcls;
306 action = WB::NoBreak;
307 }
308 break;
309 }
310 if (action != WB::NoBreak) {
311 action = WB::Break;
313 action = WB::NoBreak; // WB7a
314 }
315 break;
316 }
317
318 cls = ncls;
319 real_cls = ncls;
320
321 if (action == WB::Break) {
322 attributes[pos].wordBreak = true;
323 if (currentWordType != WordTypeNone)
324 attributes[pos].wordEnd = true;
325 switch (cls) {
327 currentWordType = WordTypeHiraganaKatakana;
328 attributes[pos].wordStart = true;
329 break;
333 currentWordType = WordTypeAlphaNumeric;
334 attributes[pos].wordStart = true;
335 break;
336 default:
337 currentWordType = WordTypeNone;
338 break;
339 }
340 }
341 }
342
343 if (currentWordType != WordTypeNone)
344 attributes[len].wordEnd = true;
345 attributes[len].wordBreak = true; // WB2
346}
347
348
349namespace SB {
350
367
369// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
373
374 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
375 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
376 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
377 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
378
379 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
380 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
381 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
382 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
383 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
384};
385
386} // namespace SB
387
388static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
389{
390 uchar state = SB::BAfter; // to meet SB1
391
392 QStringIterator it(QStringView{string, len});
393 while (it.hasNext()) {
394 const qsizetype pos = it.index();
395 const char32_t ucs4 = it.nextOrRawCodeUnit();
396
397 const auto prop = QUnicodeTables::properties(ucs4);
398 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
399
400 Q_ASSERT(state <= SB::BAfter);
401 state = SB::breakTable[state][ncls];
402 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
403 state = SB::Break;
404 for (auto lookahead = it; lookahead.hasNext(); /**/) {
405 const char32_t ucs4 = lookahead.nextOrRawCodeUnit();
406
407 const auto prop = QUnicodeTables::properties(ucs4);
408 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
409 switch (tcls) {
416 continue;
418 it = lookahead;
419 state = SB::Initial;
420 break;
421 default:
422 break;
423 }
424 break;
425 }
426 }
427 if (Q_UNLIKELY(state == SB::Break)) {
428 attributes[pos].sentenceBoundary = true;
429 state = SB::breakTable[SB::Initial][ncls];
430 }
431 }
432
433 attributes[len].sentenceBoundary = true; // SB2
434}
435
436
437// -----------------------------------------------------------------------------------------------------
438//
439// The line breaking algorithm.
440// See http://www.unicode.org/reports/tr14/tr14-39.html
441//
442// -----------------------------------------------------------------------------------------------------
443
444namespace LB {
445
446namespace NS { // Number Sequence
447
448// This namespace is used to implement LB25 which, as of Unicode 16, has this
449// definition:
450// NU ( SY | IS )* CL × PO
451// NU ( SY | IS )* CP × PO
452// NU ( SY | IS )* CL × PR
453// NU ( SY | IS )* CP × PR
454// NU ( SY | IS )* × PO
455// NU ( SY | IS )* × PR
456// PO × OP NU
457// PO × OP IS NU
458// PO × NU
459// PR × OP NU
460// PR × OP IS NU
461// PR × NU
462// HY × NU
463// IS × NU
464// NU ( SY | IS )* × NU
465
466enum Action {
471 NeedOPNU, // Like Start, but must be followed by sequence `(OP (IS)?)? NU`
472 // These are 'synthetic' actions and are not used in the table but are
473 // tracked otherwise in the code for LB25, to track the state of specific
474 // sequences:
475 CNeedNU, // Like Continue, but must be followed by NU
476 CNeedISNU, // Like Continue, but must be followed by IS? NU
477};
478
489
490static const uchar actionTable[CLCP + 1][CLCP + 1] = {
491// XX PRPO OP HY NU SY IS CLCP
492 { None , NeedOPNU, Start , None , Start , None , None , None }, // XX
493 { None , NeedOPNU, Continue, Break , Start , None , None , None }, // PRPO
494 { None , Start , Start , Break , Continue, None , Continue, None }, // OP
495 { None , None , None , Start , Continue, None , None , None }, // HY
499 { Break , Continue, Break , Break , Break , Break , Break , Break }, // CLCP
500};
501
503{
504 switch (lbc) {
506 return PRPO;
508 return OP;
510 return HY;
512 return NU;
514 return SY;
516 return IS;
518 return CLCP;
519 default:
520 break;
521 }
522 return XX;
523}
524
525} // namespace NS
526
527namespace BRS { // Brahmic Sequence, used to implement LB28a
528 constexpr char32_t DottedCircle = U'\u25CC';
529
530 // The LB28a_{n} value maps to the 'regex' on the nth line in LB28a
531 // The only special case is LB28a_2VI which is a direct match to the 2nd
532 // line, but it also leads to LB28a_3VIAK, the 3rd line.
533 enum State {
535 Start, // => Have: `(AK | [◌] | AS)`
536 LB28a_2VF, // => Have: `(AK | [◌] | AS) VF`
537 LB28a_2VI, // => Have: `(AK | [◌] | AS) VI` May find: `(AK | [◌])`
538 LB28a_3VIAK, // => Have: `(AK | [◌] | AS) VI (AK | [◌])`
539 LB28a_4, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS)` May find: `VF`
540 LB28a_4VF, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS) VF`
542 };
552 {
553 using LBC = QUnicodeTables::LineBreakClass;
554 if (lb.lbc == LBC::LineBreak_CM)
555 return state;
556
557 switch (state) {
558 case Start:
559 if (lb.lbc == LBC::LineBreak_VF)
560 return LB28a_2VF;
561 if (lb.lbc == LBC::LineBreak_VI)
562 return LB28a_2VI;
563 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
564 || lb.lbc == LBC::LineBreak_AS)
565 return LB28a_4;
566 break;
567 case LB28a_2VI:
568 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK)
569 return LB28a_3VIAK;
570 break;
571 case LB28a_4:
572 if (lb.lbc == LBC::LineBreak_VF)
573 return LB28a_4VF;
574 // Had (AK | [◌] | AS) (AK | [◌] | AS), which could mean the 2nd capture is the start
575 // of a new sequence, so we need to check if it makes sense.
576 return Restart;
577 case None:
578 if (Q_UNLIKELY(lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
579 || lb.lbc == LBC::LineBreak_AS)) {
580 return Start;
581 }
582 break;
583 case LB28a_2VF:
584 case LB28a_4VF:
585 case LB28a_3VIAK:
586 case Restart:
587 // These are all terminal states, so no need to update
588 Q_UNREACHABLE();
589 }
590 return None;
591 }
592}
593
604
605// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
606// about the table. It was removed in the later versions of the standard.
608/* 1↓ 2→ OP CL CP QU +Pi +Pf +19 GL NS EX SY IS PR PO NU AL HL ID IN HY +WS BA +WS HYBA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM AK AP AS VI VF*/
609/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
610/* CL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
611/* CP */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
612/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
613/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
614/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
615/* +19*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
616/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
617/* NS */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
618/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
619/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
620/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DN, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
621/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, DB, DB, DB },
622/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
623/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
624/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
625/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, CI, CI, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
626/* ID */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
627/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
628/* HY */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
629/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
630/* BA */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
631/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
632/*HYBA*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, DB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
633/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB },
634/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
635/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
636/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
637/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
638/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
639/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
640/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
641/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
642/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
643/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB },
644/* CB */ { DB, PB, PB, IB, IB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
645/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB },
646/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
647/* AK */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
648/* AP */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, IB, DB, DB },
649/* AS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
650/* VI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
651/* VF */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
652};
653
654// The following line break classes are not treated by the pair table
655// and must be resolved outside:
656// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX, ZWJ
657
658} // namespace LB
659
660static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
661{
662 qsizetype nestart = 0;
663 LB::NS::Class nelast = LB::NS::XX;
664 LB::NS::Action neactlast = LB::NS::None;
665
666 LB::BRS::ParseState brsState;
667
669 QUnicodeTables::LineBreakClass cls = lcls;
670 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(U'\n');
671
672 constexpr static auto isEastAsian = [](QUnicodeTables::EastAsianWidth eaw) {
673 using EAW = QUnicodeTables::EastAsianWidth;
674 return eaw == EAW::W || eaw == EAW::F || eaw == EAW::H;
675 };
676
677 for (qsizetype i = 0; i != len; ++i) {
678 qsizetype pos = i;
679 char32_t ucs4 = string[i];
680 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
681 ushort low = string[i + 1];
682 if (QChar::isLowSurrogate(low)) {
683 ucs4 = QChar::surrogateToUcs4(ucs4, low);
684 ++i;
685 }
686 }
687
688 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
689 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
691
693 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
695 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
696 ) {
697 // LB27: use SPACE for line breaking
698 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
699 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
700 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
702 } else {
703 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
704 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
705 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
706 if (FLAG(prop->category) & test)
708 }
709 }
710 }
711
712 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
713 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
714 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
715 if (FLAG(prop->category) & test)
717 }
718
719 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
720 if (prop->category == QChar::Punctuation_InitialQuote) {
721 // LB15a: Do not break after an unresolved initial punctuation
722 // that lies at the start of the line, after a space, after
723 // opening punctuation, or after an unresolved quotation mark,
724 // even after spaces.
725 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
726 // [\p{Pi}&QU] SP* ×
727 // Note: sot is treated as LF here due to initial loop setup.
728 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
734 if (std::any_of(std::begin(lb15a), std::end(lb15a),
735 [lcls](auto x) { return x == lcls; })) {
737 }
738 } else if (prop->category == QChar::Punctuation_FinalQuote) {
739 // LB15b: Do not break before an unresolved final punctuation
740 // that lies at the end of the line, before a space, before
741 // a prohibited break, or before an unresolved quotation mark,
742 // even after spaces.
743 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
744 // | SY | BK | CR | LF | NL | ZW | eot)
745 const auto nncls = [&] {
746 if (i + 1 >= len)
748 char32_t c = string[i + 1];
749 if (QChar::isHighSurrogate(c) && i + 2 < len) {
750 ushort low = string[i + 2];
751 if (QChar::isLowSurrogate(low))
752 c = QChar::surrogateToUcs4(c, low);
753 else
754 return QUnicodeTables::LineBreak_SG; // all surrogates
755 }
756 return QUnicodeTables::lineBreakClass(c);
757 }();
758
759 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
768 if (std::any_of(std::begin(lb15b), std::end(lb15b),
769 [nncls](auto x) { return x == nncls; })) {
771 }
772 }
773 }
774
775 if (Q_UNLIKELY((lcls >= QUnicodeTables::LineBreak_SP || lcls == QUnicodeTables::LineBreak_ZW
778 && (ncls == QUnicodeTables::LineBreak_HY || ucs4 == u'\u2010'))) {
779 // LB20a: Do not break after a word-initial hyphen.
780 // ( sot | BK | CR | LF | NL | SP | ZW | CB | GL ) ( HY | [\u2010] ) × AL
781
782 // Remap to the synthetic class WS_* (whitespace+*), which is just
783 // like the current respective linebreak class but with an IB action
784 // if the next class is AL.
785 if (ucs4 == u'\u2010')
787 else
789 }
790
791 if (Q_UNLIKELY(cls == QUnicodeTables::LineBreak_AP && ucs4 == LB::BRS::DottedCircle)) {
792 // LB28a: Do not break inside the orthographic syllables of Brahmic scripts
793 // AP × (AK | [◌] | AS)
794 // @note: AP × (AK | AS) is checked by the breakTable
795 goto next;
796 }
797 while (true) { // May need to recheck once.
798 // LB28a cont'd
799 LB::BRS::State oldState = brsState.state;
800 brsState.state = LB::BRS::updateState(brsState.state, {ncls, ucs4});
801 if (Q_LIKELY(brsState.state == oldState))
802 break;
803 switch (brsState.state) {
804 case LB::BRS::Start:
805 brsState.start = i;
806 break;
807 case LB::BRS::LB28a_2VI: // Wait for more characters, but also valid sequence
808 // We may get another character, but this is already a complete
809 // sequence that should not have any breaks:
810 for (qsizetype j = brsState.start + 1; j < i; ++j)
811 attributes[j].lineBreak = false;
812 // No need to mark this sequence again later, so move 'start'
813 // up to the current position:
814 brsState.start = i;
815 goto next;
816 case LB::BRS::Restart:
817 // The previous character was possibly the start of a new sequence
818 brsState.state = LB::BRS::Start;
819 brsState.start = pos - 1;
820 continue; // Doing the loop again!
824 for (qsizetype j = brsState.start + 1; j < i; ++j)
825 attributes[j].lineBreak = false;
826 if (brsState.state == LB::BRS::LB28a_3VIAK) {
827 // This might be the start of a new sequence
828 brsState.state = LB::BRS::Start;
829 brsState.start = i;
830 } else {
831 brsState.state = LB::BRS::None;
832 }
833 goto next;
834 case LB::BRS::LB28a_4: // Wait for more characters
835 Q_LIKELY_BRANCH
836 case LB::BRS::None: // Nothing to do
837 break;
838 }
839 break;
840 }
841
842 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_IS)) {
843 // LB15c Break before a decimal mark that follows a space, for instance, in
844 // ‘subtract .5’.
845 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_SP)) {
846 if (i + 1 < len) {
847 constexpr char32_t Invalid = ~U'\0';
848 char32_t ch = string[i + 1];
849 if (QChar::isHighSurrogate(ch) && i + 2 < len) {
850 ushort low = string[i + 2];
851 if (QChar::isLowSurrogate(low))
852 ch = QChar::surrogateToUcs4(ch, low);
853 else
854 ch = Invalid;
855 }
856 if (ch != Invalid // surrogates won't match (ensured by util/unicode)
857 && QUnicodeTables::lineBreakClass(ch) == QUnicodeTables::LineBreak_NU) {
858 attributes[pos].lineBreak = true;
859 goto next;
860 }
861 }
862 }
863 }
864
865 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_HL)) {
866 // LB21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
867 // HL (HY | [ BA - $EastAsian ]) × [^HL]
868 auto eaw = QUnicodeTables::EastAsianWidth(prop->eastAsianWidth);
869 const bool isNonEaBA = ncls == QUnicodeTables::LineBreak_BA && !isEastAsian(eaw);
870 if (isNonEaBA || ncls == QUnicodeTables::LineBreak_HY) {
871 // Remap to synthetic HYBA class which handles the next
872 // character. Generally (LB21) there are no breaks before
873 // HY or BA, so we can skip ahead to the next character.
875 goto next;
876 }
877 }
878
879 // LB25: do not break lines inside numbers
880 {
881 LB::NS::Class necur = LB::NS::toClass(ncls);
882 LB::NS::Action neact = LB::NS::Action(LB::NS::actionTable[nelast][necur]);
883 if (Q_UNLIKELY(neactlast == LB::NS::CNeedNU && necur != LB::NS::NU)) {
884 neact = LB::NS::None;
885 } else if (Q_UNLIKELY(neactlast == LB::NS::NeedOPNU)) {
886 if (necur == LB::NS::OP)
887 neact = LB::NS::CNeedISNU;
888 else if (necur == LB::NS::NU)
889 neact = LB::NS::Continue;
890 else // Anything else and we ignore the sequence
891 neact = LB::NS::None;
892 } else if (Q_UNLIKELY(neactlast == LB::NS::CNeedISNU)) {
893 if (necur == LB::NS::IS)
894 neact = LB::NS::CNeedNU;
895 else if (necur == LB::NS::NU)
896 neact = LB::NS::Continue;
897 else // Anything else and we ignore the sequence
898 neact = LB::NS::None;
899 }
900 switch (neact) {
901 case LB::NS::Break:
902 // do not change breaks before and after the expression
903 for (qsizetype j = nestart + 1; j < pos; ++j)
904 attributes[j].lineBreak = false;
905 Q_FALLTHROUGH();
906 Q_LIKELY_BRANCH
907 case LB::NS::None:
908 nelast = LB::NS::XX; // reset state
909 break;
910 case LB::NS::NeedOPNU:
911 case LB::NS::Start:
912 if (neactlast == LB::NS::Start || neactlast == LB::NS::Continue) {
913 // Apply the linebreaks for the previous stretch; we need to start a new one
914 for (qsizetype j = nestart + 1; j < pos; ++j)
915 attributes[j].lineBreak = false;
916 }
917 nestart = i;
918 Q_FALLTHROUGH();
919 case LB::NS::CNeedNU:
921 case LB::NS::Continue:
922 nelast = necur;
923 break;
924 }
925 neactlast = neact;
926 }
927
928 // LB19a Unless surrounded by East Asian characters, do not break either side of any
929 // unresolved quotation marks
930 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU
932 && lcls != QUnicodeTables::LineBreak_ZW)) {
933 using EAW = QUnicodeTables::EastAsianWidth;
934 constexpr static auto nextCharNonEastAsian = [](const char16_t *string, qsizetype len) {
935 if (len > 0) {
936 char32_t nch = string[0];
937 if (QChar::isHighSurrogate(nch) && len > 1) {
938 char16_t low = string[1];
939 if (QChar::isLowSurrogate(low))
940 nch = QChar::surrogateToUcs4(char16_t(nch), low);
941 }
942 const auto *nextProp = QUnicodeTables::properties(nch);
944 nextProp->lineBreakClass);
945 QUnicodeTables::EastAsianWidth neaw = EAW(nextProp->eastAsianWidth);
946 return nncls != QUnicodeTables::LineBreak_CM
947 && nncls <= QUnicodeTables::LineBreak_SP
948 && !isEastAsian(neaw);
949 }
950 return true; // end-of-text counts as non-East-Asian
951 };
952 if (Q_UNLIKELY(!isEastAsian(EAW(lastProp->eastAsianWidth))
953 || nextCharNonEastAsian(string + i + 1, len - i - 1))) {
954 // Remap to the synthetic QU_19 class which has indirect breaks
955 // for most following classes.
957 }
958 }
959
960 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
961 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
962 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
963 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
964 goto next;
965 }
966
967 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
969 goto next; // LB6: x(BK|CR|LF|NL)
970 goto next_no_cls_update; // LB7: xSP
971 }
972
973 // LB19 - do not break before non-initial unresolved quotation marks, or after non-final
974 // unresolved quotation marks
975 if (Q_UNLIKELY(((ncls == QUnicodeTables::LineBreak_QU
976 || ncls == QUnicodeTables::LineBreak_QU_19)
977 && prop->category != QChar::Punctuation_InitialQuote)
978 || (cls == QUnicodeTables::LineBreak_QU
979 && lastProp->category != QChar::Punctuation_FinalQuote))) {
980 // Make sure the previous character is not one that we have to break after.
981 // Also skip if ncls is CM so it can be treated as lcls (LB9)
983 && ncls != QUnicodeTables::LineBreak_CM) {
984 goto next;
985 }
986 }
987
988 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
989 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
991 // don't update anything
992 goto next_no_cls_update;
993 }
994
995 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
996 // LB8a: ZWJ x
997 goto next;
998 }
999
1000 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
1001 // LB30a
1003 goto next;
1004 }
1005
1006 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
1007 && lastProp->category == QChar::Other_NotAssigned
1008 && lastProp->graphemeBreakClass
1009 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
1010 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
1011 goto next;
1012 }
1013
1014 // for South East Asian chars that require a complex analysis, the Unicode
1015 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
1016 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
1018
1019 tcls = cls;
1020
1021 constexpr static auto remapToAL = [](QUnicodeTables::LineBreakClass &c, auto &property) {
1022 if (Q_UNLIKELY(c == QUnicodeTables::LineBreak_CM
1025 property = QUnicodeTables::properties(U'\u0041');
1026 }
1027 };
1028 // LB10 Treat any remaining combining mark or ZWJ as AL,
1029 // as if it had the properties of U+0041 A LATIN CAPITAL LETTER
1030 remapToAL(tcls, lastProp);
1031 remapToAL(ncls, prop);
1032
1034 case LB::DirectBreak:
1035 attributes[pos].lineBreak = true;
1036 break;
1037 case LB::IndirectBreak:
1038 if (lcls == QUnicodeTables::LineBreak_SP)
1039 attributes[pos].lineBreak = true;
1040 break;
1042 if (lcls != QUnicodeTables::LineBreak_SP)
1043 goto next_no_cls_update;
1044 attributes[pos].lineBreak = true;
1045 break;
1047 if (lcls != QUnicodeTables::LineBreak_SP)
1048 goto next_no_cls_update;
1049 break;
1050 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
1051 if (lcls != QUnicodeTables::LineBreak_HL)
1052 attributes[pos].lineBreak = true;
1053 break;
1055 using EAW = QUnicodeTables::EastAsianWidth;
1056 switch (EAW(prop->eastAsianWidth)) {
1057 default:
1058 if (lcls != QUnicodeTables::LineBreak_SP)
1059 break;
1060 Q_FALLTHROUGH();
1061 case QUnicodeTables::EastAsianWidth::F:
1062 case QUnicodeTables::EastAsianWidth::W:
1063 case QUnicodeTables::EastAsianWidth::H:
1064 attributes[pos].lineBreak = true;
1065 break;
1066 }
1067 break;
1068 case LB::DirectBreakOutsideNumericSequence:
1069 if (neactlast == LB::NS::None || neactlast > LB::NS::Break)
1070 attributes[pos].lineBreak = true;
1071 break;
1073 // nothing to do
1074 default:
1075 break;
1076 }
1077
1078 next:
1080 cls = ncls;
1081 lastProp = prop;
1082 }
1083 next_no_cls_update:
1084 lcls = ncls;
1085 }
1086
1087 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
1088 // LB25: do not break lines inside numbers
1089 for (qsizetype j = nestart + 1; j < len; ++j)
1090 attributes[j].lineBreak = false;
1091 }
1092
1093 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
1094 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
1095}
1096
1097
1098static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1099{
1100 QStringIterator it(QStringView{string, len});
1101 while (it.hasNext()) {
1102 const auto pos = it.index();
1103 if (Q_UNLIKELY(QChar::isSpace(it.nextOrRawCodeUnit())))
1104 attributes[pos].whiteSpace = true;
1105 }
1106}
1107
1108namespace Tailored {
1109
1110using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
1111
1112
1127
1128static const unsigned char indicForms[0xe00-0x900] = {
1129 // Devangari
1134
1139
1144
1149
1154
1159
1164
1169
1170 // Bengali
1175
1180
1185
1190
1195
1200
1205
1210
1211 // Gurmukhi
1216
1221
1226
1231
1236
1241
1246
1251
1252 // Gujarati
1257
1262
1267
1272
1277
1282
1287
1292
1293 // Oriya
1298
1303
1308
1313
1318
1323
1328
1333
1334 //Tamil
1339
1344
1349
1354
1359
1364
1369
1374
1375 // Telugu
1380
1385
1390
1395
1400
1405
1410
1415
1416 // Kannada
1421
1426
1431
1436
1441
1446
1451
1456
1457 // Malayalam
1462
1467
1472
1477
1482
1487
1492
1497
1498 // Sinhala
1503
1508
1513
1518
1523
1528
1533
1538};
1539
1540static inline Form form(unsigned short uc) {
1541 if (uc < 0x900 || uc > 0xdff) {
1542 if (uc == 0x25cc)
1543 return Consonant;
1544 if (uc == 0x200c || uc == 0x200d)
1545 return Control;
1546 return Other;
1547 }
1548 return (Form)indicForms[uc-0x900];
1549}
1550
1551// #define INDIC_DEBUG
1552#ifdef INDIC_DEBUG
1553#define IDEBUG qDebug
1554#else
1555#define IDEBUG if constexpr (1) ; else qDebug
1556#endif
1557
1558/* syllables are of the form:
1559
1560 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1561 (Consonant Nukta? Halant)* Consonant Halant
1562 IndependentVowel VowelMark? StressMark?
1563
1564 We return syllable boundaries on invalid combinations as well
1565*/
1566static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1567{
1568 *invalid = false;
1569 IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1570 const char16_t *uc = s+start;
1571
1572 qsizetype pos = 0;
1573 Form state = form(uc[pos]);
1574 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1575 pos++;
1576
1577 if (state != Consonant && state != IndependentVowel) {
1578 if (state != Other)
1579 *invalid = true;
1580 goto finish;
1581 }
1582
1583 while (pos < end - start) {
1584 Form newState = form(uc[pos]);
1585 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1586 switch (newState) {
1587 case Control:
1588 newState = state;
1589 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1590 break;
1591 // the control character should be the last char in the item
1592 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1593 break;
1594 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1595 break;
1596 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1597 ++pos;
1598 goto finish;
1599 case Consonant:
1600 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1601 break;
1602 goto finish;
1603 case Halant:
1604 if (state == Nukta || state == Consonant)
1605 break;
1606 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1607 if (script == QChar::Script_Bengali && pos == 1 &&
1608 (uc[0] == 0x0985 || uc[0] == 0x098f))
1609 break;
1610 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1611 if (script == QChar::Script_Sinhala && state == Matra) {
1612 ++pos;
1613 continue;
1614 }
1615 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1616 ++pos;
1617 continue;
1618 }
1619 goto finish;
1620 case Nukta:
1621 if (state == Consonant)
1622 break;
1623 goto finish;
1624 case StressMark:
1625 if (state == VowelMark)
1626 break;
1627 Q_FALLTHROUGH();
1628 case VowelMark:
1629 if (state == Matra || state == LengthMark || state == IndependentVowel)
1630 break;
1631 Q_FALLTHROUGH();
1632 case Matra:
1633 if (state == Consonant || state == Nukta)
1634 break;
1635 if (state == Matra) {
1636 // ### needs proper testing for correct two/three part matras
1637 break;
1638 }
1639 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1640 // it work for all Indic languages?
1641 // the combination Independent_A + Vowel Sign AA is allowed.
1642 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1643 break;
1644 if (script == QChar::Script_Tamil && state == Matra) {
1645 if (uc[pos-1] == 0x0bc6 &&
1646 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1647 break;
1648 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1649 break;
1650 }
1651 goto finish;
1652
1653 case LengthMark:
1654 if (state == Matra) {
1655 // ### needs proper testing for correct two/three part matras
1656 break;
1657 }
1658 Q_FALLTHROUGH();
1659 case IndependentVowel:
1660 case Invalid:
1661 case Other:
1662 goto finish;
1663 }
1664 state = newState;
1665 pos++;
1666 }
1667 finish:
1668 return pos+start;
1669}
1670
1671static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1672{
1673 qsizetype end = from + len;
1674 attributes += from;
1675 qsizetype i = 0;
1676 while (i < len) {
1677 bool invalid;
1678 qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1679 attributes[i].graphemeBoundary = true;
1680
1681 if (boundary > len-1) boundary = len;
1682 i++;
1683 while (i < boundary) {
1684 attributes[i].graphemeBoundary = false;
1685 ++i;
1686 }
1687 assert(i == boundary);
1688 }
1689
1690
1691}
1692
1693#if QT_CONFIG(library)
1694
1695#define LIBTHAI_MAJOR 0
1696
1697/*
1698 * if libthai changed please update these codes too.
1699 */
1700struct thcell_t {
1701 unsigned char base; /**< base character */
1702 unsigned char hilo; /**< upper/lower vowel/diacritic */
1703 unsigned char top; /**< top-level mark */
1704};
1705
1706using ThBrk = struct _ThBrk;
1707
1708namespace {
1709
1710class LibThai final
1711{
1713
1714 using th_brk_new_def = ThBrk *(*)(const char *);
1715 using th_brk_delete_def = void (*)(ThBrk *);
1716 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1717 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1718
1719public:
1720 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1721 {
1723 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
1724 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
1725
1726 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
1727 if (th_brk_new) {
1728 m_state = th_brk_new(nullptr);
1730 reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
1731 }
1732 }
1733
1734 ~LibThai()
1735 {
1736 if (m_state && m_th_brk_delete)
1738 m_library.unload();
1739 }
1740
1741 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1742
1743 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1744 {
1748 }
1749
1750 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1751 {
1754 }
1755
1756private:
1758
1759 // Global state for th_brk_find_breaks().
1760 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1761 // state is read-only, and so it is safe to use it from multiple threads after
1762 // initialization. This is also stated in the libthai documentation.
1763 ThBrk *m_state = nullptr;
1764
1768};
1769
1770} // unnamed namespace
1771
1773
1774static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1775{
1776 qsizetype i;
1777 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1778
1779 for (i = 0; i < len; ++i) {
1780 if (string[i] <= 0xa0)
1781 result[i] = static_cast<unsigned char>(string[i]);
1782 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1783 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1784 else
1785 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1786 }
1787
1788 result[len] = 0;
1789}
1790
1791/*
1792 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1793 */
1794static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1795{
1796 constexpr qsizetype Prealloc = 128;
1797 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1800 struct thcell_t tis_cell;
1801
1803 if (!libThai || !libThai->isInitialized())
1804 return;
1805
1806 to_tis620(string, len, s.data());
1807
1808 for (i = 0; i < len; ++i) {
1809 attributes[i].wordBreak = false;
1810 attributes[i].wordStart = false;
1811 attributes[i].wordEnd = false;
1812 attributes[i].lineBreak = false;
1813 }
1814
1815 attributes[0].wordBreak = true;
1816 attributes[0].wordStart = true;
1817 attributes[0].wordEnd = false;
1818 numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
1820 static_cast<size_t>(break_positions.size()));
1821 for (i = 0; i < numbreaks; ++i) {
1826 }
1827 if (numbreaks > 0)
1829
1830 /* manage grapheme boundaries */
1831 i = 0;
1832 while (i < len) {
1834 libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
1835 size_t(len - i), &tis_cell, true);
1836
1838 for (size_t j = 1; j < cell_length; ++j)
1839 attributes[i + j].graphemeBoundary = false;
1840
1841 i += cell_length;
1842 }
1843}
1844
1845#endif // QT_CONFIG(library)
1846
1847static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1848{
1849 assert(script == QChar::Script_Thai);
1850#if QT_CONFIG(library)
1851 const char16_t *uc = text + from;
1852 attributes += from;
1853 Q_UNUSED(script);
1854 thaiAssignAttributes(uc, len, attributes);
1855#else
1856 Q_UNUSED(script);
1857 Q_UNUSED(text);
1858 Q_UNUSED(from);
1859 Q_UNUSED(len);
1860 Q_UNUSED(attributes);
1861#endif
1862}
1863
1864/*
1865 tibetan syllables are of the form:
1866 head position consonant
1867 first sub-joined consonant
1868 ....intermediate sub-joined consonants (if any)
1869 last sub-joined consonant
1870 sub-joined vowel (a-chung U+0F71)
1871 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1872*/
1873
1881
1882/* this table starts at U+0f40 */
1883static const unsigned char tibetanForm[0x80] = {
1888
1893
1898
1903
1908
1913
1918
1923};
1924
1925#define tibetan_form(c)
1926 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1927
1928static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1929{
1930 const char16_t *uc = s + start;
1931
1932 qsizetype pos = 0;
1933 TibetanForm state = tibetan_form(*uc);
1934
1935/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1936 pos++;
1937
1938 if (state != TibetanHeadConsonant) {
1939 if (state != TibetanOther)
1940 *invalid = true;
1941 goto finish;
1942 }
1943
1944 while (pos < end - start) {
1945 TibetanForm newState = tibetan_form(uc[pos]);
1946 switch (newState) {
1949 if (state != TibetanHeadConsonant &&
1951 goto finish;
1952 state = newState;
1953 break;
1954 case TibetanVowel:
1955 if (state != TibetanHeadConsonant &&
1956 state != TibetanSubjoinedConsonant &&
1957 state != TibetanSubjoinedVowel)
1958 goto finish;
1959 break;
1960 case TibetanOther:
1962 goto finish;
1963 }
1964 pos++;
1965 }
1966
1967finish:
1968 *invalid = false;
1969 return start+pos;
1970}
1971
1972static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1973{
1974 qsizetype end = from + len;
1975 qsizetype i = 0;
1976 Q_UNUSED(script);
1977 attributes += from;
1978 while (i < len) {
1979 bool invalid;
1980 qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1981
1982 attributes[i].graphemeBoundary = true;
1983
1984 if (boundary > len-1) boundary = len;
1985 i++;
1986 while (i < boundary) {
1987 attributes[i].graphemeBoundary = false;
1988 ++i;
1989 }
1990 assert(i == boundary);
1991 }
1992}
1993
1996 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
1997 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
1998 Mymr_CC_NGA = 3, /* Consonant NGA */
1999 Mymr_CC_YA = 4, /* Consonant YA */
2000 Mymr_CC_RA = 5, /* Consonant RA */
2001 Mymr_CC_WA = 6, /* Consonant WA */
2002 Mymr_CC_HA = 7, /* Consonant HA */
2003 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
2004 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
2005 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
2006 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
2007 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
2008 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
2009 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
2013 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
2014 Mymr_CC_COUNT = 19 /* This is the number of character classes */
2015};
2016
2019
2020 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2021 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
2022 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
2023 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
2024 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
2025 first in a syllable */
2026 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
2027
2028 /* position flags */
2030 Mymr_CF_POS_BELOW = 0x00040000,
2031 Mymr_CF_POS_ABOVE = 0x00020000,
2032 Mymr_CF_POS_AFTER = 0x00010000,
2033 Mymr_CF_POS_MASK = 0x000f0000,
2034
2036};
2037
2038Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
2039
2040/* Characters that get refrered to by name */
2042{
2046 Mymr_C_RA = 0x101B,
2047 Mymr_C_YA = 0x101A,
2048 Mymr_C_NGA = 0x1004,
2051};
2052
2053enum
2054{
2072};
2073
2074
2075typedef int MymrCharClass;
2076
2077
2093
2094static MymrCharClass
2096{
2097 if (ch == Mymr_C_SIGN_ZWJ)
2099
2100 if (ch == Mymr_C_SIGN_ZWNJ)
2102
2103 if (ch < 0x1000 || ch > 0x105f)
2104 return Mymr_CC_RESERVED;
2105
2106 return mymrCharClasses[ch - 0x1000];
2107}
2108
2109static const signed char mymrStateTable[][Mymr_CC_COUNT] =
2110{
2111/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
2112 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
2113 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
2114 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
2115 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
2116 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
2117 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
2118 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
2119 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
2120 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
2121 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
2122 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
2123 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
2124 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
2125 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
2126 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
2127 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
2128 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
2129 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
2130 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
2131 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
2132 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
2133 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
2134 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
2135 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
2136 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
2137 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
2138 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
2139 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
2140/* exit state -2 is for invalid order of medials and combination of invalids
2141 with virama where virama should treat as start of next syllable
2142 */
2143};
2144
2145/*#define MYANMAR_DEBUG */
2146#ifdef MYANMAR_DEBUG
2147#define MMDEBUG qDebug
2148#else
2149# define MMDEBUG
2150 if (0)
2151 printf
2152#endif
2153
2154/*
2155// Given an input string of characters and a location in which to start looking
2156// calculate, using the state table, which one is the last character of the syllable
2157// that starts in the starting position.
2158*/
2159static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2160{
2161 const char16_t *uc = s + start;
2162 int state = 0;
2163 qsizetype pos = start;
2164 *invalid = false;
2165
2166 while (pos < end) {
2167 MymrCharClass charClass = getMyanmarCharClass(*uc);
2168 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
2169 if (pos == start)
2170 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
2171
2172 MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
2173
2174 if (state < 0) {
2175 if (state < -1)
2176 --pos;
2177 break;
2178 }
2179 ++uc;
2180 ++pos;
2181 }
2182 return pos;
2183}
2184
2185static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2186{
2187 qsizetype end = from + len;
2188 qsizetype i = 0;
2189 Q_UNUSED(script);
2190 attributes += from;
2191 while (i < len) {
2192 bool invalid;
2193 qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
2194
2195 attributes[i].graphemeBoundary = true;
2196 attributes[i].lineBreak = true;
2197
2198 if (boundary > len-1)
2199 boundary = len;
2200 i++;
2201 while (i < boundary) {
2202 attributes[i].graphemeBoundary = false;
2203 ++i;
2204 }
2205 assert(i == boundary);
2206 }
2207}
2208
2209/*
2210// Vocabulary
2211// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
2212// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
2213// split vowels, signs... but there is only one base in a syllable, it has to be coded as
2214// the first character of the syllable.
2215// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
2216// Khmer language has five of them. Khmer split vowels either have one part before the
2217// base and one after the base or they have a part before the base and a part above the base.
2218// The first part of all Khmer split vowels is the same character, identical to
2219// the glyph of Khmer dependent vowel SRA EI
2220// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
2221// Differently than indian languages, the coeng modifies the consonant that follows it,
2222// not the one preceding it Each consonant has two forms, the base form and the subscript form
2223// the base form is the normal one (using the consonants code-point), the subscript form is
2224// displayed when the combination coeng + consonant is encountered.
2225// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
2226// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
2227// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
2228// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
2229// if it is attached to a consonant of the first series or a consonant of the second series
2230// Most consonants have an equivalent in the other series, but some of theme exist only in
2231// one series (for example SA). If we want to use the consonant SA with a vowel sound that
2232// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
2233// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
2234// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
2235// MUSIKATOAN a second series consonant to have a first series vowel sound.
2236// Consonant shifter are both normally supercript marks, but, when they are followed by a
2237// superscript, they change shape and take the form of subscript dependent vowel SRA U.
2238// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
2239// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
2240// be placed after the coeng consonant.
2241// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2242// Each vowel has its own position. Only one vowel per syllable is allowed.
2243// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2244// Allowed in a syllable.
2245//
2246//
2247// order is important here! This order must be the same that is found in each horizontal
2248// line in the statetable for Khmer (see khmerStateTable) .
2249*/
2252 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2253 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2254 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2255 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2257 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2258 CC_COENG = 7, /* Subscript consonant combining character */
2262 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2263 CC_COUNT = 12 /* This is the number of character classes */
2264};
2265
2266
2268 CF_CLASS_MASK = 0x0000FFFF,
2269
2270 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2271 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2272 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2273 CF_COENG = 0x08000000, /* flag to speed up comparing */
2274 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2275 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2276
2277 /* position flags */
2278 CF_POS_BEFORE = 0x00080000,
2279 CF_POS_BELOW = 0x00040000,
2280 CF_POS_ABOVE = 0x00020000,
2281 CF_POS_AFTER = 0x00010000,
2282 CF_POS_MASK = 0x000f0000
2283};
2284
2285Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2286
2287/* Characters that get referred to by name */
2289 C_SIGN_ZWNJ = 0x200C,
2290 C_SIGN_ZWJ = 0x200D,
2291 C_RO = 0x179A,
2292 C_VOWEL_AA = 0x17B6,
2294 C_VOWEL_E = 0x17C1,
2295 C_COENG = 0x17D2
2296};
2297
2298
2299/*
2300// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2301// they are also used to know where a character should be placed (location in reference to the base character)
2302// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2303// indicate error in syllable construction
2304*/
2305enum {
2319
2320 /* split vowel */
2323};
2324
2325
2326/*
2327// Character class: a character class value
2328// ORed with character class flags.
2329*/
2330typedef unsigned long KhmerCharClass;
2331
2332
2333/*
2334// Character class tables
2335// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2336// _sa Sign placed above the base
2337// _sp Sign placed after the base
2338// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2339// _c2 Consonant of type 2 (only RO)
2340// _c3 Consonant of type 3
2341// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2342// _cd Consonant-shifter
2343// _dl Dependent vowel placed before the base (left of the base)
2344// _db Dependent vowel placed below the base
2345// _da Dependent vowel placed above the base
2346// _dr Dependent vowel placed behind the base (right of the base)
2347// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2348// it to create a subscript consonant or independent vowel
2349// _va Khmer split vowel in which the first part is before the base and the second one above the base
2350// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2351*/
2353 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2354 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2355 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2356 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2357 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2358 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2359};
2360
2361/* this enum must reflect the range of khmerCharClasses */
2366
2367/*
2368// Below we define how a character in the input string is either in the khmerCharClasses table
2369// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2370// within the syllable, but are not in the table) we also get their type back, or an unknown object
2371// in which case we get _xx (CC_RESERVED) back
2372*/
2374{
2375 if (uc == C_SIGN_ZWJ) {
2376 return CC_ZERO_WIDTH_J_MARK;
2377 }
2378
2379 if (uc == C_SIGN_ZWNJ) {
2380 return CC_ZERO_WIDTH_NJ_MARK;
2381 }
2382
2383 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2384 return CC_RESERVED;
2385 }
2386
2387 return khmerCharClasses[uc - KhmerFirstChar];
2388}
2389
2390
2391/*
2392// The stateTable is used to calculate the end (the length) of a well
2393// formed Khmer Syllable.
2394//
2395// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2396// CharClassValues. This coincidence of values allows the follow up of the table.
2397//
2398// Each line corresponds to a state, which does not necessarily need to be a type
2399// of component... for example, state 2 is a base, with is always a first character
2400// in the syllable, but the state could be produced a consonant of any type when
2401// it is the first character that is analysed (in ground state).
2402//
2403// Differentiating 3 types of consonants is necessary in order to
2404// forbid the use of certain combinations, such as having a second
2405// coeng after a coeng RO,
2406// The inexistent possibility of having a type 3 after another type 3 is permitted,
2407// eliminating it would very much complicate the table, and it does not create typing
2408// problems, as the case above.
2409//
2410// The table is quite complex, in order to limit the number of coeng consonants
2411// to 2 (by means of the table).
2412//
2413// There a peculiarity, as far as Unicode is concerned:
2414// - The consonant-shifter is considered in two possible different
2415// locations, the one considered in Unicode 3.0 and the one considered in
2416// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2417//
2418//
2419// xx independent character, such as a number, punctuation sign or non-khmer char
2420//
2421// c1 Khmer consonant of type 1 or an independent vowel
2422// that is, a letter in which the subscript for is only under the
2423// base, not taking any space to the right or to the left
2424//
2425// c2 Khmer consonant of type 2, the coeng form takes space under
2426// and to the left of the base (only RO is of this type)
2427//
2428// c3 Khmer consonant of type 3. Its subscript form takes space under
2429// and to the right of the base.
2430//
2431// cs Khmer consonant shifter
2432//
2433// rb Khmer robat
2434//
2435// co coeng character (u17D2)
2436//
2437// dv dependent vowel (including split vowels, they are treated in the same way).
2438// even if dv is not defined above, the component that is really tested for is
2439// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2440//
2441// zwj Zero Width joiner
2442//
2443// zwnj Zero width non joiner
2444//
2445// sa above sign
2446//
2447// sp post sign
2448//
2449// there are lines with equal content but for an easier understanding
2450// (and maybe change in the future) we did not join them
2451*/
2452static const signed char khmerStateTable[][CC_COUNT] =
2453{
2454 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2455 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2456 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2457 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2458 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2459 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2460 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2461 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2462 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2463 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2464 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2465 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2466 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2467 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2468 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2469 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2470 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2471 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2472 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2473 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2474 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2475 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2476};
2477
2478
2479/* #define KHMER_DEBUG */
2480#ifdef KHMER_DEBUG
2481#define KHDEBUG qDebug
2482#else
2483# define KHDEBUG
2484 if (0)
2485 printf
2486#endif
2487
2488/*
2489// Given an input string of characters and a location in which to start looking
2490// calculate, using the state table, which one is the last character of the syllable
2491// that starts in the starting position.
2492*/
2493static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2494{
2495 const char16_t *uc = s + start;
2496 int state = 0;
2497 qsizetype pos = start;
2498 *invalid = false;
2499
2500 while (pos < end) {
2501 KhmerCharClass charClass = getKhmerCharClass(*uc);
2502 if (pos == start) {
2503 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2504 }
2505 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2506
2507 KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2508 charClass, *uc );
2509
2510 if (state < 0) {
2511 break;
2512 }
2513 ++uc;
2514 ++pos;
2515 }
2516 return pos;
2517}
2518
2519static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2520{
2521 qsizetype end = from + len;
2522 qsizetype i = 0;
2523 Q_UNUSED(script);
2524 attributes += from;
2525 while ( i < len ) {
2526 bool invalid;
2527 qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2528
2529 attributes[i].graphemeBoundary = true;
2530
2531 if ( boundary > len-1 ) boundary = len;
2532 i++;
2533 while ( i < boundary ) {
2534 attributes[i].graphemeBoundary = false;
2535 ++i;
2536 }
2537 assert( i == boundary );
2538 }
2539}
2540
2541
2543{
2544 switch (script) {
2545 case QChar::Script_Unknown:
2546 case QChar::Script_Inherited:
2547 case QChar::Script_Common:
2548 case QChar::Script_Latin:
2549 case QChar::Script_Greek:
2550 case QChar::Script_Cyrillic:
2551 case QChar::Script_Armenian:
2552 case QChar::Script_Hebrew:
2553 case QChar::Script_Arabic:
2554 case QChar::Script_Syriac:
2555 case QChar::Script_Thaana:
2556 return nullptr;
2557 case QChar::Script_Devanagari:
2558 case QChar::Script_Bengali:
2559 case QChar::Script_Gurmukhi:
2560 case QChar::Script_Gujarati:
2561 case QChar::Script_Oriya:
2562 case QChar::Script_Tamil:
2563 case QChar::Script_Telugu:
2564 case QChar::Script_Kannada:
2565 case QChar::Script_Malayalam:
2566 case QChar::Script_Sinhala:
2567 return &indicAttributes;
2568 case QChar::Script_Thai:
2569 return &thaiAttributes;
2570 case QChar::Script_Lao:
2571 return nullptr;
2572 case QChar::Script_Tibetan:
2573 return &tibetanAttributes;
2574 case QChar::Script_Myanmar:
2575 return &myanmarAttributes;
2576 case QChar::Script_Georgian:
2577 case QChar::Script_Hangul:
2578 case QChar::Script_Ethiopic:
2579 case QChar::Script_Cherokee:
2580 case QChar::Script_CanadianAboriginal:
2581 case QChar::Script_Ogham:
2582 case QChar::Script_Runic:
2583 return nullptr;
2584 case QChar::Script_Khmer:
2585 return &khmerAttributes;
2586 case QChar::Script_Mongolian:
2587 case QChar::Script_Hiragana:
2588 case QChar::Script_Katakana:
2589 case QChar::Script_Bopomofo:
2590 case QChar::Script_Han:
2591 case QChar::Script_Yi:
2592 case QChar::Script_OldItalic:
2593 case QChar::Script_Gothic:
2594 case QChar::Script_Deseret:
2595 case QChar::Script_Tagalog:
2596 case QChar::Script_Hanunoo:
2597 case QChar::Script_Buhid:
2598 case QChar::Script_Tagbanwa:
2599 case QChar::Script_Coptic:
2600 case QChar::Script_Limbu:
2601 case QChar::Script_TaiLe:
2602 case QChar::Script_LinearB:
2603 case QChar::Script_Ugaritic:
2604 case QChar::Script_Shavian:
2605 case QChar::Script_Osmanya:
2606 case QChar::Script_Cypriot:
2607 case QChar::Script_Braille:
2608 case QChar::Script_Buginese:
2609 case QChar::Script_NewTaiLue:
2610 case QChar::Script_Glagolitic:
2611 case QChar::Script_Tifinagh:
2612 case QChar::Script_SylotiNagri:
2613 case QChar::Script_OldPersian:
2614 case QChar::Script_Kharoshthi:
2615 case QChar::Script_Balinese:
2616 case QChar::Script_Cuneiform:
2617 case QChar::Script_Phoenician:
2618 case QChar::Script_PhagsPa:
2619 case QChar::Script_Nko:
2620 case QChar::Script_Sundanese:
2621 case QChar::Script_Lepcha:
2622 case QChar::Script_OlChiki:
2623 case QChar::Script_Vai:
2624 case QChar::Script_Saurashtra:
2625 case QChar::Script_KayahLi:
2626 case QChar::Script_Rejang:
2627 case QChar::Script_Lycian:
2628 case QChar::Script_Carian:
2629 case QChar::Script_Lydian:
2630 case QChar::Script_Cham:
2631 case QChar::Script_TaiTham:
2632 case QChar::Script_TaiViet:
2633 case QChar::Script_Avestan:
2634 case QChar::Script_EgyptianHieroglyphs:
2635 case QChar::Script_Samaritan:
2636 case QChar::Script_Lisu:
2637 case QChar::Script_Bamum:
2638 case QChar::Script_Javanese:
2639 case QChar::Script_MeeteiMayek:
2640 case QChar::Script_ImperialAramaic:
2641 case QChar::Script_OldSouthArabian:
2642 case QChar::Script_InscriptionalParthian:
2643 case QChar::Script_InscriptionalPahlavi:
2644 case QChar::Script_OldTurkic:
2645 case QChar::Script_Kaithi:
2646 case QChar::Script_Batak:
2647 case QChar::Script_Brahmi:
2648 case QChar::Script_Mandaic:
2649 case QChar::Script_Chakma:
2650 case QChar::Script_MeroiticCursive:
2651 case QChar::Script_MeroiticHieroglyphs:
2652 case QChar::Script_Miao:
2653 case QChar::Script_Sharada:
2654 case QChar::Script_SoraSompeng:
2655 case QChar::Script_Takri:
2656 case QChar::Script_CaucasianAlbanian:
2657 case QChar::Script_BassaVah:
2658 case QChar::Script_Duployan:
2659 case QChar::Script_Elbasan:
2660 case QChar::Script_Grantha:
2661 case QChar::Script_PahawhHmong:
2662 case QChar::Script_Khojki:
2663 case QChar::Script_LinearA:
2664 case QChar::Script_Mahajani:
2665 case QChar::Script_Manichaean:
2666 case QChar::Script_MendeKikakui:
2667 case QChar::Script_Modi:
2668 case QChar::Script_Mro:
2669 case QChar::Script_OldNorthArabian:
2670 case QChar::Script_Nabataean:
2671 case QChar::Script_Palmyrene:
2672 case QChar::Script_PauCinHau:
2673 case QChar::Script_OldPermic:
2674 case QChar::Script_PsalterPahlavi:
2675 case QChar::Script_Siddham:
2676 case QChar::Script_Khudawadi:
2677 case QChar::Script_Tirhuta:
2678 case QChar::Script_WarangCiti:
2679 case QChar::Script_Ahom:
2680 case QChar::Script_AnatolianHieroglyphs:
2681 case QChar::Script_Hatran:
2682 case QChar::Script_Multani:
2683 case QChar::Script_OldHungarian:
2684 case QChar::Script_SignWriting:
2685 case QChar::Script_Adlam:
2686 case QChar::Script_Bhaiksuki:
2687 case QChar::Script_Marchen:
2688 case QChar::Script_Newa:
2689 case QChar::Script_Osage:
2690 case QChar::Script_Tangut:
2691 case QChar::Script_MasaramGondi:
2692 case QChar::Script_Nushu:
2693 case QChar::Script_Soyombo:
2694 case QChar::Script_ZanabazarSquare:
2695 case QChar::Script_Dogra:
2696 case QChar::Script_GunjalaGondi:
2697 case QChar::Script_HanifiRohingya:
2698 case QChar::Script_Makasar:
2699 case QChar::Script_Medefaidrin:
2700 case QChar::Script_OldSogdian:
2701 case QChar::Script_Sogdian:
2702 case QChar::Script_Elymaic:
2703 case QChar::Script_Nandinagari:
2704 case QChar::Script_NyiakengPuachueHmong:
2705 case QChar::Script_Wancho:
2706 case QChar::Script_Chorasmian:
2707 case QChar::Script_DivesAkuru:
2708 case QChar::Script_KhitanSmallScript:
2709 case QChar::Script_Yezidi:
2710 case QChar::Script_CyproMinoan:
2711 case QChar::Script_OldUyghur:
2712 case QChar::Script_Tangsa:
2713 case QChar::Script_Toto:
2714 case QChar::Script_Vithkuqi:
2715 case QChar::Script_Kawi:
2716 case QChar::Script_NagMundari:
2717 case QChar::Script_Garay:
2718 case QChar::Script_GurungKhema:
2719 case QChar::Script_KiratRai:
2720 case QChar::Script_OlOnal:
2721 case QChar::Script_Sunuwar:
2722 case QChar::Script_Todhri:
2723 case QChar::Script_TuluTigalari:
2724 return nullptr;
2725 case QChar::ScriptCount:
2726 // Don't Q_UNREACHABLE here; this might be a newer value in later Qt versions
2727 // (incl. patch releases)
2728 ;
2729 }
2730 return nullptr;
2731};
2732
2733static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2734 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2735 QCharAttributes *attributes)
2736{
2737 if (stringLength == 0)
2738 return;
2739 for (qsizetype i = 0; i < numItems; ++i) {
2740 QChar::Script script = items[i].script;
2741 CharAttributeFunction attributeFunction = charAttributeFunction(script);
2742 if (!attributeFunction)
2743 continue;
2744 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2745 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2746 }
2747}
2748
2749}
2750
2751Q_CORE_EXPORT void initCharAttributes(QStringView string,
2752 const ScriptItem *items, qsizetype numItems,
2753 QCharAttributes *attributes, CharAttributeOptions options)
2754{
2755 if (string.size() <= 0)
2756 return;
2757
2758 if (!(options & DontClearAttributes))
2759 ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2760
2761 if (options & GraphemeBreaks)
2762 getGraphemeBreaks(string.utf16(), string.size(), attributes);
2763 if (options & WordBreaks)
2764 getWordBreaks(string.utf16(), string.size(), attributes);
2765 if (options & SentenceBreaks)
2766 getSentenceBreaks(string.utf16(), string.size(), attributes);
2767 if (options & LineBreaks)
2768 getLineBreaks(string.utf16(), string.size(), attributes, options);
2769 if (options & WhiteSpaces)
2770 getWhiteSpaces(string.utf16(), string.size(), attributes);
2771
2773 if (!items || numItems <= 0)
2774 return;
2775
2776 Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2777 }
2778}
2779
2780
2781// ----------------------------------------------------------------------------
2782//
2783// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2784//
2785// ----------------------------------------------------------------------------
2786
2787Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2788{
2789 qsizetype sor = 0;
2790 QChar::Script script = QChar::Script_Common;
2791
2792 QStringIterator it(string);
2793 while (it.hasNext()) {
2794 const auto eor = it.index();
2795 const char32_t ucs4 = it.nextOrRawCodeUnit();
2796
2797 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2798
2799 QChar::Script nscript = QChar::Script(prop->script);
2800
2801 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2802 continue;
2803
2804 // inherit preceding Common-s
2805 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2806 // also covers a case where the base character of Common script followed
2807 // by one or more combining marks of non-Inherited, non-Common script
2808 script = nscript;
2809 continue;
2810 }
2811
2812 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2813 // Thus, a combining mark - whatever its script property value is - should inherit
2814 // the script property value of its base character.
2815 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2816 if (Q_UNLIKELY(FLAG(prop->category) & test))
2817 continue;
2818
2819 Q_ASSERT(script > QChar::Script_Common);
2820 Q_ASSERT(sor < eor);
2821 scripts->append(ScriptItem{sor, script});
2822 sor = eor;
2823
2824 script = nscript;
2825 }
2826
2827 Q_ASSERT(script >= QChar::Script_Common);
2828 scripts->append(ScriptItem{sor, script});
2829}
2830
2831} // namespace QUnicodeTools
2832
2833QT_END_NAMESPACE
Combined button and popup list for selecting options.
static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first, QUnicodeTables::GraphemeBreakClass second)
static const GBTableEntryType Extend_SpacingMark_ZWJ
static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses]
static const GBTableEntryType HardBreak
State updateState(State state, LinebreakUnit lb)
constexpr char32_t DottedCircle
Class toClass(QUnicodeTables::LineBreakClass lbc)
static const uchar actionTable[CLCP+1][CLCP+1]
static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ]
static const uchar breakTable[BAfter+1][QUnicodeTables::NumSentenceBreakClasses]
static const KhmerCharClass khmerCharClasses[]
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static CharAttributeFunction charAttributeFunction(QChar::Script script)
static MymrCharClass getMyanmarCharClass(ushort ch)
static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const signed char mymrStateTable[][Mymr_CC_COUNT]
static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const MymrCharClass mymrCharClasses[]
static Form form(unsigned short uc)
static const unsigned char indicForms[0xe00-0x900]
static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
void(*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) CharAttributeFunction
static const signed char khmerStateTable[][CC_COUNT]
static void getCharAttributes(const char16_t *string, qsizetype stringLength, const QUnicodeTools::ScriptItem *items, qsizetype numItems, QCharAttributes *attributes)
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char tibetanForm[0x80]
static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static KhmerCharClass getKhmerCharClass(ushort uc)
static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses]
static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initCharAttributes(QStringView string, const ScriptItem *items, qsizetype numItems, QCharAttributes *attributes, CharAttributeOptions options)
static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
#define FLAG(x)
Definition qchar.cpp:14
#define KHDEBUG
#define IDEBUG
constexpr int qt_initcharattributes_default_algorithm_only
#define tibetan_form(c)
#define MMDEBUG
QUnicodeTables::LineBreakClass lbc