Qt
Internal/Contributor docs for the Qt SDK. <b>Note:</b> These are NOT official API docs; those are found <a href='https://doc.qt.io/'>here</a>.
Loading...
Searching...
No Matches
qunicodetools.cpp
Go to the documentation of this file.
1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qunicodetools_p.h"
5
6#include "qunicodetables_p.h"
7#include "qvarlengtharray.h"
8#if QT_CONFIG(library)
9#include "qlibrary.h"
10#endif
11
12#include <limits.h>
13
14#define FLAG(x) (1 << (x))
15
17
18using namespace Qt::StringLiterals;
19
20#ifdef QT_BUILD_INTERNAL
21Q_CONSTINIT Q_AUTOTEST_EXPORT
22#else
23constexpr
24#endif
26
27namespace QUnicodeTools {
28
29// -----------------------------------------------------------------------------------------------------
30//
31// The text boundaries determination algorithm.
32// See https://www.unicode.org/reports/tr29/tr29-37.html
33//
34// -----------------------------------------------------------------------------------------------------
35
36namespace GB {
37
38// This table is indexed by the grapheme break classes of two
39// (adjacent) code points.
40// The class of the first code point selects an entry.
41// If the entry's bit at position second_cp_class is set
42// (in other words: if entry & (1u << second_cp_class) is non-zero)
43// then there is NO grapheme break between the two code points.
44
46
47// Check that we have enough bits in the table (in case
48// NumGraphemeBreakClasses grows too much).
49static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50 "Internal error: increase the size in bits of GBTableEntryType");
51
52// GB9, GB9a
57
58static const GBTableEntryType HardBreak = 0u;
59
63 HardBreak, // LF
64 HardBreak, // Control
65 Extend_SpacingMark_ZWJ, // Extend
67 Extend_SpacingMark_ZWJ, // RegionalIndicator
78 ), // Prepend
79 Extend_SpacingMark_ZWJ, // SpacingMark
85 ), // L
89 ), // V
92 ), // T
96 ), // LV
99 ), // LVT
100 Extend_SpacingMark_ZWJ // Extended_Pictographic
101};
102
108
109// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110// so we need to store some local state.
111enum class State : uchar {
112 Normal,
113 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116};
117
118} // namespace GB
119
120static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
121{
124 for (qsizetype i = 0; i != len; ++i) {
125 qsizetype pos = i;
126 char32_t ucs4 = string[i];
127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
128 ushort low = string[i + 1];
129 if (QChar::isLowSurrogate(low)) {
130 ucs4 = QChar::surrogateToUcs4(ucs4, low);
131 ++i;
132 }
133 }
134
137
138 bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
139 bool handled = false;
140
141 switch (state) {
143 break; // will deal with it below
144
148 // keep going in the current state
149 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150 handled = true;
151 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
153 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154 handled = true;
155 } else {
157 }
158 break;
159
163 shouldBreak = false;
164 handled = true;
165 }
166
168 break;
169
173 shouldBreak = false;
174 handled = true;
175 }
176
178 break;
179 }
180
181 if (!handled) {
186 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
189 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190 }
191 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
193 }
194 }
195
196 if (shouldBreak)
197 attributes[pos].graphemeBoundary = true;
198
199 lcls = cls;
200 }
201
202 attributes[len].graphemeBoundary = true; // GB2
203}
204
205
206namespace WB {
207
214
216// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
235 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236};
237
238} // namespace WB
239
240static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
241{
242 enum WordType {
243 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244 } currentWordType = WordTypeNone;
245
247 auto real_cls = cls; // Unaffected by WB4
248
249 for (qsizetype i = 0; i != len; ++i) {
250 qsizetype pos = i;
251 char32_t ucs4 = string[i];
252 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
253 ushort low = string[i + 1];
254 if (QChar::isLowSurrogate(low)) {
255 ucs4 = QChar::surrogateToUcs4(ucs4, low);
256 ++i;
257 }
258 }
259
263 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264 // which caused "hi.there" to be treated like if it were just a single word;
265 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
267 if (ucs4 == 0x002E) // FULL STOP
269 else if (ucs4 == 0x003A) // COLON
271 }
272
273 uchar action = WB::breakTable[cls][ncls];
274 switch (action) {
275 case WB::Break:
277 && prop->graphemeBreakClass
279 // WB3c: ZWJ × \p{Extended_Pictographic}
280 action = WB::NoBreak;
281 }
282 break;
283 case WB::NoBreak:
285 // WB4: X(Extend|Format)* -> X
286 real_cls = ncls;
287 continue;
288 }
290 // WB15/WB16: break between pairs of Regional indicator
292 }
294 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295 // WB3d should not be affected by WB4
296 action = WB::Break;
297 }
298 break;
299 case WB::Lookup:
300 case WB::LookupW:
301 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
302 ucs4 = string[lookahead];
303 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
304 ushort low = string[lookahead + 1];
305 if (QChar::isLowSurrogate(low)) {
306 ucs4 = QChar::surrogateToUcs4(ucs4, low);
307 ++lookahead;
308 }
309 }
310
311 prop = QUnicodeTables::properties(ucs4);
313
315 // WB4: X(Extend|Format)* -> X
316 continue;
317 }
318
319 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
321 i = lookahead;
322 ncls = tcls;
323 action = WB::NoBreak;
324 }
325 break;
326 }
327 if (action != WB::NoBreak) {
328 action = WB::Break;
330 action = WB::NoBreak; // WB7a
331 }
332 break;
333 }
334
335 cls = ncls;
336 real_cls = ncls;
337
338 if (action == WB::Break) {
339 attributes[pos].wordBreak = true;
340 if (currentWordType != WordTypeNone)
341 attributes[pos].wordEnd = true;
342 switch (cls) {
344 currentWordType = WordTypeHiraganaKatakana;
345 attributes[pos].wordStart = true;
346 break;
350 currentWordType = WordTypeAlphaNumeric;
351 attributes[pos].wordStart = true;
352 break;
353 default:
354 currentWordType = WordTypeNone;
355 break;
356 }
357 }
358 }
359
360 if (currentWordType != WordTypeNone)
361 attributes[len].wordEnd = true;
362 attributes[len].wordBreak = true; // WB2
363}
364
365
366namespace SB {
367
384
386// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
390
391 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401};
402
403} // namespace SB
404
405static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
406{
407 uchar state = SB::BAfter; // to meet SB1
408 for (qsizetype i = 0; i != len; ++i) {
409 qsizetype pos = i;
410 char32_t ucs4 = string[i];
411 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
412 ushort low = string[i + 1];
413 if (QChar::isLowSurrogate(low)) {
414 ucs4 = QChar::surrogateToUcs4(ucs4, low);
415 ++i;
416 }
417 }
418
421
423 state = SB::breakTable[state][ncls];
424 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
426 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
427 ucs4 = string[lookahead];
428 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
429 ushort low = string[lookahead + 1];
430 if (QChar::isLowSurrogate(low)) {
431 ucs4 = QChar::surrogateToUcs4(ucs4, low);
432 ++lookahead;
433 }
434 }
435
436 prop = QUnicodeTables::properties(ucs4);
438 switch (tcls) {
445 continue;
447 i = lookahead;
449 break;
450 default:
451 break;
452 }
453 break;
454 }
455 }
456 if (Q_UNLIKELY(state == SB::Break)) {
457 attributes[pos].sentenceBoundary = true;
459 }
460 }
461
462 attributes[len].sentenceBoundary = true; // SB2
463}
464
465
466// -----------------------------------------------------------------------------------------------------
467//
468// The line breaking algorithm.
469// See http://www.unicode.org/reports/tr14/tr14-39.html
470//
471// -----------------------------------------------------------------------------------------------------
472
473namespace LB {
474
475namespace NS { // Number Sequence
476
477// LB25 recommends to not break lines inside numbers of the form
478// described by the following regular expression:
479// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
480
487
496
497static const uchar actionTable[CLCP + 1][CLCP + 1] = {
498// XX PRPO OPHY NU SYIS CLCP
499 { None , Start , Start , Start , None , None }, // XX
500 { None , Start , Continue, Continue, None , None }, // PRPO
501 { None , Start , Start , Continue, None , None }, // OPHY
502 { Break , Break , Break , Continue, Continue, Continue }, // NU
503 { Break , Break , Break , Continue, Continue, Continue }, // SYIS
504 { Break , Continue, Break , Break , Break , Break }, // CLCP
505};
506
508{
509 switch (lbc) {
510 case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
511 // resolve AI math symbols in numerical context to IS
512 if (category == QChar::Symbol_Math)
513 return SYIS;
514 break;
516 return PRPO;
518 return OPHY;
520 return NU;
522 return SYIS;
524 return CLCP;
525 default:
526 break;
527 }
528 return XX;
529}
530
531} // namespace NS
532
533/* In order to support the tailored implementation of LB25 properly
534 the following changes were made in the pair table to allow breaks
535 where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
536 (CL)(PO) from IB to DB
537 (CP)(PO) from IB to DB
538 (CL)(PR) from IB to DB
539 (CP)(PR) from IB to DB
540 (PO)(OP) from IB to DB
541 (PR)(OP) from IB to DB
542 (IS)(NU) from IB to DB
543 (SY)(NU) from IB to DB
544*/
545
546/* In order to implementat LB21a properly a special rule HH has been introduced and
547 the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
548 (HL)(HY|BA) from IB to CI
549 (HY|BA)(!CB) from DB to HH
550*/
551
561
562// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
563// about the table. It was removed in the later versions of the standard.
565/* 1↓ 2→ OP CL CP QU +Pi +Pf GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
566/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
567/* CL */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
568/* CP */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
569/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
570/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
571/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
572/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
573/* NS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
574/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
575/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
576/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
577/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
578/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
579/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
580/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
581/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
582/* ID */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
583/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
584/* HY */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
585/* BA */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
586/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
587/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
588/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
589/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
590/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
591/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
592/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
593/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
594/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
595/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
596/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
597/* CB */ { DB, PB, PB, IB, IB, PB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
598/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
599/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
600};
601
602// The following line break classes are not treated by the pair table
603// and must be resolved outside:
604// AI, AK, AP, AS, BK, CB, CJ, CR, LF, NL, SA, SG, SP, VF, VI, XX, ZWJ
605
606} // namespace LB
607
608static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
609{
610 qsizetype nestart = 0;
611 LB::NS::Class nelast = LB::NS::XX;
612
616
617 for (qsizetype i = 0; i != len; ++i) {
618 qsizetype pos = i;
619 char32_t ucs4 = string[i];
620 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
621 ushort low = string[i + 1];
622 if (QChar::isLowSurrogate(low)) {
623 ucs4 = QChar::surrogateToUcs4(ucs4, low);
624 ++i;
625 }
626 }
627
631
635 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
636 ) {
637 // LB27: use SPACE for line breaking
638 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
639 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
640 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
642 } else {
644 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
645 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
646 if (FLAG(prop->category) & test)
648 }
650 // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
653 }
654 }
655 }
656
658 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
659 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
660 if (FLAG(prop->category) & test)
662 }
663
665 if (prop->category == QChar::Punctuation_InitialQuote) {
666 // LB15a: Do not break after an unresolved initial punctuation
667 // that lies at the start of the line, after a space, after
668 // opening punctuation, or after an unresolved quotation mark,
669 // even after spaces.
670 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
671 // [\p{Pi}&QU] SP* ×
672 // Note: sot is treated as LF here due to initial loop setup.
673 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
679 if (std::any_of(std::begin(lb15a), std::end(lb15a),
680 [lcls](auto x) { return x == lcls; })) {
682 }
683 } else if (prop->category == QChar::Punctuation_FinalQuote) {
684 // LB15b: Do not break before an unresolved final punctuation
685 // that lies at the end of the line, before a space, before
686 // a prohibited break, or before an unresolved quotation mark,
687 // even after spaces.
688 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
689 // | SY | BK | CR | LF | NL | ZW | eot)
690 auto nncls = QUnicodeTables::LineBreak_LF;
691
692 if (i + 1 < len) {
693 char32_t c = string[i + 1];
694 if (QChar::isHighSurrogate(c) && i + 2 != len) {
695 ushort low = string[i + 2];
696 if (QChar::isLowSurrogate(low))
697 c = QChar::surrogateToUcs4(c, low);
698 }
700 QUnicodeTables::properties(c)->lineBreakClass);
701 }
702
703 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
712 if (std::any_of(std::begin(lb15b), std::end(lb15b),
713 [nncls](auto x) { return x == nncls; })) {
715 }
716 }
717 }
718
720 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
722 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
725 goto next_no_cls_update;
726 }
727 goto next;
728 }
729
732 goto next; // LB6: x(BK|CR|LF|NL)
733 goto next_no_cls_update; // LB7: xSP
734 }
735
737 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
739 // don't update anything
740 goto next_no_cls_update;
741 }
742
744 // LB8a: ZWJ x
745 goto next;
746 }
747
748 // LB25: do not break lines inside numbers
749 {
750 LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category);
751 switch (LB::NS::actionTable[nelast][necur]) {
752 case LB::NS::Break:
753 // do not change breaks before and after the expression
754 for (qsizetype j = nestart + 1; j < pos; ++j)
755 attributes[j].lineBreak = false;
757 case LB::NS::None:
758 nelast = LB::NS::XX; // reset state
759 break;
760 case LB::NS::Start:
761 nestart = i;
763 default:
764 nelast = necur;
765 break;
766 }
767 }
768
770 // LB30a
772 goto next;
773 }
774
776 && lastProp->category == QChar::Other_NotAssigned
777 && lastProp->graphemeBreakClass
779 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
780 goto next;
781 }
782
783 // for South East Asian chars that require a complex analysis, the Unicode
784 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
787
788 tcls = cls;
790 // LB10
793 case LB::DirectBreak:
794 attributes[pos].lineBreak = true;
795 break;
798 attributes[pos].lineBreak = true;
799 break;
802 goto next_no_cls_update;
803 attributes[pos].lineBreak = true;
804 break;
807 goto next_no_cls_update;
808 break;
811 attributes[pos].lineBreak = true;
812 break;
814 switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) {
815 default:
817 break;
822 attributes[pos].lineBreak = true;
823 break;
824 }
825 break;
827 // nothing to do
828 default:
829 break;
830 }
831
832 next:
833 cls = ncls;
834 lastProp = prop;
835 next_no_cls_update:
836 lcls = ncls;
837 }
838
840 // LB25: do not break lines inside numbers
841 for (qsizetype j = nestart + 1; j < len; ++j)
842 attributes[j].lineBreak = false;
843 }
844
845 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
846 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
847}
848
849
850static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
851{
852 for (qsizetype i = 0; i != len; ++i) {
853 uint ucs4 = string[i];
854 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
855 ushort low = string[i + 1];
856 if (QChar::isLowSurrogate(low)) {
857 ucs4 = QChar::surrogateToUcs4(ucs4, low);
858 ++i;
859 }
860 }
861
862 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
863 attributes[i].whiteSpace = true;
864 }
865}
866
867namespace Tailored {
868
869using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
870
871
886
887static const unsigned char indicForms[0xe00-0x900] = {
888 // Devangari
893
898
903
908
913
918
923
928
929 // Bengali
934
939
944
949
954
959
964
969
970 // Gurmukhi
975
980
985
990
995
1000
1005
1010
1011 // Gujarati
1016
1021
1026
1031
1036
1041
1046
1051
1052 // Oriya
1057
1062
1067
1072
1077
1082
1087
1092
1093 //Tamil
1098
1103
1108
1113
1118
1123
1128
1133
1134 // Telugu
1139
1144
1149
1154
1159
1164
1169
1174
1175 // Kannada
1180
1185
1190
1195
1200
1205
1210
1215
1216 // Malayalam
1221
1226
1231
1236
1241
1246
1251
1256
1257 // Sinhala
1262
1267
1272
1277
1282
1287
1292
1297};
1298
1299static inline Form form(unsigned short uc) {
1300 if (uc < 0x900 || uc > 0xdff) {
1301 if (uc == 0x25cc)
1302 return Consonant;
1303 if (uc == 0x200c || uc == 0x200d)
1304 return Control;
1305 return Other;
1306 }
1307 return (Form)indicForms[uc-0x900];
1308}
1309
1310// #define INDIC_DEBUG
1311#ifdef INDIC_DEBUG
1312#define IDEBUG qDebug
1313#else
1314#define IDEBUG if constexpr (1) ; else qDebug
1315#endif
1316
1317/* syllables are of the form:
1318
1319 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1320 (Consonant Nukta? Halant)* Consonant Halant
1321 IndependentVowel VowelMark? StressMark?
1322
1323 We return syllable boundaries on invalid combinations as well
1324*/
1325static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1326{
1327 *invalid = false;
1328 IDEBUG("indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1329 const char16_t *uc = s+start;
1330
1331 qsizetype pos = 0;
1332 Form state = form(uc[pos]);
1333 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1334 pos++;
1335
1336 if (state != Consonant && state != IndependentVowel) {
1337 if (state != Other)
1338 *invalid = true;
1339 goto finish;
1340 }
1341
1342 while (pos < end - start) {
1343 Form newState = form(uc[pos]);
1344 IDEBUG("state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1345 switch (newState) {
1346 case Control:
1347 newState = state;
1348 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1349 break;
1350 // the control character should be the last char in the item
1351 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1352 break;
1353 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1354 break;
1355 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1356 ++pos;
1357 goto finish;
1358 case Consonant:
1359 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1360 break;
1361 goto finish;
1362 case Halant:
1363 if (state == Nukta || state == Consonant)
1364 break;
1365 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1366 if (script == QChar::Script_Bengali && pos == 1 &&
1367 (uc[0] == 0x0985 || uc[0] == 0x098f))
1368 break;
1369 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1370 if (script == QChar::Script_Sinhala && state == Matra) {
1371 ++pos;
1372 continue;
1373 }
1374 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1375 ++pos;
1376 continue;
1377 }
1378 goto finish;
1379 case Nukta:
1380 if (state == Consonant)
1381 break;
1382 goto finish;
1383 case StressMark:
1384 if (state == VowelMark)
1385 break;
1386 Q_FALLTHROUGH();
1387 case VowelMark:
1388 if (state == Matra || state == LengthMark || state == IndependentVowel)
1389 break;
1390 Q_FALLTHROUGH();
1391 case Matra:
1392 if (state == Consonant || state == Nukta)
1393 break;
1394 if (state == Matra) {
1395 // ### needs proper testing for correct two/three part matras
1396 break;
1397 }
1398 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1399 // it work for all Indic languages?
1400 // the combination Independent_A + Vowel Sign AA is allowed.
1401 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1402 break;
1403 if (script == QChar::Script_Tamil && state == Matra) {
1404 if (uc[pos-1] == 0x0bc6 &&
1405 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1406 break;
1407 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1408 break;
1409 }
1410 goto finish;
1411
1412 case LengthMark:
1413 if (state == Matra) {
1414 // ### needs proper testing for correct two/three part matras
1415 break;
1416 }
1417 Q_FALLTHROUGH();
1418 case IndependentVowel:
1419 case Invalid:
1420 case Other:
1421 goto finish;
1422 }
1423 state = newState;
1424 pos++;
1425 }
1426 finish:
1427 return pos+start;
1428}
1429
1430static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1431{
1432 qsizetype end = from + len;
1433 attributes += from;
1434 qsizetype i = 0;
1435 while (i < len) {
1436 bool invalid;
1437 qsizetype boundary = indic_nextSyllableBoundary(script, text, from+i, end, &invalid) - from;
1438 attributes[i].graphemeBoundary = true;
1439
1440 if (boundary > len-1) boundary = len;
1441 i++;
1442 while (i < boundary) {
1443 attributes[i].graphemeBoundary = false;
1444 ++i;
1445 }
1446 assert(i == boundary);
1447 }
1448
1449
1450}
1451
1452#if QT_CONFIG(library)
1453
1454#define LIBTHAI_MAJOR 0
1455
1456/*
1457 * if libthai changed please update these codes too.
1458 */
1459struct thcell_t {
1460 unsigned char base;
1461 unsigned char hilo;
1462 unsigned char top;
1463};
1464
1465using ThBrk = struct _ThBrk;
1466
1467namespace {
1468
1469class LibThai final
1470{
1471 Q_DISABLE_COPY_MOVE(LibThai)
1472
1473 using th_brk_new_def = ThBrk *(*)(const char *);
1474 using th_brk_delete_def = void (*)(ThBrk *);
1475 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1476 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1477
1478public:
1479 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1480 {
1481 m_th_brk_find_breaks =
1482 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve("th_brk_find_breaks"));
1483 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve("th_next_cell"));
1484
1485 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve("th_brk_new"));
1486 if (th_brk_new) {
1487 m_state = th_brk_new(nullptr);
1488 m_th_brk_delete =
1489 reinterpret_cast<th_brk_delete_def>(m_library.resolve("th_brk_delete"));
1490 }
1491 }
1492
1493 ~LibThai()
1494 {
1495 if (m_state && m_th_brk_delete)
1496 m_th_brk_delete(m_state);
1497 m_library.unload();
1498 }
1499
1500 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1501
1502 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1503 {
1504 Q_ASSERT(m_state);
1505 Q_ASSERT(m_th_brk_find_breaks);
1506 return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1507 }
1508
1509 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1510 {
1511 Q_ASSERT(m_th_next_cell);
1512 return m_th_next_cell(s, len, cell, is_decomp_am);
1513 }
1514
1515private:
1516 QLibrary m_library;
1517
1518 // Global state for th_brk_find_breaks().
1519 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1520 // state is read-only, and so it is safe to use it from multiple threads after
1521 // initialization. This is also stated in the libthai documentation.
1522 ThBrk *m_state = nullptr;
1523
1524 th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1525 th_next_cell_def m_th_next_cell = nullptr;
1526 th_brk_delete_def m_th_brk_delete = nullptr;
1527};
1528
1529} // unnamed namespace
1530
1531Q_GLOBAL_STATIC(LibThai, g_libThai)
1532
1533static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1534{
1535 qsizetype i;
1536 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1537
1538 for (i = 0; i < len; ++i) {
1539 if (string[i] <= 0xa0)
1540 result[i] = static_cast<unsigned char>(string[i]);
1541 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1542 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1543 else
1544 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1545 }
1546
1547 result[len] = 0;
1548}
1549
1550/*
1551 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1552 */
1553static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1554{
1555 constexpr qsizetype Prealloc = 128;
1556 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1557 QVarLengthArray<int, Prealloc> break_positions(len);
1558 qsizetype numbreaks, i;
1559 struct thcell_t tis_cell;
1560
1561 LibThai *libThai = g_libThai;
1562 if (!libThai || !libThai->isInitialized())
1563 return;
1564
1565 to_tis620(string, len, s.data());
1566
1567 for (i = 0; i < len; ++i) {
1568 attributes[i].wordBreak = false;
1569 attributes[i].wordStart = false;
1570 attributes[i].wordEnd = false;
1571 attributes[i].lineBreak = false;
1572 }
1573
1574 attributes[0].wordBreak = true;
1575 attributes[0].wordStart = true;
1576 attributes[0].wordEnd = false;
1577 numbreaks = libThai->brk_find_breaks(reinterpret_cast<const unsigned char *>(s.data()),
1578 break_positions.data(),
1579 static_cast<size_t>(break_positions.size()));
1580 for (i = 0; i < numbreaks; ++i) {
1581 attributes[break_positions[i]].wordBreak = true;
1582 attributes[break_positions[i]].wordStart = true;
1583 attributes[break_positions[i]].wordEnd = true;
1584 attributes[break_positions[i]].lineBreak = true;
1585 }
1586 if (numbreaks > 0)
1587 attributes[break_positions[numbreaks - 1]].wordStart = false;
1588
1589 /* manage grapheme boundaries */
1590 i = 0;
1591 while (i < len) {
1592 size_t cell_length =
1593 libThai->next_cell(reinterpret_cast<const unsigned char *>(s.data()) + i,
1594 size_t(len - i), &tis_cell, true);
1595
1596 attributes[i].graphemeBoundary = true;
1597 for (size_t j = 1; j < cell_length; ++j)
1598 attributes[i + j].graphemeBoundary = false;
1599
1600 i += cell_length;
1601 }
1602}
1603
1604#endif // QT_CONFIG(library)
1605
1606static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1607{
1608 assert(script == QChar::Script_Thai);
1609#if QT_CONFIG(library)
1610 const char16_t *uc = text + from;
1611 attributes += from;
1612 Q_UNUSED(script);
1613 thaiAssignAttributes(uc, len, attributes);
1614#else
1615 Q_UNUSED(script);
1616 Q_UNUSED(text);
1617 Q_UNUSED(from);
1618 Q_UNUSED(len);
1619 Q_UNUSED(attributes);
1620#endif
1621}
1622
1623/*
1624 tibetan syllables are of the form:
1625 head position consonant
1626 first sub-joined consonant
1627 ....intermediate sub-joined consonants (if any)
1628 last sub-joined consonant
1629 sub-joined vowel (a-chung U+0F71)
1630 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1631*/
1632
1640
1641/* this table starts at U+0f40 */
1642static const unsigned char tibetanForm[0x80] = {
1647
1652
1657
1662
1667
1672
1677
1682};
1683
1684#define tibetan_form(c) \
1685 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1686
1687static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1688{
1689 const char16_t *uc = s + start;
1690
1691 qsizetype pos = 0;
1693
1694/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1695 pos++;
1696
1697 if (state != TibetanHeadConsonant) {
1698 if (state != TibetanOther)
1699 *invalid = true;
1700 goto finish;
1701 }
1702
1703 while (pos < end - start) {
1705 switch (newState) {
1708 if (state != TibetanHeadConsonant &&
1710 goto finish;
1711 state = newState;
1712 break;
1713 case TibetanVowel:
1714 if (state != TibetanHeadConsonant &&
1717 goto finish;
1718 break;
1719 case TibetanOther:
1721 goto finish;
1722 }
1723 pos++;
1724 }
1725
1726finish:
1727 *invalid = false;
1728 return start+pos;
1729}
1730
1731static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1732{
1733 qsizetype end = from + len;
1734 qsizetype i = 0;
1735 Q_UNUSED(script);
1736 attributes += from;
1737 while (i < len) {
1738 bool invalid;
1739 qsizetype boundary = tibetan_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1740
1741 attributes[i].graphemeBoundary = true;
1742
1743 if (boundary > len-1) boundary = len;
1744 i++;
1745 while (i < boundary) {
1746 attributes[i].graphemeBoundary = false;
1747 ++i;
1748 }
1749 assert(i == boundary);
1750 }
1751}
1752
1755 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
1756 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
1757 Mymr_CC_NGA = 3, /* Consonant NGA */
1758 Mymr_CC_YA = 4, /* Consonant YA */
1759 Mymr_CC_RA = 5, /* Consonant RA */
1760 Mymr_CC_WA = 6, /* Consonant WA */
1761 Mymr_CC_HA = 7, /* Consonant HA */
1762 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
1763 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
1764 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
1765 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
1766 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
1767 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
1768 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
1772 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
1773 Mymr_CC_COUNT = 19 /* This is the number of character classes */
1775
1778
1779 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1780 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
1781 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
1782 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
1783 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
1784 first in a syllable */
1785 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
1786
1787 /* position flags */
1789 Mymr_CF_POS_BELOW = 0x00040000,
1790 Mymr_CF_POS_ABOVE = 0x00020000,
1791 Mymr_CF_POS_AFTER = 0x00010000,
1792 Mymr_CF_POS_MASK = 0x000f0000,
1793
1794 Mymr_CF_AFTER_KINZI = 0x00100000
1796
1798
1799/* Characters that get refrered to by name */
1801{
1805 Mymr_C_RA = 0x101B,
1806 Mymr_C_YA = 0x101A,
1807 Mymr_C_NGA = 0x1004,
1809 Mymr_C_VIRAMA = 0x1039
1811
1812enum
1813{
1832
1833
1834typedef int MymrCharClass;
1835
1836
1852
1853static MymrCharClass
1855{
1856 if (ch == Mymr_C_SIGN_ZWJ)
1858
1859 if (ch == Mymr_C_SIGN_ZWNJ)
1861
1862 if (ch < 0x1000 || ch > 0x105f)
1863 return Mymr_CC_RESERVED;
1864
1865 return mymrCharClasses[ch - 0x1000];
1866}
1867
1868static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1869{
1870/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
1871 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
1872 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
1873 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
1874 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
1875 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
1876 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
1877 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
1878 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
1879 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
1880 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
1881 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
1882 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
1883 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
1884 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
1885 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
1886 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
1887 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
1888 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
1889 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
1890 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
1891 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
1892 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
1893 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
1894 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
1895 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
1896 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
1897 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
1898 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
1899/* exit state -2 is for invalid order of medials and combination of invalids
1900 with virama where virama should treat as start of next syllable
1901 */
1902};
1903
1904/*#define MYANMAR_DEBUG */
1905#ifdef MYANMAR_DEBUG
1906#define MMDEBUG qDebug
1907#else
1908# define MMDEBUG \
1909 if (0) \
1910 printf
1911#endif
1912
1913/*
1914// Given an input string of characters and a location in which to start looking
1915// calculate, using the state table, which one is the last character of the syllable
1916// that starts in the starting position.
1917*/
1918static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1919{
1920 const char16_t *uc = s + start;
1921 int state = 0;
1923 *invalid = false;
1924
1925 while (pos < end) {
1926 MymrCharClass charClass = getMyanmarCharClass(*uc);
1928 if (pos == start)
1929 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
1930
1931 MMDEBUG("state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
1932
1933 if (state < 0) {
1934 if (state < -1)
1935 --pos;
1936 break;
1937 }
1938 ++uc;
1939 ++pos;
1940 }
1941 return pos;
1942}
1943
1944static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1945{
1946 qsizetype end = from + len;
1947 qsizetype i = 0;
1948 Q_UNUSED(script);
1949 attributes += from;
1950 while (i < len) {
1951 bool invalid;
1952 qsizetype boundary = myanmar_nextSyllableBoundary(text, from+i, end, &invalid) - from;
1953
1954 attributes[i].graphemeBoundary = true;
1955 attributes[i].lineBreak = true;
1956
1957 if (boundary > len-1)
1958 boundary = len;
1959 i++;
1960 while (i < boundary) {
1961 attributes[i].graphemeBoundary = false;
1962 ++i;
1963 }
1964 assert(i == boundary);
1965 }
1966}
1967
1968/*
1969// Vocabulary
1970// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1971// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1972// split vowels, signs... but there is only one base in a syllable, it has to be coded as
1973// the first character of the syllable.
1974// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1975// Khmer language has five of them. Khmer split vowels either have one part before the
1976// base and one after the base or they have a part before the base and a part above the base.
1977// The first part of all Khmer split vowels is the same character, identical to
1978// the glyph of Khmer dependent vowel SRA EI
1979// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1980// Differently than indian languages, the coeng modifies the consonant that follows it,
1981// not the one preceding it Each consonant has two forms, the base form and the subscript form
1982// the base form is the normal one (using the consonants code-point), the subscript form is
1983// displayed when the combination coeng + consonant is encountered.
1984// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1985// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1986// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1987// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1988// if it is attached to a consonant of the first series or a consonant of the second series
1989// Most consonants have an equivalent in the other series, but some of theme exist only in
1990// one series (for example SA). If we want to use the consonant SA with a vowel sound that
1991// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1992// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1993// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1994// MUSIKATOAN a second series consonant to have a first series vowel sound.
1995// Consonant shifter are both normally supercript marks, but, when they are followed by a
1996// superscript, they change shape and take the form of subscript dependent vowel SRA U.
1997// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1998// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1999// be placed after the coeng consonant.
2000// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2001// Each vowel has its own position. Only one vowel per syllable is allowed.
2002// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2003// Allowed in a syllable.
2004//
2005//
2006// order is important here! This order must be the same that is found in each horizontal
2007// line in the statetable for Khmer (see khmerStateTable) .
2008*/
2011 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2012 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2013 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2014 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2016 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2017 CC_COENG = 7, /* Subscript consonant combining character */
2021 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2022 CC_COUNT = 12 /* This is the number of character classes */
2024
2025
2027 CF_CLASS_MASK = 0x0000FFFF,
2028
2029 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2030 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2031 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2032 CF_COENG = 0x08000000, /* flag to speed up comparing */
2033 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2034 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2035
2036 /* position flags */
2037 CF_POS_BEFORE = 0x00080000,
2038 CF_POS_BELOW = 0x00040000,
2039 CF_POS_ABOVE = 0x00020000,
2040 CF_POS_AFTER = 0x00010000,
2041 CF_POS_MASK = 0x000f0000
2043
2045
2046/* Characters that get referred to by name */
2048 C_SIGN_ZWNJ = 0x200C,
2049 C_SIGN_ZWJ = 0x200D,
2050 C_RO = 0x179A,
2051 C_VOWEL_AA = 0x17B6,
2053 C_VOWEL_E = 0x17C1,
2054 C_COENG = 0x17D2
2056
2057
2058/*
2059// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2060// they are also used to know where a character should be placed (location in reference to the base character)
2061// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2062// indicate error in syllable construction
2063*/
2064enum {
2078
2079 /* split vowel */
2083
2084
2085/*
2086// Character class: a character class value
2087// ORed with character class flags.
2088*/
2089typedef unsigned long KhmerCharClass;
2090
2091
2092/*
2093// Character class tables
2094// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2095// _sa Sign placed above the base
2096// _sp Sign placed after the base
2097// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2098// _c2 Consonant of type 2 (only RO)
2099// _c3 Consonant of type 3
2100// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2101// _cd Consonant-shifter
2102// _dl Dependent vowel placed before the base (left of the base)
2103// _db Dependent vowel placed below the base
2104// _da Dependent vowel placed above the base
2105// _dr Dependent vowel placed behind the base (right of the base)
2106// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2107// it to create a subscript consonant or independent vowel
2108// _va Khmer split vowel in which the first part is before the base and the second one above the base
2109// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2110*/
2112 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2113 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2114 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2115 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2116 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2117 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2118};
2119
2120/* this enum must reflect the range of khmerCharClasses */
2125
2126/*
2127// Below we define how a character in the input string is either in the khmerCharClasses table
2128// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2129// within the syllable, but are not in the table) we also get their type back, or an unknown object
2130// in which case we get _xx (CC_RESERVED) back
2131*/
2133{
2134 if (uc == C_SIGN_ZWJ) {
2135 return CC_ZERO_WIDTH_J_MARK;
2136 }
2137
2138 if (uc == C_SIGN_ZWNJ) {
2139 return CC_ZERO_WIDTH_NJ_MARK;
2140 }
2141
2142 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2143 return CC_RESERVED;
2144 }
2145
2146 return khmerCharClasses[uc - KhmerFirstChar];
2147}
2148
2149
2150/*
2151// The stateTable is used to calculate the end (the length) of a well
2152// formed Khmer Syllable.
2153//
2154// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2155// CharClassValues. This coincidence of values allows the follow up of the table.
2156//
2157// Each line corresponds to a state, which does not necessarily need to be a type
2158// of component... for example, state 2 is a base, with is always a first character
2159// in the syllable, but the state could be produced a consonant of any type when
2160// it is the first character that is analysed (in ground state).
2161//
2162// Differentiating 3 types of consonants is necessary in order to
2163// forbid the use of certain combinations, such as having a second
2164// coeng after a coeng RO,
2165// The inexistent possibility of having a type 3 after another type 3 is permitted,
2166// eliminating it would very much complicate the table, and it does not create typing
2167// problems, as the case above.
2168//
2169// The table is quite complex, in order to limit the number of coeng consonants
2170// to 2 (by means of the table).
2171//
2172// There a peculiarity, as far as Unicode is concerned:
2173// - The consonant-shifter is considered in two possible different
2174// locations, the one considered in Unicode 3.0 and the one considered in
2175// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2176//
2177//
2178// xx independent character, such as a number, punctuation sign or non-khmer char
2179//
2180// c1 Khmer consonant of type 1 or an independent vowel
2181// that is, a letter in which the subscript for is only under the
2182// base, not taking any space to the right or to the left
2183//
2184// c2 Khmer consonant of type 2, the coeng form takes space under
2185// and to the left of the base (only RO is of this type)
2186//
2187// c3 Khmer consonant of type 3. Its subscript form takes space under
2188// and to the right of the base.
2189//
2190// cs Khmer consonant shifter
2191//
2192// rb Khmer robat
2193//
2194// co coeng character (u17D2)
2195//
2196// dv dependent vowel (including split vowels, they are treated in the same way).
2197// even if dv is not defined above, the component that is really tested for is
2198// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2199//
2200// zwj Zero Width joiner
2201//
2202// zwnj Zero width non joiner
2203//
2204// sa above sign
2205//
2206// sp post sign
2207//
2208// there are lines with equal content but for an easier understanding
2209// (and maybe change in the future) we did not join them
2210*/
2211static const signed char khmerStateTable[][CC_COUNT] =
2212{
2213 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2214 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2215 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2216 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2217 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2218 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2219 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2220 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2221 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2222 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2223 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2224 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2225 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2226 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2227 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2228 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2229 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2230 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2231 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2232 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2233 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2234 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2235};
2236
2237
2238/* #define KHMER_DEBUG */
2239#ifdef KHMER_DEBUG
2240#define KHDEBUG qDebug
2241#else
2242# define KHDEBUG \
2243 if (0) \
2244 printf
2245#endif
2246
2247/*
2248// Given an input string of characters and a location in which to start looking
2249// calculate, using the state table, which one is the last character of the syllable
2250// that starts in the starting position.
2251*/
2252static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2253{
2254 const char16_t *uc = s + start;
2255 int state = 0;
2257 *invalid = false;
2258
2259 while (pos < end) {
2260 KhmerCharClass charClass = getKhmerCharClass(*uc);
2261 if (pos == start) {
2262 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2263 }
2264 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2265
2266 KHDEBUG("state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2267 charClass, *uc );
2268
2269 if (state < 0) {
2270 break;
2271 }
2272 ++uc;
2273 ++pos;
2274 }
2275 return pos;
2276}
2277
2278static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2279{
2280 qsizetype end = from + len;
2281 qsizetype i = 0;
2282 Q_UNUSED(script);
2283 attributes += from;
2284 while ( i < len ) {
2285 bool invalid;
2286 qsizetype boundary = khmer_nextSyllableBoundary( text, from+i, end, &invalid ) - from;
2287
2288 attributes[i].graphemeBoundary = true;
2289
2290 if ( boundary > len-1 ) boundary = len;
2291 i++;
2292 while ( i < boundary ) {
2293 attributes[i].graphemeBoundary = false;
2294 ++i;
2295 }
2296 assert( i == boundary );
2297 }
2298}
2299
2300
2302// Script_Unknown,
2303 nullptr,
2304// Script_Inherited,
2305 nullptr,
2306// Script_Common,
2307 nullptr,
2308// Script_Latin,
2309 nullptr,
2310// Script_Greek,
2311 nullptr,
2312// Script_Cyrillic,
2313 nullptr,
2314// Script_Armenian,
2315 nullptr,
2316// Script_Hebrew,
2317 nullptr,
2318// Script_Arabic,
2319 nullptr,
2320// Script_Syriac,
2321 nullptr,
2322// Script_Thaana,
2323 nullptr,
2324// Script_Devanagari,
2326// Script_Bengali,
2328// Script_Gurmukhi,
2330// Script_Gujarati,
2332// Script_Oriya,
2334// Script_Tamil,
2336// Script_Telugu,
2338// Script_Kannada,
2340// Script_Malayalam,
2342// Script_Sinhala,
2344// Script_Thai,
2346// Script_Lao,
2347 nullptr,
2348// Script_Tibetan,
2350// Script_Myanmar,
2352// Script_Georgian,
2353 nullptr,
2354// Script_Hangul,
2355 nullptr,
2356// Script_Ethiopic,
2357 nullptr,
2358// Script_Cherokee,
2359 nullptr,
2360// Script_CanadianAboriginal,
2361 nullptr,
2362// Script_Ogham,
2363 nullptr,
2364// Script_Runic,
2365 nullptr,
2366// Script_Khmer,
2368};
2369
2370static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2371 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2372 QCharAttributes *attributes)
2373{
2374 if (stringLength == 0)
2375 return;
2376 for (qsizetype i = 0; i < numItems; ++i) {
2377 QChar::Script script = items[i].script;
2378 if (script > QChar::Script_Khmer)
2379 script = QChar::Script_Common;
2380 CharAttributeFunction attributeFunction = charAttributeFunction[script];
2381 if (!attributeFunction)
2382 continue;
2383 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2384 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2385 }
2386}
2387
2388}
2389
2390Q_CORE_EXPORT void initCharAttributes(QStringView string,
2391 const ScriptItem *items, qsizetype numItems,
2392 QCharAttributes *attributes, CharAttributeOptions options)
2393{
2394 if (string.size() <= 0)
2395 return;
2396
2397 if (!(options & DontClearAttributes))
2398 ::memset(attributes, 0, (string.size() + 1) * sizeof(QCharAttributes));
2399
2400 if (options & GraphemeBreaks)
2401 getGraphemeBreaks(string.utf16(), string.size(), attributes);
2402 if (options & WordBreaks)
2403 getWordBreaks(string.utf16(), string.size(), attributes);
2404 if (options & SentenceBreaks)
2405 getSentenceBreaks(string.utf16(), string.size(), attributes);
2406 if (options & LineBreaks)
2407 getLineBreaks(string.utf16(), string.size(), attributes, options);
2408 if (options & WhiteSpaces)
2409 getWhiteSpaces(string.utf16(), string.size(), attributes);
2410
2412 if (!items || numItems <= 0)
2413 return;
2414
2415 Tailored::getCharAttributes(string.utf16(), string.size(), items, numItems, attributes);
2416 }
2417}
2418
2419
2420// ----------------------------------------------------------------------------
2421//
2422// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2423//
2424// ----------------------------------------------------------------------------
2425
2426Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2427{
2428 qsizetype sor = 0;
2429 qsizetype eor = 0;
2430 QChar::Script script = QChar::Script_Common;
2431
2432 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2433 char32_t ucs4 = string[i].unicode();
2434 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2435 ushort low = string[i + 1].unicode();
2436 if (QChar::isLowSurrogate(low)) {
2437 ucs4 = QChar::surrogateToUcs4(ucs4, low);
2438 ++i;
2439 }
2440 }
2441
2443
2444 QChar::Script nscript = QChar::Script(prop->script);
2445
2446 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2447 continue;
2448
2449 // inherit preceding Common-s
2450 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2451 // also covers a case where the base character of Common script followed
2452 // by one or more combining marks of non-Inherited, non-Common script
2453 script = nscript;
2454 continue;
2455 }
2456
2457 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2458 // Thus, a combining mark - whatever its script property value is - should inherit
2459 // the script property value of its base character.
2460 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2461 if (Q_UNLIKELY(FLAG(prop->category) & test))
2462 continue;
2463
2464 Q_ASSERT(script > QChar::Script_Common);
2465 Q_ASSERT(sor < eor);
2466 scripts->append(ScriptItem{sor, script});
2467 sor = eor;
2468
2469 script = nscript;
2470 }
2471
2472 Q_ASSERT(script >= QChar::Script_Common);
2473 Q_ASSERT(eor == string.size());
2474 scripts->append(ScriptItem{sor, script});
2475}
2476
2477} // namespace QUnicodeTools
2478
\inmodule QtCore \reentrant
Definition qlibrary.h:17
\inmodule QtCore
Definition qstringview.h:78
void append(const T &t)
const QLoggingCategory & category()
[1]
QString text
else opt state
[0]
void newState(QList< State > &states, const char *token, const char *lexem, bool pre)
short next
Definition keywords.cpp:445
Combined button and popup list for selecting options.
Q_DECL_CONST_FUNCTION Q_CORE_EXPORT const Properties *QT_FASTCALL properties(char32_t ucs4) noexcept
@ GraphemeBreak_Extended_Pictographic
static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first, QUnicodeTables::GraphemeBreakClass second)
static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses]
static const GBTableEntryType Extend_SpacingMark_ZWJ
static const GBTableEntryType HardBreak
Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
static const uchar actionTable[CLCP+1][CLCP+1]
static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ]
static const uchar breakTable[BAfter+1][QUnicodeTables::NumSentenceBreakClasses]
static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
const CharAttributeFunction charAttributeFunction[]
static const MymrCharClass mymrCharClasses[]
static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const signed char mymrStateTable[][Mymr_CC_COUNT]
static MymrCharClass getMyanmarCharClass(ushort ch)
static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char tibetanForm[0x80]
void(*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes) CharAttributeFunction
static const KhmerCharClass khmerCharClasses[]
static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static const signed char khmerStateTable[][CC_COUNT]
static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static const unsigned char indicForms[0xe00-0x900]
static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static void getCharAttributes(const char16_t *string, qsizetype stringLength, const QUnicodeTools::ScriptItem *items, qsizetype numItems, QCharAttributes *attributes)
static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
static KhmerCharClass getKhmerCharClass(ushort uc)
static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses]
static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initCharAttributes(QStringView string, const ScriptItem *items, qsizetype numItems, QCharAttributes *attributes, CharAttributeOptions options)
static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
#define assert
#define FLAG(x)
Definition qchar.cpp:15
#define Q_FALLTHROUGH()
#define Q_UNLIKELY(x)
#define Q_LIKELY(x)
DBusConnection const char DBusError DBusBusType DBusError return DBusConnection DBusHandleMessageFunction void DBusFreeFunction return DBusConnection return DBusConnection return const char DBusError return DBusConnection DBusMessage dbus_uint32_t return DBusConnection dbus_bool_t DBusConnection DBusAddWatchFunction DBusRemoveWatchFunction DBusWatchToggledFunction void DBusFreeFunction return DBusConnection DBusDispatchStatusFunction void DBusFreeFunction DBusTimeout return DBusTimeout return DBusWatch return DBusWatch unsigned int return DBusError const DBusError return const DBusMessage return DBusMessage return DBusMessage return DBusMessage return DBusMessage return DBusMessage return DBusMessageIter int const void return DBusMessageIter DBusMessageIter return DBusMessageIter void DBusMessageIter void int return DBusMessage DBusMessageIter return DBusMessageIter return DBusMessageIter DBusMessageIter const char const char const char const char return DBusMessage return DBusMessage const char return DBusMessage dbus_bool_t return DBusMessage dbus_uint32_t return DBusMessage void
#define Q_DECLARE_MIXED_ENUM_OPERATORS(Ret, Flags, Enum)
Definition qflags.h:241
#define Q_GLOBAL_STATIC(TYPE, NAME,...)
#define NS(x)
Definition qmetatype.cpp:64
GLint GLint GLint GLint GLint x
[0]
GLenum GLuint GLintptr GLsizeiptr size
[1]
GLuint GLuint end
GLdouble GLdouble GLdouble GLdouble top
GLuint start
GLint first
GLdouble s
[6]
Definition qopenglext.h:235
const GLubyte * c
GLuint64EXT * result
[6]
GLenum GLsizei len
static qreal position(const QQuickItem *item, QQuickAnchors::Anchor anchorLine)
#define Q_ASSERT(cond)
Definition qrandom.cpp:47
static QString lineBreak(QString s)
Definition main.cpp:752
#define Q_AUTOTEST_EXPORT
#define Q_UNUSED(x)
unsigned char uchar
Definition qtypes.h:32
unsigned short quint16
Definition qtypes.h:48
ptrdiff_t qsizetype
Definition qtypes.h:165
unsigned int uint
Definition qtypes.h:34
unsigned short ushort
Definition qtypes.h:33
qint64 qlonglong
Definition qtypes.h:63
#define KHDEBUG
#define IDEBUG
constexpr int qt_initcharattributes_default_algorithm_only
#define tibetan_form(c)
#define MMDEBUG
static const uint base
Definition qurlidna.cpp:20
QList< QTreeWidgetItem * > items