|
Lines 23-28
a/Source/WebCore/platform/text/TextBreakIteratorICU.cpp_sec1
|
| 23 |
#include "TextBreakIterator.h" |
23 |
#include "TextBreakIterator.h" |
| 24 |
|
24 |
|
| 25 |
#include "LineBreakIteratorPoolICU.h" |
25 |
#include "LineBreakIteratorPoolICU.h" |
|
|
26 |
#include <unicode/ubrk.h> |
| 27 |
#include <unicode/uloc.h> |
| 26 |
#include <wtf/Atomics.h> |
28 |
#include <wtf/Atomics.h> |
| 27 |
#include <wtf/text/WTFString.h> |
29 |
#include <wtf/text/WTFString.h> |
| 28 |
|
30 |
|
|
Lines 260-268
TextBreakIterator* wordBreakIterator(const UChar* string, int length)
a/Source/WebCore/platform/text/TextBreakIteratorICU.cpp_sec2
|
| 260 |
staticWordBreakIterator, UBRK_WORD, string, length); |
262 |
staticWordBreakIterator, UBRK_WORD, string, length); |
| 261 |
} |
263 |
} |
| 262 |
|
264 |
|
| 263 |
TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale) |
265 |
TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, LineBreakIteratorMode mode, bool isCJK) |
| 264 |
{ |
266 |
{ |
| 265 |
UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); |
267 |
TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale, mode, isCJK); |
| 266 |
if (!iterator) |
268 |
if (!iterator) |
| 267 |
return 0; |
269 |
return 0; |
| 268 |
|
270 |
|
|
Lines 278-285
TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, con
a/Source/WebCore/platform/text/TextBreakIteratorICU.cpp_sec3
|
| 278 |
return 0; |
280 |
return 0; |
| 279 |
} |
281 |
} |
| 280 |
|
282 |
|
|
|
283 |
UBreakIterator* ubrkIter = reinterpret_cast<UBreakIterator*>(iterator); |
| 281 |
UErrorCode setTextStatus = U_ZERO_ERROR; |
284 |
UErrorCode setTextStatus = U_ZERO_ERROR; |
| 282 |
ubrk_setUText(iterator, uTextLatin1, &setTextStatus); |
285 |
ubrk_setUText(ubrkIter, uTextLatin1, &setTextStatus); |
| 283 |
if (U_FAILURE(setTextStatus)) { |
286 |
if (U_FAILURE(setTextStatus)) { |
| 284 |
LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); |
287 |
LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); |
| 285 |
return 0; |
288 |
return 0; |
|
Lines 287-316
TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, con
a/Source/WebCore/platform/text/TextBreakIteratorICU.cpp_sec4
|
| 287 |
|
290 |
|
| 288 |
utext_close(uTextLatin1); |
291 |
utext_close(uTextLatin1); |
| 289 |
|
292 |
|
| 290 |
return reinterpret_cast<TextBreakIterator*>(iterator); |
293 |
return iterator; |
| 291 |
} |
294 |
} |
| 292 |
|
295 |
|
| 293 |
TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale) |
296 |
TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, LineBreakIteratorMode mode, bool isCJK) |
| 294 |
{ |
297 |
{ |
| 295 |
UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); |
298 |
TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale, mode, isCJK); |
| 296 |
if (!iterator) |
299 |
if (!iterator) |
| 297 |
return 0; |
300 |
return 0; |
| 298 |
|
301 |
|
|
|
302 |
UBreakIterator* ubrkIter = reinterpret_cast<UBreakIterator*>(iterator); |
| 299 |
UErrorCode setTextStatus = U_ZERO_ERROR; |
303 |
UErrorCode setTextStatus = U_ZERO_ERROR; |
| 300 |
ubrk_setText(iterator, string, length, &setTextStatus); |
304 |
ubrk_setText(ubrkIter, string, length, &setTextStatus); |
| 301 |
if (U_FAILURE(setTextStatus)) { |
305 |
if (U_FAILURE(setTextStatus)) { |
| 302 |
LOG_ERROR("ubrk_setText failed with status %d", setTextStatus); |
306 |
LOG_ERROR("ubrk_setText failed with status %d", setTextStatus); |
| 303 |
return 0; |
307 |
return 0; |
| 304 |
} |
308 |
} |
| 305 |
|
309 |
|
| 306 |
return reinterpret_cast<TextBreakIterator*>(iterator); |
310 |
return iterator; |
| 307 |
} |
311 |
} |
| 308 |
|
312 |
|
| 309 |
void releaseLineBreakIterator(TextBreakIterator* iterator) |
313 |
void releaseLineBreakIterator(TextBreakIterator* iterator) |
| 310 |
{ |
314 |
{ |
| 311 |
ASSERT_ARG(iterator, iterator); |
315 |
ASSERT_ARG(iterator, iterator); |
| 312 |
|
316 |
|
| 313 |
LineBreakIteratorPool::sharedPool().put(reinterpret_cast<UBreakIterator*>(iterator)); |
317 |
LineBreakIteratorPool::sharedPool().put(iterator); |
|
|
318 |
} |
| 319 |
|
| 320 |
// Recognize BCP47 compliant primary language values of 'zh', 'ja', 'ko' |
| 321 |
// (in any combination of case), optionally followed by subtags. Don't |
| 322 |
// recognize 3-letter variants 'chi'/'zho', 'jpn', or 'kor' since BCP47 |
| 323 |
// requires use of shortest language tag. |
| 324 |
template<typename T> |
| 325 |
static bool isCJKLocale(const T* s, size_t length) |
| 326 |
{ |
| 327 |
if (!s || length < 2) |
| 328 |
return false; |
| 329 |
T c1 = s[0]; |
| 330 |
T c2 = s[1]; |
| 331 |
T c3 = length == 2 ? 0 : s[2]; |
| 332 |
if (!c3 || c3 == '-' || c3 == '_' || c3 == '@') { |
| 333 |
if (c1 == 'z' || c1 == 'Z') |
| 334 |
return c2 == 'h' || c2 == 'H'; |
| 335 |
if (c1 == 'j' || c1 == 'J') |
| 336 |
return c2 == 'a' || c2 == 'A'; |
| 337 |
if (c1 == 'k' || c1 == 'K') |
| 338 |
return c2 == 'o' || c2 == 'O'; |
| 339 |
} |
| 340 |
return false; |
| 341 |
} |
| 342 |
|
| 343 |
bool isCJKLocale(const AtomicString& locale) |
| 344 |
{ |
| 345 |
if (locale.isEmpty()) |
| 346 |
return false; |
| 347 |
size_t length = locale.length(); |
| 348 |
if (locale.is8Bit()) |
| 349 |
return isCJKLocale<LChar>(locale.characters8(), length); |
| 350 |
return isCJKLocale<UChar>(locale.characters16(), length); |
| 351 |
} |
| 352 |
|
| 353 |
static void mapLineIteratorModeToRules(LineBreakIteratorMode, bool isCJK, String& rules); |
| 354 |
|
| 355 |
TextBreakIterator* openLineBreakIterator(const AtomicString& locale, LineBreakIteratorMode mode, bool isCJK) |
| 356 |
{ |
| 357 |
UBreakIterator* ubrkIter; |
| 358 |
UErrorCode openStatus = U_ZERO_ERROR; |
| 359 |
bool isLocaleEmpty = locale.isEmpty(); |
| 360 |
if ((mode == LineBreakIteratorModeUAX14) && !isCJK) |
| 361 |
ubrkIter = ubrk_open(UBRK_LINE, isLocaleEmpty ? currentTextBreakLocaleID() : locale.string().utf8().data(), 0, 0, &openStatus); |
| 362 |
else { |
| 363 |
UParseError parseStatus; |
| 364 |
String rules; |
| 365 |
mapLineIteratorModeToRules(mode, isCJK, rules); |
| 366 |
ubrkIter = ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus); |
| 367 |
} |
| 368 |
// Locale comes from a web page and it can be invalid, leading ICU |
| 369 |
// to fail, in which case we fall back to the default locale (with default rules). |
| 370 |
if (!isLocaleEmpty && U_FAILURE(openStatus)) { |
| 371 |
openStatus = U_ZERO_ERROR; |
| 372 |
ubrkIter = ubrk_open(UBRK_LINE, currentTextBreakLocaleID(), 0, 0, &openStatus); |
| 373 |
} |
| 374 |
|
| 375 |
if (U_FAILURE(openStatus)) { |
| 376 |
LOG_ERROR("ubrk_open failed with status %d", openStatus); |
| 377 |
ASSERT(!ubrkIter); |
| 378 |
} |
| 379 |
return reinterpret_cast<TextBreakIterator*>(ubrkIter); |
| 380 |
} |
| 381 |
|
| 382 |
void closeLineBreakIterator(TextBreakIterator*& iterator) |
| 383 |
{ |
| 384 |
UBreakIterator* ubrkIter = reinterpret_cast<UBreakIterator*>(iterator); |
| 385 |
ASSERT(ubrkIter); |
| 386 |
ubrk_close(ubrkIter); |
| 387 |
iterator = 0; |
| 314 |
} |
388 |
} |
| 315 |
|
389 |
|
| 316 |
static TextBreakIterator* nonSharedCharacterBreakIterator; |
390 |
static TextBreakIterator* nonSharedCharacterBreakIterator; |
|
Lines 510-513
TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
a/Source/WebCore/platform/text/TextBreakIteratorICU.cpp_sec5
|
| 510 |
return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length); |
584 |
return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length); |
| 511 |
} |
585 |
} |
| 512 |
|
586 |
|
|
|
587 |
static const char* uax14Prologue = |
| 588 |
"!!chain;" |
| 589 |
"!!LBCMNoChain;" |
| 590 |
"!!lookAheadHardBreak;"; |
| 591 |
|
| 592 |
static const char* uax14AssignmentsBefore = |
| 593 |
// explicitly enumerate $CJ since ICU versions prior to 49 don't support :LineBreak=Conditional_Japanese_Starter: |
| 594 |
"$CJ = [" |
| 595 |
#if (U_ICU_VERSION_MAJOR_NUM >= 4) && (U_ICU_VERSION_MINOR_NUM >= 9) |
| 596 |
":LineBreak=Conditional_Japanese_Starter:" |
| 597 |
#else |
| 598 |
"\\u3041\\u3043\\u3045\\u3047\\u3049\\u3063\\u3083\\u3085\\u3087\\u308E\\u3095\\u3096\\u30A1\\u30A3\\u30A5\\u30A7" |
| 599 |
"\\u30A9\\u30C3\\u30E3\\u30E5\\u30E7\\u30EE\\u30F5\\u30F6\\u30FC" |
| 600 |
"\\u31F0\\u31F1\\u31F2\\u31F3\\u31F4\\u31F5\\u31F6\\u31F7\\u31F8\\u31F9\\u31FA\\u31FB\\u31FC\\u31FD\\u31FE\\u31FF" |
| 601 |
"\\uFF67\\uFF68\\uFF69\\uFF6A\\uFF6B\\uFF6C\\uFF6D\\uFF6E\\uFF6F\\uFF70" |
| 602 |
#endif |
| 603 |
"];"; |
| 604 |
|
| 605 |
static const char* uax14AssignmentsCustomLooseCJK = |
| 606 |
"$BA_SUB = [\\u2010\\u2013];" |
| 607 |
"$EX_SUB = [\\u0021\\u003F\\uFF01\\uFF1F];" |
| 608 |
"$ID_SUB = '';" |
| 609 |
"$IN_SUB = [\\u2025\\u2026];" |
| 610 |
"$IS_SUB = [\\u003A\\u003B];" |
| 611 |
"$NS_SUB = [\\u203C\\u2047\\u2048\\u2049\\u3005\\u301C\\u303B\\u309D\\u309E\\u30A0\\u30FB\\u30FD\\u30FE\\uFF1A\\uFF1B\\uFF65];" |
| 612 |
"$PO_SUB = [\\u0025\\u00A2\\u00B0\\u2030\\u2032\\u2033\\u2103\\uFF05\\uFFE0];" |
| 613 |
"$PR_SUB = [\\u0024\\u00A3\\u00A5\\u20AC\\u2116\\uFF04\\uFFE1\\uFFE5];" |
| 614 |
"$ID_ADD = [$CJ $BA_SUB $EX_SUB $IN_SUB $IS_SUB $NS_SUB $PO_SUB $PR_SUB];" |
| 615 |
"$NS_ADD = '';"; |
| 616 |
|
| 617 |
static const char* uax14AssignmentsCustomLooseNonCJK = |
| 618 |
"$BA_SUB = '';" |
| 619 |
"$EX_SUB = '';" |
| 620 |
"$ID_SUB = '';" |
| 621 |
"$IN_SUB = [\\u2025\\u2026];" |
| 622 |
"$IS_SUB = '';" |
| 623 |
"$NS_SUB = [\\u3005\\u303B\\u309D\\u309E\\u30FD\\u30FE];" |
| 624 |
"$PO_SUB = '';" |
| 625 |
"$PR_SUB = '';" |
| 626 |
"$ID_ADD = [$CJ $IN_SUB $NS_SUB];" |
| 627 |
"$NS_ADD = '';"; |
| 628 |
|
| 629 |
static const char* uax14AssignmentsCustomNormalCJK = |
| 630 |
"$BA_SUB = [\\u2010\\u2013];" |
| 631 |
"$EX_SUB = '';" |
| 632 |
"$IN_SUB = '';" |
| 633 |
"$ID_SUB = '';" |
| 634 |
"$IS_SUB = '';" |
| 635 |
"$NS_SUB = [\\u301C\\u30A0];" |
| 636 |
"$PO_SUB = '';" |
| 637 |
"$PR_SUB = '';" |
| 638 |
"$ID_ADD = [$CJ $BA_SUB $NS_SUB];" |
| 639 |
"$NS_ADD = '';"; |
| 640 |
|
| 641 |
static const char* uax14AssignmentsCustomNormalNonCJK = |
| 642 |
"$BA_SUB = '';" |
| 643 |
"$EX_SUB = '';" |
| 644 |
"$ID_SUB = '';" |
| 645 |
"$IN_SUB = '';" |
| 646 |
"$IS_SUB = '';" |
| 647 |
"$NS_SUB = '';" |
| 648 |
"$PO_SUB = '';" |
| 649 |
"$PR_SUB = '';" |
| 650 |
"$ID_ADD = [$CJ];" |
| 651 |
"$NS_ADD = '';"; |
| 652 |
|
| 653 |
static const char* uax14AssignmentsCustomStrictCJK = |
| 654 |
"$BA_SUB = '';" |
| 655 |
"$EX_SUB = '';" |
| 656 |
"$ID_SUB = '';" |
| 657 |
"$IN_SUB = '';" |
| 658 |
"$IS_SUB = '';" |
| 659 |
"$NS_SUB = '';" |
| 660 |
"$PO_SUB = '';" |
| 661 |
"$PR_SUB = '';" |
| 662 |
"$ID_ADD = '';" |
| 663 |
"$NS_ADD = [$CJ];"; |
| 664 |
|
| 665 |
#define uax14AssignmentsCustomStrictNonCJK uax14AssignmentsCustomStrictCJK |
| 666 |
#define uax14AssignmentsCustomDefaultCJK uax14AssignmentsCustomNormalCJK |
| 667 |
#define uax14AssignmentsCustomDefaultNonCJK uax14AssignmentsCustomStrictNonCJK |
| 668 |
|
| 669 |
static const char* uax14AssignmentsAfter = |
| 670 |
"$AI = [:LineBreak = Ambiguous:];" |
| 671 |
"$AL = [:LineBreak = Alphabetic:];" |
| 672 |
"$BA = [[:LineBreak = Break_After:] - $BA_SUB];" |
| 673 |
"$BB = [:LineBreak = Break_Before:];" |
| 674 |
"$BK = [:LineBreak = Mandatory_Break:];" |
| 675 |
"$B2 = [:LineBreak = Break_Both:];" |
| 676 |
"$CB = [:LineBreak = Contingent_Break:];" |
| 677 |
"$CL = [:LineBreak = Close_Punctuation:];" |
| 678 |
"$CM = [:LineBreak = Combining_Mark:];" |
| 679 |
"$CP = [:LineBreak = Close_Parenthesis:];" |
| 680 |
"$CR = [:LineBreak = Carriage_Return:];" |
| 681 |
"$EX = [[:LineBreak = Exclamation:] - $EX_SUB];" |
| 682 |
"$GL = [:LineBreak = Glue:];" |
| 683 |
#if (U_ICU_VERSION_MAJOR_NUM >= 4) && (U_ICU_VERSION_MINOR_NUM >= 9) |
| 684 |
"$HL = [:LineBreak = Hebrew_Letter:];" |
| 685 |
#else |
| 686 |
"$HL = [[:Hebrew:] & [:Letter:]];" |
| 687 |
#endif |
| 688 |
"$HY = [:LineBreak = Hyphen:];" |
| 689 |
"$H2 = [:LineBreak = H2:];" |
| 690 |
"$H3 = [:LineBreak = H3:];" |
| 691 |
"$ID = [[[[:LineBreak = Ideographic:] - $CJ] $ID_ADD] - $ID_SUB];" |
| 692 |
"$IN = [[:LineBreak = Inseparable:] - $IN_SUB];" |
| 693 |
"$IS = [[:LineBreak = Infix_Numeric:] - $IS_SUB];" |
| 694 |
"$JL = [:LineBreak = JL:];" |
| 695 |
"$JV = [:LineBreak = JV:];" |
| 696 |
"$JT = [:LineBreak = JT:];" |
| 697 |
"$LF = [:LineBreak = Line_Feed:];" |
| 698 |
"$NL = [:LineBreak = Next_Line:];" |
| 699 |
"$NS = [[[[:LineBreak = Nonstarter:] - $CJ] $NS_ADD] - $NS_SUB];" |
| 700 |
"$NU = [:LineBreak = Numeric:];" |
| 701 |
"$OP = [:LineBreak = Open_Punctuation:];" |
| 702 |
"$PO = [[:LineBreak = Postfix_Numeric:] - $PO_SUB];" |
| 703 |
"$PR = [[:LineBreak = Prefix_Numeric:] - $PR_SUB];" |
| 704 |
"$QU = [:LineBreak = Quotation:];" |
| 705 |
"$SA = [:LineBreak = Complex_Context:];" |
| 706 |
"$SG = [:LineBreak = Surrogate:];" |
| 707 |
"$SP = [:LineBreak = Space:];" |
| 708 |
"$SY = [:LineBreak = Break_Symbols:];" |
| 709 |
"$WJ = [:LineBreak = Word_Joiner:];" |
| 710 |
"$XX = [:LineBreak = Unknown:];" |
| 711 |
"$ZW = [:LineBreak = ZWSpace:];" |
| 712 |
"$dictionary = [:LineBreak = Complex_Context:];" |
| 713 |
"$ALPlus = [$AL $AI $SA $SG $XX];" |
| 714 |
"$ALcm = $ALPlus $CM*;" |
| 715 |
"$BAcm = $BA $CM*;" |
| 716 |
"$BBcm = $BB $CM*;" |
| 717 |
"$B2cm = $B2 $CM*;" |
| 718 |
"$CLcm = $CL $CM*;" |
| 719 |
"$CPcm = $CP $CM*;" |
| 720 |
"$EXcm = $EX $CM*;" |
| 721 |
"$GLcm = $GL $CM*;" |
| 722 |
"$HLcm = $HL $CM*;" |
| 723 |
"$HYcm = $HY $CM*;" |
| 724 |
"$H2cm = $H2 $CM*;" |
| 725 |
"$H3cm = $H3 $CM*;" |
| 726 |
"$IDcm = $ID $CM*;" |
| 727 |
"$INcm = $IN $CM*;" |
| 728 |
"$IScm = $IS $CM*;" |
| 729 |
"$JLcm = $JL $CM*;" |
| 730 |
"$JVcm = $JV $CM*;" |
| 731 |
"$JTcm = $JT $CM*;" |
| 732 |
"$NScm = $NS $CM*;" |
| 733 |
"$NUcm = $NU $CM*;" |
| 734 |
"$OPcm = $OP $CM*;" |
| 735 |
"$POcm = $PO $CM*;" |
| 736 |
"$PRcm = $PR $CM*;" |
| 737 |
"$QUcm = $QU $CM*;" |
| 738 |
"$SYcm = $SY $CM*;" |
| 739 |
"$WJcm = $WJ $CM*;"; |
| 740 |
|
| 741 |
static const char* uax14Forward = |
| 742 |
"!!forward;" |
| 743 |
"$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM];" |
| 744 |
"$CANT_CM = [$SP $BK $CR $LF $NL $ZW $CM];" |
| 745 |
"$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];" |
| 746 |
"$AL_FOLLOW_CM = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];" |
| 747 |
"$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];" |
| 748 |
"$LB4Breaks = [$BK $CR $LF $NL];" |
| 749 |
"$LB4NonBreaks = [^$BK $CR $LF $NL];" |
| 750 |
"$LB8Breaks = [$LB4Breaks $ZW];" |
| 751 |
"$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];" |
| 752 |
"$LB18NonBreaks = [$LB8NonBreaks - [$SP]];" |
| 753 |
"$LB18Breaks = [$LB8Breaks $SP];" |
| 754 |
"$LB20NonBreaks = [$LB18NonBreaks - $CB];" |
| 755 |
"$ALPlus $CM+;" |
| 756 |
"$BA $CM+;" |
| 757 |
"$BB $CM+;" |
| 758 |
"$B2 $CM+;" |
| 759 |
"$CL $CM+;" |
| 760 |
"$CP $CM+;" |
| 761 |
"$EX $CM+;" |
| 762 |
"$GL $CM+;" |
| 763 |
"$HL $CM+;" |
| 764 |
"$HY $CM+;" |
| 765 |
"$H2 $CM+;" |
| 766 |
"$H3 $CM+;" |
| 767 |
"$ID $CM+;" |
| 768 |
"$IN $CM+;" |
| 769 |
"$IS $CM+;" |
| 770 |
"$JL $CM+;" |
| 771 |
"$JV $CM+;" |
| 772 |
"$JT $CM+;" |
| 773 |
"$NS $CM+;" |
| 774 |
"$NU $CM+;" |
| 775 |
"$OP $CM+;" |
| 776 |
"$PO $CM+;" |
| 777 |
"$PR $CM+;" |
| 778 |
"$QU $CM+;" |
| 779 |
"$SY $CM+;" |
| 780 |
"$WJ $CM+;" |
| 781 |
"$CR $LF {100};" |
| 782 |
"$LB4NonBreaks? $LB4Breaks {100};" |
| 783 |
"$CAN_CM $CM* $LB4Breaks {100};" |
| 784 |
"$CM+ $LB4Breaks {100};" |
| 785 |
"$LB4NonBreaks [$SP $ZW];" |
| 786 |
"$CAN_CM $CM* [$SP $ZW];" |
| 787 |
"$CM+ [$SP $ZW];" |
| 788 |
"$CAN_CM $CM+;" |
| 789 |
"$CM+;" |
| 790 |
"$CAN_CM $CM* $WJcm;" |
| 791 |
"$LB8NonBreaks $WJcm;" |
| 792 |
"$CM+ $WJcm;" |
| 793 |
"$WJcm $CANT_CM;" |
| 794 |
"$WJcm $CAN_CM $CM*;" |
| 795 |
"$GLcm $CAN_CM $CM*;" |
| 796 |
"$GLcm $CANT_CM;" |
| 797 |
"[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm;" |
| 798 |
"$CM+ GLcm;" |
| 799 |
"$LB8NonBreaks $CL;" |
| 800 |
"$CAN_CM $CM* $CL;" |
| 801 |
"$CM+ $CL;" |
| 802 |
"$LB8NonBreaks $CP;" |
| 803 |
"$CAN_CM $CM* $CP;" |
| 804 |
"$CM+ $CP;" |
| 805 |
"$LB8NonBreaks $EX;" |
| 806 |
"$CAN_CM $CM* $EX;" |
| 807 |
"$CM+ $EX;" |
| 808 |
"$LB8NonBreaks $IS;" |
| 809 |
"$CAN_CM $CM* $IS;" |
| 810 |
"$CM+ $IS;" |
| 811 |
"$LB8NonBreaks $SY;" |
| 812 |
"$CAN_CM $CM* $SY;" |
| 813 |
"$CM+ $SY;" |
| 814 |
"$OPcm $SP* $CAN_CM $CM*;" |
| 815 |
"$OPcm $SP* $CANT_CM;" |
| 816 |
"$OPcm $SP+ $CM+ $AL_FOLLOW?;" |
| 817 |
"$QUcm $SP* $OPcm;" |
| 818 |
"($CLcm | $CPcm) $SP* $NScm;" |
| 819 |
"$B2cm $SP* $B2cm;" |
| 820 |
"$LB18NonBreaks $CM* $QUcm;" |
| 821 |
"$CM+ $QUcm;" |
| 822 |
"$QUcm .?;" |
| 823 |
"$QUcm $LB18NonBreaks $CM*;" |
| 824 |
"$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); " |
| 825 |
"$BBcm [^$CB];" |
| 826 |
"$BBcm $LB20NonBreaks $CM*;" |
| 827 |
"$HLcm ($HYcm | $BAcm) [^$CB]?;" |
| 828 |
"($ALcm | $HLcm) $INcm;" |
| 829 |
"$CM+ $INcm;" |
| 830 |
"$IDcm $INcm;" |
| 831 |
"$INcm $INcm;" |
| 832 |
"$NUcm $INcm;" |
| 833 |
"$IDcm $POcm;" |
| 834 |
"$ALcm $NUcm;" |
| 835 |
"$HLcm $NUcm;" |
| 836 |
"$CM+ $NUcm;" |
| 837 |
"$NUcm $ALcm;" |
| 838 |
"$NUcm $HLcm;" |
| 839 |
"$PRcm $IDcm;" |
| 840 |
"$PRcm ($ALcm | $HLcm);" |
| 841 |
"$POcm ($ALcm | $HLcm);" |
| 842 |
"($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?;" |
| 843 |
"$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);" |
| 844 |
"($JVcm | $H2cm) ($JVcm | $JTcm);" |
| 845 |
"($JTcm | $H3cm) $JTcm;" |
| 846 |
"($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;" |
| 847 |
"($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;" |
| 848 |
"$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);" |
| 849 |
"($ALcm | $HLcm) ($ALcm | $HLcm);" |
| 850 |
"$CM+ ($ALcm | $HLcm);" |
| 851 |
"$IScm ($ALcm | $HLcm);" |
| 852 |
"($ALcm | $HLcm | $NUcm) $OPcm;" |
| 853 |
"$CM+ $OPcm;" |
| 854 |
"$CPcm ($ALcm | $HLcm | $NUcm);"; |
| 855 |
|
| 856 |
static const char* uax14Reverse = |
| 857 |
"!!reverse;" |
| 858 |
"$CM+ $ALPlus;" |
| 859 |
"$CM+ $BA;" |
| 860 |
"$CM+ $BB;" |
| 861 |
"$CM+ $B2;" |
| 862 |
"$CM+ $CL;" |
| 863 |
"$CM+ $CP;" |
| 864 |
"$CM+ $EX;" |
| 865 |
"$CM+ $GL;" |
| 866 |
"$CM+ $HL;" |
| 867 |
"$CM+ $HY;" |
| 868 |
"$CM+ $H2;" |
| 869 |
"$CM+ $H3;" |
| 870 |
"$CM+ $ID;" |
| 871 |
"$CM+ $IN;" |
| 872 |
"$CM+ $IS;" |
| 873 |
"$CM+ $JL;" |
| 874 |
"$CM+ $JV;" |
| 875 |
"$CM+ $JT;" |
| 876 |
"$CM+ $NS;" |
| 877 |
"$CM+ $NU;" |
| 878 |
"$CM+ $OP;" |
| 879 |
"$CM+ $PO;" |
| 880 |
"$CM+ $PR;" |
| 881 |
"$CM+ $QU;" |
| 882 |
"$CM+ $SY;" |
| 883 |
"$CM+ $WJ;" |
| 884 |
"$CM+;" |
| 885 |
"$AL_FOLLOW $CM+ / ([$BK $CR $LF $NL $ZW {eof}] | $SP+ $CM+ $SP | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));" |
| 886 |
"[$PR] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];" |
| 887 |
"$LB4Breaks [$LB4NonBreaks-$CM];" |
| 888 |
"$LB4Breaks $CM+ $CAN_CM;" |
| 889 |
"$LF $CR;" |
| 890 |
"[$SP $ZW] [$LB4NonBreaks-$CM];" |
| 891 |
"[$SP $ZW] $CM+ $CAN_CM;" |
| 892 |
"$CM+ $CAN_CM;" |
| 893 |
"$CM* $WJ $CM* $CAN_CM;" |
| 894 |
"$CM* $WJ [$LB8NonBreaks-$CM];" |
| 895 |
"$CANT_CM $CM* $WJ;" |
| 896 |
"$CM* $CAN_CM $CM* $WJ;" |
| 897 |
"$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]];" |
| 898 |
"$CANT_CM $CM* $GL;" |
| 899 |
"$CM* $CAN_CM $CM* $GL;" |
| 900 |
"$CL $CM+ $CAN_CM;" |
| 901 |
"$CP $CM+ $CAN_CM;" |
| 902 |
"$EX $CM+ $CAN_CM;" |
| 903 |
"$IS $CM+ $CAN_CM;" |
| 904 |
"$SY $CM+ $CAN_CM;" |
| 905 |
"$CL [$LB8NonBreaks-$CM];" |
| 906 |
"$CP [$LB8NonBreaks-$CM];" |
| 907 |
"$EX [$LB8NonBreaks-$CM];" |
| 908 |
"$IS [$LB8NonBreaks-$CM];" |
| 909 |
"$SY [$LB8NonBreaks-$CM];" |
| 910 |
"[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; " |
| 911 |
"$CM* $CAN_CM $SP* $CM* $OP;" |
| 912 |
"$CANT_CM $SP* $CM* $OP;" |
| 913 |
"$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP;" |
| 914 |
"$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;" |
| 915 |
"$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;" |
| 916 |
"$SY $CM $SP+ $OP;" |
| 917 |
"$CM* $OP $SP* $CM* $QU;" |
| 918 |
"$CM* $NS $SP* $CM* ($CL | $CP);" |
| 919 |
"$CM* $B2 $SP* $CM* $B2;" |
| 920 |
"$CM* $QU $CM* $CAN_CM;" |
| 921 |
"$CM* $QU $LB18NonBreaks;" |
| 922 |
"$CM* $CAN_CM $CM* $QU;" |
| 923 |
"$CANT_CM $CM* $QU;" |
| 924 |
"$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];" |
| 925 |
"$CM* [$LB20NonBreaks-$CM] $CM* $BB;" |
| 926 |
"[^$CB] $CM* $BB;" |
| 927 |
"[^$CB] $CM* ($HY | $BA) $CM* $HL;" |
| 928 |
"$CM* $IN $CM* ($ALPlus | $HL);" |
| 929 |
"$CM* $IN $CM* $ID;" |
| 930 |
"$CM* $IN $CM* $IN;" |
| 931 |
"$CM* $IN $CM* $NU;" |
| 932 |
"$CM* $PO $CM* $ID;" |
| 933 |
"$CM* $NU $CM* ($ALPlus | $HL);" |
| 934 |
"$CM* ($ALPlus | $HL) $CM* $NU;" |
| 935 |
"$CM* $ID $CM* $PR;" |
| 936 |
"$CM* ($ALPlus | $HL) $CM* $PR;" |
| 937 |
"$CM* ($ALPlus | $HL) $CM* $PO;" |
| 938 |
"($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;" |
| 939 |
"$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;" |
| 940 |
"$CM* ($JT | $JV) $CM* ($H2 | $JV);" |
| 941 |
"$CM* $JT $CM* ($H3 | $JT);" |
| 942 |
"$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);" |
| 943 |
"$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);" |
| 944 |
"$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;" |
| 945 |
"$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);" |
| 946 |
"$CM* ($ALPlus | $HL) $CM* $IS;" |
| 947 |
"$CM* $OP $CM* ($ALPlus | $HL | $NU);" |
| 948 |
"$CM* ($ALPlus | $HL | $NU) $CM* $CP;"; |
| 949 |
|
| 950 |
static const char* uax14SafeForward = |
| 951 |
"!!safe_forward;" |
| 952 |
"[$CM $OP $QU $CL $CP $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $BA $dictionary];" |
| 953 |
"$dictionary $dictionary;"; |
| 954 |
|
| 955 |
static const char* uax14SafeReverse = |
| 956 |
"!!safe_reverse;" |
| 957 |
"$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];" |
| 958 |
"$CM+ $SP / .;" |
| 959 |
"$SP+ $CM* $OP;" |
| 960 |
"$SP+ $CM* $QU;" |
| 961 |
"$SP+ $CM* ($CL | $CP);" |
| 962 |
"$SP+ $CM* $B2;" |
| 963 |
"$CM* ($HY | $BA) $CM* $HL;" |
| 964 |
"($CM* ($IS | $SY))+ $CM* $NU;" |
| 965 |
"($CL | $CP) $CM* ($NU | $IS | $SY);" |
| 966 |
"$dictionary $dictionary;"; |
| 967 |
|
| 968 |
static void mapLineIteratorModeToRules(LineBreakIteratorMode mode, bool isCJK, String& rules) |
| 969 |
{ |
| 970 |
StringBuilder rulesBuilder; |
| 971 |
rulesBuilder.append(uax14Prologue); |
| 972 |
rulesBuilder.append(uax14AssignmentsBefore); |
| 973 |
switch (mode) { |
| 974 |
case LineBreakIteratorModeUAX14: |
| 975 |
rulesBuilder.append(isCJK ? uax14AssignmentsCustomDefaultCJK : uax14AssignmentsCustomDefaultNonCJK); |
| 976 |
break; |
| 977 |
case LineBreakIteratorModeUAX14Loose: |
| 978 |
rulesBuilder.append(isCJK ? uax14AssignmentsCustomLooseCJK : uax14AssignmentsCustomLooseNonCJK); |
| 979 |
break; |
| 980 |
case LineBreakIteratorModeUAX14Normal: |
| 981 |
rulesBuilder.append(isCJK ? uax14AssignmentsCustomNormalCJK : uax14AssignmentsCustomNormalNonCJK); |
| 982 |
break; |
| 983 |
case LineBreakIteratorModeUAX14Strict: |
| 984 |
rulesBuilder.append(isCJK ? uax14AssignmentsCustomStrictCJK : uax14AssignmentsCustomStrictNonCJK); |
| 985 |
break; |
| 986 |
} |
| 987 |
rulesBuilder.append(uax14AssignmentsAfter); |
| 988 |
rulesBuilder.append(uax14Forward); |
| 989 |
rulesBuilder.append(uax14Reverse); |
| 990 |
rulesBuilder.append(uax14SafeForward); |
| 991 |
rulesBuilder.append(uax14SafeReverse); |
| 992 |
rules = rulesBuilder.toString(); |
| 993 |
} |
| 994 |
|
| 513 |
} |
995 |
} |