tdf#49885 BreakIterator rule upgrades
This change re-bases the BreakIterator rule customizations on top of a clean copy of the ICU 74.2 rules. Change-Id: Iadcf16cab138cc6c869fac61ad64e996e65b5ae4 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166273 Tested-by: Jenkins Tested-by: Caolán McNamara <caolan.mcnamara@collabora.com> Reviewed-by: Caolán McNamara <caolan.mcnamara@collabora.com>
This commit is contained in:
parent
3956472eb2
commit
44699b3de3
12 changed files with 1342 additions and 1760 deletions
|
@ -16,16 +16,12 @@ $(call gb_CustomTarget_get_target,i18npool/breakiterator) : \
|
|||
|
||||
i18npool_BRKTXTS := \
|
||||
count_word.brk \
|
||||
$(call gb_Helper_optional_locale,he,dict_word_he.brk) \
|
||||
$(call gb_Helper_optional_locale,hu,dict_word_hu.brk) \
|
||||
dict_word_nodash.brk \
|
||||
dict_word_prepostdash.brk \
|
||||
dict_word.brk \
|
||||
$(call gb_Helper_optional_locale,he,edit_word_he.brk) \
|
||||
$(call gb_Helper_optional_locale,hu,edit_word_hu.brk) \
|
||||
edit_word.brk \
|
||||
line.brk \
|
||||
sent.brk
|
||||
line.brk
|
||||
|
||||
# 'gencmn', 'genbrk' and 'genccode' are tools generated and delivered by icu project to process icu breakiterator rules.
|
||||
# The output of gencmn generates warnings under Windows. We want to minimize the patches to external tools,
|
||||
|
|
|
@ -184,11 +184,10 @@ void TestBreakIterator::testLineBreaking()
|
|||
|
||||
{
|
||||
// Per the bug, the line break should leave -bar clumped together on the next line.
|
||||
// However, this change was reverted at some point. This test asserts the new behavior.
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
u"foo -bar"_ustr, strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
|
||||
static_cast<sal_Int32>(5), aResult.breakIndex);
|
||||
static_cast<sal_Int32>(4), aResult.breakIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -198,11 +197,29 @@ void TestBreakIterator::testLineBreaking()
|
|||
aLocale.Country = "US";
|
||||
|
||||
{
|
||||
// Here we want the line break to leave C:\Program Files\ on the first line
|
||||
// Note that the current behavior deviates from the original fix for this bug.
|
||||
//
|
||||
// The original report was filed due to wrapping all of "\Program Files\aaaa" to the
|
||||
// next line, even though only "aaaa" overflowed. The original fix was to simply make
|
||||
// U+005C reverse solidus (backslash) a breaking character.
|
||||
//
|
||||
// However, the root cause for this bug was not the behavior of '\', but rather some
|
||||
// other bug making all of "\Program Files\" behave like a single token, despite it
|
||||
// even containing whitespace.
|
||||
//
|
||||
// Reverting to the ICU line rules fixes this root issue. Now, in the following,
|
||||
// "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also
|
||||
// consistent with the behavior of other office programs.
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
u"C:\\Program Files\\LibreOffice"_ustr, strlen("C:\\Program Files\\Libre"), aLocale, 0,
|
||||
aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
|
||||
|
||||
// An identical result should be generated for solidus.
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
"C:/Program Files/LibreOffice", strlen("C:/Program Files/Libre"), aLocale, 0,
|
||||
aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -251,23 +268,125 @@ void TestBreakIterator::testLineBreaking()
|
|||
aLocale.Country = "US";
|
||||
|
||||
{
|
||||
// The root cause for this bug was the Unicode standard introducing special treatment
|
||||
// for '-' in a number range context. This change makes number ranges (e.g. "100-199")
|
||||
// behave as if they are single tokens for the purposes of line breaking. Unfortunately,
|
||||
// this caused a significant appearance change to existing documents.
|
||||
//
|
||||
// Despite being a user-visible layout change, this isn't exactly a bug. Wrapping
|
||||
// number ranges as a single token is consistent with other applications, including web
|
||||
// browsers, and other office suites as mentioned in the bug discussion. Removing this
|
||||
// customization seems like it would be a major change, however.
|
||||
//
|
||||
// Here we want the line break to leave 100- clumped on the first line.
|
||||
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
u"word 100-199 word"_ustr, strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), aResult.breakIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// i#83649: Line break should be between typographical quote and left bracket
|
||||
{
|
||||
{
|
||||
// From the same bug: "the leading minus must stay with numbers and strings"
|
||||
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
"range of -100.000 to 100.000", strlen("range of -1"), aLocale, 0,
|
||||
aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
|
||||
|
||||
constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr;
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
|
||||
}
|
||||
|
||||
aLocale.Language = "de";
|
||||
aLocale.Country = "DE";
|
||||
|
||||
{
|
||||
// Here we want the line break to leave »angetan werden« on the first line
|
||||
// From the same bug: "the leading minus must stay with numbers and strings"
|
||||
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
"EURO is -10,50", strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
|
||||
|
||||
// Also the mathematical minus sign:
|
||||
|
||||
constexpr OUString str = u"EURO is \u221210,50"_ustr;
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
|
||||
}
|
||||
|
||||
{
|
||||
// From the same bug: "the leading minus must stay with numbers and strings"
|
||||
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
"und -kosten", strlen("und -ko"), aLocale, 0,
|
||||
aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex);
|
||||
|
||||
// But not the non-breaking hyphen:
|
||||
|
||||
constexpr OUString str = u"und \u2011"_ustr;
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// i#83649: "Line break should be between typographical quote and left bracket"
|
||||
// - Actually: Spaces between quotation mark and opening punctuation not treated as a break.
|
||||
// - Note that per the Unicode standard, prohibiting breaks in this context is intentional
|
||||
// because it may cause issues in certain languages due to the various ways quotation
|
||||
// characters are used.
|
||||
// - We do it anyway by customizing the ICU line breaking rules.
|
||||
{
|
||||
{
|
||||
// This uses the sample text provided in the bug report. Based on usage, it is assumed
|
||||
// they were in the de_DE locale.
|
||||
|
||||
aLocale.Language = "de";
|
||||
aLocale.Country = "DE";
|
||||
|
||||
// Per the bug report, it is expected that »angetan werden« remains on the first line.
|
||||
const OUString str = u"»angetan werden« [Passiv]"_ustr;
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
str, strlen("Xangetan werdenX ["), aLocale, 0, aHyphOptions, aUserOptions);
|
||||
str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
|
||||
|
||||
// The same result should be returned for this and the first case.
|
||||
const OUString str2 = u"»angetan werden« Passiv"_ustr;
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
|
||||
|
||||
// Under ICU rules, no amount of spaces would cause this to wrap.
|
||||
const OUString str3 = u"»angetan werden« [Passiv]"_ustr;
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), aResult.breakIndex);
|
||||
|
||||
// However, tabs will
|
||||
const OUString str4 = u"»angetan werden«\t[Passiv]"_ustr;
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
|
||||
}
|
||||
|
||||
{
|
||||
// The same behavior is seen in English
|
||||
|
||||
aLocale.Language = "en";
|
||||
aLocale.Country = "US";
|
||||
|
||||
const OUString str = u"\"angetan werden\" [Passiv]"_ustr;
|
||||
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
|
||||
str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
|
||||
|
||||
const OUString str2 = u"\"angetan werden\" Passiv"_ustr;
|
||||
aResult = m_xBreak->getLineBreak(
|
||||
str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
|
||||
}
|
||||
}
|
||||
|
@ -355,7 +474,7 @@ void TestBreakIterator::testLineBreaking()
|
|||
auto res = m_xBreak->getLineBreak(u"Wort -prinzessinnen, wort"_ustr,
|
||||
strlen("Wort -prinzessinnen,"), aLocale, 0,
|
||||
aHyphOptions, aUserOptions);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 6 }, res.breakIndex);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -638,7 +757,8 @@ void TestBreakIterator::testWordBoundaries()
|
|||
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
|
||||
}
|
||||
|
||||
//See https://bz.apache.org/ooo/show_bug.cgi?id=85411
|
||||
// i#85411: ZWSP should be a word separator for spellchecking
|
||||
// - This fix was applied to both dict and edit customizations
|
||||
for (int j = 0; j < 3; ++j)
|
||||
{
|
||||
switch (j)
|
||||
|
@ -660,21 +780,23 @@ void TestBreakIterator::testWordBoundaries()
|
|||
break;
|
||||
}
|
||||
|
||||
static constexpr OUString aTest =
|
||||
u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
|
||||
static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
|
||||
|
||||
sal_Int32 nPos = 0;
|
||||
sal_Int32 aExpected[] = {1, 6, 9, 12};
|
||||
sal_Int32 aExpected[] = { 1, 6, 9, 12 };
|
||||
size_t i = 0;
|
||||
do
|
||||
{
|
||||
CPPUNIT_ASSERT(i < std::size(aExpected));
|
||||
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
||||
i18n::WordType::DICTIONARY_WORD, true).endPos;
|
||||
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
||||
auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
||||
i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos);
|
||||
auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos);
|
||||
nPos = dwPos.endPos;
|
||||
++i;
|
||||
}
|
||||
while (nPos++ < aTest.getLength());
|
||||
} while (nPos++ < aTest.getLength());
|
||||
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
|
||||
}
|
||||
|
||||
|
@ -814,121 +936,45 @@ void TestBreakIterator::testWordBoundaries()
|
|||
}
|
||||
|
||||
// i#56347: "BreakIterator patch for Hungarian"
|
||||
// i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
|
||||
// Rules for Hungarian affixes after numbers and certain symbols
|
||||
{
|
||||
auto mode = i18n::WordType::DICTIONARY_WORD;
|
||||
aLocale.Language = "hu";
|
||||
aLocale.Country = "HU";
|
||||
|
||||
OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
||||
for (auto mode :
|
||||
{ i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES })
|
||||
{
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
|
||||
}
|
||||
|
||||
// i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
|
||||
// Rules for Hungarian affixes after numbers and certain symbols in edit mode.
|
||||
// The patch was merged, but the original bug was never closed and the current behavior seems
|
||||
// identical to the ICU default behavior. Added this test to ensure that doesn't change.
|
||||
{
|
||||
auto mode = i18n::WordType::ANY_WORD;
|
||||
aLocale.Language = "hu";
|
||||
aLocale.Country = "HU";
|
||||
|
||||
OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 17, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 19, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 20, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(20), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 21, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(21), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 24, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 26, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(26), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(27), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 30, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 31, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
|
||||
}
|
||||
}
|
||||
|
||||
// tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis
|
||||
|
@ -983,6 +1029,56 @@ void TestBreakIterator::testSentenceBoundaries()
|
|||
CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale));
|
||||
}
|
||||
|
||||
// i#55063: Sentence selection in Thai should select a space-delimited phrase.
|
||||
// - This customization broke at some point. It works in an English locale in a synthetic test
|
||||
// like this one, but does not work in the Thai locale, nor on Thai text in practice.
|
||||
{
|
||||
static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr;
|
||||
|
||||
aLocale.Language = "en";
|
||||
aLocale.Country = "US";
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
|
||||
|
||||
aLocale.Language = "th";
|
||||
aLocale.Country = "TH";
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
|
||||
}
|
||||
|
||||
// i#55063: Thai phrases should delimit English sentence selection.
|
||||
// - This customization broke at some point. It works in an English locale in a synthetic test
|
||||
// like this one, but does not work in the Thai locale, nor on Thai text in practice.
|
||||
{
|
||||
static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr;
|
||||
|
||||
aLocale.Language = "en";
|
||||
aLocale.Country = "US";
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
|
||||
|
||||
aLocale.Language = "th";
|
||||
aLocale.Country = "TH";
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
|
||||
}
|
||||
|
||||
// i#55063: Characteristic test for English text delimiting Thai phrases (sentences)
|
||||
// - English text should not delimit Thai phrases.
|
||||
{
|
||||
static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr;
|
||||
|
||||
aLocale.Language = "en";
|
||||
aLocale.Country = "US";
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
|
||||
|
||||
aLocale.Language = "th";
|
||||
aLocale.Country = "TH";
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
|
||||
}
|
||||
}
|
||||
|
||||
//See https://bugs.libreoffice.org/show_bug.cgi?id=40292
|
||||
|
@ -1559,6 +1655,7 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
|
|||
aLocale.Language = "he";
|
||||
aLocale.Country = "IL";
|
||||
|
||||
// i#51661: Add quotation mark as middle letter for Hebrew
|
||||
{
|
||||
auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
|
||||
|
||||
|
@ -1572,6 +1669,21 @@ void TestBreakIterator::testLegacyHebrewQuoteInsideWord()
|
|||
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
|
||||
}
|
||||
|
||||
// i#51661: Add quotation mark as middle letter for Hebrew
|
||||
{
|
||||
auto aTest = u"פַּרְדּ״ס פַּרְדּ\"ס"_ustr;
|
||||
|
||||
i18n::Boundary aBounds = m_xBreak->getWordBoundary(
|
||||
aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aTest, 13, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBreakIterator::testLegacySurrogatePairs()
|
||||
|
|
|
@ -1,148 +1,199 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: dict_word.txt
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
##############################################################################
|
||||
|
||||
$Ideographic = [:Ideographic:];
|
||||
$Hangul = [:Script = HANGUL:];
|
||||
### BEGIN CUSTOMIZATION
|
||||
### This file contains LibreOffice-specific rule customizations.
|
||||
###
|
||||
### To aid future maintainability:
|
||||
### - The change location should be bracketed by comments of this form.
|
||||
### - The original rule should be commented out, and the modified rule placed alongside.
|
||||
### - By doing this, maintainers can more easily compare to an upstream baseline.
|
||||
###
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- $Ideographic
|
||||
- $Katakana
|
||||
- $Hangul
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:]
|
||||
[:name = HYPHEN-MINUS:] ];
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$SufixLetter = [:name= FULL STOP:];
|
||||
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
|
||||
[:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
|
||||
[:name = PRIME:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
$Han = [:Han:];
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$SufixLetterEx= $SufixLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$IdeographicEx= $Ideographic $Extend*;
|
||||
$HangulEx = $Hangul $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}-$Han];
|
||||
$ZWJ = [\p{Word_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [\p{Extended_Pictographic}];
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### Unknown issue number: Dictionary words can contain hyphens
|
||||
### tdf#49885: Sync custom BreakIterator rules with ICU originals
|
||||
### - ICU is now more permissive about punctuation inside words.
|
||||
### - For compatibility, exclude certain characters that were previously excluded.
|
||||
|
||||
$IncludedML = [:name = HYPHEN-MINUS:];
|
||||
$ExcludedML = [[:name = COLON:]
|
||||
[:name = GREEK ANO TELEIA:]
|
||||
[:name = PRESENTATION FORM FOR VERTICAL COLON:]
|
||||
[:name = SMALL COLON:]
|
||||
[:name = FULLWIDTH COLON:]];
|
||||
|
||||
# $MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
[[:P:][:S:]]*;
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
|
||||
$HangulEx ($FormatEx* $HangulEx)* {400};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c Do not break within emoji zwj sequences.
|
||||
# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
$ZWJ $Extended_Pict;
|
||||
|
||||
# Rule 3d - Keep horizontal whitespace together.
|
||||
#
|
||||
$WSegSpace $WSegSpace;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text.
|
||||
|
||||
$ExFm = [$Extend $Format $ZWJ];
|
||||
|
||||
^$ExFm+; # This rule fires only when there are format or extend characters at the
|
||||
# start of text, or immediately following another boundary. It groups them, in
|
||||
# the event there are more than one.
|
||||
|
||||
[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
|
||||
# with no special rule status value.
|
||||
|
||||
$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
|
||||
$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_Letter $ExFm* {200};
|
||||
$Katakana $ExFm* {400}; # note: these status values override those from rule 5
|
||||
$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
|
||||
$Ideographic $ExFm* {400}; #
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
# rule 6 and 7
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_Letter $ExFm* $Single_Quote {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
|
||||
|
||||
# rule 8
|
||||
|
||||
$Numeric $ExFm* $Numeric;
|
||||
|
||||
# rule 9
|
||||
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric;
|
||||
|
||||
# rule 10
|
||||
|
||||
$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
|
||||
|
||||
# rule 13
|
||||
# to be consistent with $KanaKanji $KanaKanhi, changed
|
||||
# from 300 to 400.
|
||||
# See also TestRuleStatus in intltest/rbbiapts.cpp
|
||||
$Katakana $ExFm* $Katakana {400};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
|
||||
$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
|
||||
$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
|
||||
$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_Indicator $ExFm* $Regional_Indicator;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
|
|
@ -1,139 +0,0 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: dict_word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
#
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- $Katakana
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]];
|
||||
|
||||
$SufixLetter = [:name= FULL STOP:];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
|
||||
[:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
|
||||
[:name = PRIME:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$SufixLetterEx= $SufixLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
|
||||
|
||||
[[:P:][:S:]]*;
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
# [:IDEOGRAPHIC:] $Extend* {400};
|
||||
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
#
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
|
|
@ -1,176 +1,222 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: dict_word.txt
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
##############################################################################
|
||||
|
||||
$Ideographic = [:Ideographic:];
|
||||
$Hangul = [:Script = HANGUL:];
|
||||
### BEGIN CUSTOMIZATION
|
||||
### This file contains LibreOffice-specific rule customizations.
|
||||
###
|
||||
### To aid future maintainability:
|
||||
### - The change location should be bracketed by comments of this form.
|
||||
### - The original rule should be commented out, and the modified rule placed alongside.
|
||||
### - By doing this, maintainers can more easily compare to an upstream baseline.
|
||||
###
|
||||
### END CUSTOMIZATION
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
# Fix spelling of a)-ban, b)-ben, when the letter is a reference
|
||||
# resulting bad word breaking "ban" and "ben"
|
||||
# (reference fields are not expanded in spell checking, yet, only
|
||||
# for grammar checking).
|
||||
|
||||
$PrefixLetter = [[:name = RIGHT PARENTHESIS:]];
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
[:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
|
||||
[:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:]
|
||||
[:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
|
||||
[:name = DIGIT ZERO:]
|
||||
[:name = DIGIT ONE:]
|
||||
[:name = DIGIT TWO:]
|
||||
[:name = DIGIT THREE:]
|
||||
[:name = DIGIT FOUR:]
|
||||
[:name = DIGIT FIVE:]
|
||||
[:name = DIGIT SIX:]
|
||||
[:name = DIGIT SEVEN:]
|
||||
[:name = DIGIT EIGHT:]
|
||||
[:name = DIGIT NINE:]
|
||||
- $Ideographic
|
||||
- $Katakana
|
||||
- $Hangul
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] [:name = HYPHEN-MINUS:]
|
||||
[:name = EURO SIGN:] [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
|
||||
[:name = EN DASH:] [:name = EM DASH:]
|
||||
[:name = RIGHT DOUBLE QUOTATION MARK:]
|
||||
[:name = LEFT PARENTHESIS:]
|
||||
[:name = RIGHT PARENTHESIS:]
|
||||
[:name = RIGHT SQUARE BRACKET:]
|
||||
[:name = EXCLAMATION MARK:]
|
||||
[:name = QUESTION MARK:]
|
||||
[:name = FULL STOP:] [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]];
|
||||
|
||||
$SufixLetter = [:name= FULL STOP:];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
|
||||
[:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
|
||||
[:name = PRIME:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
$Han = [:Han:];
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$SufixLetterEx= $SufixLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$IdeographicEx= $Ideographic $Extend*;
|
||||
$HangulEx = $Hangul $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}-$Han];
|
||||
$ZWJ = [\p{Word_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [\p{Extended_Pictographic}];
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### Unknown issue number: Dictionary words can contain hyphens
|
||||
### tdf#49885: Sync custom BreakIterator rules with ICU originals
|
||||
### - ICU is now more permissive about punctuation inside words.
|
||||
### - For compatibility, exclude certain characters that were previously excluded.
|
||||
### tdf#116072: Extend MidLetter in Hungarian word breaking
|
||||
### i#56347: BreakIterator patch for Hungarian
|
||||
### i#56348: Special chars in first pos not handled by spell checking for Hungarian
|
||||
|
||||
$Symbols_hu = [[:name = PERCENT SIGN:]
|
||||
[:name = PER MILLE SIGN:]
|
||||
[:name = PER TEN THOUSAND SIGN:]
|
||||
[:name = SECTION SIGN:]
|
||||
[:name = DEGREE SIGN:]
|
||||
[:name = EURO SIGN:]
|
||||
[:name = HYPHEN-MINUS:]
|
||||
[:name = EN DASH:]
|
||||
[:name = EM DASH:]];
|
||||
|
||||
#$ALetter = [\p{Word_Break = ALetter}];
|
||||
$ALetter = [\p{Word_Break = ALetter} $Symbols_hu];
|
||||
|
||||
$IncludedML = [:name = HYPHEN-MINUS:];
|
||||
$ExcludedML = [[:name = COLON:]
|
||||
[:name = GREEK ANO TELEIA:]
|
||||
[:name = PRESENTATION FORM FOR VERTICAL COLON:]
|
||||
[:name = SMALL COLON:]
|
||||
[:name = FULLWIDTH COLON:]];
|
||||
|
||||
$IncludedML_hu = [[:name = RIGHT DOUBLE QUOTATION MARK:]
|
||||
[:name = LEFT PARENTHESIS:]
|
||||
[:name = RIGHT PARENTHESIS:]
|
||||
[:name = RIGHT SQUARE BRACKET:]
|
||||
[:name = EXCLAMATION MARK:]
|
||||
[:name = QUESTION MARK:]
|
||||
$Symbols_hu];
|
||||
|
||||
# $MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML $IncludedML_hu];
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $PrefixLetter? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
[[:P:][:S:]]*;
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
|
||||
$HangulEx ($FormatEx* $HangulEx)* {400};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c Do not break within emoji zwj sequences.
|
||||
# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
$ZWJ $Extended_Pict;
|
||||
|
||||
# Rule 3d - Keep horizontal whitespace together.
|
||||
#
|
||||
$WSegSpace $WSegSpace;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text.
|
||||
|
||||
$ExFm = [$Extend $Format $ZWJ];
|
||||
|
||||
^$ExFm+; # This rule fires only when there are format or extend characters at the
|
||||
# start of text, or immediately following another boundary. It groups them, in
|
||||
# the event there are more than one.
|
||||
|
||||
[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
|
||||
# with no special rule status value.
|
||||
|
||||
$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
|
||||
$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_Letter $ExFm* {200};
|
||||
$Katakana $ExFm* {400}; # note: these status values override those from rule 5
|
||||
$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
|
||||
$Ideographic $ExFm* {400}; #
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
# rule 6 and 7
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_Letter $ExFm* $Single_Quote {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
|
||||
|
||||
# rule 8
|
||||
|
||||
$Numeric $ExFm* $Numeric;
|
||||
|
||||
# rule 9
|
||||
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric;
|
||||
|
||||
# rule 10
|
||||
|
||||
$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
|
||||
|
||||
# rule 13
|
||||
# to be consistent with $KanaKanji $KanaKanhi, changed
|
||||
# from 300 to 400.
|
||||
# See also TestRuleStatus in intltest/rbbiapts.cpp
|
||||
$Katakana $ExFm* $Katakana {400};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
|
||||
$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
|
||||
$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
|
||||
$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_Indicator $ExFm* $Regional_Indicator;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
|
|
@ -1,147 +0,0 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: dict_word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
#
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
$Ideographic = [:Ideographic:];
|
||||
$Hangul = [:Script = HANGUL:];
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- $Ideographic
|
||||
- $Katakana
|
||||
- $Hangul
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] ];
|
||||
|
||||
$SufixLetter = [:name= FULL STOP:];
|
||||
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
|
||||
[:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
|
||||
[:name = PRIME:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$SufixLetterEx= $SufixLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$IdeographicEx= $Ideographic $Extend*;
|
||||
$HangulEx = $Hangul $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
|
||||
|
||||
[[:P:][:S:]]*;
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
|
||||
$HangulEx ($FormatEx* $HangulEx)* {400};
|
||||
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
#
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
|
|
@ -1,157 +1,221 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: dict_word.txt
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
##############################################################################
|
||||
|
||||
$Ideographic = [:Ideographic:];
|
||||
$Hangul = [:Script = HANGUL:];
|
||||
### BEGIN CUSTOMIZATION
|
||||
### This file contains LibreOffice-specific rule customizations.
|
||||
###
|
||||
### To aid future maintainability:
|
||||
### - The change location should be bracketed by comments of this form.
|
||||
### - The original rule should be commented out, and the modified rule placed alongside.
|
||||
### - By doing this, maintainers can more easily compare to an upstream baseline.
|
||||
###
|
||||
### END CUSTOMIZATION
|
||||
|
||||
# list of dashes or hyphens that should be accepted as part of the word if a single one of these
|
||||
# pre- or postfixes a word. E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
|
||||
# be part of the word in order to have it properly spell checked etc.
|
||||
$PrePostDashHyphen = [ [:name = HYPHEN-MINUS:] ];
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- $Ideographic
|
||||
- $Katakana
|
||||
- $Hangul
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:]
|
||||
[:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:]
|
||||
[:name = HYPHEN-MINUS:] ];
|
||||
|
||||
$SufixLetter = [:name= FULL STOP:];
|
||||
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:]
|
||||
[:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:]
|
||||
[:name = PRIME:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
$Han = [:Han:];
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$SufixLetterEx= $SufixLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$IdeographicEx= $Ideographic $Extend*;
|
||||
$HangulEx = $Hangul $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}-$Han];
|
||||
$ZWJ = [\p{Word_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [\p{Extended_Pictographic}];
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### Unknown issue number: Dictionary words can contain hyphens
|
||||
### tdf#49885: Sync custom BreakIterator rules with ICU originals
|
||||
### - ICU is now more permissive about punctuation inside words.
|
||||
### - For compatibility, exclude certain characters that were previously excluded.
|
||||
|
||||
$IncludedML = [:name = HYPHEN-MINUS:];
|
||||
$ExcludedML = [[:name = COLON:]
|
||||
[:name = GREEK ANO TELEIA:]
|
||||
[:name = PRESENTATION FORM FOR VERTICAL COLON:]
|
||||
[:name = SMALL COLON:]
|
||||
[:name = FULLWIDTH COLON:]];
|
||||
|
||||
# $MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidLetter = [[\p{Word_Break = MidLetter}]-$ExcludedML $IncludedML];
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### Unknown issue number: Allow leading and trailing hyphens in certain languages
|
||||
### This part of the customization does not replace any rules.
|
||||
|
||||
$PrePostHyphen = [:name = HYPHEN-MINUS:];
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
# At most one leading or trailing dash/hyphen should be accepted as well.
|
||||
# E.g. in German: "Arbeits-" or "-nehmer" where that hyphen needs to
|
||||
# be part of the word in order to have it properly spell checked etc.
|
||||
$LetterSequence = $PrePostDashHyphen? $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)* $PrePostDashHyphen?; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200};
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
[[:P:][:S:]]*;
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
|
||||
$HangulEx ($FormatEx* $HangulEx)* {400};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c Do not break within emoji zwj sequences.
|
||||
# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
$ZWJ $Extended_Pict;
|
||||
|
||||
# Rule 3d - Keep horizontal whitespace together.
|
||||
#
|
||||
$WSegSpace $WSegSpace;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text.
|
||||
|
||||
$ExFm = [$Extend $Format $ZWJ];
|
||||
|
||||
^$ExFm+; # This rule fires only when there are format or extend characters at the
|
||||
# start of text, or immediately following another boundary. It groups them, in
|
||||
# the event there are more than one.
|
||||
|
||||
[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
|
||||
# with no special rule status value.
|
||||
|
||||
$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
|
||||
$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_Letter $ExFm* {200};
|
||||
$Katakana $ExFm* {400}; # note: these status values override those from rule 5
|
||||
$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
|
||||
$Ideographic $ExFm* {400}; #
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
### BEGIN CUSTOMIZATION
|
||||
### Unknown issue number: Allow leading and trailing hyphens in certain languages
|
||||
|
||||
# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
($PrePostHyphen) ? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)?;
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
# rule 6 and 7
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### Unknown issue number: Allow leading and trailing hyphens in certain languages
|
||||
|
||||
# ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
|
||||
($PrePostHyphen)? ($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) ($PrePostHyphen)? {200};
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_Letter $ExFm* $Single_Quote {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
|
||||
|
||||
# rule 8
|
||||
|
||||
$Numeric $ExFm* $Numeric;
|
||||
|
||||
# rule 9
|
||||
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric;
|
||||
|
||||
# rule 10
|
||||
|
||||
$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
|
||||
|
||||
# rule 13
|
||||
# to be consistent with $KanaKanji $KanaKanhi, changed
|
||||
# from 300 to 400.
|
||||
# See also TestRuleStatus in intltest/rbbiapts.cpp
|
||||
$Katakana $ExFm* $Katakana {400};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
|
||||
$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
|
||||
$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
|
||||
$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format];
|
||||
^$Regional_Indicator $ExFm* $Regional_Indicator;
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
|
|
@ -1,142 +1,199 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: edit_word.txt
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
##############################################################################
|
||||
|
||||
$Ideographic = [:Ideographic:];
|
||||
$Hangul = [:Script = HANGUL:];
|
||||
### BEGIN CUSTOMIZATION
|
||||
### This file contains LibreOffice-specific rule customizations.
|
||||
###
|
||||
### To aid future maintainability:
|
||||
### - The change location should be bracketed by comments of this form.
|
||||
### - The original rule should be commented out, and the modified rule placed alongside.
|
||||
### - By doing this, maintainers can more easily compare to an upstream baseline.
|
||||
###
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- $Ideographic
|
||||
- $Katakana
|
||||
- $Hangul
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
$Han = [:Han:];
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$IdeographicEx= $Ideographic $Extend*;
|
||||
$HangulEx = $Hangul $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}-$Han];
|
||||
$ZWJ = [\p{Word_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [\p{Extended_Pictographic}];
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
|
||||
### This change subtracts undesired characters from the above families
|
||||
|
||||
# $MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
|
||||
|
||||
# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]];
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
# Punctuations by themselves
|
||||
[[:P:][:S:]-[:name = FULL STOP:]]*;
|
||||
[[:name = FULL STOP:]]*;
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
|
||||
$HangulEx ($FormatEx* $HangulEx)* {400};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c Do not break within emoji zwj sequences.
|
||||
# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
$ZWJ $Extended_Pict;
|
||||
|
||||
# Rule 3d - Keep horizontal whitespace together.
|
||||
#
|
||||
$WSegSpace $WSegSpace;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text.
|
||||
|
||||
$ExFm = [$Extend $Format $ZWJ];
|
||||
|
||||
^$ExFm+; # This rule fires only when there are format or extend characters at the
|
||||
# start of text, or immediately following another boundary. It groups them, in
|
||||
# the event there are more than one.
|
||||
|
||||
[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
|
||||
# with no special rule status value.
|
||||
|
||||
$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
|
||||
$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_Letter $ExFm* {200};
|
||||
$Katakana $ExFm* {400}; # note: these status values override those from rule 5
|
||||
$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
|
||||
$Ideographic $ExFm* {400}; #
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format];
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
# rule 6 and 7
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_Letter $ExFm* $Single_Quote {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
|
||||
|
||||
# rule 8
|
||||
|
||||
$Numeric $ExFm* $Numeric;
|
||||
|
||||
# rule 9
|
||||
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric;
|
||||
|
||||
# rule 10
|
||||
|
||||
$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
|
||||
|
||||
# rule 13
|
||||
# to be consistent with $KanaKanji $KanaKanhi, changed
|
||||
# from 300 to 400.
|
||||
# See also TestRuleStatus in intltest/rbbiapts.cpp
|
||||
$Katakana $ExFm* $Katakana {400};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
|
||||
$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
|
||||
$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
|
||||
$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_Indicator $ExFm* $Regional_Indicator;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
|
||||
### This customization does not replace any rules.
|
||||
[[:P:][:S:]-[:name = FULL STOP:]]*
|
||||
[[:name = FULL STOP:]]*;
|
||||
### END CUSTOMIZATION
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
|
|
@ -1,142 +0,0 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: edit_word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
#
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
$Ideographic = [:Ideographic:];
|
||||
$Hangul = [:Script = HANGUL:];
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- $Ideographic
|
||||
- $Katakana
|
||||
- $Hangul
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = QUOTATION MARK:] [:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$IdeographicEx= $Ideographic $Extend*;
|
||||
$HangulEx = $Hangul $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};
|
||||
|
||||
# Punctuations by themselves
|
||||
[[:P:][:S:]-[:name = FULL STOP:]]*;
|
||||
[[:name = FULL STOP:]]*;
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
|
||||
$HangulEx ($FormatEx* $HangulEx)* {400};
|
||||
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
#
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format];
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
|
|
@ -1,159 +1,215 @@
|
|||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: edit_word.txt
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on Version 4.0.0, dated 2003-04-17
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
##############################################################################
|
||||
|
||||
$Ideographic = [:Ideographic:];
|
||||
$Hangul = [:Script = HANGUL:];
|
||||
### BEGIN CUSTOMIZATION
|
||||
### This file contains LibreOffice-specific rule customizations.
|
||||
###
|
||||
### To aid future maintainability:
|
||||
### - The change location should be bracketed by comments of this form.
|
||||
### - The original rule should be commented out, and the modified rule placed alongside.
|
||||
### - By doing this, maintainers can more easily compare to an upstream baseline.
|
||||
###
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
[:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
|
||||
[:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:]
|
||||
[:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:]
|
||||
[:name = DIGIT ZERO:]
|
||||
[:name = DIGIT ONE:]
|
||||
[:name = DIGIT TWO:]
|
||||
[:name = DIGIT THREE:]
|
||||
[:name = DIGIT FOUR:]
|
||||
[:name = DIGIT FIVE:]
|
||||
[:name = DIGIT SIX:]
|
||||
[:name = DIGIT SEVEN:]
|
||||
[:name = DIGIT EIGHT:]
|
||||
[:name = DIGIT NINE:]
|
||||
- $Ideographic
|
||||
- $Katakana
|
||||
- $Hangul
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]
|
||||
[:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT SIGN:]
|
||||
[:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:]
|
||||
[:name = EN DASH:] [:name = EM DASH:]
|
||||
[:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
|
||||
$TheZWSP = \u200b;
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Word Break Rules. Definitions and Rules specific to word break begin Here.
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Format = [[:Cf:] - $TheZWSP];
|
||||
|
||||
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
|
||||
$Han = [:Han:];
|
||||
|
||||
#
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$KatakanaEx = $Katakana $Extend*;
|
||||
$IdeographicEx= $Ideographic $Extend*;
|
||||
$HangulEx = $Hangul $Extend*;
|
||||
$FormatEx = $Format $Extend*;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}-$Han];
|
||||
$ZWJ = [\p{Word_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [\p{Extended_Pictographic}];
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
|
||||
### This change subtracts undesired characters from the above families
|
||||
### i#56347: BreakIterator patch for Hungarian
|
||||
### i#56348: Special chars in first pos not handled by spell checking for Hungarian
|
||||
|
||||
$Symbols_hu = [[:name = PERCENT SIGN:]
|
||||
[:name = PER MILLE SIGN:]
|
||||
[:name = PER TEN THOUSAND SIGN:]
|
||||
[:name = SECTION SIGN:]
|
||||
[:name = DEGREE SIGN:]
|
||||
[:name = EURO SIGN:]
|
||||
[:name = HYPHEN-MINUS:]
|
||||
[:name = EN DASH:]
|
||||
[:name = EM DASH:]];
|
||||
|
||||
# $ALetter = [\p{Word_Break = ALetter}];
|
||||
$ALetter = [\p{Word_Break = ALetter} $Symbols_hu];
|
||||
|
||||
# $MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter} $Symbols_hu];
|
||||
|
||||
# $MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}-[:name= FULL STOP:]];
|
||||
|
||||
# $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}-[:name= LOW LINE:]];
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7
|
||||
($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200};
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
# Punctuations by themselves
|
||||
[[:P:][:S:]-[:name = FULL STOP:]]*;
|
||||
[[:name = FULL STOP:]]*;
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
|
||||
#
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$KatakanaEx ($FormatEx* $KatakanaEx)* {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
$IdeographicEx ($FormatEx* $IdeographicEx)* {400};
|
||||
$HangulEx ($FormatEx* $HangulEx)* {400};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c Do not break within emoji zwj sequences.
|
||||
# ZWJ × \p{Extended_Pictographic}. Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up too far,
|
||||
# but must back up at least enough, and must stop on a boundary.)
|
||||
$ZWJ $Extended_Pict;
|
||||
|
||||
# Rule 3d - Keep horizontal whitespace together.
|
||||
#
|
||||
$WSegSpace $WSegSpace;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text.
|
||||
|
||||
$ExFm = [$Extend $Format $ZWJ];
|
||||
|
||||
^$ExFm+; # This rule fires only when there are format or extend characters at the
|
||||
# start of text, or immediately following another boundary. It groups them, in
|
||||
# the event there are more than one.
|
||||
|
||||
[^$CR $LF $Newline $ExFm] $ExFm*; # This rule rule attaches trailing format/extends to words,
|
||||
# with no special rule status value.
|
||||
|
||||
$Numeric $ExFm* {100}; # This group of rules also attach trailing format/extends, but
|
||||
$ALetterPlus $ExFm* {200}; # with rule status set based on the word's final base character.
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_Letter $ExFm* {200};
|
||||
$Katakana $ExFm* {400}; # note: these status values override those from rule 5
|
||||
$Hiragana $ExFm* {400}; # by virtue of being numerically larger.
|
||||
$Ideographic $ExFm* {400}; #
|
||||
|
||||
# NonStarters are the set of all characters that can appear at the 2nd - nth position of
|
||||
# a word. (They may also be the first.) The reverse rule skips over these, until it
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format];
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
#!.*;
|
||||
! ($NonStarters* | \n \r) .;
|
||||
# rule 6 and 7
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* ($MidLetter | $MidNumLet | $Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_Letter $ExFm* $Single_Quote {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
|
||||
|
||||
# rule 8
|
||||
|
||||
$Numeric $ExFm* $Numeric;
|
||||
|
||||
# rule 9
|
||||
|
||||
($ALetterPlus | $Hebrew_Letter) $ExFm* $Numeric;
|
||||
|
||||
# rule 10
|
||||
|
||||
$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
|
||||
|
||||
# rule 13
|
||||
# to be consistent with $KanaKanji $KanaKanhi, changed
|
||||
# from 300 to 400.
|
||||
# See also TestRuleStatus in intltest/rbbiapts.cpp
|
||||
$Katakana $ExFm* $Katakana {400};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterPlus $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Hebrew_Letter $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
$Numeric $ExFm* $ExtendNumLet {100}; # (13a)
|
||||
$Katakana $ExFm* $ExtendNumLet {400}; # (13a)
|
||||
$ExtendNumLet $ExFm* $ExtendNumLet {200}; # (13a)
|
||||
|
||||
$ExtendNumLet $ExFm* $ALetterPlus {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Numeric {100}; # (13b)
|
||||
$ExtendNumLet $ExFm* $Katakana {400}; # (13b)
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With incoming rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_Indicator $ExFm* $Regional_Indicator;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### i#13494: For the purposes of editing, standalone punctuation should be treated as a word.
|
||||
### This customization does not replace any rules.
|
||||
[[:P:][:S:]-[:name = FULL STOP:]]*
|
||||
[[:name = FULL STOP:]]*;
|
||||
### END CUSTOMIZATION
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
|
|
@ -1,177 +1,117 @@
|
|||
# Copyright (c) 2002-2006 International Business Machines Corporation and
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
|
||||
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
|
||||
# for Unicode 14.0, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### This file contains LibreOffice-specific rule customizations.
|
||||
###
|
||||
### To aid future maintainability:
|
||||
### - The change location should be bracketed by comments of this form.
|
||||
### - The original rule should be commented out, and the modified rule placed alongside.
|
||||
### - By doing this, maintainers can more easily compare to an upstream baseline.
|
||||
###
|
||||
### END CUSTOMIZATION
|
||||
|
||||
!!chain;
|
||||
!!LBCMNoChain;
|
||||
|
||||
|
||||
!!lookAheadHardBreak;
|
||||
#
|
||||
# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere
|
||||
# and only used for the line break rules.
|
||||
#
|
||||
# It is used in the implementation of the incredibly annoying rule LB 10
|
||||
# which says to treat any combining mark that is not attached to a base
|
||||
# character as if it were of class AL (alphabetic).
|
||||
#
|
||||
# The problem occurs in the reverse rules.
|
||||
#
|
||||
# Consider a sequence like, with correct breaks as shown
|
||||
# LF ID CM AL AL
|
||||
# ^ ^ ^
|
||||
# Then consider the sequence without the initial ID (ideographic)
|
||||
# LF CM AL AL
|
||||
# ^ ^
|
||||
# Our CM, which in the first example was attached to the ideograph,
|
||||
# is now unattached, becomes an alpha, and joins in with the other
|
||||
# alphas.
|
||||
#
|
||||
# When iterating forwards, these sequences do not present any problems
|
||||
# When iterating backwards, we need to look ahead when encountering
|
||||
# a CM to see whether it attaches to something further on or not.
|
||||
# (Look-ahead in a reverse rule is looking towards the start)
|
||||
#
|
||||
# If the CM is unattached, we need to force a break.
|
||||
#
|
||||
# !!lookAheadHardBreak forces the run time state machine to
|
||||
# stop immediately when a look ahead rule ( '/' operator) matches,
|
||||
# and set the match position to that of the look-ahead operator,
|
||||
# no matter what other rules may be in play at the time.
|
||||
#
|
||||
# See rule LB 19 for an example.
|
||||
#
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$DG = \u00B0;
|
||||
$AL = [[:LineBreak = Alphabetic:] $DG];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [[:LineBreak = Close_Punctuation:] [:LineBreak = Close_Parenthesis:]]; # tdf#31271
|
||||
$CM = [:LineBreak = Combining_Mark:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [[:LineBreak = Ideographic:] - [\ufe30]];
|
||||
$IN = [:LineBreak = Inseparable:];
|
||||
$IS = [[:LineBreak = Infix_Numeric:] [\ufe30]];
|
||||
$ID = [:LineBreak = Ideographic:];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
# NS includes CJ for CSS strict line breaking.
|
||||
$NS = [[:LineBreak = Nonstarter:] $CJ];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [[:LineBreak = Open_Punctuation:] - $DG];
|
||||
$OP = [:LineBreak = Open_Punctuation:];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$BS = \u005C;
|
||||
$PR = [[:LineBreak = Prefix_Numeric:] - $BS];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [:LineBreak = Quotation:];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [[:LineBreak = Break_Symbols:] $BS];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
$CMX = [[$CM] - [$ZWJ]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
# limited to LineBreak=Complex_Context (SA).
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$dictionary = [$SA];
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (South East Asian: Thai, Lao, Khmer)
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
$ALPlus = [$AL $AI $SA $SG $XX];
|
||||
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HLcm = $HL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$H2cm = $H2 $CM*;
|
||||
$H3cm = $H3 $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$JLcm = $JL $CM*;
|
||||
$JVcm = $JV $CM*;
|
||||
$JTcm = $JT $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
#
|
||||
$ALPlus $CM+;
|
||||
$BA $CM+;
|
||||
$BB $CM+;
|
||||
$B2 $CM+;
|
||||
$CL $CM+;
|
||||
$EX $CM+;
|
||||
$GL $CM+;
|
||||
$HL $CM+;
|
||||
$HY $CM+;
|
||||
$H2 $CM+;
|
||||
$H3 $CM+;
|
||||
$ID $CM+;
|
||||
$IN $CM+;
|
||||
$IS $CM+;
|
||||
$JL $CM+;
|
||||
$JV $CM+;
|
||||
$JT $CM+;
|
||||
$NS $CM+;
|
||||
$NU $CM+;
|
||||
$OP $CM+;
|
||||
$PO $CM+;
|
||||
$PR $CM+;
|
||||
$QU $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
|
@ -186,19 +126,15 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
# Chaining is disabled with CM because it causes other failures,
|
||||
# so for this one case we need to manually list out longer sequences.
|
||||
#
|
||||
$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
|
||||
$AL_FOLLOW_CM = [$CL $EX $HL $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP];
|
||||
$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
|
@ -206,91 +142,124 @@ $CR $LF {100};
|
|||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
$CM+ $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
$CM+ [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
$CM+;
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJcm;
|
||||
$LB8NonBreaks $WJcm;
|
||||
$CM+ $WJcm;
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJcm [^$CAN_CM];
|
||||
$WJcm $CAN_CM $CM*;
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break before or after NBSP and related characters.
|
||||
#
|
||||
# (!SP) x GL
|
||||
[$LB8NonBreaks-$SP] $CM* $GLcm;
|
||||
$CM+ $GLcm;
|
||||
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
$GLcm ($LB8Breaks | $SP);
|
||||
$GLcm [$LB8NonBreaks-$SP] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think we need this rule.
|
||||
# All but $CM will chain off of preceding rule.
|
||||
# $GLcm will pick up the CM case by itself.
|
||||
|
||||
|
||||
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
|
||||
# LB 13 Don't break before ']' or '!' or '/', even after spaces.
|
||||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaced
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
# Note subtle interaction with "SP IS /" rules in LB14a.
|
||||
# This rule consumes the SP, chaining happens on the IS, effectivley overriding the SP IS rules,
|
||||
# which is the desired behavior.
|
||||
#
|
||||
$OPcm $SP* $CAN_CM $CM*;
|
||||
$OPcm $SP* $CANT_CM;
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
|
||||
# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
|
||||
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
#
|
||||
# LB 14b Do not break before numeric separators (IS), even after spaces.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
$SP $IS $CM* $ZWJ [^$CM $NU];
|
||||
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
# LB 15
|
||||
# $QUcm $SP* $OPcm;
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### i#83649: Allow line break between quote and opening punctuation.
|
||||
### This customization simply disables rule LB 15.
|
||||
###
|
||||
# $QU $CM* $SP* $OP;
|
||||
###
|
||||
### END CUSTOMIZATION
|
||||
|
||||
# LB 16
|
||||
$CLcm $SP* $NScm;
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2cm $SP* $B2cm;
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
|
@ -301,347 +270,134 @@ $LB18Breaks = [$LB8Breaks $SP];
|
|||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QUcm;
|
||||
$CM+ $QUcm;
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QUcm .?;
|
||||
$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
# TODO: I don't think this rule is needed.
|
||||
|
||||
$QU $CM* .;
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
$BBcm [^$CB]; # $BB x
|
||||
$BBcm $LB20NonBreaks $CM*;
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HLcm ($HYcm | $BAcm) [^$CB]?;
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 22
|
||||
($ALcm | $HLcm) $INcm;
|
||||
$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$IDcm $INcm;
|
||||
$INcm $INcm;
|
||||
$NUcm $INcm;
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
$LB20NonBreaks $CM* $IN;
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
$IDcm $POcm;
|
||||
$ALcm $NUcm; # includes $LB19
|
||||
$HLcm $NUcm;
|
||||
$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NUcm $ALcm;
|
||||
$NUcm $HLcm;
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
$PRcm $IDcm;
|
||||
$ALcm $PRcm;
|
||||
$PRcm ($ALcm | $HLcm);
|
||||
$POcm ($ALcm | $HLcm);
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
($PRcm | $POcm)? ($OPcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? ($PRcm | $POcm)?;
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
### BEGIN CUSTOMIZATION
|
||||
### i#83229: Allow line break after hyphen in number range context.
|
||||
### The default ICU rules treat number ranges (e.g. 100-199) as a single token. This change forces
|
||||
### a break opportunity after the embedded '-', but only if followed by another numeral.
|
||||
###
|
||||
### This customization does not replace any existing rule.
|
||||
### Maintainers: note that this rule should consist of two instances of the LB 25 numbers rule,
|
||||
### separated by a hyphen and an explicit break.
|
||||
|
||||
((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?)
|
||||
($HY $CM*) /
|
||||
((($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?);
|
||||
|
||||
### END CUSTOMIZATION
|
||||
|
||||
### TODO
|
||||
### ((PrefixNumeric | PostfixNumeric) CombMark*) ? ((OpenPunc | Hyphen) CombMark*)?
|
||||
### (InfixNumeric CombMark*)? Numeric (CombMark* (Numeric | BreakSym | InfixNumeric))*
|
||||
### (CombMark* (ClosePunc | CloseParen))? (CombMark* (PrefixNumeric | PostfixNumeric))?
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
|
||||
($JVcm | $H2cm) ($JVcm | $JTcm);
|
||||
($JTcm | $H3cm) $JTcm;
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
|
||||
($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
|
||||
$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALcm | $HLcm) ($ALcm | $HLcm);
|
||||
$CM+ ($ALcm | $HLcm); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IScm ($ALcm | $NUcm);
|
||||
|
||||
#
|
||||
# Rule 30 Do not break between letters, numbers or ordinary symbols
|
||||
# and opening or closing punctuation
|
||||
#
|
||||
($ALcm | $HLcm | $NUcm) $OPcm;
|
||||
$CM+ $OPcm;
|
||||
$CLcm ($ALcm | $HLcm | $NUcm);
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BB;
|
||||
$CM+ $B2;
|
||||
$CM+ $CL;
|
||||
$CM+ $EX;
|
||||
$CM+ $GL;
|
||||
$CM+ $HL;
|
||||
$CM+ $HY;
|
||||
$CM+ $H2;
|
||||
$CM+ $H3;
|
||||
$CM+ $ID;
|
||||
$CM+ $IN;
|
||||
$CM+ $IS;
|
||||
$CM+ $JL;
|
||||
$CM+ $JV;
|
||||
$CM+ $JT;
|
||||
$CM+ $NS;
|
||||
$CM+ $NU;
|
||||
$CM+ $OP;
|
||||
$CM+ $PO;
|
||||
$CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
$CM+;
|
||||
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] [whatever]
|
||||
# The CM needs to behave as an AL
|
||||
#
|
||||
$AL_FOLLOW $CM+ / (
|
||||
[$BK $CR $LF $NL $ZW {eof}] |
|
||||
$SP+ $CM+ $SP |
|
||||
$SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to suppress this break.
|
||||
# LB14 says OP SP* x .
|
||||
# becomes OP SP* x AL
|
||||
# becomes OP SP* x CM+ AL_FOLLOW
|
||||
#
|
||||
# Further note: the $AL in [$AL {eof}] is only to work around
|
||||
# a rule compiler bug which complains about
|
||||
# empty sets otherwise.
|
||||
|
||||
#
|
||||
# Sequences of the form (shown forwards)
|
||||
# [CANT_CM] <break> [CM] <break> [PR]
|
||||
# The CM needs to behave as an AL
|
||||
# This rule is concerned about getting the second of the two <breaks> in place.
|
||||
#
|
||||
|
||||
[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
|
||||
|
||||
|
||||
|
||||
# LB 4, 5, 5
|
||||
|
||||
$LB4Breaks [$LB4NonBreaks-$CM];
|
||||
$LB4Breaks $CM+ $CAN_CM;
|
||||
$LF $CR;
|
||||
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
[$SP $ZW] [$LB4NonBreaks-$CM];
|
||||
[$SP $ZW] $CM+ $CAN_CM;
|
||||
|
||||
# LB 8 Break after zero width space
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
# X $CM needs to behave like X, where X is not $SP or controls.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# Stick together any combining sequences that don't match other rules.
|
||||
$CM+ $CAN_CM;
|
||||
|
||||
|
||||
# LB 11
|
||||
$CM* $WJ $CM* $CAN_CM;
|
||||
$CM* $WJ [$LB8NonBreaks-$CM];
|
||||
|
||||
$CANT_CM $CM* $WJ;
|
||||
$CM* $CAN_CM $CM* $WJ;
|
||||
|
||||
# LB 12
|
||||
# x GL
|
||||
#
|
||||
$CM* $GL $CM* [$LB8NonBreaks-$CM-$SP];
|
||||
|
||||
#
|
||||
# GL x
|
||||
#
|
||||
$CANT_CM $CM* $GL;
|
||||
$CM* $CAN_CM $CM* $GL;
|
||||
|
||||
|
||||
# LB 13
|
||||
$CL $CM+ $CAN_CM;
|
||||
$EX $CM+ $CAN_CM;
|
||||
$IS $CM+ $CAN_CM;
|
||||
$SY $CM+ $CAN_CM;
|
||||
|
||||
$CL [$LB8NonBreaks-$CM];
|
||||
$EX [$LB8NonBreaks-$CM];
|
||||
$IS [$LB8NonBreaks-$CM];
|
||||
$SY [$LB8NonBreaks-$CM];
|
||||
|
||||
# Rule 13 & 14 taken together for an edge case.
|
||||
# Match this, shown forward
|
||||
# OP SP+ ($CM+ behaving as $AL) (CL | EX | IS | IY)
|
||||
# This really wants to chain at the $CM+ (which is acting as an $AL)
|
||||
# except for $CM chaining being disabled.
|
||||
[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP;
|
||||
|
||||
# LB 14 OP SP* x
|
||||
#
|
||||
$CM* $CAN_CM $SP* $CM* $OP;
|
||||
$CANT_CM $SP* $CM* $OP;
|
||||
$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
|
||||
|
||||
$AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
|
||||
$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;
|
||||
$SY $CM $SP+ $OP; # TODO: Experiment. Remove.
|
||||
|
||||
|
||||
|
||||
# LB 15
|
||||
# $CM* $OP $SP* $CM* $QU;
|
||||
|
||||
# LB 16
|
||||
$CM* $NS $SP* $CM* $CL;
|
||||
|
||||
# LB 17
|
||||
$CM* $B2 $SP* $CM* $B2;
|
||||
|
||||
# LB 18 break after spaces
|
||||
# Nothing explicit needed here.
|
||||
|
||||
|
||||
#
|
||||
# LB 19
|
||||
#
|
||||
$CM* $QU $CM* $CAN_CM; # . x QU
|
||||
$CM* $QU $LB18NonBreaks;
|
||||
|
||||
|
||||
$CM* $CAN_CM $CM* $QU; # QU x .
|
||||
$CANT_CM $CM* $QU;
|
||||
|
||||
#
|
||||
# LB 20 Break before and after CB.
|
||||
# nothing needed here.
|
||||
#
|
||||
|
||||
# LB 21
|
||||
$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)
|
||||
|
||||
$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .
|
||||
[^$CB] $CM* $BB; #
|
||||
|
||||
# LB21a
|
||||
[^$CB] $CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
$CM* $IN $CM* ($ALPlus | $HL);
|
||||
$CM* $IN $CM* $ID;
|
||||
$CM* $IN $CM* $IN;
|
||||
$CM* $IN $CM* $NU;
|
||||
|
||||
# LB 23
|
||||
$CM* $PO $CM* $ID;
|
||||
$CM* $NU $CM* ($ALPlus | $HL);
|
||||
$CM* ($ALPlus | $HL) $CM* $NU;
|
||||
|
||||
# LB 24
|
||||
$CM* $ID $CM* $PR;
|
||||
$CM* $PR $CM* $ALPlus;
|
||||
$CM* ($ALPlus | $HL) $CM* $PR;
|
||||
$CM* ($ALPlus | $HL) $CM* $PO;
|
||||
|
||||
$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP;
|
||||
$CM* $NU+ $CM* $HY+ / $SP;
|
||||
|
||||
# LB 25
|
||||
($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26
|
||||
$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;
|
||||
$CM* ($JT | $JV) $CM* ($H2 | $JV);
|
||||
$CM* $JT $CM* ($H3 | $JT);
|
||||
|
||||
# LB 27
|
||||
$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
|
||||
$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
|
||||
|
||||
# LB 28
|
||||
$CM* ($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 29
|
||||
$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP];
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
$CM* $OP $CM* ($ALPlus | $HL | $NU);
|
||||
$CM* ($ALPlus | $HL | $NU) $CM* ($CL | $SY)+ [^$SP];
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 7
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
|
||||
# LB 9
|
||||
$SP+ $CM* $OP;
|
||||
|
||||
# LB 10
|
||||
$SP+ $CM* $QU;
|
||||
|
||||
# LB 11
|
||||
$SP+ $CM* $CL;
|
||||
$SP+ $CM* $B2;
|
||||
|
||||
# LB 21
|
||||
$CM* ($HY | $BA) $CM* $HL;
|
||||
|
||||
# LB 18
|
||||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
$CL $CM* ($NU | $IS | $SY);
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# Skip forward over all character classes that are involved in
|
||||
# rules containing patterns with possibly more than one char
|
||||
# of context.
|
||||
#
|
||||
# It might be slightly more efficient to have specific rules
|
||||
# instead of one generic one, but only if we could
|
||||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $B2 $PR $HY $BA $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $BA $dictionary];
|
||||
$dictionary $dictionary;
|
||||
# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
|
||||
$EB $CM* $EM;
|
||||
$ExtPictUnassigned $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
|
|
@ -1,128 +0,0 @@
|
|||
#
|
||||
# Copyright (C) 2002-2006, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: sent.txt
|
||||
#
|
||||
# ICU Sentence Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on SA 29 version 5.0.0
|
||||
# Includes post 5.0 changes to treat Japanese half width voicing marks
|
||||
# as Grapheme Extend.
|
||||
#
|
||||
|
||||
|
||||
$VoiceMarks = [\uff9e\uff9f];
|
||||
$Thai = [:Script = Thai:];
|
||||
|
||||
#
|
||||
# Character categories as defined in TR 29
|
||||
#
|
||||
$Sep = [\p{Sentence_Break = Sep}];
|
||||
$Format = [\p{Sentence_Break = Format}];
|
||||
$Sp = [\p{Sentence_Break = Sp}];
|
||||
$Lower = [\p{Sentence_Break = Lower}];
|
||||
$Upper = [\p{Sentence_Break = Upper}];
|
||||
$OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks];
|
||||
$Numeric = [\p{Sentence_Break = Numeric}];
|
||||
$ATerm = [\p{Sentence_Break = ATerm}];
|
||||
$STerm = [\p{Sentence_Break = STerm}];
|
||||
$Close = [\p{Sentence_Break = Close}];
|
||||
|
||||
#
|
||||
# Define extended forms of the character classes,
|
||||
# incorporate grapheme cluster + format chars.
|
||||
# Rules 4 and 5.
|
||||
|
||||
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
|
||||
|
||||
$SpEx = $Sp ($Extend | $Format)*;
|
||||
$LowerEx = $Lower ($Extend | $Format)*;
|
||||
$UpperEx = $Upper ($Extend | $Format)*;
|
||||
$OLetterEx = $OLetter ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ATermEx = $ATerm ($Extend | $Format)*;
|
||||
$STermEx = $STerm ($Extend | $Format)*;
|
||||
$CloseEx = $Close ($Extend | $Format)*;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!chain;
|
||||
!!forward;
|
||||
|
||||
# Rule 3 - break after separators. Keep CR/LF together.
|
||||
#
|
||||
$CR $LF;
|
||||
|
||||
$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*;
|
||||
$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*;
|
||||
|
||||
# Rule 4 - Break after $Sep.
|
||||
# Rule 5 - Ignore $Format and $Extend
|
||||
#
|
||||
[^$Sep]? ($Extend | $Format)*;
|
||||
|
||||
|
||||
# Rule 6
|
||||
$ATermEx $NumericEx;
|
||||
|
||||
# Rule 7
|
||||
$UpperEx $ATermEx $UpperEx;
|
||||
|
||||
#Rule 8
|
||||
# Note: follows errata for Unicode 5.0 boundary rules.
|
||||
$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*;
|
||||
$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
|
||||
|
||||
# Rule 8a
|
||||
($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx);
|
||||
|
||||
#Rule 9, 10, 11
|
||||
($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?;
|
||||
|
||||
#Rule 12
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai];
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100};
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!reverse;
|
||||
|
||||
$SpEx_R = ($Extend | $Format)* $Sp;
|
||||
$ATermEx_R = ($Extend | $Format)* $ATerm;
|
||||
$STermEx_R = ($Extend | $Format)* $STerm;
|
||||
$CloseEx_R = ($Extend | $Format)* $Close;
|
||||
|
||||
#
|
||||
# Reverse rules.
|
||||
# For now, use the old style inexact reverse rules, which are easier
|
||||
# to write, but less efficient.
|
||||
# TODO: exact reverse rules. It appears that exact reverse rules
|
||||
# may require improving support for look-ahead breaks in the
|
||||
# builder. Needs more investigation.
|
||||
#
|
||||
|
||||
[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
|
||||
#.*;
|
||||
|
||||
# Explanation for this rule:
|
||||
#
|
||||
# It needs to back over
|
||||
# The $Sep at which we probably begin
|
||||
# All of the non $Sep chars leading to the preceding $Sep
|
||||
# The preceding $Sep, which will be the second one that the rule matches.
|
||||
# Any immediately preceding STerm or ATerm sequences. We need to see these
|
||||
# to get the correct rule status when moving forwards again.
|
||||
#
|
||||
# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match
|
||||
# the entire string.
|
||||
#
|
||||
# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be
|
||||
# at the beginning of the string at this point, and we don't want to fail.
|
||||
# Can only use {eof} once, and it is used later.
|
||||
#
|
||||
|
Loading…
Reference in a new issue