tdf#162912 i18npool: Updated CJK BreakIterator to use custom rules
Regression from commit 14c6cde779
:
"tdf#49885 Updated CJK BreakIterator to use ICU"
Previously, languages requiring dictionary-based break iterators were
handled by instantiating a stock ICU break iterator as a special case.
tdf#49885 upgraded our custom rules to support passthrough for
dictionary-based breaking, so this special case is no longer necessary.
Change-Id: Iebb06de82eb511946e5b220e5dc414440838b03c
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177713
Tested-by: Jenkins
Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
This commit is contained in:
parent
94afced019
commit
9a14a0fd8b
2 changed files with 249 additions and 12 deletions
|
@ -49,6 +49,7 @@ public:
|
|||
void testHebrewGereshGershaim();
|
||||
void testLegacySurrogatePairs();
|
||||
void testWordCount();
|
||||
void testDictionaryIteratorLanguages();
|
||||
|
||||
CPPUNIT_TEST_SUITE(TestBreakIterator);
|
||||
CPPUNIT_TEST(testLineBreaking);
|
||||
|
@ -70,6 +71,7 @@ public:
|
|||
CPPUNIT_TEST(testHebrewGereshGershaim);
|
||||
CPPUNIT_TEST(testLegacySurrogatePairs);
|
||||
CPPUNIT_TEST(testWordCount);
|
||||
CPPUNIT_TEST(testDictionaryIteratorLanguages);
|
||||
CPPUNIT_TEST_SUITE_END();
|
||||
|
||||
private:
|
||||
|
@ -1612,6 +1614,25 @@ void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > co
|
|||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
||||
}
|
||||
|
||||
{
|
||||
// tdf#162912: Double-clicking should only select one Basic identifier
|
||||
static constexpr OUString aTest = u"ThisComponent.CurrentSelection"_ustr;
|
||||
|
||||
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
|
||||
|
||||
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos);
|
||||
|
||||
aBounds = xBreak->getWordBoundary(aTest, 15, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBreakIterator::testJapanese()
|
||||
|
@ -1914,7 +1935,7 @@ void TestBreakIterator::testWordCount()
|
|||
|
||||
const OUString aStr = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL(7, fnCountWords(aStr, aLocale));
|
||||
CPPUNIT_ASSERT_EQUAL(8, fnCountWords(aStr, aLocale));
|
||||
}
|
||||
|
||||
// tdf#150621 Korean words should be counted individually, rather than by syllable.
|
||||
|
@ -1941,6 +1962,232 @@ void TestBreakIterator::testWordCount()
|
|||
}
|
||||
}
|
||||
|
||||
void TestBreakIterator::testDictionaryIteratorLanguages()
|
||||
{
|
||||
// Thai
|
||||
{
|
||||
lang::Locale aLocale{ "th", "TH", "" };
|
||||
|
||||
const OUString aStr = u"รอนานหรือเปล่า"_ustr;
|
||||
|
||||
i18n::Boundary aBounds;
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
|
||||
}
|
||||
|
||||
// Japanese
|
||||
{
|
||||
lang::Locale aLocale{ "ja", "JP", "" };
|
||||
|
||||
const OUString aStr = u"通産省工業技術院北海道"_ustr;
|
||||
|
||||
i18n::Boundary aBounds;
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
|
||||
}
|
||||
|
||||
// Chinese
|
||||
{
|
||||
lang::Locale aLocale{ "zh", "CN", "" };
|
||||
|
||||
const OUString aStr = u"很高兴认识你"_ustr;
|
||||
|
||||
i18n::Boundary aBounds;
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::ANY_WORD, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
|
||||
aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale,
|
||||
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBreakIterator::setUp()
|
||||
{
|
||||
BootstrapFixtureBase::setUp();
|
||||
|
|
|
@ -74,16 +74,6 @@ class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
|
|||
|
||||
};
|
||||
|
||||
bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale)
|
||||
{
|
||||
return rLocale.Language == "bo" || // Tibetan
|
||||
rLocale.Language == "dz" || // Dzongkha
|
||||
rLocale.Language == "ja" || // Japanese
|
||||
rLocale.Language == "km" || // Khmer
|
||||
rLocale.Language == "lo" || // Lao
|
||||
rLocale.Language == "th" || // Thai
|
||||
rLocale.Language == "zh"; // Chinese
|
||||
}
|
||||
}
|
||||
|
||||
// loading ICU breakiterator on demand.
|
||||
|
@ -189,7 +179,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocal
|
|||
rbi.reset();
|
||||
}
|
||||
}
|
||||
else if(!locale_requires_dictionary_iterator(rLocale))
|
||||
else
|
||||
{
|
||||
// language;rule (not langtag, unless we'd actually load such)
|
||||
OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
|
||||
|
|
Loading…
Reference in a new issue