From 9a14a0fd8b4227b5d08b3154cddca46f82ec2a03 Mon Sep 17 00:00:00 2001 From: Jonathan Clark Date: Mon, 2 Dec 2024 16:03:43 -0700 Subject: [PATCH] tdf#162912 i18npool: Updated CJK BreakIterator to use custom rules Regression from commit 14c6cde779d64596eab0f4d3f32f181ce2243929: "tdf#49885 Updated CJK BreakIterator to use ICU" Previously, languages requiring dictionary-based break iterators were handled by instantiating a stock ICU break iterator as a special case. tdf#49885 upgraded our custom rules to support passthrough for dictionary-based breaking, so this special case is no longer necessary. Change-Id: Iebb06de82eb511946e5b220e5dc414440838b03c Reviewed-on: https://gerrit.libreoffice.org/c/core/+/177713 Tested-by: Jenkins Reviewed-by: Jonathan Clark --- i18npool/qa/cppunit/test_breakiterator.cxx | 249 +++++++++++++++++- .../breakiterator/breakiterator_unicode.cxx | 12 +- 2 files changed, 249 insertions(+), 12 deletions(-) diff --git a/i18npool/qa/cppunit/test_breakiterator.cxx b/i18npool/qa/cppunit/test_breakiterator.cxx index 24666ca4ac80..80bdeb15c7be 100644 --- a/i18npool/qa/cppunit/test_breakiterator.cxx +++ b/i18npool/qa/cppunit/test_breakiterator.cxx @@ -49,6 +49,7 @@ public: void testHebrewGereshGershaim(); void testLegacySurrogatePairs(); void testWordCount(); + void testDictionaryIteratorLanguages(); CPPUNIT_TEST_SUITE(TestBreakIterator); CPPUNIT_TEST(testLineBreaking); @@ -70,6 +71,7 @@ public: CPPUNIT_TEST(testHebrewGereshGershaim); CPPUNIT_TEST(testLegacySurrogatePairs); CPPUNIT_TEST(testWordCount); + CPPUNIT_TEST(testDictionaryIteratorLanguages); CPPUNIT_TEST_SUITE_END(); private: @@ -1612,6 +1614,25 @@ void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > co CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); } + + { + // tdf#162912: Double-clicking should only select one Basic identifier + static constexpr OUString aTest = u"ThisComponent.CurrentSelection"_ustr; + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + + aBounds = xBreak->getWordBoundary(aTest, 5, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(13), aBounds.endPos); + + aBounds = xBreak->getWordBoundary(aTest, 15, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos); + } } void TestBreakIterator::testJapanese() @@ -1914,7 +1935,7 @@ void TestBreakIterator::testWordCount() const OUString aStr = u"Wordの様にワード数をするのにTest\n植松町"_ustr; - CPPUNIT_ASSERT_EQUAL(7, fnCountWords(aStr, aLocale)); + CPPUNIT_ASSERT_EQUAL(8, fnCountWords(aStr, aLocale)); } // tdf#150621 Korean words should be counted individually, rather than by syllable. @@ -1941,6 +1962,232 @@ void TestBreakIterator::testWordCount() } } +void TestBreakIterator::testDictionaryIteratorLanguages() +{ + // Thai + { + lang::Locale aLocale{ "th", "TH", "" }; + + const OUString aStr = u"รอนานหรือเปล่า"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 10, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos); + } + + // Japanese + { + lang::Locale aLocale{ "ja", "JP", "" }; + + const OUString aStr = u"通産省工業技術院北海道"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 2, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 4, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 6, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 7, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 9, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos); + } + + // Chinese + { + lang::Locale aLocale{ "zh", "CN", "" }; + + const OUString aStr = u"很高兴认识你"_ustr; + + i18n::Boundary aBounds; + + aBounds + = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds + = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::DICTIONARY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, i18n::WordType::ANY_WORD, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 0, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 1, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 3, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos); + + aBounds = m_xBreak->getWordBoundary(aStr, 5, aLocale, + i18n::WordType::ANYWORD_IGNOREWHITESPACES, true); + CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos); + CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos); + } +} + void TestBreakIterator::setUp() { BootstrapFixtureBase::setUp(); diff --git a/i18npool/source/breakiterator/breakiterator_unicode.cxx b/i18npool/source/breakiterator/breakiterator_unicode.cxx index 5992b6144b0b..4e5df75d2701 100644 --- a/i18npool/source/breakiterator/breakiterator_unicode.cxx +++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx @@ -74,16 +74,6 @@ class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator }; -bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale) -{ - return rLocale.Language == "bo" || // Tibetan - rLocale.Language == "dz" || // Dzongkha - rLocale.Language == "ja" || // Japanese - rLocale.Language == "km" || // Khmer - rLocale.Language == "lo" || // Lao - rLocale.Language == "th" || // Thai - rLocale.Language == "zh"; // Chinese -} } // loading ICU breakiterator on demand. @@ -189,7 +179,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocal rbi.reset(); } } - else if(!locale_requires_dictionary_iterator(rLocale)) + else { // language;rule (not langtag, unless we'd actually load such) OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());