tdf#150621 Changed Korean word counting to use words

Previously, Writer counted characters for all CJK languages, rather than words. This is the correct behavior for Chinese and Japanese, which make extensive use of ideographs. However, it is not correct for Korean. This change adjusts the Writer word count algorithm to count Korean words, rather than Korean characters. Change-Id: I6e77136867baca1a7b51248886ee5fd7073ad364 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170621 Tested-by: Jenkins Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
2024-07-16 16:50:10 -06:00 · 2024-07-16 16:50:10 -06:00 · aa938fe03c
commit aa938fe03c
parent c9a96f2724
7 changed files with 153 additions and 65 deletions
--- a/editeng/source/editeng/impedit4.cxx
+++ b/editeng/source/editeng/impedit4.cxx
@ -2904,7 +2904,8 @@ EditSelection ImpEditEngine::TransliterateText( const EditSelection& rSelection,
            }

            i18n::Boundary aCurWordBndry( aSttBndry );
-            while (aCurWordBndry.endPos && aCurWordBndry.startPos <= aEndBndry.startPos)
+            while (aCurWordBndry.startPos != aCurWordBndry.endPos
+                   && aCurWordBndry.startPos <= aEndBndry.startPos)
            {
                nCurrentStart = aCurWordBndry.startPos;
                nCurrentEnd   = aCurWordBndry.endPos;
--- a/i18npool/qa/cppunit/test_breakiterator.cxx
+++ b/i18npool/qa/cppunit/test_breakiterator.cxx
@ -1867,33 +1867,32 @@ void TestBreakIterator::testLegacySurrogatePairs()

 void TestBreakIterator::testWordCount()
 {
-    auto count_words_fn = [&](const OUString& str, const lang::Locale& aLocale) -> int
+    auto fnCountWords = [&](const OUString& aStr, const lang::Locale& aLocale) -> int
    {
-        int num_words = 0;
-        sal_Int32 next_pos = 0;
-        int iter_guard = 0;
+        int nWords = 0;
+        sal_Int32 nNextPos = 0;
+        int nIterGuard = 0;

-        if (m_xBreak->isBeginWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT))
+        if (m_xBreak->isBeginWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT))
        {
-            ++num_words;
+            ++nWords;
        }

        while (true)
        {
-            CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100);
+            CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++nIterGuard < 100);

-            auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT);
-
-            if (aBounds.endPos < next_pos || aBounds.startPos == aBounds.endPos)
+            auto aBounds = m_xBreak->nextWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT);
+            if (aBounds.endPos == aBounds.startPos)
            {
                break;
            }

-            next_pos = aBounds.endPos;
-            ++num_words;
+            nNextPos = aBounds.endPos;
+            ++nWords;
        }

-        return num_words;
+        return nWords;
    };

    // i#80815: "Word count differs from MS Word"
@ -1903,29 +1902,29 @@ void TestBreakIterator::testWordCount()
        aLocale.Language = "en";
        aLocale.Country = "US";

-        const OUString str = u""
-                             "test data for word count issue #80815\n"
-                             "fo\\\'sforos\n"
-                             "archipi\\\'elago\n"
-                             "do\\^me\n"
-                             "f**k\n"
-                             "\n"
-                             "battery-driven\n"
-                             "and/or\n"
-                             "apple(s)\n"
-                             "money+opportunity\n"
-                             "Micro$oft\n"
-                             "\n"
-                             "300$\n"
-                             "I(not you)\n"
-                             "a****n\n"
-                             "1+3=4\n"
-                             "\n"
-                             "aaaaaaa.aaaaaaa\n"
-                             "aaaaaaa,aaaaaaa\n"
-                             "aaaaaaa;aaaaaaa\n"_ustr;
+        const OUString aStr = u""
+                              "test data for word count issue #80815\n"
+                              "fo\\\'sforos\n"
+                              "archipi\\\'elago\n"
+                              "do\\^me\n"
+                              "f**k\n"
+                              "\n"
+                              "battery-driven\n"
+                              "and/or\n"
+                              "apple(s)\n"
+                              "money+opportunity\n"
+                              "Micro$oft\n"
+                              "\n"
+                              "300$\n"
+                              "I(not you)\n"
+                              "a****n\n"
+                              "1+3=4\n"
+                              "\n"
+                              "aaaaaaa.aaaaaaa\n"
+                              "aaaaaaa,aaaaaaa\n"
+                              "aaaaaaa;aaaaaaa\n"_ustr;

-        CPPUNIT_ASSERT_EQUAL(24, count_words_fn(str, aLocale));
+        CPPUNIT_ASSERT_EQUAL(24, fnCountWords(aStr, aLocale));
    }

    // Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
@ -1934,9 +1933,32 @@ void TestBreakIterator::testWordCount()
        aLocale.Language = "ja";
        aLocale.Country = "JP";

-        const OUString str = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
+        const OUString aStr = u"Wordの様にワード数をするのにTest\n植松町"_ustr;

-        CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale));
+        CPPUNIT_ASSERT_EQUAL(7, fnCountWords(aStr, aLocale));
+    }
+
+    // tdf#150621 Korean words should be counted individually, rather than by syllable.
+    //
+    // Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
+    {
+        lang::Locale aLocale;
+        aLocale.Language = "ko";
+        aLocale.Country = "KR";
+
+        // Basic case: Korean words are counted as space-delimited. In particular, grammatical
+        // particles are treated as part of the previous word.
+        CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"저는 영화를 봤어요"_ustr, aLocale));
+
+        // Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
+        // situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
+        // ideographs would be counted individually as words. In Korean, however, they are treated
+        // no differently than hangul characters.
+        CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"불렀다...與"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"불렀다 ...與"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"불렀다 ... 與"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"尹탄핵"_ustr, aLocale));
+        CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"尹 탄핵"_ustr, aLocale));
    }
 }

--- a/i18npool/source/breakiterator/breakiterator_unicode.cxx
+++ b/i18npool/source/breakiterator/breakiterator_unicode.cxx
@ -364,7 +364,7 @@ Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int
    Boundary rv;
    rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
    if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
-        rv.endPos = result.startPos;
+        rv.endPos = rv.startPos;
    else {
        if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
             && u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))
--- a/sw/qa/core/uwriter.cxx
+++ b/sw/qa/core/uwriter.cxx
@ -654,7 +654,7 @@ void SwDocTest::testSwScanner()
        pTextNode = aPaM.GetPointNode().GetTextNode();
        pTextNode->CountWords(aDocStat, 0, test.getLength());
        CPPUNIT_ASSERT_EQUAL_MESSAGE("words", static_cast<sal_uLong>(58), aDocStat.nWord);
-        CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean syllables", static_cast<sal_uLong>(43), aDocStat.nAsianWord);
+        CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean words", static_cast<sal_uLong>(43), aDocStat.nAsianWord);
        CPPUNIT_ASSERT_EQUAL_MESSAGE("non-whitespace chars", static_cast<sal_uLong>(105), aDocStat.nCharExcludingSpaces);
        CPPUNIT_ASSERT_EQUAL_MESSAGE("characters", static_cast<sal_uLong>(128), aDocStat.nChar);
    }
@ -929,6 +929,46 @@ void SwDocTest::testSwScanner()
        CPPUNIT_ASSERT_EQUAL(sal_uLong(17), aDocStat.nChar);
        aDocStat.Reset();
    }
+
+    // tdf#150621 Korean words should be counted individually, rather than by syllable.
+    //
+    // Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
+    {
+        auto fnAssertWords = [&](const OUString& aStr, sal_uLong nWords, sal_uLong nAsianWords)
+        {
+            m_pDoc->getIDocumentContentOperations().AppendTextNode(*aPaM.GetPoint());
+
+            SvxLanguageItem aCJKLangItem(LANGUAGE_KOREAN, RES_CHRATR_CJK_LANGUAGE);
+            SvxLanguageItem aWestLangItem(LANGUAGE_ENGLISH_US, RES_CHRATR_LANGUAGE);
+            m_pDoc->getIDocumentContentOperations().InsertPoolItem(aPaM, aCJKLangItem);
+            m_pDoc->getIDocumentContentOperations().InsertPoolItem(aPaM, aWestLangItem);
+
+            m_pDoc->getIDocumentContentOperations().InsertString(aPaM, aStr);
+
+            SwDocStat aDocStat;
+            pTextNode = aPaM.GetPointNode().GetTextNode();
+            pTextNode->CountWords(aDocStat, 0, aStr.getLength());
+            CPPUNIT_ASSERT_EQUAL_MESSAGE("words", nWords, aDocStat.nWord);
+            CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean words", nAsianWords,
+                                         aDocStat.nAsianWord);
+        };
+
+        // Basic case: Korean words are counted as space-delimited. In particular, grammatical
+        // particles are treated as part of the previous word.
+        fnAssertWords(u"저는 영화를 봤어요"_ustr, 3, 3);
+
+        // Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
+        // situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
+        // ideographs would be counted individually as words. In Korean, however, they are treated
+        // no differently than hangul characters.
+        fnAssertWords(u"尹탄핵"_ustr, 1, 1);
+        fnAssertWords(u"尹 탄핵"_ustr, 2, 2);
+
+        // These mixed-script results are anomalous, but reflect the behavior of MSW.
+        fnAssertWords(u"불렀다...與"_ustr, 1, 1);
+        fnAssertWords(u"불렀다 ...與"_ustr, 2, 1);
+        fnAssertWords(u"불렀다 ... 與"_ustr, 3, 2);
+    }
 }

 void SwDocTest::testMergePortionsDeleteNotSorted()
--- a/sw/source/core/txtnode/txtedt.cxx
+++ b/sw/source/core/txtnode/txtedt.cxx
@ -793,40 +793,62 @@ SwScanner::SwScanner(std::function<LanguageType(sal_Int32, sal_Int32, bool)> aGe

 namespace
 {
-    //fdo#45271 for Asian words count characters instead of words
-    sal_Int32 forceEachAsianCodePointToWord(const OUString &rText, sal_Int32 nBegin, sal_Int32 nLen)
+// tdf#45271 For Chinese and Japanese, count characters instead of words
+sal_Int32
+forceEachCJCodePointToWord(const OUString& rText, sal_Int32 nBegin, sal_Int32 nLen,
+                           const ModelToViewHelper* pModelToView,
+                           std::function<LanguageType(sal_Int32, sal_Int32, bool)>& fnGetLangOfChar)
+{
+    if (nLen > 1)
    {
-        if (nLen > 1)
+        const uno::Reference<XBreakIterator>& rxBreak = g_pBreakIt->GetBreakIter();
+
+        sal_uInt16 nCurrScript = rxBreak->getScriptType(rText, nBegin);
+
+        sal_Int32 indexUtf16 = nBegin;
+        rText.iterateCodePoints(&indexUtf16);
+
+        // First character is Asian
+        if (nCurrScript == i18n::ScriptType::ASIAN)
        {
-            const uno::Reference< XBreakIterator > &rxBreak = g_pBreakIt->GetBreakIter();
+            auto aModelBeginPos = pModelToView->ConvertToModelPosition(nBegin);
+            auto aCurrentLang = fnGetLangOfChar(aModelBeginPos.mnPos, nCurrScript, false);

-            sal_uInt16 nCurrScript = rxBreak->getScriptType( rText, nBegin );
-
-            sal_Int32 indexUtf16 = nBegin;
-            rText.iterateCodePoints(&indexUtf16);
-
-            //First character is Asian, consider it a word :-(
-            if (nCurrScript == i18n::ScriptType::ASIAN)
+            // tdf#150621 Korean words must be counted as-is
+            if (primary(aCurrentLang) == primary(LANGUAGE_KOREAN))
            {
-                nLen = indexUtf16 - nBegin;
                return nLen;
            }

-            //First character was not Asian, consider appearance of any Asian character
-            //to be the end of the word
-            while (indexUtf16 < nBegin + nLen)
+            // Word is Chinese or Japanese, and must be truncated to a single character
+            return indexUtf16 - nBegin;
+        }
+
+        // First character was not Asian, consider appearance of any Asian character
+        // to be the end of the word
+        while (indexUtf16 < nBegin + nLen)
+        {
+            nCurrScript = rxBreak->getScriptType(rText, indexUtf16);
+            if (nCurrScript == i18n::ScriptType::ASIAN)
            {
-                nCurrScript = rxBreak->getScriptType( rText, indexUtf16 );
-                if (nCurrScript == i18n::ScriptType::ASIAN)
+                auto aModelBeginPos = pModelToView->ConvertToModelPosition(indexUtf16);
+                auto aCurrentLang = fnGetLangOfChar(aModelBeginPos.mnPos, nCurrScript, false);
+
+                // tdf#150621 Korean words must be counted as-is.
+                // Note that script changes intentionally do not delimit words for counting.
+                if (primary(aCurrentLang) == primary(LANGUAGE_KOREAN))
                {
-                    nLen = indexUtf16 - nBegin;
                    return nLen;
                }
-                rText.iterateCodePoints(&indexUtf16);
+
+                // Word tail contains Chinese or Japanese, and must be truncated
+                return indexUtf16 - nBegin;
            }
+            rText.iterateCodePoints(&indexUtf16);
        }
-        return nLen;
    }
+    return nLen;
+}
 }

 bool SwScanner::NextWord()
@ -959,8 +981,11 @@ bool SwScanner::NextWord()
    if( ! m_nLength )
        return false;

-    if ( m_nWordType == i18n::WordType::WORD_COUNT )
-        m_nLength = forceEachAsianCodePointToWord(m_aText, m_nBegin, m_nLength);
+    if (m_nWordType == i18n::WordType::WORD_COUNT)
+    {
+        m_nLength = forceEachCJCodePointToWord(m_aText, m_nBegin, m_nLength, &m_ModelToView,
+                                               m_pGetLangOfChar);
+    }

    m_aPrevWord = m_aWord;
    m_aWord = m_aPreDashReplacementText.copy( m_nBegin, m_nLength );
--- a/sw/uiconfig/swriter/ui/wordcount-mobile.ui
+++ b/sw/uiconfig/swriter/ui/wordcount-mobile.ui
@ -202,7 +202,7 @@
                  <object class="GtkLabel" id="cjkcharsft">
                    <property name="can-focus">False</property>
                    <property name="no-show-all">True</property>
-                    <property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean syllables</property>
+                    <property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean words</property>
                    <property name="xalign">1</property>
                  </object>
                  <packing>
@ -274,7 +274,7 @@
                  <object class="GtkLabel" id="cjkcharsft2">
                    <property name="can-focus">False</property>
                    <property name="no-show-all">True</property>
-                    <property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean syllables</property>
+                    <property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean words</property>
                    <property name="xalign">1</property>
                  </object>
                  <packing>
--- a/sw/uiconfig/swriter/ui/wordcount.ui
+++ b/sw/uiconfig/swriter/ui/wordcount.ui
@ -229,7 +229,7 @@
              <object class="GtkLabel" id="cjkcharsft">
                <property name="can-focus">False</property>
                <property name="no-show-all">True</property>
-                <property name="label" translatable="yes" context="wordcount|cjkcharsft">Asian characters and Korean syllables</property>
+                <property name="label" translatable="yes" context="wordcount|cjkcharsft">Asian characters and Korean words</property>
                <property name="xalign">1</property>
              </object>
              <packing>