tdf#150621 Changed Korean word counting to use words

Previously, Writer counted characters for all CJK languages, rather than
words. This is the correct behavior for Chinese and Japanese, which make
extensive use of ideographs. However, it is not correct for Korean.

This change adjusts the Writer word count algorithm to count Korean
words, rather than Korean characters.

Change-Id: I6e77136867baca1a7b51248886ee5fd7073ad364
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170621
Tested-by: Jenkins
Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
This commit is contained in:
Jonathan Clark 2024-07-16 16:50:10 -06:00
parent c9a96f2724
commit aa938fe03c
7 changed files with 153 additions and 65 deletions

View file

@ -2904,7 +2904,8 @@ EditSelection ImpEditEngine::TransliterateText( const EditSelection& rSelection,
}
i18n::Boundary aCurWordBndry( aSttBndry );
while (aCurWordBndry.endPos && aCurWordBndry.startPos <= aEndBndry.startPos)
while (aCurWordBndry.startPos != aCurWordBndry.endPos
&& aCurWordBndry.startPos <= aEndBndry.startPos)
{
nCurrentStart = aCurWordBndry.startPos;
nCurrentEnd = aCurWordBndry.endPos;

View file

@ -1867,33 +1867,32 @@ void TestBreakIterator::testLegacySurrogatePairs()
void TestBreakIterator::testWordCount()
{
auto count_words_fn = [&](const OUString& str, const lang::Locale& aLocale) -> int
auto fnCountWords = [&](const OUString& aStr, const lang::Locale& aLocale) -> int
{
int num_words = 0;
sal_Int32 next_pos = 0;
int iter_guard = 0;
int nWords = 0;
sal_Int32 nNextPos = 0;
int nIterGuard = 0;
if (m_xBreak->isBeginWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT))
if (m_xBreak->isBeginWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT))
{
++num_words;
++nWords;
}
while (true)
{
CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100);
CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++nIterGuard < 100);
auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT);
if (aBounds.endPos < next_pos || aBounds.startPos == aBounds.endPos)
auto aBounds = m_xBreak->nextWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT);
if (aBounds.endPos == aBounds.startPos)
{
break;
}
next_pos = aBounds.endPos;
++num_words;
nNextPos = aBounds.endPos;
++nWords;
}
return num_words;
return nWords;
};
// i#80815: "Word count differs from MS Word"
@ -1903,29 +1902,29 @@ void TestBreakIterator::testWordCount()
aLocale.Language = "en";
aLocale.Country = "US";
const OUString str = u""
"test data for word count issue #80815\n"
"fo\\\'sforos\n"
"archipi\\\'elago\n"
"do\\^me\n"
"f**k\n"
"\n"
"battery-driven\n"
"and/or\n"
"apple(s)\n"
"money+opportunity\n"
"Micro$oft\n"
"\n"
"300$\n"
"I(not you)\n"
"a****n\n"
"1+3=4\n"
"\n"
"aaaaaaa.aaaaaaa\n"
"aaaaaaa,aaaaaaa\n"
"aaaaaaa;aaaaaaa\n"_ustr;
const OUString aStr = u""
"test data for word count issue #80815\n"
"fo\\\'sforos\n"
"archipi\\\'elago\n"
"do\\^me\n"
"f**k\n"
"\n"
"battery-driven\n"
"and/or\n"
"apple(s)\n"
"money+opportunity\n"
"Micro$oft\n"
"\n"
"300$\n"
"I(not you)\n"
"a****n\n"
"1+3=4\n"
"\n"
"aaaaaaa.aaaaaaa\n"
"aaaaaaa,aaaaaaa\n"
"aaaaaaa;aaaaaaa\n"_ustr;
CPPUNIT_ASSERT_EQUAL(24, count_words_fn(str, aLocale));
CPPUNIT_ASSERT_EQUAL(24, fnCountWords(aStr, aLocale));
}
// Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
@ -1934,9 +1933,32 @@ void TestBreakIterator::testWordCount()
aLocale.Language = "ja";
aLocale.Country = "JP";
const OUString str = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
const OUString aStr = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale));
CPPUNIT_ASSERT_EQUAL(7, fnCountWords(aStr, aLocale));
}
// tdf#150621 Korean words should be counted individually, rather than by syllable.
//
// Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
{
lang::Locale aLocale;
aLocale.Language = "ko";
aLocale.Country = "KR";
// Basic case: Korean words are counted as space-delimited. In particular, grammatical
// particles are treated as part of the previous word.
CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"저는 영화를 봤어요"_ustr, aLocale));
// Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
// situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
// ideographs would be counted individually as words. In Korean, however, they are treated
// no differently than hangul characters.
CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"불렀다...與"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"불렀다 ...與"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"불렀다 ... 與"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"尹탄핵"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"尹 탄핵"_ustr, aLocale));
}
}

View file

@ -364,7 +364,7 @@ Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int
Boundary rv;
rv.startPos = icuBI->mpValue->mpBreakIterator->following(nStartPos);
if( rv.startPos >= Text.getLength() || rv.startPos == icu::BreakIterator::DONE )
rv.endPos = result.startPos;
rv.endPos = rv.startPos;
else {
if ((rWordType == WordType::ANYWORD_IGNOREWHITESPACES
&& u_isUWhiteSpace(Text.iterateCodePoints(&rv.startPos, 0)))

View file

@ -654,7 +654,7 @@ void SwDocTest::testSwScanner()
pTextNode = aPaM.GetPointNode().GetTextNode();
pTextNode->CountWords(aDocStat, 0, test.getLength());
CPPUNIT_ASSERT_EQUAL_MESSAGE("words", static_cast<sal_uLong>(58), aDocStat.nWord);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean syllables", static_cast<sal_uLong>(43), aDocStat.nAsianWord);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean words", static_cast<sal_uLong>(43), aDocStat.nAsianWord);
CPPUNIT_ASSERT_EQUAL_MESSAGE("non-whitespace chars", static_cast<sal_uLong>(105), aDocStat.nCharExcludingSpaces);
CPPUNIT_ASSERT_EQUAL_MESSAGE("characters", static_cast<sal_uLong>(128), aDocStat.nChar);
}
@ -929,6 +929,46 @@ void SwDocTest::testSwScanner()
CPPUNIT_ASSERT_EQUAL(sal_uLong(17), aDocStat.nChar);
aDocStat.Reset();
}
// tdf#150621 Korean words should be counted individually, rather than by syllable.
//
// Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
{
auto fnAssertWords = [&](const OUString& aStr, sal_uLong nWords, sal_uLong nAsianWords)
{
m_pDoc->getIDocumentContentOperations().AppendTextNode(*aPaM.GetPoint());
SvxLanguageItem aCJKLangItem(LANGUAGE_KOREAN, RES_CHRATR_CJK_LANGUAGE);
SvxLanguageItem aWestLangItem(LANGUAGE_ENGLISH_US, RES_CHRATR_LANGUAGE);
m_pDoc->getIDocumentContentOperations().InsertPoolItem(aPaM, aCJKLangItem);
m_pDoc->getIDocumentContentOperations().InsertPoolItem(aPaM, aWestLangItem);
m_pDoc->getIDocumentContentOperations().InsertString(aPaM, aStr);
SwDocStat aDocStat;
pTextNode = aPaM.GetPointNode().GetTextNode();
pTextNode->CountWords(aDocStat, 0, aStr.getLength());
CPPUNIT_ASSERT_EQUAL_MESSAGE("words", nWords, aDocStat.nWord);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Asian characters and Korean words", nAsianWords,
aDocStat.nAsianWord);
};
// Basic case: Korean words are counted as space-delimited. In particular, grammatical
// particles are treated as part of the previous word.
fnAssertWords(u"저는 영화를 봤어요"_ustr, 3, 3);
// Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
// situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
// ideographs would be counted individually as words. In Korean, however, they are treated
// no differently than hangul characters.
fnAssertWords(u"尹탄핵"_ustr, 1, 1);
fnAssertWords(u"尹 탄핵"_ustr, 2, 2);
// These mixed-script results are anomalous, but reflect the behavior of MSW.
fnAssertWords(u"불렀다...與"_ustr, 1, 1);
fnAssertWords(u"불렀다 ...與"_ustr, 2, 1);
fnAssertWords(u"불렀다 ... 與"_ustr, 3, 2);
}
}
void SwDocTest::testMergePortionsDeleteNotSorted()

View file

@ -793,40 +793,62 @@ SwScanner::SwScanner(std::function<LanguageType(sal_Int32, sal_Int32, bool)> aGe
namespace
{
//fdo#45271 for Asian words count characters instead of words
sal_Int32 forceEachAsianCodePointToWord(const OUString &rText, sal_Int32 nBegin, sal_Int32 nLen)
// tdf#45271 For Chinese and Japanese, count characters instead of words
sal_Int32
forceEachCJCodePointToWord(const OUString& rText, sal_Int32 nBegin, sal_Int32 nLen,
const ModelToViewHelper* pModelToView,
std::function<LanguageType(sal_Int32, sal_Int32, bool)>& fnGetLangOfChar)
{
if (nLen > 1)
{
if (nLen > 1)
const uno::Reference<XBreakIterator>& rxBreak = g_pBreakIt->GetBreakIter();
sal_uInt16 nCurrScript = rxBreak->getScriptType(rText, nBegin);
sal_Int32 indexUtf16 = nBegin;
rText.iterateCodePoints(&indexUtf16);
// First character is Asian
if (nCurrScript == i18n::ScriptType::ASIAN)
{
const uno::Reference< XBreakIterator > &rxBreak = g_pBreakIt->GetBreakIter();
auto aModelBeginPos = pModelToView->ConvertToModelPosition(nBegin);
auto aCurrentLang = fnGetLangOfChar(aModelBeginPos.mnPos, nCurrScript, false);
sal_uInt16 nCurrScript = rxBreak->getScriptType( rText, nBegin );
sal_Int32 indexUtf16 = nBegin;
rText.iterateCodePoints(&indexUtf16);
//First character is Asian, consider it a word :-(
if (nCurrScript == i18n::ScriptType::ASIAN)
// tdf#150621 Korean words must be counted as-is
if (primary(aCurrentLang) == primary(LANGUAGE_KOREAN))
{
nLen = indexUtf16 - nBegin;
return nLen;
}
//First character was not Asian, consider appearance of any Asian character
//to be the end of the word
while (indexUtf16 < nBegin + nLen)
// Word is Chinese or Japanese, and must be truncated to a single character
return indexUtf16 - nBegin;
}
// First character was not Asian, consider appearance of any Asian character
// to be the end of the word
while (indexUtf16 < nBegin + nLen)
{
nCurrScript = rxBreak->getScriptType(rText, indexUtf16);
if (nCurrScript == i18n::ScriptType::ASIAN)
{
nCurrScript = rxBreak->getScriptType( rText, indexUtf16 );
if (nCurrScript == i18n::ScriptType::ASIAN)
auto aModelBeginPos = pModelToView->ConvertToModelPosition(indexUtf16);
auto aCurrentLang = fnGetLangOfChar(aModelBeginPos.mnPos, nCurrScript, false);
// tdf#150621 Korean words must be counted as-is.
// Note that script changes intentionally do not delimit words for counting.
if (primary(aCurrentLang) == primary(LANGUAGE_KOREAN))
{
nLen = indexUtf16 - nBegin;
return nLen;
}
rText.iterateCodePoints(&indexUtf16);
// Word tail contains Chinese or Japanese, and must be truncated
return indexUtf16 - nBegin;
}
rText.iterateCodePoints(&indexUtf16);
}
return nLen;
}
return nLen;
}
}
bool SwScanner::NextWord()
@ -959,8 +981,11 @@ bool SwScanner::NextWord()
if( ! m_nLength )
return false;
if ( m_nWordType == i18n::WordType::WORD_COUNT )
m_nLength = forceEachAsianCodePointToWord(m_aText, m_nBegin, m_nLength);
if (m_nWordType == i18n::WordType::WORD_COUNT)
{
m_nLength = forceEachCJCodePointToWord(m_aText, m_nBegin, m_nLength, &m_ModelToView,
m_pGetLangOfChar);
}
m_aPrevWord = m_aWord;
m_aWord = m_aPreDashReplacementText.copy( m_nBegin, m_nLength );

View file

@ -202,7 +202,7 @@
<object class="GtkLabel" id="cjkcharsft">
<property name="can-focus">False</property>
<property name="no-show-all">True</property>
<property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean syllables</property>
<property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean words</property>
<property name="xalign">1</property>
</object>
<packing>
@ -274,7 +274,7 @@
<object class="GtkLabel" id="cjkcharsft2">
<property name="can-focus">False</property>
<property name="no-show-all">True</property>
<property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean syllables</property>
<property name="label" translatable="yes" context="wordcount-mobile|cjkcharsft">Asian characters and Korean words</property>
<property name="xalign">1</property>
</object>
<packing>

View file

@ -229,7 +229,7 @@
<object class="GtkLabel" id="cjkcharsft">
<property name="can-focus">False</property>
<property name="no-show-all">True</property>
<property name="label" translatable="yes" context="wordcount|cjkcharsft">Asian characters and Korean syllables</property>
<property name="label" translatable="yes" context="wordcount|cjkcharsft">Asian characters and Korean words</property>
<property name="xalign">1</property>
</object>
<packing>