office-gobmx/i18npool/qa/cppunit/test_breakiterator.cxx
Noel Grandin 5de73f04f3 new loplugin:staticconstexpr
Change-Id: Ida1996dfffa106bf95fd064e8191b8033b4002f3
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/175336
Tested-by: Jenkins
Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
2024-11-04 08:51:00 +01:00

1981 lines
79 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <com/sun/star/i18n/XBreakIterator.hpp>
#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
#include <com/sun/star/i18n/ScriptType.hpp>
#include <com/sun/star/i18n/WordType.hpp>
#include <o3tl/cppunittraitshelper.hxx>
#include <unotest/bootstrapfixturebase.hxx>
#include <unicode/uvernum.h>
#include <string.h>
#include <stack>
#include <string_view>
using namespace ::com::sun::star;
class TestBreakIterator : public test::BootstrapFixtureBase
{
public:
virtual void setUp() override;
virtual void tearDown() override;
void testLineBreaking();
void testWordBoundaries();
void testSentenceBoundaries();
void testGraphemeIteration();
void testWeak();
void testAsian();
void testThai();
void testLao();
#ifdef TODO
void testNorthernThai();
void testKhmer();
#endif
void testJapanese();
void testChinese();
void testLegacyDictWordPrepostDash_de_DE();
void testLegacyDictWordPrepostDash_nds_DE();
void testLegacyDictWordPrepostDash_nl_NL();
void testLegacyDictWordPrepostDash_sv_SE();
void testHebrewGereshGershaim();
void testLegacySurrogatePairs();
void testWordCount();
CPPUNIT_TEST_SUITE(TestBreakIterator);
CPPUNIT_TEST(testLineBreaking);
CPPUNIT_TEST(testWordBoundaries);
CPPUNIT_TEST(testSentenceBoundaries);
CPPUNIT_TEST(testGraphemeIteration);
CPPUNIT_TEST(testWeak);
CPPUNIT_TEST(testAsian);
CPPUNIT_TEST(testThai);
CPPUNIT_TEST(testLao);
#ifdef TODO
CPPUNIT_TEST(testKhmer);
CPPUNIT_TEST(testNorthernThai);
#endif
CPPUNIT_TEST(testJapanese);
CPPUNIT_TEST(testChinese);
CPPUNIT_TEST(testLegacyDictWordPrepostDash_de_DE);
CPPUNIT_TEST(testLegacyDictWordPrepostDash_nds_DE);
CPPUNIT_TEST(testLegacyDictWordPrepostDash_nl_NL);
CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE);
CPPUNIT_TEST(testHebrewGereshGershaim);
CPPUNIT_TEST(testLegacySurrogatePairs);
CPPUNIT_TEST(testWordCount);
CPPUNIT_TEST_SUITE_END();
private:
uno::Reference<i18n::XBreakIterator> m_xBreak;
void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
};
void TestBreakIterator::testLineBreaking()
{
i18n::LineBreakHyphenationOptions aHyphOptions;
i18n::LineBreakUserOptions aUserOptions;
lang::Locale aLocale;
//See https://bugs.libreoffice.org/show_bug.cgi?id=31271
{
OUString aTest(u"(some text here)"_ustr);
aLocale.Language = "en";
aLocale.Country = "US";
{
//Here we want the line break to leave text here) on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
}
{
//Here we want the line break to leave "here)" on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
}
}
//See https://bugs.libreoffice.org/show_bug.cgi?id=49849
{
static constexpr OUString aWord = u"\u05DE\u05D9\u05DC\u05D9\u05DD"_ustr;
OUString aTest(aWord + " " + aWord);
aLocale.Language = "he";
aLocale.Country = "IL";
{
//Here we want the line break to happen at the whitespace
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
}
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=17155
{
aLocale.Language = "en";
aLocale.Country = "US";
{
//Here we want the line break to leave /bar/ba clumped together on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(u"foo /bar/baz"_ustr, strlen("foo /bar/ba"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
}
}
// i#22602: writer breaks word after dot immediately followed by a letter
{
aLocale.Language = "en";
aLocale.Country = "US";
{
//Here we want the line break to leave ./bar/baz clumped together on the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"foo ./bar/baz"_ustr, strlen("foo ./bar/ba"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first period",
static_cast<sal_Int32>(4), aResult.breakIndex);
}
}
// i#81448: slash and backslash make non-breaking spaces of preceding spaces
{
aLocale.Language = "en";
aLocale.Country = "US";
{
// Per the bug, the line break should leave ...BE clumped together on the next line.
// However, the current behavior does not wrap the string at all. This test asserts the
// current behavior as a point of reference.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"THIS... ...BE"_ustr, strlen("THIS... ...B"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(0), aResult.breakIndex);
}
}
// i#81448: slash and backslash make non-breaking spaces of preceding spaces
{
aLocale.Language = "en";
aLocale.Country = "US";
{
// The line break should leave /BE clumped together on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"THIS... /BE"_ustr, strlen("THIS... /B"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(8), aResult.breakIndex);
}
}
// i#80548: Bad word wrap between dash and word
{
aLocale.Language = "fi";
aLocale.Country = "FI";
{
// Per the bug, the line break should leave -bar clumped together on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"foo -bar"_ustr, strlen("foo -ba"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first dash",
static_cast<sal_Int32>(4), aResult.breakIndex);
}
}
// i#80645: Line erroneously breaks at backslash
{
aLocale.Language = "en";
aLocale.Country = "US";
{
// Note that the current behavior deviates from the original fix for this bug.
//
// The original report was filed due to wrapping all of "\Program Files\aaaa" to the
// next line, even though only "aaaa" overflowed. The original fix was to simply make
// U+005C reverse solidus (backslash) a breaking character.
//
// However, the root cause for this bug was not the behavior of '\', but rather some
// other bug making all of "\Program Files\" behave like a single token, despite it
// even containing whitespace.
//
// Reverting to the ICU line rules fixes this root issue. Now, in the following,
// "C:\Program" and "Files\LibreOffice" are treated as separate tokens. This is also
// consistent with the behavior of other office programs.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"C:\\Program Files\\LibreOffice"_ustr, strlen("C:\\Program Files\\Libre"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
// An identical result should be generated for solidus.
aResult = m_xBreak->getLineBreak(
u"C:/Program Files/LibreOffice"_ustr, strlen("C:/Program Files/Libre"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(11), aResult.breakIndex);
}
}
// i#80841: Words separated by hyphens will always break to next line
{
aLocale.Language = "en";
aLocale.Country = "US";
{
// Here we want the line break to leave toll- on the first line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"toll-free"_ustr, strlen("toll-fr"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// i#83464: Line break between letter and $
{
aLocale.Language = "en";
aLocale.Country = "US";
{
// Here we want the line break to leave US$ clumped on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"word US$ 123"_ustr, strlen("word U"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// Unknown bug number: "fix line break problem of dot after letter and before number"
{
aLocale.Language = "en";
aLocale.Country = "US";
{
// Here we want the line break to leave US$ clumped on the next line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"word L.5 word"_ustr, strlen("word L"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// i#83229: Wrong line break when word contains a hyphen
{
aLocale.Language = "en";
aLocale.Country = "US";
{
// The root cause for this bug was the Unicode standard introducing special treatment
// for '-' in a number range context. This change makes number ranges (e.g. "100-199")
// behave as if they are single tokens for the purposes of line breaking. Unfortunately,
// this caused a significant appearance change to existing documents.
//
// Despite being a user-visible layout change, this isn't exactly a bug. Wrapping
// number ranges as a single token is consistent with other applications, including web
// browsers, and other office suites as mentioned in the bug discussion. Removing this
// customization seems like it would be a major change, however.
//
// Here we want the line break to leave 100- clumped on the first line.
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"word 100-199 word"_ustr, strlen("word 100-1"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(9), aResult.breakIndex);
}
{
// From the same bug: "the leading minus must stay with numbers and strings"
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"range of -100.000 to 100.000"_ustr, strlen("range of -1"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
static constexpr OUString str = u"range of \u2212100.000 to 100.000"_ustr;
aResult = m_xBreak->getLineBreak(
str, strlen("range of -"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{9}, aResult.breakIndex);
}
aLocale.Language = "de";
aLocale.Country = "DE";
{
// From the same bug: "the leading minus must stay with numbers and strings"
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"EURO is -10,50"_ustr, strlen("EURO is -1"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
// Also the mathematical minus sign:
static constexpr OUString str = u"EURO is \u221210,50"_ustr;
aResult = m_xBreak->getLineBreak(
str, strlen("EURO is -"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{8}, aResult.breakIndex);
}
{
// From the same bug: "the leading minus must stay with numbers and strings"
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
u"und -kosten"_ustr, strlen("und -ko"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{4}, aResult.breakIndex);
// But not the non-breaking hyphen:
static constexpr OUString str = u"und \u2011"_ustr;
aResult = m_xBreak->getLineBreak(
str, strlen("und -ko"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{5}, aResult.breakIndex);
}
}
// i#83649: "Line break should be between typographical quote and left bracket"
// - Actually: Spaces between quotation mark and opening punctuation not treated as a break.
// - Note that per the Unicode standard, prohibiting breaks in this context is intentional
// because it may cause issues in certain languages due to the various ways quotation
// characters are used.
// - We do it anyway by customizing the ICU line breaking rules.
{
{
// This uses the sample text provided in the bug report. Based on usage, it is assumed
// they were in the de_DE locale.
aLocale.Language = "de";
aLocale.Country = "DE";
// Per the bug report, it is expected that »angetan werden« remains on the first line.
const OUString str = u"»angetan werden« [Passiv]"_ustr;
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
// The same result should be returned for this and the first case.
const OUString str2 = u"»angetan werden« Passiv"_ustr;
aResult = m_xBreak->getLineBreak(
str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
// Under ICU rules, no amount of spaces would cause this to wrap.
const OUString str3 = u"»angetan werden« [Passiv]"_ustr;
aResult = m_xBreak->getLineBreak(
str3, str3.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(20), aResult.breakIndex);
// However, tabs will
const OUString str4 = u"»angetan werden«\t[Passiv]"_ustr;
aResult = m_xBreak->getLineBreak(
str4, str4.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
}
{
// The same behavior is seen in English
aLocale.Language = "en";
aLocale.Country = "US";
const OUString str = u"\"angetan werden\" [Passiv]"_ustr;
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
str, str.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
const OUString str2 = u"\"angetan werden\" Passiv"_ustr;
aResult = m_xBreak->getLineBreak(
str2, str2.getLength() - 4, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(17), aResult.breakIndex);
}
}
// i#72868: Writer/Impress line does not break after Chinese punctuation and Latin letters
{
aLocale.Language = "zh";
aLocale.Country = "HK";
{
// Per the bug, this should break at the ideographic comma. However, this change has
// been reverted at some point. This test only verifies current behavior.
const OUString str = u"word word、word word"_ustr;
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(
str, strlen("word wordXwor"), aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(13), aResult.breakIndex);
}
}
// i#80891: Character in the forbidden list sometimes appears at the start of line
{
aLocale.Language = "zh";
aLocale.Country = "HK";
{
// Per the bug, the ideographic two-dot leader should be a forbidden character. However,
// this change seems to have been reverted or broken at some point.
const OUString str = u"電話︰電話"_ustr;
i18n::LineBreakResults aResult
= m_xBreak->getLineBreak(str, 2, aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(static_cast<sal_Int32>(2), aResult.breakIndex);
}
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=19716
{
aLocale.Language = "en";
aLocale.Country = "US";
{
OUString aTest(u"aaa]aaa"_ustr);
//Here we want the line break to move the whole lot to the next line
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
}
}
//this is an example sequence from tdf92993-1.docx caught by the load crashtesting
{
static constexpr OUStringLiteral aTest = u"\U0001f356\U0001f357\U0001f346"
"\U0001f364\u2668\ufe0f\U0001f3c6";
aLocale.Language = "en";
aLocale.Country = "US";
{
//This must not assert/crash
(void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
}
}
//See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
{
static constexpr OUString aTest = u"\uc560\uad6D\uac00\uc758 \uac00"
"\uc0ac\ub294"_ustr;
aLocale.Language = "ko";
aLocale.Country = "KR";
{
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
}
}
// i#65267: Comma is badly broken at end of line
// - The word should be wrapped along with the comma
{
aLocale.Language = "de";
aLocale.Country = "DE";
{
auto res = m_xBreak->getLineBreak(u"Wort -prinzessinnen, wort"_ustr,
strlen("Wort -prinzessinnen,"), aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 5 }, res.breakIndex);
}
}
// tdf#114160: ZWJ shouldn't be treated as a breaking character
{
aLocale.Language = "mn";
aLocale.Country = "MN";
{
auto res = m_xBreak->getLineBreak(u"\u1828\u1820\u200d\u00a0\u200d\u1873\u1873"_ustr, 6,
aLocale, 0, aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
}
aLocale.Language = "en";
aLocale.Country = "US";
{
auto res = m_xBreak->getLineBreak(u"AB\u200d\u00a0\u200dCD"_ustr, 6, aLocale, 0,
aHyphOptions, aUserOptions);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), res.breakIndex);
}
}
}
//See https://bugs.libreoffice.org/show_bug.cgi?id=49629
void TestBreakIterator::testWordBoundaries()
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
i18n::Boundary aBounds;
//See https://bz.apache.org/ooo/show_bug.cgi?id=11993
{
OUString aTest(u"abcd ef ghi??? KLM"_ustr);
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
//next word
aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
//previous word
aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=21907
{
OUString aTest(u"b a?"_ustr);
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=14904
{
static constexpr OUString aTest =
u"Working \u201CWords"
" starting wit"
"h quotes\u201D Work"
"ing \u2018Broken\u2019 "
"?Spanish? doe"
"sn\u2019t work. No"
"t even \u00BFreal? "
"Spanish"_ustr;
aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(44), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(46), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(52), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(55), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(62), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(64), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(71), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(88), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(92), aBounds.endPos);
}
//See https://bugs.libreoffice.org/show_bug.cgi?id=49629
sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
{
//make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
for (auto const& i: aBreakTests)
{
OUString aTest = "Word" + OUStringChar(i) + "Word";
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
switch (mode)
{
case i18n::WordType::ANY_WORD:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
break;
case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
break;
case i18n::WordType::DICTIONARY_WORD:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
break;
case i18n::WordType::WORD_COUNT:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
break;
}
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
}
}
sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
{
//make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
for (auto const& p: aJoinTests)
{
OUString aTest = "Word" + OUStringChar(p) + "Word";
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
switch (mode)
{
case i18n::WordType::ANY_WORD:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
break;
case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
break;
case i18n::WordType::DICTIONARY_WORD:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
break;
case i18n::WordType::WORD_COUNT:
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
break;
}
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
}
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=13494
{
static constexpr OUString aBase(u"xxAAxxBBxxCCxx"_ustr);
const sal_Unicode aTests[] =
{
'\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
'(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
'\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
};
const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
for (auto const& r: aTests)
{
OUString aTest = aBase.replace('x', r);
sal_Int32 nPos = -1;
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aDoublePositions));
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
++i;
}
while (nPos < aTest.getLength());
nPos = aTest.getLength();
i = std::size(aDoublePositions)-1;
do
{
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
--i;
CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
}
while (nPos > 0);
}
const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
for (size_t j = 1; j < std::size(aTests); ++j)
{
OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j]));
sal_Int32 nPos = -1;
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aSinglePositions));
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
++i;
}
while (nPos < aTest.getLength());
nPos = aTest.getLength();
i = std::size(aSinglePositions)-1;
do
{
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
--i;
CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
}
while (nPos > 0);
}
const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
{
OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0]));
sal_Int32 nPos = -1;
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aSingleQuotePositions));
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
++i;
}
while (nPos < aTest.getLength());
nPos = aTest.getLength();
i = std::size(aSingleQuotePositions)-1;
do
{
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
--i;
CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
}
while (nPos > 0);
}
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=13451
{
aLocale.Language = "ca";
aLocale.Country = "ES";
OUString aTest(u"mirar-se comprar-vos donem-nos les mans aneu-vos-en!"_ustr);
sal_Int32 nPos = 0;
sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aExpected));
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
i18n::WordType::DICTIONARY_WORD, true).endPos;
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
++i;
}
while (nPos++ < aTest.getLength());
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
}
// i#85411: ZWSP should be a word separator for spellchecking
// - This fix was applied to both dict and edit customizations
for (int j = 0; j < 3; ++j)
{
switch (j)
{
case 0:
aLocale.Language = "en";
aLocale.Country = "US";
break;
case 1:
aLocale.Language = "ca";
aLocale.Country = "ES";
break;
case 2:
aLocale.Language = "fi";
aLocale.Country = "FI";
break;
default:
CPPUNIT_ASSERT(false);
break;
}
static constexpr OUString aTest = u"I\u200Bwant\u200Bto\u200Bgo"_ustr;
sal_Int32 nPos = 0;
sal_Int32 aExpected[] = { 1, 6, 9, 12 };
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aExpected));
auto dwPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(aExpected[i], dwPos.endPos);
auto ewPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, true);
CPPUNIT_ASSERT_EQUAL(aExpected[i], ewPos.endPos);
nPos = dwPos.endPos;
++i;
} while (nPos++ < aTest.getLength());
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
}
//https://bz.apache.org/ooo/show_bug.cgi?id=21290
for (int j = 0; j < 2; ++j)
{
switch (j)
{
case 0:
aLocale.Language = "en";
aLocale.Country = "US";
break;
case 1:
aLocale.Language = "grc";
aLocale.Country.clear();
break;
default:
CPPUNIT_ASSERT(false);
break;
}
static constexpr OUString aTest =
u"\u1F0C\u03BD\u03B4\u03C1\u03B1 \u1F00"
"\u03C1\u03BD\u1F7B\u03BC\u03B5\u03BD\u03BF"
"\u03C2 \u1F00\u03BB\u03BB \u1F24"
"\u03C3\u03B8\u03B9\u03BF\u03BD"_ustr;
sal_Int32 nPos = 0;
sal_Int32 aExpected[] = {5, 15, 19, 26};
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aExpected));
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
i18n::WordType::DICTIONARY_WORD, true).endPos;
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
++i;
}
while (nPos++ < aTest.getLength());
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=58513
//See https://bugs.libreoffice.org/show_bug.cgi?id=55707
{
aLocale.Language = "fi";
aLocale.Country = "FI";
OUString aTest(u"Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n"_ustr);
{
sal_Int32 nPos = 0;
sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aExpected));
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
i18n::WordType::WORD_COUNT, true).endPos;
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
++i;
}
while (nPos++ < aTest.getLength());
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
}
{
sal_Int32 nPos = 0;
sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
40, 41, 42, 43, 45, 46, 47, 50, 51};
size_t i = 0;
do
{
CPPUNIT_ASSERT(i < std::size(aExpected));
aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
++i;
CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
++i;
nPos = aBounds.endPos;
}
while (nPos++ < aTest.getLength());
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
}
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=107843
{
aLocale.Language = "en";
aLocale.Country = "US";
static constexpr OUString aTest =
u"ru\uFB00le \uFB01sh"_ustr;
aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=113785
{
aLocale.Language = "en";
aLocale.Country = "US";
static constexpr OUString aTest =
u"a\u2013b\u2014c"_ustr;
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
}
// i#55778: Words containing numbers get broken up
{
aLocale.Language = "en";
aLocale.Country = "US";
static constexpr OUString aTest = u"first i18n third"_ustr;
aBounds
= m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.endPos);
}
// i#56347: "BreakIterator patch for Hungarian"
// i#56348: Special chars in first pos not handled by spell checking in Writer (Hungarian)
// Rules for Hungarian affixes after numbers and certain symbols
{
aLocale.Language = "hu";
aLocale.Country = "HU";
OUString aTest = u"szavak 15 15-tel 15%-kal €-val szavak"_ustr;
for (auto mode :
{ i18n::WordType::DICTIONARY_WORD, i18n::WordType::ANYWORD_IGNOREWHITESPACES })
{
aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale, mode, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, mode, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 11, aLocale, mode, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(10), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 18, aLocale, mode, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(17), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(24), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 25, aLocale, mode, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 27, aLocale, mode, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(25), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(30), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 34, aLocale, mode, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
}
}
// tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis
{
aLocale.Language = "ja";
aLocale.Country = "JP";
static constexpr OUString aTest = u"通産省工業技術院北海道工業開発試験所"_ustr;
aBounds
= m_xBreak->getWordBoundary(aTest, 9, aLocale, i18n::WordType::DICTIONARY_WORD, false);
// When using the old LO custom dictionaries, this will select the entire phrase.
// When using ICU, it will select only 北海道.
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
}
// tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
{
aLocale.Language = "en";
aLocale.Country = "US";
OUString aTest(u"Lespace fine insécable\u202F!"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
// This was 24 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
}
// tdf#161737: narrow no-break space between digits resulted spelling mistakes
// as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
// TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
// to check numbers with thousand separators and with correct suffix
{
aLocale.Language = "en";
aLocale.Country = "US";
OUString aTest(u"1\u202F000\u202F000"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
// This was 0 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
// This was 8 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
}
// tdf#161737: narrow no-break space at the end of words resulted spelling mistakes
{
aLocale.Language = "hu";
aLocale.Country = "HU";
OUString aTest(u"Lespace fine insécable\u202F!"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 14, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.startPos);
// This was 24 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(23), aBounds.endPos);
}
// tdf#161737: narrow no-break space between digits resulted spelling mistakes
// as a quick fix, limit NBSP as word-part character only for editing, and not for spell checking
// TODO: remove NBSP by the linguistic module or by the spell checking dictionaries to allow
// to check numbers with thousand separators and with correct suffix
{
aLocale.Language = "hu";
aLocale.Country = "HU";
OUString aTest(u"1\u202F000\u202F000"_ustr);
aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
// This was 0 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
// This was 8 (word + NNBSP)
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
}
}
void TestBreakIterator::testSentenceBoundaries()
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
// Trivial characteristic test for sentence boundary detection
{
OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 5, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 5, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 31, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 31, aLocale));
}
// i#24098: i18n API beginOfSentence/endOfSentence
// fix beginOfSentence, ... when cursor is on the beginning of the sentence
{
OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
CPPUNIT_ASSERT_EQUAL(sal_Int32(20), m_xBreak->beginOfSentence(aTest, 20, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(49), m_xBreak->endOfSentence(aTest, 20, aLocale));
}
// i#24098: i18n API beginOfSentence/endOfSentence
// "skip preceding space for beginOfSentence"
{
OUString aTest(u"This is a sentence. This is a different sentence."_ustr);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 20, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), m_xBreak->endOfSentence(aTest, 20, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(24), m_xBreak->beginOfSentence(aTest, 26, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(53), m_xBreak->endOfSentence(aTest, 26, aLocale));
}
// i#55063: Sentence selection in Thai should select a space-delimited phrase.
// - This customization broke at some point. It works in an English locale in a synthetic test
// like this one, but does not work in the Thai locale, nor on Thai text in practice.
{
static constexpr OUString aTest = u"ว้อย โหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอก โปรโมเตอร์"_ustr;
aLocale.Language = "en";
aLocale.Country = "US";
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
aLocale.Language = "th";
aLocale.Country = "TH";
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(46), m_xBreak->endOfSentence(aTest, 23, aLocale));
}
// i#55063: Thai phrases should delimit English sentence selection.
// - This customization broke at some point. It works in an English locale in a synthetic test
// like this one, but does not work in the Thai locale, nor on Thai text in practice.
{
static constexpr OUString aTest = u"ว้อย English usually ends with a period โปรโมเตอร์."_ustr;
aLocale.Language = "en";
aLocale.Country = "US";
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
aLocale.Language = "th";
aLocale.Country = "TH";
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(51), m_xBreak->endOfSentence(aTest, 23, aLocale));
}
// i#55063: Characteristic test for English text delimiting Thai phrases (sentences)
// - English text should not delimit Thai phrases.
{
static constexpr OUString aTest = u"Englishโหลยโท่ยคอร์รัปชันโอเพ่นฮอตดอกEnglish"_ustr;
aLocale.Language = "en";
aLocale.Country = "US";
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
aLocale.Language = "th";
aLocale.Country = "TH";
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), m_xBreak->beginOfSentence(aTest, 23, aLocale));
CPPUNIT_ASSERT_EQUAL(sal_Int32(44), m_xBreak->endOfSentence(aTest, 23, aLocale));
}
}
//See https://bugs.libreoffice.org/show_bug.cgi?id=40292
//See https://bz.apache.org/ooo/show_bug.cgi?id=80412
//See https://bz.apache.org/ooo/show_bug.cgi?id=111152
//See https://bz.apache.org/ooo/show_bug.cgi?id=50172
void TestBreakIterator::testGraphemeIteration()
{
lang::Locale aLocale;
aLocale.Language = "bn";
aLocale.Country = "IN";
{
static constexpr OUString aTest = u"\u09AC\u09CD\u09AF"_ustr; // BA HALANT LA
sal_Int32 nDone=0;
sal_Int32 nPos;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
}
{
static constexpr OUString aTest = u"\u09B9\u09CD\u09A3\u09BF"_ustr;
// HA HALANT NA VOWELSIGNI
sal_Int32 nDone=0;
sal_Int32 nPos;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
}
{
static constexpr OUString aTest = u"\u09A4\u09CD\u09AE\u09CD\u09AF"_ustr;
// TA HALANT MA HALANT YA
sal_Int32 nDone=0;
sal_Int32 nPos;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
}
aLocale.Language = "ta";
aLocale.Country = "IN";
{
static constexpr OUString aTest = u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr; // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
sal_Int32 nDone=0;
sal_Int32 nPos = 0;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
}
{
static constexpr OUString aTest = u"\u0B95\u0BC1"_ustr; // KA VOWELSIGNU
sal_Int32 nDone=0;
sal_Int32 nPos = 0;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
}
{
static constexpr OUString aTest =
u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"_ustr;
// CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
sal_Int32 nDone=0;
sal_Int32 nPos=0;
for (sal_Int32 i = 0; i < 4; ++i)
{
sal_Int32 nOldPos = nPos;
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
}
for (sal_Int32 i = 0; i < 4; ++i)
{
sal_Int32 nOldPos = nPos;
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
}
}
{
static constexpr OUString aText = u"\u05D0\u05B8"_ustr; // ALEF QAMATS
sal_Int32 nGraphemeCount = 0;
sal_Int32 nCurPos = 0;
while (nCurPos < aText.getLength())
{
sal_Int32 nCount2 = 1;
nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
++nGraphemeCount;
}
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
}
aLocale.Language = "hi";
aLocale.Country = "IN";
{
static constexpr OUString aTest = u"\u0936\u0940"_ustr; // SHA VOWELSIGNII
sal_Int32 nDone=0;
sal_Int32 nPos = 0;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
}
// tdf#49885: Replace custom Thai implementation with ICU
{
aLocale.Language = "th";
aLocale.Country = "TH";
static constexpr OUString aTest = u"กำ"_ustr;
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 2 }, aTest.getLength());
sal_Int32 nDone = 0;
sal_Int32 nPos = 0;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1,
nDone);
CPPUNIT_ASSERT_EQUAL(aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 0 }, nPos);
}
// Korean may also use grapheme clusters for character composition
{
aLocale.Language = "ko";
aLocale.Country = "KR";
static constexpr OUString aTest = u"각"_ustr;
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 3 }, aTest.getLength());
sal_Int32 nDone = 0;
sal_Int32 nPos = 0;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1,
nDone);
CPPUNIT_ASSERT_EQUAL(aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL(sal_Int32{ 0 }, nPos);
}
}
//A test to ensure that certain ranges and codepoints that are categorized as
//weak remain as weak, so that existing docs that depend on this don't silently
//change font for those weak chars
void TestBreakIterator::testWeak()
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
{
static constexpr OUString aWeaks =
u"\u0001\u0002"
" \u00A0"
"\u0300\u036F" //Combining Diacritical Marks
"\u1AB0\u1AFF" //Combining Diacritical Marks Extended
"\u1DC0\u1DFF" //Combining Diacritical Marks Supplement
"\u20D0\u20FF" //Combining Diacritical Marks for Symbols
"\u2150\u215F" //Number Forms, fractions
"\u2160\u2180" //Number Forms, roman numerals
"\u2200\u22FF" //Mathematical Operators
"\u27C0\u27EF" //Miscellaneous Mathematical Symbols-A
"\u2980\u29FF" //Miscellaneous Mathematical Symbols-B
"\u2A00\u2AFF" //Supplemental Mathematical Operators
"\u2100\u214F" //Letterlike Symbols
"\u2308\u230B" //Miscellaneous technical
"\u25A0\u25FF" //Geometric Shapes
"\u2B30\u2B4C"_ustr; //Miscellaneous Symbols and Arrows
for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
{
sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
OString aMsg =
"Char 0x" +
OString::number(static_cast<sal_Int32>(std::u16string_view(aWeaks)[i]), 16) +
" should have been weak";
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
i18n::ScriptType::WEAK, nScript);
}
}
}
//A test to ensure that certain ranges and codepoints that are categorized as
//asian remain as asian, so that existing docs that depend on this don't silently
//change font for those asian chars.
//See https://bugs.libreoffice.org/show_bug.cgi?id=38095
void TestBreakIterator::testAsian()
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
{
static constexpr OUString aAsians =
//some typical CJK chars
u"\u4E00\u62FF"
//The full HalfWidth and FullWidth block has historically been
//designated as taking the CJK font :-(
//HalfWidth and FullWidth forms of ASCII 0-9, categorized under
//UAX24 as "Common" i.e. by that logic WEAK
"\uFF10\uFF19"
//HalfWidth and FullWidth forms of ASCII A-z, categorized under
//UAX25 as "Latin", i.e. by that logic LATIN
"\uFF21\uFF5A"_ustr;
for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
{
sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
OString aMsg =
"Char 0x" +
OString::number(static_cast<sal_Int32>(std::u16string_view(aAsians)[i]), 16) +
" should have been asian";
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
i18n::ScriptType::ASIAN, nScript);
}
}
}
//A test to ensure that our Lao word boundary detection is useful
void TestBreakIterator::testLao()
{
lang::Locale aLocale;
aLocale.Language = "lo";
aLocale.Country = "LA";
static constexpr OUString aTest = u"\u0e8d\u0eb4\u0e99\u0e94\u0eb5\u0e95\u0ec9\u0ead\u0e99\u0eae\u0eb1\u0e9a"_ustr;
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
#if (U_ICU_VERSION_MAJOR_NUM < 70)
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
#else
// FIXME:
// In ICU 70/71 for yet unknown reason the word boundary 9 is not detected and
// instead the length 12 is returned as endpos.
// Deep in
// icu_70::RuleBasedBreakIterator::BreakCache::next()
// icu_70::RuleBasedBreakIterator::BreakCache::following()
// icu_70::RuleBasedBreakIterator::following()
// i18npool::BreakIterator_Unicode::getWordBoundary()
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
#endif
}
//A test to ensure that our thai word boundary detection is useful
void TestBreakIterator::testThai()
{
lang::Locale aLocale;
aLocale.Language = "th";
aLocale.Country = "TH";
//See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
{
static constexpr OUString aTest = u"\u0E01\u0E38\u0E2B\u0E25\u0E32\u0E1A"_ustr;
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
aTest.getLength(), aBounds.endPos);
}
//See https://bz.apache.org/ooo/show_bug.cgi?id=29548
//make sure forwards and back are consistent
{
static constexpr OUString aTest =
u"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
"\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
"\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"
"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
"\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
"\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"_ustr;
std::stack<sal_Int32> aPositions;
sal_Int32 nPos = -1;
do
{
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
aPositions.push(nPos);
}
while (nPos < aTest.getLength());
nPos = aTest.getLength();
CPPUNIT_ASSERT(!aPositions.empty());
aPositions.pop();
do
{
CPPUNIT_ASSERT(!aPositions.empty());
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
aPositions.pop();
}
while (nPos > 0);
}
// tdf#113694
{
static constexpr OUString aTest = u"\U00010000"_ustr;
sal_Int32 nDone=0;
sal_Int32 nPos;
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
}
}
#ifdef TODO
void TestBreakIterator::testNorthernThai()
{
lang::Locale aLocale;
aLocale.Language = "nod";
aLocale.Country = "TH";
const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_MESSAGE("Should skip full word",
aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
}
// Not sure if any version earlier than 49 did have Khmer word boundary
// dictionaries, 4.6 does not.
// As of icu 54, word boundary detection for Khmer is still considered
// insufficient, so icu khmer stuff is disabled
//A test to ensure that our khmer word boundary detection is useful
//https://bugs.libreoffice.org/show_bug.cgi?id=52020
void TestBreakIterator::testKhmer()
{
lang::Locale aLocale;
aLocale.Language = "km";
aLocale.Country = "KH";
const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
}
#endif
void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
{
lang::Locale aLocale;
aLocale.Language = "ja";
aLocale.Country = "JP";
i18n::Boundary aBounds;
{
static constexpr OUString aTest = u"シャットダウン"_ustr;
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
}
{
static constexpr OUString aTest = u"\u9EBB\u306E\u8449\u9EBB\u306E\u8449"_ustr;
aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
}
void TestBreakIterator::testJapanese()
{
doTestJapanese(m_xBreak);
// fdo#78479 - test second / cached instantiation of xdictionary
uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
u"com.sun.star.i18n.BreakIterator"_ustr), uno::UNO_QUERY_THROW);
doTestJapanese(xTmpBreak);
}
void TestBreakIterator::testChinese()
{
lang::Locale aLocale;
aLocale.Language = "zh";
aLocale.Country = "CN";
{
static constexpr OUStringLiteral aTest = u"\u6A35\u6A30\u69FE\u8919\U00029EDB";
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
}
void TestBreakIterator::testLegacyDictWordPrepostDash_de_DE()
{
lang::Locale aLocale;
aLocale.Language = "de";
aLocale.Country = "DE";
{
auto aTest = u"Arbeits- -nehmer"_ustr;
i18n::Boundary aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
aBounds
= m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
}
}
void TestBreakIterator::testLegacyDictWordPrepostDash_nds_DE()
{
lang::Locale aLocale;
aLocale.Language = "nds";
aLocale.Country = "DE";
{
auto aTest = u"Arbeits- -nehmer"_ustr;
i18n::Boundary aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
aBounds
= m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
}
}
void TestBreakIterator::testLegacyDictWordPrepostDash_nl_NL()
{
lang::Locale aLocale;
aLocale.Language = "nl";
aLocale.Country = "NL";
{
auto aTest = u"Arbeits- -nehmer"_ustr;
i18n::Boundary aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
aBounds
= m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
}
}
void TestBreakIterator::testLegacyDictWordPrepostDash_sv_SE()
{
lang::Locale aLocale;
aLocale.Language = "sv";
aLocale.Country = "SE";
{
auto aTest = u"Arbeits- -nehmer"_ustr;
i18n::Boundary aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.endPos);
aBounds
= m_xBreak->getWordBoundary(aTest, 13, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.endPos);
}
}
void TestBreakIterator::testHebrewGereshGershaim()
{
// In Hebrew documents, there are multiple valid ways to represent the geresh and gershaim
// intra-word punctuation marks. This test exhaustively exercises them.
//
// See the following bugs:
// i#51661: Add quotation mark as middle letter for Hebrew
// tdf#46950: Spell-checking breaks Hebrew words at intra-word single and double quotes
lang::Locale aLocale;
aLocale.Language = "he";
aLocale.Country = "IL";
// Unicode U+05F3 HEBREW PUNCTUATION GERESH
{
auto aTest = u"ג׳ירפה"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
// Apostrophe as geresh
{
auto aTest = u"ג'ירפה"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
// Right single quote as geresh
{
auto aTest = u"ג’ירפה"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 3, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 3, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
// Unicode U+05F4 HEBREW PUNCTUATION GERSHAYIM
{
auto aTest = u"דו״ח"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
}
// Double quote as gershayim
{
auto aTest = u"דו\"ח"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
}
// Right double quote as gershayim
{
auto aTest = u"דו”ח"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
aBounds = m_xBreak->getWordBoundary(aTest, 2, aLocale,
i18n::WordType::ANYWORD_IGNOREWHITESPACES, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
}
}
void TestBreakIterator::testLegacySurrogatePairs()
{
lang::Locale aLocale;
aLocale.Language = "ja";
aLocale.Country = "JP";
// i#75632: [surrogate pair] Japanese word break does not work properly for surrogate pairs.
// and many others to address bugs: i#75631 i#75633 i#75412 etc.
//
// BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test.
{
static constexpr OUString aTest = u"X 𠮟 X"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
aBounds
= m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
}
}
void TestBreakIterator::testWordCount()
{
auto fnCountWords = [&](const OUString& aStr, const lang::Locale& aLocale) -> int
{
int nWords = 0;
sal_Int32 nNextPos = 0;
int nIterGuard = 0;
if (m_xBreak->isBeginWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT))
{
++nWords;
}
while (true)
{
CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++nIterGuard < 100);
auto aBounds = m_xBreak->nextWord(aStr, nNextPos, aLocale, i18n::WordType::WORD_COUNT);
if (aBounds.endPos == aBounds.startPos)
{
break;
}
nNextPos = aBounds.endPos;
++nWords;
}
return nWords;
};
// i#80815: "Word count differs from MS Word"
// This is a characteristic test for word count using test data from the linked bug.
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
const OUString aStr = u""
"test data for word count issue #80815\n"
"fo\\\'sforos\n"
"archipi\\\'elago\n"
"do\\^me\n"
"f**k\n"
"\n"
"battery-driven\n"
"and/or\n"
"apple(s)\n"
"money+opportunity\n"
"Micro$oft\n"
"\n"
"300$\n"
"I(not you)\n"
"a****n\n"
"1+3=4\n"
"\n"
"aaaaaaa.aaaaaaa\n"
"aaaaaaa,aaaaaaa\n"
"aaaaaaa;aaaaaaa\n"_ustr;
CPPUNIT_ASSERT_EQUAL(24, fnCountWords(aStr, aLocale));
}
// Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
{
lang::Locale aLocale;
aLocale.Language = "ja";
aLocale.Country = "JP";
const OUString aStr = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
CPPUNIT_ASSERT_EQUAL(7, fnCountWords(aStr, aLocale));
}
// tdf#150621 Korean words should be counted individually, rather than by syllable.
//
// Per i#80815, the intention for the word count feature is to emulate the behavior of MS Word.
{
lang::Locale aLocale;
aLocale.Language = "ko";
aLocale.Country = "KR";
// Basic case: Korean words are counted as space-delimited. In particular, grammatical
// particles are treated as part of the previous word.
CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"저는 영화를 봤어요"_ustr, aLocale));
// Mixed script: Korean is mostly written in hangul, but hanja are still used in certain
// situations (e.g. abbreviations in newspaper articles). For Chinese and Japanese, such
// ideographs would be counted individually as words. In Korean, however, they are treated
// no differently than hangul characters.
CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"불렀다...與"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"불렀다 ...與"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(3, fnCountWords(u"불렀다 ... 與"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(1, fnCountWords(u"尹탄핵"_ustr, aLocale));
CPPUNIT_ASSERT_EQUAL(2, fnCountWords(u"尹 탄핵"_ustr, aLocale));
}
}
void TestBreakIterator::setUp()
{
BootstrapFixtureBase::setUp();
m_xBreak.set(m_xSFactory->createInstance(u"com.sun.star.i18n.BreakIterator"_ustr), uno::UNO_QUERY_THROW);
}
void TestBreakIterator::tearDown()
{
m_xBreak.clear();
BootstrapFixtureBase::tearDown();
}
CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
CPPUNIT_PLUGIN_IMPLEMENT();
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */