f0b87e8162
We were requiring ICU 4.6 which was released in 2011, and ifdef'ing our way through newer ICU versions. ICU is a core dependency and it makes no sense to build LibreOffice with such ancient versions of it. This change requires ICU 66 (released in 2020), and removes all the ifdefs for older versions. There are more cleanups to do, but these will be done separately. Change-Id: I2e4f7608a08f4d531b0a4c74bbfdf91a451f833f Reviewed-on: https://gerrit.libreoffice.org/c/core/+/153387 Tested-by: Jenkins Reviewed-by: خالد حسني <khaled@libreoffice.org>
1062 lines
41 KiB
C++
1062 lines
41 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*/
|
|
|
|
#include <com/sun/star/i18n/XBreakIterator.hpp>
|
|
#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
|
|
#include <com/sun/star/i18n/ScriptType.hpp>
|
|
#include <com/sun/star/i18n/WordType.hpp>
|
|
#include <o3tl/cppunittraitshelper.hxx>
|
|
#include <unotest/bootstrapfixturebase.hxx>
|
|
|
|
#include <unicode/uvernum.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <stack>
|
|
#include <string_view>
|
|
|
|
using namespace ::com::sun::star;
|
|
|
|
class TestBreakIterator : public test::BootstrapFixtureBase
|
|
{
|
|
public:
|
|
virtual void setUp() override;
|
|
virtual void tearDown() override;
|
|
|
|
void testLineBreaking();
|
|
void testWordBoundaries();
|
|
void testGraphemeIteration();
|
|
void testWeak();
|
|
void testAsian();
|
|
void testThai();
|
|
void testLao();
|
|
#ifdef TODO
|
|
void testNorthernThai();
|
|
void testKhmer();
|
|
#endif
|
|
void testJapanese();
|
|
void testChinese();
|
|
|
|
CPPUNIT_TEST_SUITE(TestBreakIterator);
|
|
CPPUNIT_TEST(testLineBreaking);
|
|
CPPUNIT_TEST(testWordBoundaries);
|
|
CPPUNIT_TEST(testGraphemeIteration);
|
|
CPPUNIT_TEST(testWeak);
|
|
CPPUNIT_TEST(testAsian);
|
|
CPPUNIT_TEST(testThai);
|
|
CPPUNIT_TEST(testLao);
|
|
#ifdef TODO
|
|
CPPUNIT_TEST(testKhmer);
|
|
CPPUNIT_TEST(testNorthernThai);
|
|
#endif
|
|
CPPUNIT_TEST(testJapanese);
|
|
CPPUNIT_TEST(testChinese);
|
|
CPPUNIT_TEST_SUITE_END();
|
|
|
|
private:
|
|
uno::Reference<i18n::XBreakIterator> m_xBreak;
|
|
void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
|
|
};
|
|
|
|
void TestBreakIterator::testLineBreaking()
|
|
{
|
|
i18n::LineBreakHyphenationOptions aHyphOptions;
|
|
i18n::LineBreakUserOptions aUserOptions;
|
|
lang::Locale aLocale;
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=31271
|
|
{
|
|
OUString aTest("(some text here)");
|
|
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
//Here we want the line break to leave text here) on the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
|
|
}
|
|
|
|
{
|
|
//Here we want the line break to leave "here)" on the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=49849
|
|
{
|
|
static constexpr OUStringLiteral aWord = u"\u05DE\u05D9\u05DC\u05D9\u05DD";
|
|
OUString aTest(aWord + " " + aWord);
|
|
|
|
aLocale.Language = "he";
|
|
aLocale.Country = "IL";
|
|
|
|
{
|
|
//Here we want the line break to happen at the whitespace
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=17155
|
|
{
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
//Here we want the line break to leave /bar/ba clumped together on the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak("foo /bar/baz", strlen("foo /bar/ba"), aLocale, 0,
|
|
aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=19716
|
|
{
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
OUString aTest("aaa]aaa");
|
|
//Here we want the line break to move the whole lot to the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
|
|
aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//this is an example sequence from tdf92993-1.docx caught by the load crashtesting
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\U0001f356\U0001f357\U0001f346"
|
|
"\U0001f364\u2668\ufe0f\U0001f3c6";
|
|
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
//This must not assert/crash
|
|
(void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
|
|
}
|
|
}
|
|
|
|
//See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\uc560\uad6D\uac00\uc758 \uac00"
|
|
"\uc0ac\ub294";
|
|
|
|
aLocale.Language = "ko";
|
|
aLocale.Country = "KR";
|
|
|
|
{
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
|
|
aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
|
|
}
|
|
}
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=49629
|
|
void TestBreakIterator::testWordBoundaries()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
i18n::Boundary aBounds;
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=11993
|
|
{
|
|
OUString aTest("abcd ef ghi??? KLM");
|
|
|
|
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
|
|
|
|
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
|
|
//next word
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
|
|
|
|
//previous word
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
|
|
|
|
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(16), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(19), aBounds.endPos);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=21907
|
|
{
|
|
OUString aTest("b a?");
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=14904
|
|
{
|
|
static constexpr OUStringLiteral aTest =
|
|
u"Working \u201CWords"
|
|
" starting wit"
|
|
"h quotes\u201D Work"
|
|
"ing \u2018Broken\u2019 "
|
|
"?Spanish? doe"
|
|
"sn\u2019t work. No"
|
|
"t even \u00BFreal? "
|
|
"Spanish";
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(14), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(44), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(46), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(52), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(55), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(62), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(64), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(71), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(88), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(92), aBounds.endPos);
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=49629
|
|
sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
|
|
for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
|
|
{
|
|
//make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
|
|
for (auto const& i: aBreakTests)
|
|
{
|
|
OUString aTest = "Word" + OUStringChar(i) + "Word";
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
|
|
switch (mode)
|
|
{
|
|
case i18n::WordType::ANY_WORD:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
|
|
break;
|
|
case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
|
|
break;
|
|
case i18n::WordType::DICTIONARY_WORD:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
|
|
break;
|
|
case i18n::WordType::WORD_COUNT:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
|
|
break;
|
|
}
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
|
|
}
|
|
}
|
|
|
|
sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
|
|
for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
|
|
{
|
|
//make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
|
|
for (auto const& p: aJoinTests)
|
|
{
|
|
OUString aTest = "Word" + OUStringChar(p) + "Word";
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
|
|
switch (mode)
|
|
{
|
|
case i18n::WordType::ANY_WORD:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
|
break;
|
|
case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
|
break;
|
|
case i18n::WordType::DICTIONARY_WORD:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
|
break;
|
|
case i18n::WordType::WORD_COUNT:
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
|
break;
|
|
}
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=13494
|
|
{
|
|
const OUString aBase("xxAAxxBBxxCCxx");
|
|
const sal_Unicode aTests[] =
|
|
{
|
|
'\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
|
|
'(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
|
|
'\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
|
|
};
|
|
|
|
const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
|
|
for (auto const& r: aTests)
|
|
{
|
|
OUString aTest = aBase.replace('x', r);
|
|
sal_Int32 nPos = -1;
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aDoublePositions));
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
i = std::size(aDoublePositions)-1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
--i;
|
|
CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
|
|
const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
|
|
for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
|
|
{
|
|
OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[j]));
|
|
sal_Int32 nPos = -1;
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aSinglePositions));
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
i = std::size(aSinglePositions)-1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
--i;
|
|
CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
|
|
const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
|
|
CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
|
|
{
|
|
OUString aTest = aBase.replaceAll("xx", OUStringChar(aTests[0]));
|
|
sal_Int32 nPos = -1;
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aSingleQuotePositions));
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
i = std::size(aSingleQuotePositions)-1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
--i;
|
|
CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=13451
|
|
{
|
|
aLocale.Language = "ca";
|
|
aLocale.Country = "ES";
|
|
|
|
OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
|
|
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=85411
|
|
for (int j = 0; j < 3; ++j)
|
|
{
|
|
switch (j)
|
|
{
|
|
case 0:
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
break;
|
|
case 1:
|
|
aLocale.Language = "ca";
|
|
aLocale.Country = "ES";
|
|
break;
|
|
case 2:
|
|
aLocale.Language = "fi";
|
|
aLocale.Country = "FI";
|
|
break;
|
|
default:
|
|
CPPUNIT_ASSERT(false);
|
|
break;
|
|
}
|
|
|
|
static constexpr OUStringLiteral aTest =
|
|
u"I\u200Bwant\u200Bto\u200Bgo";
|
|
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {1, 6, 9, 12};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
|
|
}
|
|
|
|
//https://bz.apache.org/ooo/show_bug.cgi?id=21290
|
|
for (int j = 0; j < 2; ++j)
|
|
{
|
|
switch (j)
|
|
{
|
|
case 0:
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
break;
|
|
case 1:
|
|
aLocale.Language = "grc";
|
|
aLocale.Country.clear();
|
|
break;
|
|
default:
|
|
CPPUNIT_ASSERT(false);
|
|
break;
|
|
}
|
|
|
|
static constexpr OUStringLiteral aTest =
|
|
u"\u1F0C\u03BD\u03B4\u03C1\u03B1 \u1F00"
|
|
"\u03C1\u03BD\u1F7B\u03BC\u03B5\u03BD\u03BF"
|
|
"\u03C2 \u1F00\u03BB\u03BB \u1F24"
|
|
"\u03C3\u03B8\u03B9\u03BF\u03BD";
|
|
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {5, 15, 19, 26};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=58513
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=55707
|
|
{
|
|
aLocale.Language = "fi";
|
|
aLocale.Country = "FI";
|
|
|
|
OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
|
|
|
|
{
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::WORD_COUNT, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
|
|
}
|
|
|
|
{
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
|
|
40, 41, 42, 43, 45, 46, 47, 50, 51};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < std::size(aExpected));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
|
|
++i;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
|
|
++i;
|
|
nPos = aBounds.endPos;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(std::size(aExpected), i);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=107843
|
|
{
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
static constexpr OUStringLiteral aTest =
|
|
u"ru\uFB00le \uFB01sh";
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=113785
|
|
{
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
static constexpr OUStringLiteral aTest =
|
|
u"a\u2013b\u2014c";
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
|
}
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=40292
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=80412
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=111152
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=50172
|
|
void TestBreakIterator::testGraphemeIteration()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "bn";
|
|
aLocale.Country = "IN";
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u09AC\u09CD\u09AF"; // BA HALANT LA
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u09B9\u09CD\u09A3\u09BF";
|
|
// HA HALANT NA VOWELSIGNI
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u09A4\u09CD\u09AE\u09CD\u09AF";
|
|
// TA HALANT MA HALANT YA
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
aLocale.Language = "ta";
|
|
aLocale.Country = "IN";
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8"; // CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos = 0;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
|
|
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
|
|
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
|
|
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(6), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(4), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(2), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale, i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u0B95\u0BC1"; // KA VOWELSIGNU
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos = 0;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest =
|
|
u"\u0B9A\u0BBF\u0BA4\u0BCD\u0BA4\u0BBF\u0BB0\u0BC8";
|
|
// CA VOWELSIGNI TA VIRAMA TA VOWELSIGNI RA VOWELSIGNAI
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos=0;
|
|
|
|
for (sal_Int32 i = 0; i < 4; ++i)
|
|
{
|
|
sal_Int32 nOldPos = nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
|
|
}
|
|
|
|
for (sal_Int32 i = 0; i < 4; ++i)
|
|
{
|
|
sal_Int32 nOldPos = nPos;
|
|
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
|
|
}
|
|
}
|
|
|
|
{
|
|
static constexpr OUStringLiteral aText = u"\u05D0\u05B8"; // ALEF QAMATS
|
|
|
|
sal_Int32 nGraphemeCount = 0;
|
|
|
|
sal_Int32 nCurPos = 0;
|
|
while (nCurPos < aText.getLength())
|
|
{
|
|
sal_Int32 nCount2 = 1;
|
|
nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
|
|
i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
|
|
++nGraphemeCount;
|
|
}
|
|
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
|
|
}
|
|
|
|
aLocale.Language = "hi";
|
|
aLocale.Country = "IN";
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u0936\u0940"; // SHA VOWELSIGNII
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos = 0;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
}
|
|
|
|
//A test to ensure that certain ranges and codepoints that are categorized as
|
|
//weak remain as weak, so that existing docs that depend on this don't silently
|
|
//change font for those weak chars
|
|
void TestBreakIterator::testWeak()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
static constexpr OUStringLiteral aWeaks =
|
|
u"\u0001\u0002"
|
|
" \u00A0"
|
|
"\u0300\u036F" //Combining Diacritical Marks
|
|
"\u1AB0\u1AFF" //Combining Diacritical Marks Extended
|
|
"\u1DC0\u1DFF" //Combining Diacritical Marks Supplement
|
|
"\u20D0\u20FF" //Combining Diacritical Marks for Symbols
|
|
"\u2150\u215F" //Number Forms, fractions
|
|
"\u2160\u2180" //Number Forms, roman numerals
|
|
"\u2200\u22FF" //Mathematical Operators
|
|
"\u27C0\u27EF" //Miscellaneous Mathematical Symbols-A
|
|
"\u2980\u29FF" //Miscellaneous Mathematical Symbols-B
|
|
"\u2A00\u2AFF" //Supplemental Mathematical Operators
|
|
"\u2100\u214F" //Letterlike Symbols
|
|
"\u2308\u230B" //Miscellaneous technical
|
|
"\u25A0\u25FF" //Geometric Shapes
|
|
"\u2B30\u2B4C"; //Miscellaneous Symbols and Arrows
|
|
|
|
for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
|
|
{
|
|
sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
|
|
OString aMsg =
|
|
"Char 0x" +
|
|
OString::number(static_cast<sal_Int32>(std::u16string_view(aWeaks)[i]), 16) +
|
|
" should have been weak";
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
|
|
i18n::ScriptType::WEAK, nScript);
|
|
}
|
|
}
|
|
}
|
|
|
|
//A test to ensure that certain ranges and codepoints that are categorized as
|
|
//asian remain as asian, so that existing docs that depend on this don't silently
|
|
//change font for those asian chars.
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=38095
|
|
void TestBreakIterator::testAsian()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
static constexpr OUStringLiteral aAsians =
|
|
//some typical CJK chars
|
|
u"\u4E00\u62FF"
|
|
//The full HalfWidth and FullWidth block has historically been
|
|
//designated as taking the CJK font :-(
|
|
//HalfWidth and FullWidth forms of ASCII 0-9, categorized under
|
|
//UAX24 as "Common" i.e. by that logic WEAK
|
|
"\uFF10\uFF19"
|
|
//HalfWidth and FullWidth forms of ASCII A-z, categorized under
|
|
//UAX25 as "Latin", i.e. by that logic LATIN
|
|
"\uFF21\uFF5A";
|
|
|
|
for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
|
|
{
|
|
sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
|
|
OString aMsg =
|
|
"Char 0x" +
|
|
OString::number(static_cast<sal_Int32>(std::u16string_view(aAsians)[i]), 16) +
|
|
" should have been asian";
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
|
|
i18n::ScriptType::ASIAN, nScript);
|
|
}
|
|
}
|
|
}
|
|
|
|
//A test to ensure that our Lao word boundary detection is useful
|
|
void TestBreakIterator::testLao()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "lo";
|
|
aLocale.Country = "LA";
|
|
|
|
static constexpr OUStringLiteral aTest = u"\u0e8d\u0eb4\u0e99\u0e94\u0eb5\u0e95\u0ec9\u0ead\u0e99\u0eae\u0eb1\u0e9a";
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
|
#if (U_ICU_VERSION_MAJOR_NUM < 70)
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
|
#else
|
|
// FIXME:
|
|
// In ICU 70/71 for yet unknown reason the word boundary 9 is not detected and
|
|
// instead the length 12 is returned as endpos.
|
|
// Deep in
|
|
// icu_70::RuleBasedBreakIterator::BreakCache::next()
|
|
// icu_70::RuleBasedBreakIterator::BreakCache::following()
|
|
// icu_70::RuleBasedBreakIterator::following()
|
|
// i18npool::BreakIterator_Unicode::getWordBoundary()
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(12), aBounds.endPos);
|
|
#endif
|
|
}
|
|
|
|
//A test to ensure that our thai word boundary detection is useful
|
|
void TestBreakIterator::testThai()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "th";
|
|
aLocale.Country = "TH";
|
|
|
|
//See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u0E01\u0E38\u0E2B\u0E25\u0E32\u0E1A";
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
|
|
sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full word",
|
|
aTest.getLength(), aBounds.endPos);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=29548
|
|
//make sure forwards and back are consistent
|
|
{
|
|
static constexpr OUStringLiteral aTest =
|
|
u"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
|
|
"\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
|
|
"\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27"
|
|
"\u0E2D\u0E38\u0E17\u0E22\u0E32\u0E19\u0E41"
|
|
"\u0E2B\u0E48\u0E07\u0E0A\u0E32\u0E15\u0E34"
|
|
"\u0E19\u0E49\u0E33\u0E2B\u0E19\u0E32\u0E27";
|
|
|
|
std::stack<sal_Int32> aPositions;
|
|
sal_Int32 nPos = -1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
aPositions.push(nPos);
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
CPPUNIT_ASSERT(!aPositions.empty());
|
|
aPositions.pop();
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(!aPositions.empty());
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
|
|
aPositions.pop();
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
|
|
// tdf#113694
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\U00010000";
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", aTest.getLength(), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, aTest.getLength(), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
}
|
|
|
|
#ifdef TODO
|
|
void TestBreakIterator::testNorthernThai()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "nod";
|
|
aLocale.Country = "TH";
|
|
|
|
const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
|
|
OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_MESSAGE("Should skip full word",
|
|
aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
|
|
}
|
|
|
|
// Not sure if any version earlier than 49 did have Khmer word boundary
|
|
// dictionaries, 4.6 does not.
|
|
|
|
// As of icu 54, word boundary detection for Khmer is still considered
|
|
// insufficient, so icu khmer stuff is disabled
|
|
|
|
//A test to ensure that our khmer word boundary detection is useful
|
|
//https://bugs.libreoffice.org/show_bug.cgi?id=52020
|
|
void TestBreakIterator::testKhmer()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "km";
|
|
aLocale.Country = "KH";
|
|
|
|
const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
|
|
|
|
OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
|
|
}
|
|
#endif
|
|
|
|
void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "ja";
|
|
aLocale.Country = "JP";
|
|
i18n::Boundary aBounds;
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u30B7\u30E3\u30C3\u30C8\u30C0\u30A6\u30F3";
|
|
|
|
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
|
|
}
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u9EBB\u306E\u8449\u9EBB\u306E\u8449";
|
|
|
|
aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.endPos);
|
|
|
|
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
|
}
|
|
}
|
|
|
|
void TestBreakIterator::testJapanese()
|
|
{
|
|
doTestJapanese(m_xBreak);
|
|
|
|
// fdo#78479 - test second / cached instantiation of xdictionary
|
|
uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
|
|
"com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
|
|
|
|
doTestJapanese(xTmpBreak);
|
|
}
|
|
|
|
void TestBreakIterator::testChinese()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "zh";
|
|
aLocale.Country = "CN";
|
|
|
|
{
|
|
static constexpr OUStringLiteral aTest = u"\u6A35\u6A30\u69FE\u8919\U00029EDB";
|
|
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(6), aBounds.endPos);
|
|
}
|
|
}
|
|
void TestBreakIterator::setUp()
|
|
{
|
|
BootstrapFixtureBase::setUp();
|
|
m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
|
|
}
|
|
|
|
void TestBreakIterator::tearDown()
|
|
{
|
|
m_xBreak.clear();
|
|
BootstrapFixtureBase::tearDown();
|
|
}
|
|
|
|
CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
|
|
|
|
CPPUNIT_PLUGIN_IMPLEMENT();
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|