5dc52ee001
BreakIterator_CTL in the non CharacterIteratorMode::SKIPCELL mode did not handle UTF-16 surrogate pairs at all, causing backspace to delete lone surrogates which is really bad. Just copied the corresponding code from BreakIterator_Unicode. Additionally, BreakIterator_th was not correctly skipping non-Thai text and always treating one character as Thai. Change-Id: Ia379327e042ff602fc19a485c4cbd1a3683f9230 Reviewed-on: https://gerrit.libreoffice.org/54631 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Eike Rathke <erack@redhat.com>
1067 lines
41 KiB
C++
1067 lines
41 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*/
|
|
|
|
#include <cppuhelper/bootstrap.hxx>
|
|
#include <cppuhelper/basemutex.hxx>
|
|
#include <com/sun/star/i18n/XBreakIterator.hpp>
|
|
#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
|
|
#include <com/sun/star/i18n/ScriptType.hpp>
|
|
#include <com/sun/star/i18n/WordType.hpp>
|
|
#include <unotest/bootstrapfixturebase.hxx>
|
|
|
|
#include <unicode/uversion.h>
|
|
|
|
#include <rtl/strbuf.hxx>
|
|
#include <rtl/ustrbuf.hxx>
|
|
|
|
#include <string.h>
|
|
|
|
#include <stack>
|
|
|
|
using namespace ::com::sun::star;
|
|
|
|
class TestBreakIterator : public test::BootstrapFixtureBase
|
|
{
|
|
public:
|
|
virtual void setUp() override;
|
|
virtual void tearDown() override;
|
|
|
|
void testLineBreaking();
|
|
void testWordBoundaries();
|
|
void testGraphemeIteration();
|
|
void testWeak();
|
|
void testAsian();
|
|
void testThai();
|
|
#if (U_ICU_VERSION_MAJOR_NUM > 51)
|
|
void testLao();
|
|
#ifdef TODO
|
|
void testNorthernThai();
|
|
void testKhmer();
|
|
#endif
|
|
#endif
|
|
void testJapanese();
|
|
void testChinese();
|
|
|
|
CPPUNIT_TEST_SUITE(TestBreakIterator);
|
|
CPPUNIT_TEST(testLineBreaking);
|
|
CPPUNIT_TEST(testWordBoundaries);
|
|
CPPUNIT_TEST(testGraphemeIteration);
|
|
CPPUNIT_TEST(testWeak);
|
|
CPPUNIT_TEST(testAsian);
|
|
CPPUNIT_TEST(testThai);
|
|
#if (U_ICU_VERSION_MAJOR_NUM > 51)
|
|
CPPUNIT_TEST(testLao);
|
|
#ifdef TODO
|
|
CPPUNIT_TEST(testKhmer);
|
|
CPPUNIT_TEST(testNorthernThai);
|
|
#endif
|
|
#endif
|
|
CPPUNIT_TEST(testJapanese);
|
|
CPPUNIT_TEST(testChinese);
|
|
CPPUNIT_TEST_SUITE_END();
|
|
|
|
private:
|
|
uno::Reference<i18n::XBreakIterator> m_xBreak;
|
|
void doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak);
|
|
};
|
|
|
|
void TestBreakIterator::testLineBreaking()
|
|
{
|
|
i18n::LineBreakHyphenationOptions aHyphOptions;
|
|
i18n::LineBreakUserOptions aUserOptions;
|
|
lang::Locale aLocale;
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=31271
|
|
{
|
|
OUString aTest("(some text here)");
|
|
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
//Here we want the line break to leave text here) on the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(6), aResult.breakIndex);
|
|
}
|
|
|
|
{
|
|
//Here we want the line break to leave "here)" on the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", static_cast<sal_Int32>(11), aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=49849
|
|
{
|
|
const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
|
|
OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
|
|
OUString aTest(aWord + " " + aWord);
|
|
|
|
aLocale.Language = "he";
|
|
aLocale.Country = "IL";
|
|
|
|
{
|
|
//Here we want the line break to happen at the whitespace
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the word", aWord.getLength()+1, aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=17155
|
|
{
|
|
OUString const aTest("foo /bar/baz");
|
|
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
//Here we want the line break to leave /bar/ba clumped together on the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
|
|
aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the first slash", static_cast<sal_Int32>(4), aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=19716
|
|
{
|
|
OUString aTest("aaa]aaa");
|
|
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
//Here we want the line break to move the whole lot to the next line
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
|
|
aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break at the start of the line, not at ]", static_cast<sal_Int32>(0), aResult.breakIndex);
|
|
}
|
|
}
|
|
|
|
//this is an example sequence from tdf92993-1.docx caught by the load crashtesting
|
|
{
|
|
const sal_Unicode WEIRD1[] = { 0xd83c, 0xdf56, 0xd83c, 0xdf57, 0xd83c, 0xdf46,
|
|
0xd83c, 0xdf64, 0x2668, 0xfe0f, 0xd83c, 0xdfc6};
|
|
|
|
OUString aTest(WEIRD1, SAL_N_ELEMENTS(WEIRD1));
|
|
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
//This must not assert/crash
|
|
(void)m_xBreak->getLineBreak(aTest, 0, aLocale, 0, aHyphOptions, aUserOptions);
|
|
}
|
|
}
|
|
|
|
//See https://bugs.documentfoundation.org/show_bug.cgi?id=96197
|
|
{
|
|
const sal_Unicode HANGUL[] = { 0xc560, 0xad6D, 0xac00, 0xc758, 0x0020, 0xac00,
|
|
0xc0ac, 0xb294};
|
|
OUString aTest(HANGUL, SAL_N_ELEMENTS(HANGUL));
|
|
|
|
aLocale.Language = "ko";
|
|
aLocale.Country = "KR";
|
|
|
|
{
|
|
i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
|
|
aHyphOptions, aUserOptions);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Expected a break don't split the Korean word!", static_cast<sal_Int32>(5), aResult.breakIndex);
|
|
}
|
|
}
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=49629
|
|
void TestBreakIterator::testWordBoundaries()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
i18n::Boundary aBounds;
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=11993
|
|
{
|
|
OUString aTest("abcd ef ghi??? KLM");
|
|
|
|
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
|
|
|
|
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
|
|
//next word
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
|
|
|
|
//previous word
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
|
|
|
|
CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=21907
|
|
{
|
|
OUString aTest("b a?");
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=14904
|
|
{
|
|
const sal_Unicode TEST[] =
|
|
{
|
|
'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
|
|
' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
|
|
'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
|
|
'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
|
|
'?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
|
|
's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
|
|
't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
|
|
'S', 'p', 'a', 'n', 'i', 's', 'h'
|
|
};
|
|
OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=49629
|
|
sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
|
|
for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
|
|
{
|
|
//make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
|
|
for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
|
|
{
|
|
#if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
|
|
//Note the breakiterator test is known to fail on older icu
|
|
//versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
|
|
if (aBreakTests[i] == 0x200B)
|
|
continue;
|
|
#endif
|
|
OUString aTest = "Word" + OUStringLiteral1(aBreakTests[i]) + "Word";
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
|
|
switch (mode)
|
|
{
|
|
case i18n::WordType::ANY_WORD:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
|
|
break;
|
|
case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
|
|
break;
|
|
case i18n::WordType::DICTIONARY_WORD:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
|
|
break;
|
|
case i18n::WordType::WORD_COUNT:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
|
|
break;
|
|
}
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
|
|
}
|
|
}
|
|
|
|
sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
|
|
for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
|
|
{
|
|
//make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
|
|
for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
|
|
{
|
|
OUString aTest = "Word" + OUStringLiteral1(aJoinTests[i]) + "Word";
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
|
|
switch (mode)
|
|
{
|
|
case i18n::WordType::ANY_WORD:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
|
|
break;
|
|
case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
|
|
break;
|
|
case i18n::WordType::DICTIONARY_WORD:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
|
|
break;
|
|
case i18n::WordType::WORD_COUNT:
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
|
|
break;
|
|
}
|
|
|
|
CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
|
|
CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=13494
|
|
{
|
|
const OUString aBase("xxAAxxBBxxCCxx");
|
|
const sal_Unicode aTests[] =
|
|
{
|
|
'\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
|
|
'(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
|
|
'\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
|
|
};
|
|
|
|
const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
|
|
for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
|
|
{
|
|
OUString aTest = aBase.replace('x', aTests[j]);
|
|
sal_Int32 nPos = -1;
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
i = SAL_N_ELEMENTS(aDoublePositions)-1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
--i;
|
|
CPPUNIT_ASSERT_EQUAL(aDoublePositions[i], nPos);
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
|
|
const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
|
|
for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
|
|
{
|
|
OUString aTest = aBase.replaceAll("xx", OUStringLiteral1(aTests[j]));
|
|
sal_Int32 nPos = -1;
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
i = SAL_N_ELEMENTS(aSinglePositions)-1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
--i;
|
|
CPPUNIT_ASSERT_EQUAL(aSinglePositions[i], nPos);
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
|
|
const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
|
|
CPPUNIT_ASSERT_EQUAL(u'\'', aTests[0]);
|
|
{
|
|
OUString aTest = aBase.replaceAll("xx", OUStringLiteral1(aTests[0]));
|
|
sal_Int32 nPos = -1;
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
--i;
|
|
CPPUNIT_ASSERT_EQUAL(aSingleQuotePositions[i], nPos);
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=13451
|
|
{
|
|
aLocale.Language = "ca";
|
|
aLocale.Country = "ES";
|
|
|
|
OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
|
|
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=85411
|
|
for (int j = 0; j < 3; ++j)
|
|
{
|
|
switch (j)
|
|
{
|
|
case 0:
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
break;
|
|
case 1:
|
|
aLocale.Language = "ca";
|
|
aLocale.Country = "ES";
|
|
break;
|
|
case 2:
|
|
aLocale.Language = "fi";
|
|
aLocale.Country = "FI";
|
|
break;
|
|
default:
|
|
CPPUNIT_ASSERT(false);
|
|
break;
|
|
}
|
|
|
|
const sal_Unicode TEST[] =
|
|
{
|
|
'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
|
|
};
|
|
OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
|
|
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {1, 6, 9, 12};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
|
|
}
|
|
|
|
//https://bz.apache.org/ooo/show_bug.cgi?id=21290
|
|
for (int j = 0; j < 2; ++j)
|
|
{
|
|
switch (j)
|
|
{
|
|
case 0:
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
break;
|
|
case 1:
|
|
aLocale.Language = "grc";
|
|
aLocale.Country.clear();
|
|
break;
|
|
default:
|
|
CPPUNIT_ASSERT(false);
|
|
break;
|
|
}
|
|
|
|
const sal_Unicode TEST[] =
|
|
{
|
|
0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
|
|
0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
|
|
0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
|
|
0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
|
|
};
|
|
OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
|
|
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {5, 15, 19, 26};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=58513
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=55707
|
|
{
|
|
aLocale.Language = "fi";
|
|
aLocale.Country = "FI";
|
|
|
|
OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
|
|
|
|
{
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
|
|
nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::WORD_COUNT, true).endPos;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], nPos);
|
|
++i;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
|
|
}
|
|
|
|
{
|
|
sal_Int32 nPos = 0;
|
|
sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
|
|
40, 41, 42, 43, 45, 46, 47, 50, 51};
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.startPos);
|
|
++i;
|
|
CPPUNIT_ASSERT_EQUAL(aExpected[i], aBounds.endPos);
|
|
++i;
|
|
nPos = aBounds.endPos;
|
|
}
|
|
while (nPos++ < aTest.getLength());
|
|
CPPUNIT_ASSERT_EQUAL(SAL_N_ELEMENTS(aExpected), i);
|
|
}
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=107843
|
|
{
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
const sal_Unicode TEST[] =
|
|
{
|
|
'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
|
|
};
|
|
OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=113785
|
|
{
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
const sal_Unicode TEST[] =
|
|
{
|
|
'a', 0x2013, 'b', 0x2014, 'c'
|
|
};
|
|
OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
|
|
|
|
aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
|
|
|
|
aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
|
|
}
|
|
}
|
|
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=40292
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=80412
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=111152
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=50172
|
|
void TestBreakIterator::testGraphemeIteration()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "bn";
|
|
aLocale.Country = "IN";
|
|
|
|
{
|
|
const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
|
|
OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(BA_HALANT_LA)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
|
|
OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
|
|
OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
aLocale.Language = "ta";
|
|
aLocale.Country = "IN";
|
|
|
|
{
|
|
const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
|
|
OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos = 0;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VIRAMA_SSA)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
|
|
OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos = 0;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(KA_VOWELSIGNU)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
|
|
{
|
|
const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
|
|
{ 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
|
|
OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
|
|
SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos=0;
|
|
|
|
for (sal_Int32 i = 0; i < 4; ++i)
|
|
{
|
|
sal_Int32 nOldPos = nPos;
|
|
nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos+2, nPos);
|
|
}
|
|
|
|
for (sal_Int32 i = 0; i < 4; ++i)
|
|
{
|
|
sal_Int32 nOldPos = nPos;
|
|
nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip 2 units", nOldPos-2, nPos);
|
|
}
|
|
}
|
|
|
|
{
|
|
const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
|
|
OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
|
|
|
|
sal_Int32 nGraphemeCount = 0;
|
|
|
|
sal_Int32 nCurPos = 0;
|
|
while (nCurPos < aText.getLength())
|
|
{
|
|
sal_Int32 nCount2 = 1;
|
|
nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
|
|
i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
|
|
++nGraphemeCount;
|
|
}
|
|
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should be considered 1 grapheme", static_cast<sal_Int32>(1), nGraphemeCount);
|
|
}
|
|
|
|
aLocale.Language = "hi";
|
|
aLocale.Country = "IN";
|
|
|
|
{
|
|
const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
|
|
OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos = 0;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(SAL_N_ELEMENTS(SHA_VOWELSIGNII)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full grapheme", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
}
|
|
|
|
//A test to ensure that certain ranges and codepoints that are categorized as
|
|
//weak remain as weak, so that existing docs that depend on this don't silently
|
|
//change font for those weak chars
|
|
void TestBreakIterator::testWeak()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
const sal_Unicode WEAKS[] =
|
|
{
|
|
0x0001, 0x0002,
|
|
0x0020, 0x00A0,
|
|
0x0300, 0x036F, //Combining Diacritical Marks
|
|
0x1AB0, 0x1AFF, //Combining Diacritical Marks Extended
|
|
0x1DC0, 0x1DFF, //Combining Diacritical Marks Supplement
|
|
0x20D0, 0x20FF, //Combining Diacritical Marks for Symbols
|
|
0x2150, 0x215F, //Number Forms, fractions
|
|
0x2160, 0x2180, //Number Forms, roman numerals
|
|
0x2200, 0x22FF, //Mathematical Operators
|
|
0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
|
|
0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
|
|
0x2A00, 0x2AFF, //Supplemental Mathematical Operators
|
|
0x2100, 0x214F, //Letterlike Symbols
|
|
0x2308, 0x230B, //Miscellaneous technical
|
|
0x25A0, 0x25FF, //Geometric Shapes
|
|
0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
|
|
};
|
|
OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
|
|
|
|
for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
|
|
{
|
|
sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
|
|
OStringBuffer aMsg;
|
|
aMsg.append("Char 0x");
|
|
aMsg.append(static_cast<sal_Int32>(aWeaks[i]), 16);
|
|
aMsg.append(" should have been weak");
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
|
|
i18n::ScriptType::WEAK, nScript);
|
|
}
|
|
}
|
|
}
|
|
|
|
//A test to ensure that certain ranges and codepoints that are categorized as
|
|
//asian remain as asian, so that existing docs that depend on this don't silently
|
|
//change font for those asian chars.
|
|
//See https://bugs.libreoffice.org/show_bug.cgi?id=38095
|
|
void TestBreakIterator::testAsian()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "en";
|
|
aLocale.Country = "US";
|
|
|
|
{
|
|
const sal_Unicode ASIANS[] =
|
|
{
|
|
//some typical CJK chars
|
|
0x4E00, 0x62FF,
|
|
//The full HalfWidth and FullWidth block has historically been
|
|
//designated as taking the CJK font :-(
|
|
//HalfWidth and FullWidth forms of ASCII 0-9, categorized under
|
|
//UAX24 as "Common" i.e. by that logic WEAK
|
|
0xFF10, 0xFF19,
|
|
//HalfWidth and FullWidth forms of ASCII A-z, categorized under
|
|
//UAX25 as "Latin", i.e. by that logic LATIN
|
|
0xFF21, 0xFF5A
|
|
};
|
|
OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
|
|
|
|
for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
|
|
{
|
|
sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
|
|
OStringBuffer aMsg;
|
|
aMsg.append("Char 0x");
|
|
aMsg.append(static_cast<sal_Int32>(aAsians[i]), 16);
|
|
aMsg.append(" should have been asian");
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE(aMsg.getStr(),
|
|
i18n::ScriptType::ASIAN, nScript);
|
|
}
|
|
}
|
|
}
|
|
|
|
#if (U_ICU_VERSION_MAJOR_NUM > 51)
|
|
//A test to ensure that our Lao word boundary detection is useful
|
|
void TestBreakIterator::testLao()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "lo";
|
|
aLocale.Country = "LA";
|
|
|
|
const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
|
|
OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
|
|
CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
|
|
}
|
|
#endif
|
|
|
|
//A test to ensure that our thai word boundary detection is useful
|
|
void TestBreakIterator::testThai()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "th";
|
|
aLocale.Country = "TH";
|
|
|
|
//See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
|
|
{
|
|
const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
|
|
OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_MESSAGE("Should skip full word",
|
|
aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
|
|
}
|
|
|
|
//See https://bz.apache.org/ooo/show_bug.cgi?id=29548
|
|
//make sure forwards and back are consistent
|
|
{
|
|
const sal_Unicode THAI[] =
|
|
{
|
|
0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
|
|
0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
|
|
0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
|
|
0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
|
|
0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
|
|
0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
|
|
};
|
|
OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
|
|
|
|
std::stack<sal_Int32> aPositions;
|
|
sal_Int32 nPos = -1;
|
|
do
|
|
{
|
|
nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
aPositions.push(nPos);
|
|
}
|
|
while (nPos < aTest.getLength());
|
|
nPos = aTest.getLength();
|
|
CPPUNIT_ASSERT(!aPositions.empty());
|
|
aPositions.pop();
|
|
do
|
|
{
|
|
CPPUNIT_ASSERT(!aPositions.empty());
|
|
nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
|
|
CPPUNIT_ASSERT_EQUAL(aPositions.top(), nPos);
|
|
aPositions.pop();
|
|
}
|
|
while (nPos > 0);
|
|
}
|
|
|
|
// tdf#113694
|
|
{
|
|
const sal_Unicode NON_BMP[] = { 0xD800, 0xDC00 };
|
|
OUString aTest(NON_BMP, SAL_N_ELEMENTS(NON_BMP));
|
|
|
|
sal_Int32 nDone=0;
|
|
sal_Int32 nPos;
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
|
|
|
|
nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(SAL_N_ELEMENTS(NON_BMP)), nPos);
|
|
nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(NON_BMP), aLocale,
|
|
i18n::CharacterIteratorMode::SKIPCHARACTER, 1, nDone);
|
|
CPPUNIT_ASSERT_EQUAL_MESSAGE("Should skip full surrogate pair", static_cast<sal_Int32>(0), nPos);
|
|
}
|
|
}
|
|
|
|
#ifdef TODO
|
|
void TestBreakIterator::testNorthernThai()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "nod";
|
|
aLocale.Country = "TH";
|
|
|
|
const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
|
|
OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT_MESSAGE("Should skip full word",
|
|
aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
|
|
}
|
|
|
|
// Not sure if any version earlier than 49 did have Khmer word boundary
|
|
// dictionaries, 4.6 does not.
|
|
|
|
// As of icu 54, word boundary detection for Khmer is still considered
|
|
// insufficient, so icu khmer stuff is disabled
|
|
|
|
//A test to ensure that our khmer word boundary detection is useful
|
|
//https://bugs.libreoffice.org/show_bug.cgi?id=52020
|
|
void TestBreakIterator::testKhmer()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "km";
|
|
aLocale.Country = "KH";
|
|
|
|
const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
|
|
|
|
OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
|
|
i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
|
|
|
|
aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
|
|
}
|
|
#endif
|
|
|
|
void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > const &xBreak)
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "ja";
|
|
aLocale.Country = "JP";
|
|
i18n::Boundary aBounds;
|
|
|
|
{
|
|
const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
|
|
|
|
OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
|
|
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
|
|
}
|
|
|
|
{
|
|
const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
|
|
|
|
OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
|
|
aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
|
|
|
|
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
|
|
CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
|
|
}
|
|
}
|
|
|
|
void TestBreakIterator::testJapanese()
|
|
{
|
|
doTestJapanese(m_xBreak);
|
|
|
|
// fdo#78479 - test second / cached instantiation of xdictionary
|
|
uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
|
|
"com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
|
|
|
|
doTestJapanese(xTmpBreak);
|
|
}
|
|
|
|
void TestBreakIterator::testChinese()
|
|
{
|
|
lang::Locale aLocale;
|
|
aLocale.Language = "zh";
|
|
aLocale.Country = "CN";
|
|
i18n::Boundary aBounds;
|
|
|
|
{
|
|
const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
|
|
|
|
OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
|
|
aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
|
|
i18n::WordType::DICTIONARY_WORD, true);
|
|
CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
|
|
}
|
|
}
|
|
void TestBreakIterator::setUp()
|
|
{
|
|
BootstrapFixtureBase::setUp();
|
|
m_xBreak.set(m_xSFactory->createInstance("com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
|
|
}
|
|
|
|
void TestBreakIterator::tearDown()
|
|
{
|
|
m_xBreak.clear();
|
|
BootstrapFixtureBase::tearDown();
|
|
}
|
|
|
|
CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
|
|
|
|
CPPUNIT_PLUGIN_IMPLEMENT();
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|