tdf#49885 Updated CJK BreakIterator to use ICU

Previously, the CJK BreakIterator used custom dictionaries for Chinese
and Japanese. This change removes these custom dictionaries in favor of
the upstream ICU implementation, which uses an externally-maintained
frequency dictionary for these languages.

This change also removes support code for dictionary-based break
iterators, as it is no longer used.

Change-Id: I55c4ce9c842d1751997309fd7446e0a6917915dc
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166136
Reviewed-by: Caolán McNamara <caolan.mcnamara@collabora.com>
Tested-by: Jenkins
Tested-by: Caolán McNamara <caolan.mcnamara@collabora.com>
This commit is contained in:
Jonathan Clark 2024-04-15 17:10:51 -06:00 committed by Caolán McNamara
parent d75a37a582
commit 14c6cde779
17 changed files with 75 additions and 372325 deletions

View file

@ -701,8 +701,6 @@ $(eval $(call gb_Helper_register_libraries_for_install,PLAINLIBS_OOO,ooo, \
))
$(eval $(call gb_Helper_register_plugins_for_install,PLAINLIBS_OOO,ooo, \
dict_ja \
dict_zh \
localedata_en \
localedata_es \
localedata_euro \

View file

@ -30,8 +30,4 @@ $(eval $(call gb_CppunitTest_use_components,i18npool_break_iterator,\
i18npool/util/i18npool \
))
$(call gb_CppunitTest_get_target,i18npool_break_iterator) : \
$(call gb_Library_get_target,dict_ja) \
$(call gb_Library_get_target,dict_zh)
# vim: set noet sw=4 ts=4:

View file

@ -11,37 +11,8 @@ $(eval $(call gb_CustomTarget_CustomTarget,i18npool/breakiterator))
i18npool_BIDIR := $(call gb_CustomTarget_get_workdir,i18npool/breakiterator)
ifneq ($(filter iOS ANDROID,$(OS)),)
$(call gb_CustomTarget_get_target,i18npool/breakiterator) : \
$(i18npool_BIDIR)/dict_ja.data $(i18npool_BIDIR)/dict_zh.data $(i18npool_BIDIR)/OpenOffice_dat.c
$(i18npool_BIDIR)/dict_%.data : \
$(SRCDIR)/i18npool/source/breakiterator/data/%.dic \
$(call gb_Executable_get_runtime_dependencies,gendict) \
| $(i18npool_BIDIR)/.dir
$(call gb_Output_announce,$(subst $(WORKDIR)/,,$@),$(true),DIC,1)
$(call gb_Trace_StartRange,$(subst $(WORKDIR)/,,$@),DIC)
$(call gb_Helper_abbreviate_dirs,\
$(call gb_Helper_execute,gendict) $< $@ $(patsubst $(i18npool_BIDIR)/dict_%.cxx,%,$@))
$(call gb_Trace_EndRange,$(subst $(WORKDIR)/,,$@),DIC)
else # !iOS ANDROID
$(call gb_CustomTarget_get_target,i18npool/breakiterator) : \
$(i18npool_BIDIR)/dict_ja.cxx $(i18npool_BIDIR)/dict_zh.cxx $(i18npool_BIDIR)/OpenOffice_dat.c
$(i18npool_BIDIR)/dict_%.cxx : \
$(SRCDIR)/i18npool/source/breakiterator/data/%.dic \
$(call gb_Executable_get_runtime_dependencies,gendict) \
| $(i18npool_BIDIR)/.dir
$(call gb_Output_announce,$(subst $(WORKDIR)/,,$@),$(true),DIC,1)
$(call gb_Trace_StartRange,$(subst $(WORKDIR)/,,$@),DIC)
$(call gb_Helper_abbreviate_dirs,\
$(call gb_Helper_execute,gendict) $< $@ $(patsubst $(i18npool_BIDIR)/dict_%.cxx,%,$@))
$(call gb_Trace_EndRange,$(subst $(WORKDIR)/,,$@),DIC)
endif
$(i18npool_BIDIR)/OpenOffice_dat.c
i18npool_BRKTXTS := \
count_word.brk \

View file

@ -1,18 +0,0 @@
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
$(eval $(call gb_Library_Library,dict_ja))
$(eval $(call gb_Library_set_plugin_for_nodep,dict_ja,i18npool))
$(eval $(call gb_Library_add_generated_exception_objects,dict_ja,\
CustomTarget/i18npool/breakiterator/dict_ja \
))
# vim: set noet sw=4 ts=4:

View file

@ -1,18 +0,0 @@
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
$(eval $(call gb_Library_Library,dict_zh))
$(eval $(call gb_Library_set_plugin_for_nodep,dict_zh,i18npool))
$(eval $(call gb_Library_add_generated_exception_objects,dict_zh,\
CustomTarget/i18npool/breakiterator/dict_zh \
))
# vim: set noet sw=4 ts=4:

View file

@ -51,7 +51,6 @@ $(eval $(call gb_Library_add_exception_objects,i18npool,\
i18npool/source/breakiterator/breakiterator_cjk \
i18npool/source/breakiterator/breakiteratorImpl \
i18npool/source/breakiterator/breakiterator_unicode \
i18npool/source/breakiterator/xdictionary \
i18npool/source/calendar/calendarImpl \
i18npool/source/calendar/calendar_gregorian \
i18npool/source/calendar/calendar_hijri \

View file

@ -15,9 +15,6 @@ $(eval $(call gb_Module_add_targets,i18npool,\
CustomTarget_indexentry \
CustomTarget_localedata \
CustomTarget_textconversion \
$(if $(filter-out iOS ANDROID,$(OS)), \
Library_dict_ja \
Library_dict_zh) \
Library_i18npool \
Library_i18nsearch \
Library_localedata_en \
@ -29,7 +26,6 @@ $(eval $(call gb_Module_add_targets,i18npool,\
$(eval $(call gb_Module_add_targets_for_build,i18npool,\
Executable_gencoll_rule \
Executable_genconv_dict \
Executable_gendict \
Executable_genindex_data \
Executable_saxparser \
Rdb_saxparser \

View file

@ -6,19 +6,6 @@ code modification. (Wow, that is such marketing-speak...)
Specifically for locale data documentation please see `i18npool/source/localedata/data/locale.dtd`
On iOS we put the largest data generated here, the `dict_ja` and `dict_zh`
stuff, into separate files and not into code to keep the size of an
app binary down. Temporary test code:
static bool beenhere = false;
if (!beenhere) {
beenhere = true;
uno::Reference< uno::XComponentContext > xComponentContext(::cppu::defaultBootstrap_InitialComponentContext());
uno::Reference< lang::XMultiComponentFactory > xMultiComponentFactoryClient( xComponentContext->getServiceManager() );
uno::Reference< uno::XInterface > xInterface =
xMultiComponentFactoryClient->createInstanceWithContext( "com.sun.star.i18n.BreakIterator_ja", xComponentContext );
}
## See Also
<http://wiki.documentfoundation.org/Category:I18n>

View file

@ -19,7 +19,6 @@
#pragma once
#include "breakiterator_unicode.hxx"
#include "xdictionary.hxx"
#include <optional>
#include <memory>
@ -31,19 +30,12 @@ class BreakIterator_CJK : public BreakIterator_Unicode
public:
BreakIterator_CJK();
css::i18n::Boundary SAL_CALL nextWord( const OUString& Text, sal_Int32 nStartPos,
const css::lang::Locale& nLocale, sal_Int16 WordType) override;
css::i18n::Boundary SAL_CALL previousWord( const OUString& Text, sal_Int32 nStartPos,
const css::lang::Locale& nLocale, sal_Int16 WordType) override;
css::i18n::Boundary SAL_CALL getWordBoundary( const OUString& Text, sal_Int32 nPos,
const css::lang::Locale& nLocale, sal_Int16 WordType, sal_Bool bDirection ) override;
css::i18n::LineBreakResults SAL_CALL getLineBreak( const OUString& Text, sal_Int32 nStartPos,
const css::lang::Locale& nLocale, sal_Int32 nMinBreakPos,
const css::i18n::LineBreakHyphenationOptions& hOptions,
const css::i18n::LineBreakUserOptions& bOptions ) override;
protected:
std::optional<xdictionary> m_oDict;
OUString hangingCharacters;
};

View file

@ -1,94 +0,0 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#pragma once
#include <osl/file.h>
#include <sal/types.h>
#include <com/sun/star/i18n/Boundary.hpp>
namespace i18npool {
#define CACHE_MAX 32 // max cache structure number
#define DEFAULT_SIZE 256 // for boundary size, to avoid alloc and release memory
// cache structure.
struct WordBreakCache {
sal_Unicode *contents; // separated segment contents.
sal_Int32* wordboundary; // word boundaries in segments.
sal_Int32 length; // contents length saved here.
sal_Int32 size; // size of wordboundary
WordBreakCache();
bool equals(const sal_Unicode *str, css::i18n::Boundary const & boundary) const; // checking cached string
};
struct xdictionarydata
{
const sal_uInt8 * existMark;
const sal_Int16 * index1;
const sal_Int32 * index2;
const sal_Int32 * lenArray;
const sal_Unicode* dataArea;
xdictionarydata() :
existMark( nullptr ),
index1( nullptr ),
index2( nullptr ),
lenArray( nullptr ),
dataArea( nullptr )
{
}
};
class xdictionary
{
private:
xdictionarydata data;
void initDictionaryData(const char *lang);
css::i18n::Boundary boundary;
bool japaneseWordBreak;
#ifdef DICT_JA_ZH_IN_DATAFILE
oslFileHandle m_aFileHandle;
sal_uInt64 m_nFileSize;
char* m_pMapping;
#endif
public:
xdictionary(const char *lang);
~xdictionary();
css::i18n::Boundary nextWord( const OUString& rText, sal_Int32 nPos, sal_Int16 wordType);
css::i18n::Boundary previousWord( const OUString& rText, sal_Int32 nPos, sal_Int16 wordType);
css::i18n::Boundary const & getWordBoundary( const OUString& rText, sal_Int32 nPos, sal_Int16 wordType, bool bDirection );
void setJapaneseWordBreak();
private:
WordBreakCache cache[CACHE_MAX];
OUString segmentCachedString;
css::i18n::Boundary segmentCachedBoundary;
bool seekSegment(const OUString& rText, sal_Int32 pos, css::i18n::Boundary& boundary);
WordBreakCache& getCache(const sal_Unicode *text, css::i18n::Boundary const & boundary);
bool exists(const sal_uInt32 u) const;
sal_Int32 getLongestMatch(const sal_Unicode *text, sal_Int32 len) const;
};
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View file

@ -50,7 +50,7 @@ public:
void testLegacyDictWordPrepostDash_sv_SE();
void testLegacyHebrewQuoteInsideWord();
void testLegacySurrogatePairs();
void testLegacyWordCountCompat();
void testWordCount();
CPPUNIT_TEST_SUITE(TestBreakIterator);
CPPUNIT_TEST(testLineBreaking);
@ -73,7 +73,7 @@ public:
CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE);
CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord);
CPPUNIT_TEST(testLegacySurrogatePairs);
CPPUNIT_TEST(testLegacyWordCountCompat);
CPPUNIT_TEST(testWordCount);
CPPUNIT_TEST_SUITE_END();
private:
@ -930,6 +930,22 @@ void TestBreakIterator::testWordBoundaries()
CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
}
// tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis
{
aLocale.Language = "ja";
aLocale.Country = "JP";
static constexpr OUString aTest = u"通産省工業技術院北海道工業開発試験所"_ustr;
aBounds
= m_xBreak->getWordBoundary(aTest, 9, aLocale, i18n::WordType::DICTIONARY_WORD, false);
// When using the old LO custom dictionaries, this will select the entire phrase.
// When using ICU, it will select only 北海道.
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
}
}
void TestBreakIterator::testSentenceBoundaries()
@ -1399,12 +1415,12 @@ void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > co
i18n::Boundary aBounds;
{
static constexpr OUStringLiteral aTest = u"\u30B7\u30E3\u30C3\u30C8\u30C0\u30A6\u30F3";
static constexpr OUString aTest = u"シャットダウン"_ustr;
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
i18n::WordType::DICTIONARY_WORD, true);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
}
@ -1570,8 +1586,7 @@ void TestBreakIterator::testLegacySurrogatePairs()
//
// BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test.
{
const sal_Unicode buf[] = { u"X 𠮟 X" };
OUString aTest(buf, SAL_N_ELEMENTS(buf));
static constexpr OUString aTest = u"X 𠮟 X"_ustr;
auto aBounds
= m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
@ -1581,7 +1596,7 @@ void TestBreakIterator::testLegacySurrogatePairs()
aBounds
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
aBounds
= m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false);
@ -1590,16 +1605,44 @@ void TestBreakIterator::testLegacySurrogatePairs()
}
}
void TestBreakIterator::testLegacyWordCountCompat()
void TestBreakIterator::testWordCount()
{
lang::Locale aLocale;
auto count_words_fn = [&](const OUString& str, const lang::Locale& aLocale) -> int
{
int num_words = 0;
sal_Int32 next_pos = 0;
int iter_guard = 0;
aLocale.Language = "en";
aLocale.Country = "US";
if (m_xBreak->isBeginWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT))
{
++num_words;
}
while (true)
{
CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100);
auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT);
if (aBounds.endPos < next_pos || aBounds.startPos == aBounds.endPos)
{
break;
}
next_pos = aBounds.endPos;
++num_words;
}
return num_words;
};
// i#80815: "Word count differs from MS Word"
// This is a characteristic test for word count using test data from the linked bug.
{
lang::Locale aLocale;
aLocale.Language = "en";
aLocale.Country = "US";
const OUString str = u""
"test data for word count issue #80815\n"
"fo\\\'sforos\n"
@ -1622,25 +1665,18 @@ void TestBreakIterator::testLegacyWordCountCompat()
"aaaaaaa,aaaaaaa\n"
"aaaaaaa;aaaaaaa\n"_ustr;
int num_words = 0;
sal_Int32 next_pos = 0;
int iter_guard = 0;
while (true)
{
CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100);
CPPUNIT_ASSERT_EQUAL(24, count_words_fn(str, aLocale));
}
auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT);
// Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
{
lang::Locale aLocale;
aLocale.Language = "ja";
aLocale.Country = "JP";
if (aBounds.endPos < next_pos)
{
break;
}
const OUString str = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
next_pos = aBounds.endPos;
++num_words;
}
CPPUNIT_ASSERT_EQUAL(23, num_words);
CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale));
}
}

View file

@ -37,54 +37,6 @@ BreakIterator_CJK::BreakIterator_CJK()
cBreakIterator = u"com.sun.star.i18n.BreakIterator_CJK"_ustr;
}
Boundary SAL_CALL
BreakIterator_CJK::previousWord(const OUString& text, sal_Int32 anyPos,
const css::lang::Locale& nLocale, sal_Int16 wordType)
{
if (m_oDict) {
result = m_oDict->previousWord(text, anyPos, wordType);
// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
if (result.endPos - result.startPos != 1 ||
getScriptType(text, result.startPos) == ScriptType::ASIAN)
return result;
result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
if (result.endPos < anyPos)
return result;
}
return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType);
}
Boundary SAL_CALL
BreakIterator_CJK::nextWord(const OUString& text, sal_Int32 anyPos,
const css::lang::Locale& nLocale, sal_Int16 wordType)
{
if (m_oDict) {
result = m_oDict->nextWord(text, anyPos, wordType);
// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
if (result.endPos - result.startPos != 1 ||
getScriptType(text, result.startPos) == ScriptType::ASIAN)
return result;
result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
if (result.startPos > anyPos)
return result;
}
return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType);
}
Boundary SAL_CALL
BreakIterator_CJK::getWordBoundary( const OUString& text, sal_Int32 anyPos,
const css::lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection )
{
if (m_oDict) {
result = m_oDict->getWordBoundary(text, anyPos, wordType, bDirection);
// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
if (result.endPos - result.startPos != 1 ||
getScriptType(text, result.startPos) == ScriptType::ASIAN)
return result;
}
return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection);
}
namespace {
bool isHangul( sal_Unicode cCh )
{
@ -143,7 +95,6 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
// ----------------------------------------------------;
BreakIterator_zh::BreakIterator_zh()
{
m_oDict.emplace("zh");
assert(hangingCharacters.pData);
hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "CN"));
cBreakIterator = u"com.sun.star.i18n.BreakIterator_zh"_ustr;
@ -154,7 +105,6 @@ BreakIterator_zh::BreakIterator_zh()
// ----------------------------------------------------;
BreakIterator_zh_TW::BreakIterator_zh_TW()
{
m_oDict.emplace("zh");
assert(hangingCharacters.pData);
hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "TW"));
cBreakIterator = u"com.sun.star.i18n.BreakIterator_zh_TW"_ustr;
@ -165,8 +115,6 @@ BreakIterator_zh_TW::BreakIterator_zh_TW()
// ----------------------------------------------------;
BreakIterator_ja::BreakIterator_ja()
{
m_oDict.emplace("ja");
m_oDict->setJapaneseWordBreak();
assert(hangingCharacters.pData);
hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("ja", "JP"));
cBreakIterator = u"com.sun.star.i18n.BreakIterator_ja"_ustr;

View file

@ -74,6 +74,16 @@ class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
};
bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale)
{
return rLocale.Language == "bo" || // Tibetan
rLocale.Language == "dz" || // Dzongkha
rLocale.Language == "ja" || // Japanese
rLocale.Language == "km" || // Khmer
rLocale.Language == "lo" || // Lao
rLocale.Language == "th" || // Thai
rLocale.Language == "zh"; // Chinese
}
}
// loading ICU breakiterator on demand.
@ -179,8 +189,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocal
rbi.reset();
}
}
//use icu's breakiterator for Thai, Tibetan and Dzongkha
else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
else if(!locale_requires_dictionary_iterator(rLocale))
{
// language;rule (not langtag, unless we'd actually load such)
OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,340 +0,0 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <sal/main.h>
#include <sal/types.h>
#include <rtl/ustring.hxx>
#include <osl/diagnose.h>
#include <vector>
using std::vector;
// For iOS, where we must strive for a minimal executable size, we
// keep the data produced by this utility not as large const tables in
// source code but instead as separate data files, to be bundled with
// an app, and mmapped in at run time.
// To test this easier on a desktop OS, just make sure
// DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
#ifdef DICT_JA_ZH_IN_DATAFILE
static sal_Int64 dataAreaOffset = 0;
static sal_Int64 lenArrayOffset = 0;
static sal_Int64 index1Offset = 0;
static sal_Int64 index2Offset = 0;
static sal_Int64 existMarkOffset = 0;
#endif
/* Utility gendict:
"BreakIterator_CJK provides input string caching and dictionary searching for
longest matching. You can provide a sorted dictionary (the encoding must be
UTF-8) by creating the following file:
i18npool/source/breakiterator/data/<language>.dict.
The utility gendict will convert the file to C code, which will be compiled
into a shared library for dynamic loading.
All dictionary searching and loading is performed in the xdictionary class.
The only thing you need to do is to derive your class from BreakIterator_CJK
and create an instance of the xdictionary with the language name and
pass it to the parent class." (from https://wiki.documentfoundation.org/
Documentation/DevGuide/Office_Development#Implementing_a_New_Locale - 27/01/2011)
*/
// C-standard guarantees that static variables are automatically initialized to 0
static sal_uInt8 exists[0x2000];
static sal_uInt32 charArray[0x10000];
static void set_exists(sal_uInt32 index)
{
exists[index>>3] |= 1 << (index & 0x07);
}
static void printIncludes(FILE* source_fp)
{
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
fputs("#include <sal/types.h>\n\n", source_fp);
#else
(void) source_fp;
#endif
}
static void printFunctions(FILE* source_fp, const char *lang)
{
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
fputs ("#else\n", source_fp);
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
fputs ("#endif\n", source_fp);
#else
(void) source_fp;
(void) lang;
#endif
}
static void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
{
// generate main dict. data array
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
#else
dataAreaOffset = ftell(source_fp);
#endif
char str[1024];
sal_uInt32 lenArrayCurr = 0;
sal_Unicode current = 0;
while (fgets(str, 1024, dictionary_fp)) {
// input file is in UTF-8 encoding
// don't convert last new line character to Ostr.
OUString Ostr(str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
const sal_Int32 len = Ostr.getLength();
sal_Int32 i=0;
Ostr.iterateCodePoints(&i);
if (len == i)
continue; // skip one character word
if (Ostr[0] != current) {
OSL_ENSURE( (Ostr[0] > current), "Dictionary file should be sorted");
current = Ostr[0];
charArray[current] = lenArray.size();
}
lenArray.push_back(lenArrayCurr);
set_exists(Ostr[0]);
// first character is stored in charArray, so start from second
for (i = 1; i < len; i++, lenArrayCurr++) {
set_exists(Ostr[i]);
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf(source_fp, "0x%04x, ", Ostr[i]);
if ((lenArrayCurr & 0x0f) == 0x0f)
fputs("\n\t", source_fp);
#else
sal_Unicode x = Ostr[i];
fwrite(&x, sizeof(Ostr[i]), 1, source_fp);
#endif
}
}
charArray[current+1] = lenArray.size();
lenArray.push_back( lenArrayCurr ); // store last ending pointer
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("\n};\n", source_fp);
#endif
}
static void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
{
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
#else
lenArrayOffset = ftell(source_fp);
sal_uInt32 zero(0);
fwrite(&zero, sizeof(zero), 1, source_fp);
#endif
for (size_t k = 0; k < lenArray.size(); k++)
{
if( !(k & 0xf) )
fputs("\n\t", source_fp);
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf(source_fp, "0x%" SAL_PRIxUINT32 ", ", lenArray[k]);
#else
fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
#endif
}
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("\n};\n", source_fp );
#endif
}
/* FIXME?: what happens if in every range i there is at least one charArray != 0
=> this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
=> then in index2, the last range will be ignored incorrectly */
static void printIndex1(FILE *source_fp, sal_Int16 *set)
{
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
#else
index1Offset = ftell(source_fp);
#endif
sal_Int16 count = 0;
for (sal_Int32 i = 0; i < 0x100; i++) {
sal_Int32 j = 0;
while( j < 0x100 && charArray[(i<<8) + j] == 0)
j++;
set[i] = (j < 0x100 ? count++ : 0xff);
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf(source_fp, "0x%02x, ", set[i]);
if ((i & 0x0f) == 0x0f)
fputs ("\n\t", source_fp);
#else
fwrite(&set[i], sizeof(set[i]), 1, source_fp);
#endif
}
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("};\n", source_fp);
#endif
}
static void printIndex2(FILE *source_fp, sal_Int16 const *set)
{
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
#else
index2Offset = ftell(source_fp);
#endif
sal_Int32 prev = 0;
for (sal_Int32 i = 0; i < 0x100; i++) {
if (set[i] != 0xff) {
for (sal_Int32 j = 0; j < 0x100; j++) {
sal_Int32 k = (i<<8) + j;
if (prev != 0 )
while( k < 0x10000 && charArray[k] == 0 )
k++;
prev = charArray[(i<<8) + j];
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
if ((j & 0x0f) == 0x0f)
fputs ("\n\t", source_fp);
#else
sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
fwrite(&n, sizeof(n), 1, source_fp);
#endif
}
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs ("\n\t", source_fp);
#endif
}
}
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs ("\n};\n", source_fp);
#endif
}
/* Generates a bitmask for the existence of sal_Unicode values in dictionary;
it packs 8 sal_Bool values in 1 sal_uInt8 */
static void printExistsMask(FILE *source_fp)
{
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
#else
existMarkOffset = ftell(source_fp);
#endif
for (unsigned int i = 0; i < 0x2000; i++)
{
#ifndef DICT_JA_ZH_IN_DATAFILE
fprintf(source_fp, "0x%02x, ", exists[i]);
if ( (i & 0xf) == 0xf )
fputs("\n\t", source_fp);
#else
fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
#endif
}
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("\n};\n", source_fp);
#endif
}
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
{
FILE *dictionary_fp, *source_fp;
if (argc == 1 || argc > 4)
{
fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
exit(-1);
}
dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
if (dictionary_fp == nullptr)
{
fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
exit(1);
}
if(argc == 2)
source_fp = stdout;
else
{
// create the C source file to write
source_fp = fopen(argv[2], "wb");
if (source_fp == nullptr) {
fclose(dictionary_fp);
fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
exit(1);
}
}
vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
sal_Int16 set[0x100];
printIncludes(source_fp);
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("extern \"C\" {\n", source_fp);
#endif
printDataArea(dictionary_fp, source_fp, lenArray);
printLenArray(source_fp, lenArray);
printIndex1(source_fp, set);
printIndex2(source_fp, set);
printExistsMask(source_fp);
printFunctions(source_fp, argv[3]);
#ifndef DICT_JA_ZH_IN_DATAFILE
fputs("}\n", source_fp);
#else
// Put pointers to the tables at the end of the file...
fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
#endif
fclose(dictionary_fp);
fclose(source_fp);
return 0;
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

View file

@ -1,490 +0,0 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#include <config_folders.h>
#include <o3tl/temporary.hxx>
#include <osl/file.h>
#include <osl/module.h>
#include <osl/mutex.hxx>
#include <rtl/bootstrap.hxx>
#include <com/sun/star/i18n/ScriptType.hpp>
#include <com/sun/star/i18n/WordType.hpp>
#include <xdictionary.hxx>
#include <unicode/uchar.h>
#include <string.h>
#include <breakiteratorImpl.hxx>
using namespace com::sun::star::i18n;
namespace i18npool {
#ifdef DICT_JA_ZH_IN_DATAFILE
#elif !defined DISABLE_DYNLOADING
extern "C" { static void thisModule() {} }
#else
extern "C" {
sal_uInt8* getExistMark_ja();
sal_Int16* getIndex1_ja();
sal_Int32* getIndex2_ja();
sal_Int32* getLenArray_ja();
sal_Unicode* getDataArea_ja();
sal_uInt8* getExistMark_zh();
sal_Int16* getIndex1_zh();
sal_Int32* getIndex2_zh();
sal_Int32* getLenArray_zh();
sal_Unicode* getDataArea_zh();
}
#endif
xdictionary::xdictionary(const char *lang) :
japaneseWordBreak( false )
#ifdef DICT_JA_ZH_IN_DATAFILE
, m_aFileHandle(nullptr),
m_nFileSize(-1),
m_pMapping(nullptr)
#endif
{
#ifdef DICT_JA_ZH_IN_DATAFILE
if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
{
OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
rtl::Bootstrap::expandMacros(sUrl);
if( strcmp( lang, "ja" ) == 0 )
sUrl += "ja.data";
else if( strcmp( lang, "zh" ) == 0 )
sUrl += "zh.data";
if( osl_openFile( sUrl.pData, &m_aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
osl_getFileSize( m_aFileHandle, &m_nFileSize) == osl_File_E_None &&
osl_mapFile( m_aFileHandle, (void **) &m_pMapping, m_nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
{
// We have the offsets to the parts of the file at its end, see gendict.cxx
sal_Int64 *pEOF = (sal_Int64*)(m_pMapping + m_nFileSize);
data.existMark = (sal_uInt8*) (m_pMapping + pEOF[-1]);
data.index2 = (sal_Int32*) (m_pMapping + pEOF[-2]);
data.index1 = (sal_Int16*) (m_pMapping + pEOF[-3]);
data.lenArray = (sal_Int32*) (m_pMapping + pEOF[-4]);
data.dataArea = (sal_Unicode*) (m_pMapping + pEOF[-5]);
}
}
#elif !defined DISABLE_DYNLOADING
initDictionaryData( lang );
#else
if( strcmp( lang, "ja" ) == 0 ) {
data.existMark = getExistMark_ja();
data.index1 = getIndex1_ja();
data.index2 = getIndex2_ja();
data.lenArray = getLenArray_ja();
data.dataArea = getDataArea_ja();
}
else if( strcmp( lang, "zh" ) == 0 ) {
data.existMark = getExistMark_zh();
data.index1 = getIndex1_zh();
data.index2 = getIndex2_zh();
data.lenArray = getLenArray_zh();
data.dataArea = getDataArea_zh();
}
#endif
for (WordBreakCache & i : cache)
i.size = 0;
japaneseWordBreak = false;
}
xdictionary::~xdictionary()
{
for (const WordBreakCache & i : cache) {
if (i.size > 0) {
delete [] i.contents;
delete [] i.wordboundary;
}
}
#ifdef DICT_JA_ZH_IN_DATAFILE
if (m_aFileHandle) {
if (m_pMapping) {
osl_unmapMappedFile(m_aFileHandle, m_pMapping, m_nFileSize);
}
osl_closeFile(m_aFileHandle);
}
#endif
}
namespace {
struct datacache {
oslModule mhModule;
OString maLang;
xdictionarydata maData;
};
}
#if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
void xdictionary::initDictionaryData(const char *pLang)
{
// Global cache, never released for performance
static std::vector< datacache > aLoadedCache;
osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
for(const datacache & i : aLoadedCache)
{
if( i.maLang == pLang )
{
data = i.maData;
return;
}
}
// otherwise add to the cache, positive or negative.
datacache aEntry;
aEntry.maLang = OString( pLang, strlen( pLang ) );
#ifdef SAL_DLLPREFIX
OString sModuleName = // mostly "lib*.so" (with * == dict_zh)
OString::Concat(SAL_DLLPREFIX "dict_") + pLang + SAL_DLLEXTENSION;
#else
OString sModuleName = // mostly "*.dll" (with * == dict_zh)
OString::Concat("dict_") + pLang + SAL_DLLEXTENSION;
#endif
aEntry.mhModule = osl_loadModuleRelativeAscii( &thisModule, sModuleName.getStr(), SAL_LOADMODULE_DEFAULT );
if( aEntry.mhModule ) {
oslGenericFunction func;
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
}
data = aEntry.maData;
aLoadedCache.push_back( aEntry );
}
#endif
void xdictionary::setJapaneseWordBreak()
{
japaneseWordBreak = true;
}
bool xdictionary::exists(const sal_uInt32 c) const
{
// 0x1FFF is the hardcoded limit in gendict for data.existMarks
bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
if (!exist && japaneseWordBreak)
return BreakIteratorImpl::getScriptClass(c) == css::i18n::ScriptType::ASIAN;
else
return exist;
}
sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) const
{
if ( !data.index1 ) return 0;
sal_Int16 idx = data.index1[str[0] >> 8];
if (idx == 0xFF) return 0;
idx = (idx<<8) | (str[0]&0xff);
sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
if (begin == 0) return 0;
str++; sLen--; // first character is not stored in the dictionary
for (sal_uInt32 i = end; i > begin; i--) {
sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
if (sLen >= len) {
const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
sal_Int32 pos = 0;
while (pos < len && dstr[pos] == str[pos]) { pos++; }
if (pos == len)
return len + 1;
}
}
return 0;
}
/*
* c-tor
*/
WordBreakCache::WordBreakCache() :
contents( nullptr ),
wordboundary( nullptr ),
length( 0 ),
size( 0 )
{
}
/*
* Compare two unicode string,
*/
bool WordBreakCache::equals(const sal_Unicode* str, Boundary const & boundary) const
{
// Different length, different string.
if (length != boundary.endPos - boundary.startPos) return false;
for (sal_Int32 i = 0; i < length; i++)
if (contents[i] != str[i + boundary.startPos]) return false;
return true;
}
/*
* Retrieve the segment containing the character at pos.
* @param pos : Position of the given character.
* @return true if CJK.
*/
bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
Boundary& segBoundary)
{
sal_Int32 indexUtf16;
if (segmentCachedString.pData != rText.pData) {
// Cache the passed text so we can avoid regenerating the segment if it's the same
// (pData is refcounted and assigning the OUString references it, which ensures that
// the object is the same if we get the same pointer back later)
segmentCachedString = rText;
} else {
// If pos is within the cached boundary, use that boundary
if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
segBoundary.startPos = segmentCachedBoundary.startPos;
segBoundary.endPos = segmentCachedBoundary.endPos;
indexUtf16 = segmentCachedBoundary.startPos;
rText.iterateCodePoints(&indexUtf16);
return segmentCachedBoundary.endPos > indexUtf16;
}
}
segBoundary.endPos = segBoundary.startPos = pos;
indexUtf16 = pos;
while (indexUtf16 > 0)
{
sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
if (u_isWhitespace(ch) || exists(ch))
segBoundary.startPos = indexUtf16;
else
break;
}
indexUtf16 = pos;
while (indexUtf16 < rText.getLength())
{
sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
if (u_isWhitespace(ch) || exists(ch))
segBoundary.endPos = indexUtf16;
else
break;
}
// Cache the calculated boundary
segmentCachedBoundary.startPos = segBoundary.startPos;
segmentCachedBoundary.endPos = segBoundary.endPos;
indexUtf16 = segBoundary.startPos;
rText.iterateCodePoints(&indexUtf16);
return segBoundary.endPos > indexUtf16;
}
#define KANJA 1
#define KATAKANA 2
#define HIRAKANA 3
static sal_Int16 JapaneseCharType(sal_Unicode c)
{
if (0x3041 <= c && c <= 0x309e)
return HIRAKANA;
if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
return KATAKANA;
return KANJA;
}
WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary const & wordBoundary)
{
WordBreakCache& rCache = cache[text[0] & 0x1f];
if (rCache.size != 0 && rCache.equals(text, wordBoundary))
return rCache;
sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
if (rCache.size == 0 || len > rCache.size) {
if (rCache.size != 0) {
delete [] rCache.contents;
delete [] rCache.wordboundary;
rCache.size = len;
}
else
rCache.size = std::max<sal_Int32>(len, DEFAULT_SIZE);
rCache.contents = new sal_Unicode[rCache.size + 1];
rCache.wordboundary = new sal_Int32[rCache.size + 2];
}
rCache.length = len;
memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
*(rCache.contents + len) = 0x0000;
// reset the wordboundary in cache
memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
sal_Int32 i = 0; // loop variable
while (rCache.wordboundary[i] < rCache.length) {
len = 0;
// look the continuous white space as one word and cache it
while (u_isWhitespace(static_cast<sal_uInt32>(text[wordBoundary.startPos + rCache.wordboundary[i] + len])))
len ++;
if (len == 0) {
const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
sal_Int32 slen = rCache.length - rCache.wordboundary[i];
sal_Int16 type = 0, count = 0;
for (;len == 0 && slen > 0; str++, slen--) {
len = getLongestMatch(str, slen);
if (len == 0) {
if (!japaneseWordBreak) {
len = 1;
} else {
if (count == 0)
type = JapaneseCharType(*str);
else if (type != JapaneseCharType(*str))
break;
count++;
}
}
}
if (count)
{
rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
i++;
}
}
if (len) {
rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
i++;
}
}
rCache.wordboundary[i + 1] = rCache.length + 1;
return rCache;
}
Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
{
// looking for the first non-whitespace character from anyPos
sal_uInt32 ch = 0;
if (anyPos > 0)
rText.iterateCodePoints(&anyPos, -1);
while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
return getWordBoundary(rText, anyPos, wordType, true);
}
Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
{
boundary = getWordBoundary(rText, anyPos, wordType, true);
anyPos = boundary.endPos;
const sal_Int32 nLen = rText.getLength();
if (anyPos < nLen) {
// looking for the first non-whitespace character from anyPos
sal_uInt32 ch = rText.iterateCodePoints(&anyPos);
while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos);
if (anyPos > 0)
rText.iterateCodePoints(&anyPos, -1);
}
return getWordBoundary(rText, anyPos, wordType, true);
}
Boundary const & xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
{
const sal_Unicode *text=rText.getStr();
sal_Int32 len=rText.getLength();
if (anyPos >= len || anyPos < 0) {
boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
} else if (seekSegment(rText, anyPos, boundary)) { // character in dict
WordBreakCache& aCache = getCache(text, boundary);
sal_Int32 i = 0;
while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
sal_Int32 startPos = aCache.wordboundary[i - 1];
// if bDirection is false
if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
{
sal_uInt32 ch = rText.iterateCodePoints(&o3tl::temporary(sal_Int32(anyPos-1)));
if (u_isWhitespace(ch))
i--;
}
boundary.endPos = boundary.startPos;
boundary.endPos += aCache.wordboundary[i];
boundary.startPos += aCache.wordboundary[i-1];
} else {
boundary.startPos = anyPos;
if (anyPos < len) rText.iterateCodePoints(&anyPos);
boundary.endPos = std::min(anyPos, len);
}
if (wordType == WordType::WORD_COUNT) {
// skip punctuation for word count.
while (boundary.endPos < len)
{
sal_Int32 indexUtf16 = boundary.endPos;
if (u_ispunct(rText.iterateCodePoints(&indexUtf16)))
boundary.endPos = indexUtf16;
else
break;
}
}
return boundary;
}
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */