tdf#49885 Updated CJK BreakIterator to use ICU
Previously, the CJK BreakIterator used custom dictionaries for Chinese and Japanese. This change removes these custom dictionaries in favor of the upstream ICU implementation, which uses an externally-maintained frequency dictionary for these languages. This change also removes support code for dictionary-based break iterators, as it is no longer used. Change-Id: I55c4ce9c842d1751997309fd7446e0a6917915dc Reviewed-on: https://gerrit.libreoffice.org/c/core/+/166136 Reviewed-by: Caolán McNamara <caolan.mcnamara@collabora.com> Tested-by: Jenkins Tested-by: Caolán McNamara <caolan.mcnamara@collabora.com>
This commit is contained in:
parent
d75a37a582
commit
14c6cde779
17 changed files with 75 additions and 372325 deletions
|
@ -701,8 +701,6 @@ $(eval $(call gb_Helper_register_libraries_for_install,PLAINLIBS_OOO,ooo, \
|
|||
))
|
||||
|
||||
$(eval $(call gb_Helper_register_plugins_for_install,PLAINLIBS_OOO,ooo, \
|
||||
dict_ja \
|
||||
dict_zh \
|
||||
localedata_en \
|
||||
localedata_es \
|
||||
localedata_euro \
|
||||
|
|
|
@ -30,8 +30,4 @@ $(eval $(call gb_CppunitTest_use_components,i18npool_break_iterator,\
|
|||
i18npool/util/i18npool \
|
||||
))
|
||||
|
||||
$(call gb_CppunitTest_get_target,i18npool_break_iterator) : \
|
||||
$(call gb_Library_get_target,dict_ja) \
|
||||
$(call gb_Library_get_target,dict_zh)
|
||||
|
||||
# vim: set noet sw=4 ts=4:
|
||||
|
|
|
@ -11,37 +11,8 @@ $(eval $(call gb_CustomTarget_CustomTarget,i18npool/breakiterator))
|
|||
|
||||
i18npool_BIDIR := $(call gb_CustomTarget_get_workdir,i18npool/breakiterator)
|
||||
|
||||
ifneq ($(filter iOS ANDROID,$(OS)),)
|
||||
|
||||
$(call gb_CustomTarget_get_target,i18npool/breakiterator) : \
|
||||
$(i18npool_BIDIR)/dict_ja.data $(i18npool_BIDIR)/dict_zh.data $(i18npool_BIDIR)/OpenOffice_dat.c
|
||||
|
||||
$(i18npool_BIDIR)/dict_%.data : \
|
||||
$(SRCDIR)/i18npool/source/breakiterator/data/%.dic \
|
||||
$(call gb_Executable_get_runtime_dependencies,gendict) \
|
||||
| $(i18npool_BIDIR)/.dir
|
||||
$(call gb_Output_announce,$(subst $(WORKDIR)/,,$@),$(true),DIC,1)
|
||||
$(call gb_Trace_StartRange,$(subst $(WORKDIR)/,,$@),DIC)
|
||||
$(call gb_Helper_abbreviate_dirs,\
|
||||
$(call gb_Helper_execute,gendict) $< $@ $(patsubst $(i18npool_BIDIR)/dict_%.cxx,%,$@))
|
||||
$(call gb_Trace_EndRange,$(subst $(WORKDIR)/,,$@),DIC)
|
||||
|
||||
else # !iOS ANDROID
|
||||
|
||||
$(call gb_CustomTarget_get_target,i18npool/breakiterator) : \
|
||||
$(i18npool_BIDIR)/dict_ja.cxx $(i18npool_BIDIR)/dict_zh.cxx $(i18npool_BIDIR)/OpenOffice_dat.c
|
||||
|
||||
$(i18npool_BIDIR)/dict_%.cxx : \
|
||||
$(SRCDIR)/i18npool/source/breakiterator/data/%.dic \
|
||||
$(call gb_Executable_get_runtime_dependencies,gendict) \
|
||||
| $(i18npool_BIDIR)/.dir
|
||||
$(call gb_Output_announce,$(subst $(WORKDIR)/,,$@),$(true),DIC,1)
|
||||
$(call gb_Trace_StartRange,$(subst $(WORKDIR)/,,$@),DIC)
|
||||
$(call gb_Helper_abbreviate_dirs,\
|
||||
$(call gb_Helper_execute,gendict) $< $@ $(patsubst $(i18npool_BIDIR)/dict_%.cxx,%,$@))
|
||||
$(call gb_Trace_EndRange,$(subst $(WORKDIR)/,,$@),DIC)
|
||||
|
||||
endif
|
||||
$(i18npool_BIDIR)/OpenOffice_dat.c
|
||||
|
||||
i18npool_BRKTXTS := \
|
||||
count_word.brk \
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
|
||||
#
|
||||
# This file is part of the LibreOffice project.
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
|
||||
$(eval $(call gb_Library_Library,dict_ja))
|
||||
|
||||
$(eval $(call gb_Library_set_plugin_for_nodep,dict_ja,i18npool))
|
||||
|
||||
$(eval $(call gb_Library_add_generated_exception_objects,dict_ja,\
|
||||
CustomTarget/i18npool/breakiterator/dict_ja \
|
||||
))
|
||||
|
||||
# vim: set noet sw=4 ts=4:
|
|
@ -1,18 +0,0 @@
|
|||
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t -*-
|
||||
#
|
||||
# This file is part of the LibreOffice project.
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
#
|
||||
|
||||
$(eval $(call gb_Library_Library,dict_zh))
|
||||
|
||||
$(eval $(call gb_Library_set_plugin_for_nodep,dict_zh,i18npool))
|
||||
|
||||
$(eval $(call gb_Library_add_generated_exception_objects,dict_zh,\
|
||||
CustomTarget/i18npool/breakiterator/dict_zh \
|
||||
))
|
||||
|
||||
# vim: set noet sw=4 ts=4:
|
|
@ -51,7 +51,6 @@ $(eval $(call gb_Library_add_exception_objects,i18npool,\
|
|||
i18npool/source/breakiterator/breakiterator_cjk \
|
||||
i18npool/source/breakiterator/breakiteratorImpl \
|
||||
i18npool/source/breakiterator/breakiterator_unicode \
|
||||
i18npool/source/breakiterator/xdictionary \
|
||||
i18npool/source/calendar/calendarImpl \
|
||||
i18npool/source/calendar/calendar_gregorian \
|
||||
i18npool/source/calendar/calendar_hijri \
|
||||
|
|
|
@ -15,9 +15,6 @@ $(eval $(call gb_Module_add_targets,i18npool,\
|
|||
CustomTarget_indexentry \
|
||||
CustomTarget_localedata \
|
||||
CustomTarget_textconversion \
|
||||
$(if $(filter-out iOS ANDROID,$(OS)), \
|
||||
Library_dict_ja \
|
||||
Library_dict_zh) \
|
||||
Library_i18npool \
|
||||
Library_i18nsearch \
|
||||
Library_localedata_en \
|
||||
|
@ -29,7 +26,6 @@ $(eval $(call gb_Module_add_targets,i18npool,\
|
|||
$(eval $(call gb_Module_add_targets_for_build,i18npool,\
|
||||
Executable_gencoll_rule \
|
||||
Executable_genconv_dict \
|
||||
Executable_gendict \
|
||||
Executable_genindex_data \
|
||||
Executable_saxparser \
|
||||
Rdb_saxparser \
|
||||
|
|
|
@ -6,19 +6,6 @@ code modification. (Wow, that is such marketing-speak...)
|
|||
|
||||
Specifically for locale data documentation please see `i18npool/source/localedata/data/locale.dtd`
|
||||
|
||||
On iOS we put the largest data generated here, the `dict_ja` and `dict_zh`
|
||||
stuff, into separate files and not into code to keep the size of an
|
||||
app binary down. Temporary test code:
|
||||
|
||||
static bool beenhere = false;
|
||||
if (!beenhere) {
|
||||
beenhere = true;
|
||||
uno::Reference< uno::XComponentContext > xComponentContext(::cppu::defaultBootstrap_InitialComponentContext());
|
||||
uno::Reference< lang::XMultiComponentFactory > xMultiComponentFactoryClient( xComponentContext->getServiceManager() );
|
||||
uno::Reference< uno::XInterface > xInterface =
|
||||
xMultiComponentFactoryClient->createInstanceWithContext( "com.sun.star.i18n.BreakIterator_ja", xComponentContext );
|
||||
}
|
||||
|
||||
## See Also
|
||||
|
||||
<http://wiki.documentfoundation.org/Category:I18n>
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "breakiterator_unicode.hxx"
|
||||
#include "xdictionary.hxx"
|
||||
#include <optional>
|
||||
#include <memory>
|
||||
|
||||
|
@ -31,19 +30,12 @@ class BreakIterator_CJK : public BreakIterator_Unicode
|
|||
public:
|
||||
BreakIterator_CJK();
|
||||
|
||||
css::i18n::Boundary SAL_CALL nextWord( const OUString& Text, sal_Int32 nStartPos,
|
||||
const css::lang::Locale& nLocale, sal_Int16 WordType) override;
|
||||
css::i18n::Boundary SAL_CALL previousWord( const OUString& Text, sal_Int32 nStartPos,
|
||||
const css::lang::Locale& nLocale, sal_Int16 WordType) override;
|
||||
css::i18n::Boundary SAL_CALL getWordBoundary( const OUString& Text, sal_Int32 nPos,
|
||||
const css::lang::Locale& nLocale, sal_Int16 WordType, sal_Bool bDirection ) override;
|
||||
css::i18n::LineBreakResults SAL_CALL getLineBreak( const OUString& Text, sal_Int32 nStartPos,
|
||||
const css::lang::Locale& nLocale, sal_Int32 nMinBreakPos,
|
||||
const css::i18n::LineBreakHyphenationOptions& hOptions,
|
||||
const css::i18n::LineBreakUserOptions& bOptions ) override;
|
||||
|
||||
protected:
|
||||
std::optional<xdictionary> m_oDict;
|
||||
OUString hangingCharacters;
|
||||
};
|
||||
|
||||
|
|
|
@ -1,94 +0,0 @@
|
|||
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
||||
/*
|
||||
* This file is part of the LibreOffice project.
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* This file incorporates work covered by the following license notice:
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed
|
||||
* with this work for additional information regarding copyright
|
||||
* ownership. The ASF licenses this file to you under the Apache
|
||||
* License, Version 2.0 (the "License"); you may not use this file
|
||||
* except in compliance with the License. You may obtain a copy of
|
||||
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <osl/file.h>
|
||||
#include <sal/types.h>
|
||||
|
||||
#include <com/sun/star/i18n/Boundary.hpp>
|
||||
|
||||
namespace i18npool {
|
||||
|
||||
#define CACHE_MAX 32 // max cache structure number
|
||||
#define DEFAULT_SIZE 256 // for boundary size, to avoid alloc and release memory
|
||||
|
||||
// cache structure.
|
||||
struct WordBreakCache {
|
||||
sal_Unicode *contents; // separated segment contents.
|
||||
sal_Int32* wordboundary; // word boundaries in segments.
|
||||
sal_Int32 length; // contents length saved here.
|
||||
sal_Int32 size; // size of wordboundary
|
||||
|
||||
WordBreakCache();
|
||||
bool equals(const sal_Unicode *str, css::i18n::Boundary const & boundary) const; // checking cached string
|
||||
};
|
||||
|
||||
struct xdictionarydata
|
||||
{
|
||||
const sal_uInt8 * existMark;
|
||||
const sal_Int16 * index1;
|
||||
const sal_Int32 * index2;
|
||||
const sal_Int32 * lenArray;
|
||||
const sal_Unicode* dataArea;
|
||||
xdictionarydata() :
|
||||
existMark( nullptr ),
|
||||
index1( nullptr ),
|
||||
index2( nullptr ),
|
||||
lenArray( nullptr ),
|
||||
dataArea( nullptr )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
class xdictionary
|
||||
{
|
||||
private:
|
||||
xdictionarydata data;
|
||||
void initDictionaryData(const char *lang);
|
||||
|
||||
css::i18n::Boundary boundary;
|
||||
bool japaneseWordBreak;
|
||||
#ifdef DICT_JA_ZH_IN_DATAFILE
|
||||
oslFileHandle m_aFileHandle;
|
||||
sal_uInt64 m_nFileSize;
|
||||
char* m_pMapping;
|
||||
#endif
|
||||
|
||||
public:
|
||||
xdictionary(const char *lang);
|
||||
~xdictionary();
|
||||
css::i18n::Boundary nextWord( const OUString& rText, sal_Int32 nPos, sal_Int16 wordType);
|
||||
css::i18n::Boundary previousWord( const OUString& rText, sal_Int32 nPos, sal_Int16 wordType);
|
||||
css::i18n::Boundary const & getWordBoundary( const OUString& rText, sal_Int32 nPos, sal_Int16 wordType, bool bDirection );
|
||||
void setJapaneseWordBreak();
|
||||
|
||||
private:
|
||||
WordBreakCache cache[CACHE_MAX];
|
||||
OUString segmentCachedString;
|
||||
css::i18n::Boundary segmentCachedBoundary;
|
||||
|
||||
bool seekSegment(const OUString& rText, sal_Int32 pos, css::i18n::Boundary& boundary);
|
||||
WordBreakCache& getCache(const sal_Unicode *text, css::i18n::Boundary const & boundary);
|
||||
bool exists(const sal_uInt32 u) const;
|
||||
sal_Int32 getLongestMatch(const sal_Unicode *text, sal_Int32 len) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
|
@ -50,7 +50,7 @@ public:
|
|||
void testLegacyDictWordPrepostDash_sv_SE();
|
||||
void testLegacyHebrewQuoteInsideWord();
|
||||
void testLegacySurrogatePairs();
|
||||
void testLegacyWordCountCompat();
|
||||
void testWordCount();
|
||||
|
||||
CPPUNIT_TEST_SUITE(TestBreakIterator);
|
||||
CPPUNIT_TEST(testLineBreaking);
|
||||
|
@ -73,7 +73,7 @@ public:
|
|||
CPPUNIT_TEST(testLegacyDictWordPrepostDash_sv_SE);
|
||||
CPPUNIT_TEST(testLegacyHebrewQuoteInsideWord);
|
||||
CPPUNIT_TEST(testLegacySurrogatePairs);
|
||||
CPPUNIT_TEST(testLegacyWordCountCompat);
|
||||
CPPUNIT_TEST(testWordCount);
|
||||
CPPUNIT_TEST_SUITE_END();
|
||||
|
||||
private:
|
||||
|
@ -930,6 +930,22 @@ void TestBreakIterator::testWordBoundaries()
|
|||
CPPUNIT_ASSERT_EQUAL(sal_Int32(31), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(37), aBounds.endPos);
|
||||
}
|
||||
|
||||
// tdf#49885: Upgrade CJ word boundary analysis to ICU frequency-based analysis
|
||||
{
|
||||
aLocale.Language = "ja";
|
||||
aLocale.Country = "JP";
|
||||
|
||||
static constexpr OUString aTest = u"通産省工業技術院北海道工業開発試験所"_ustr;
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aTest, 9, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
||||
|
||||
// When using the old LO custom dictionaries, this will select the entire phrase.
|
||||
// When using ICU, it will select only 北海道.
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(8), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(11), aBounds.endPos);
|
||||
}
|
||||
}
|
||||
|
||||
void TestBreakIterator::testSentenceBoundaries()
|
||||
|
@ -1399,12 +1415,12 @@ void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > co
|
|||
i18n::Boundary aBounds;
|
||||
|
||||
{
|
||||
static constexpr OUStringLiteral aTest = u"\u30B7\u30E3\u30C3\u30C8\u30C0\u30A6\u30F3";
|
||||
static constexpr OUString aTest = u"シャットダウン"_ustr;
|
||||
|
||||
aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
|
||||
i18n::WordType::DICTIONARY_WORD, true);
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(7), aBounds.endPos);
|
||||
}
|
||||
|
||||
|
@ -1570,8 +1586,7 @@ void TestBreakIterator::testLegacySurrogatePairs()
|
|||
//
|
||||
// BreakIterator supports surrogate pairs (UTF-16). This is a simple characteristic test.
|
||||
{
|
||||
const sal_Unicode buf[] = { u"X 𠮟 X" };
|
||||
OUString aTest(buf, SAL_N_ELEMENTS(buf));
|
||||
static constexpr OUString aTest = u"X 𠮟 X"_ustr;
|
||||
|
||||
auto aBounds
|
||||
= m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
||||
|
@ -1581,7 +1596,7 @@ void TestBreakIterator::testLegacySurrogatePairs()
|
|||
aBounds
|
||||
= m_xBreak->getWordBoundary(aTest, 2, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(2), aBounds.startPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
|
||||
CPPUNIT_ASSERT_EQUAL(sal_Int32(4), aBounds.endPos);
|
||||
|
||||
aBounds
|
||||
= m_xBreak->getWordBoundary(aTest, 5, aLocale, i18n::WordType::DICTIONARY_WORD, false);
|
||||
|
@ -1590,16 +1605,44 @@ void TestBreakIterator::testLegacySurrogatePairs()
|
|||
}
|
||||
}
|
||||
|
||||
void TestBreakIterator::testLegacyWordCountCompat()
|
||||
void TestBreakIterator::testWordCount()
|
||||
{
|
||||
lang::Locale aLocale;
|
||||
auto count_words_fn = [&](const OUString& str, const lang::Locale& aLocale) -> int
|
||||
{
|
||||
int num_words = 0;
|
||||
sal_Int32 next_pos = 0;
|
||||
int iter_guard = 0;
|
||||
|
||||
aLocale.Language = "en";
|
||||
aLocale.Country = "US";
|
||||
if (m_xBreak->isBeginWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT))
|
||||
{
|
||||
++num_words;
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100);
|
||||
|
||||
auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT);
|
||||
|
||||
if (aBounds.endPos < next_pos || aBounds.startPos == aBounds.endPos)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
next_pos = aBounds.endPos;
|
||||
++num_words;
|
||||
}
|
||||
|
||||
return num_words;
|
||||
};
|
||||
|
||||
// i#80815: "Word count differs from MS Word"
|
||||
// This is a characteristic test for word count using test data from the linked bug.
|
||||
{
|
||||
lang::Locale aLocale;
|
||||
aLocale.Language = "en";
|
||||
aLocale.Country = "US";
|
||||
|
||||
const OUString str = u""
|
||||
"test data for word count issue #80815\n"
|
||||
"fo\\\'sforos\n"
|
||||
|
@ -1622,25 +1665,18 @@ void TestBreakIterator::testLegacyWordCountCompat()
|
|||
"aaaaaaa,aaaaaaa\n"
|
||||
"aaaaaaa;aaaaaaa\n"_ustr;
|
||||
|
||||
int num_words = 0;
|
||||
sal_Int32 next_pos = 0;
|
||||
int iter_guard = 0;
|
||||
while (true)
|
||||
{
|
||||
CPPUNIT_ASSERT_MESSAGE("Tripped infinite loop check", ++iter_guard < 100);
|
||||
CPPUNIT_ASSERT_EQUAL(24, count_words_fn(str, aLocale));
|
||||
}
|
||||
|
||||
auto aBounds = m_xBreak->nextWord(str, next_pos, aLocale, i18n::WordType::WORD_COUNT);
|
||||
// Test that the switch to upstream ICU for CJ word boundary analysis doesn't change word count.
|
||||
{
|
||||
lang::Locale aLocale;
|
||||
aLocale.Language = "ja";
|
||||
aLocale.Country = "JP";
|
||||
|
||||
if (aBounds.endPos < next_pos)
|
||||
{
|
||||
break;
|
||||
}
|
||||
const OUString str = u"Wordの様にワード数をするのにTest\n植松町"_ustr;
|
||||
|
||||
next_pos = aBounds.endPos;
|
||||
++num_words;
|
||||
}
|
||||
|
||||
CPPUNIT_ASSERT_EQUAL(23, num_words);
|
||||
CPPUNIT_ASSERT_EQUAL(7, count_words_fn(str, aLocale));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,54 +37,6 @@ BreakIterator_CJK::BreakIterator_CJK()
|
|||
cBreakIterator = u"com.sun.star.i18n.BreakIterator_CJK"_ustr;
|
||||
}
|
||||
|
||||
Boundary SAL_CALL
|
||||
BreakIterator_CJK::previousWord(const OUString& text, sal_Int32 anyPos,
|
||||
const css::lang::Locale& nLocale, sal_Int16 wordType)
|
||||
{
|
||||
if (m_oDict) {
|
||||
result = m_oDict->previousWord(text, anyPos, wordType);
|
||||
// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
|
||||
if (result.endPos - result.startPos != 1 ||
|
||||
getScriptType(text, result.startPos) == ScriptType::ASIAN)
|
||||
return result;
|
||||
result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
|
||||
if (result.endPos < anyPos)
|
||||
return result;
|
||||
}
|
||||
return BreakIterator_Unicode::previousWord(text, anyPos, nLocale, wordType);
|
||||
}
|
||||
|
||||
Boundary SAL_CALL
|
||||
BreakIterator_CJK::nextWord(const OUString& text, sal_Int32 anyPos,
|
||||
const css::lang::Locale& nLocale, sal_Int16 wordType)
|
||||
{
|
||||
if (m_oDict) {
|
||||
result = m_oDict->nextWord(text, anyPos, wordType);
|
||||
// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
|
||||
if (result.endPos - result.startPos != 1 ||
|
||||
getScriptType(text, result.startPos) == ScriptType::ASIAN)
|
||||
return result;
|
||||
result = BreakIterator_Unicode::getWordBoundary(text, result.startPos, nLocale, wordType, true);
|
||||
if (result.startPos > anyPos)
|
||||
return result;
|
||||
}
|
||||
return BreakIterator_Unicode::nextWord(text, anyPos, nLocale, wordType);
|
||||
}
|
||||
|
||||
Boundary SAL_CALL
|
||||
BreakIterator_CJK::getWordBoundary( const OUString& text, sal_Int32 anyPos,
|
||||
const css::lang::Locale& nLocale, sal_Int16 wordType, sal_Bool bDirection )
|
||||
{
|
||||
if (m_oDict) {
|
||||
result = m_oDict->getWordBoundary(text, anyPos, wordType, bDirection);
|
||||
// #109813# for non-CJK, single character word, fallback to ICU breakiterator.
|
||||
if (result.endPos - result.startPos != 1 ||
|
||||
getScriptType(text, result.startPos) == ScriptType::ASIAN)
|
||||
return result;
|
||||
}
|
||||
return BreakIterator_Unicode::getWordBoundary(text, anyPos, nLocale, wordType, bDirection);
|
||||
}
|
||||
|
||||
namespace {
|
||||
bool isHangul( sal_Unicode cCh )
|
||||
{
|
||||
|
@ -143,7 +95,6 @@ LineBreakResults SAL_CALL BreakIterator_CJK::getLineBreak(
|
|||
// ----------------------------------------------------;
|
||||
BreakIterator_zh::BreakIterator_zh()
|
||||
{
|
||||
m_oDict.emplace("zh");
|
||||
assert(hangingCharacters.pData);
|
||||
hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "CN"));
|
||||
cBreakIterator = u"com.sun.star.i18n.BreakIterator_zh"_ustr;
|
||||
|
@ -154,7 +105,6 @@ BreakIterator_zh::BreakIterator_zh()
|
|||
// ----------------------------------------------------;
|
||||
BreakIterator_zh_TW::BreakIterator_zh_TW()
|
||||
{
|
||||
m_oDict.emplace("zh");
|
||||
assert(hangingCharacters.pData);
|
||||
hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("zh", "TW"));
|
||||
cBreakIterator = u"com.sun.star.i18n.BreakIterator_zh_TW"_ustr;
|
||||
|
@ -165,8 +115,6 @@ BreakIterator_zh_TW::BreakIterator_zh_TW()
|
|||
// ----------------------------------------------------;
|
||||
BreakIterator_ja::BreakIterator_ja()
|
||||
{
|
||||
m_oDict.emplace("ja");
|
||||
m_oDict->setJapaneseWordBreak();
|
||||
assert(hangingCharacters.pData);
|
||||
hangingCharacters = LocaleDataImpl::get()->getHangingCharacters(LOCALE("ja", "JP"));
|
||||
cBreakIterator = u"com.sun.star.i18n.BreakIterator_ja"_ustr;
|
||||
|
|
|
@ -74,6 +74,16 @@ class OOoRuleBasedBreakIterator : public icu::RuleBasedBreakIterator
|
|||
|
||||
};
|
||||
|
||||
bool locale_requires_dictionary_iterator(const css::lang::Locale& rLocale)
|
||||
{
|
||||
return rLocale.Language == "bo" || // Tibetan
|
||||
rLocale.Language == "dz" || // Dzongkha
|
||||
rLocale.Language == "ja" || // Japanese
|
||||
rLocale.Language == "km" || // Khmer
|
||||
rLocale.Language == "lo" || // Lao
|
||||
rLocale.Language == "th" || // Thai
|
||||
rLocale.Language == "zh"; // Chinese
|
||||
}
|
||||
}
|
||||
|
||||
// loading ICU breakiterator on demand.
|
||||
|
@ -179,8 +189,7 @@ void BreakIterator_Unicode::loadICUBreakIterator(const css::lang::Locale& rLocal
|
|||
rbi.reset();
|
||||
}
|
||||
}
|
||||
//use icu's breakiterator for Thai, Tibetan and Dzongkha
|
||||
else if (rLocale.Language != "th" && rLocale.Language != "lo" && rLocale.Language != "bo" && rLocale.Language != "dz" && rLocale.Language != "km")
|
||||
else if(!locale_requires_dictionary_iterator(rLocale))
|
||||
{
|
||||
// language;rule (not langtag, unless we'd actually load such)
|
||||
OString aLanguage( LanguageTag( rLocale).getLanguage().toUtf8());
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,340 +0,0 @@
|
|||
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
||||
/*
|
||||
* This file is part of the LibreOffice project.
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* This file incorporates work covered by the following license notice:
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed
|
||||
* with this work for additional information regarding copyright
|
||||
* ownership. The ASF licenses this file to you under the Apache
|
||||
* License, Version 2.0 (the "License"); you may not use this file
|
||||
* except in compliance with the License. You may obtain a copy of
|
||||
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <sal/main.h>
|
||||
#include <sal/types.h>
|
||||
#include <rtl/ustring.hxx>
|
||||
#include <osl/diagnose.h>
|
||||
#include <vector>
|
||||
|
||||
using std::vector;
|
||||
|
||||
|
||||
// For iOS, where we must strive for a minimal executable size, we
|
||||
// keep the data produced by this utility not as large const tables in
|
||||
// source code but instead as separate data files, to be bundled with
|
||||
// an app, and mmapped in at run time.
|
||||
|
||||
// To test this easier on a desktop OS, just make sure
|
||||
// DICT_JA_ZH_IN_DATAFILE is defined when building i18npool.
|
||||
|
||||
#ifdef DICT_JA_ZH_IN_DATAFILE
|
||||
static sal_Int64 dataAreaOffset = 0;
|
||||
static sal_Int64 lenArrayOffset = 0;
|
||||
static sal_Int64 index1Offset = 0;
|
||||
static sal_Int64 index2Offset = 0;
|
||||
static sal_Int64 existMarkOffset = 0;
|
||||
#endif
|
||||
|
||||
/* Utility gendict:
|
||||
|
||||
"BreakIterator_CJK provides input string caching and dictionary searching for
|
||||
longest matching. You can provide a sorted dictionary (the encoding must be
|
||||
UTF-8) by creating the following file:
|
||||
i18npool/source/breakiterator/data/<language>.dict.
|
||||
|
||||
The utility gendict will convert the file to C code, which will be compiled
|
||||
into a shared library for dynamic loading.
|
||||
|
||||
All dictionary searching and loading is performed in the xdictionary class.
|
||||
The only thing you need to do is to derive your class from BreakIterator_CJK
|
||||
and create an instance of the xdictionary with the language name and
|
||||
pass it to the parent class." (from https://wiki.documentfoundation.org/
|
||||
Documentation/DevGuide/Office_Development#Implementing_a_New_Locale - 27/01/2011)
|
||||
*/
|
||||
|
||||
// C-standard guarantees that static variables are automatically initialized to 0
|
||||
static sal_uInt8 exists[0x2000];
|
||||
static sal_uInt32 charArray[0x10000];
|
||||
|
||||
static void set_exists(sal_uInt32 index)
|
||||
{
|
||||
exists[index>>3] |= 1 << (index & 0x07);
|
||||
}
|
||||
|
||||
static void printIncludes(FILE* source_fp)
|
||||
{
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("/* !!!The file is generated automatically. DO NOT edit the file manually!!! */\n\n", source_fp);
|
||||
fputs("#include <sal/types.h>\n\n", source_fp);
|
||||
#else
|
||||
(void) source_fp;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void printFunctions(FILE* source_fp, const char *lang)
|
||||
{
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs ("#ifndef DISABLE_DYNLOADING\n", source_fp);
|
||||
fputs ("SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark() { return existMark; }\n", source_fp);
|
||||
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1() { return index1; }\n", source_fp);
|
||||
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2() { return index2; }\n", source_fp);
|
||||
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray() { return lenArray; }\n", source_fp);
|
||||
fputs ("SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea() { return dataArea; }\n", source_fp);
|
||||
fputs ("#else\n", source_fp);
|
||||
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_uInt8* getExistMark_%s() { return existMark; }\n", lang);
|
||||
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int16* getIndex1_%s() { return index1; }\n", lang);
|
||||
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getIndex2_%s() { return index2; }\n", lang);
|
||||
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Int32* getLenArray_%s() { return lenArray; }\n", lang);
|
||||
fprintf (source_fp, "SAL_DLLPUBLIC_EXPORT const sal_Unicode* getDataArea_%s() { return dataArea; }\n", lang);
|
||||
fputs ("#endif\n", source_fp);
|
||||
#else
|
||||
(void) source_fp;
|
||||
(void) lang;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void printDataArea(FILE *dictionary_fp, FILE *source_fp, vector<sal_uInt32>& lenArray)
|
||||
{
|
||||
// generate main dict. data array
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("static const sal_Unicode dataArea[] = {\n\t", source_fp);
|
||||
#else
|
||||
dataAreaOffset = ftell(source_fp);
|
||||
#endif
|
||||
char str[1024];
|
||||
sal_uInt32 lenArrayCurr = 0;
|
||||
sal_Unicode current = 0;
|
||||
|
||||
while (fgets(str, 1024, dictionary_fp)) {
|
||||
// input file is in UTF-8 encoding
|
||||
// don't convert last new line character to Ostr.
|
||||
OUString Ostr(str, strlen(str) - 1, RTL_TEXTENCODING_UTF8);
|
||||
|
||||
const sal_Int32 len = Ostr.getLength();
|
||||
|
||||
sal_Int32 i=0;
|
||||
Ostr.iterateCodePoints(&i);
|
||||
if (len == i)
|
||||
continue; // skip one character word
|
||||
|
||||
if (Ostr[0] != current) {
|
||||
OSL_ENSURE( (Ostr[0] > current), "Dictionary file should be sorted");
|
||||
current = Ostr[0];
|
||||
charArray[current] = lenArray.size();
|
||||
}
|
||||
|
||||
lenArray.push_back(lenArrayCurr);
|
||||
|
||||
set_exists(Ostr[0]);
|
||||
// first character is stored in charArray, so start from second
|
||||
for (i = 1; i < len; i++, lenArrayCurr++) {
|
||||
set_exists(Ostr[i]);
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf(source_fp, "0x%04x, ", Ostr[i]);
|
||||
if ((lenArrayCurr & 0x0f) == 0x0f)
|
||||
fputs("\n\t", source_fp);
|
||||
#else
|
||||
sal_Unicode x = Ostr[i];
|
||||
fwrite(&x, sizeof(Ostr[i]), 1, source_fp);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
charArray[current+1] = lenArray.size();
|
||||
lenArray.push_back( lenArrayCurr ); // store last ending pointer
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("\n};\n", source_fp);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void printLenArray(FILE* source_fp, const vector<sal_uInt32>& lenArray)
|
||||
{
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf(source_fp, "static const sal_Int32 lenArray[] = {\n\t");
|
||||
fprintf(source_fp, "0x%x, ", 0); // insert one slat for skipping 0 in index2 array.
|
||||
#else
|
||||
lenArrayOffset = ftell(source_fp);
|
||||
sal_uInt32 zero(0);
|
||||
fwrite(&zero, sizeof(zero), 1, source_fp);
|
||||
#endif
|
||||
for (size_t k = 0; k < lenArray.size(); k++)
|
||||
{
|
||||
if( !(k & 0xf) )
|
||||
fputs("\n\t", source_fp);
|
||||
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf(source_fp, "0x%" SAL_PRIxUINT32 ", ", lenArray[k]);
|
||||
#else
|
||||
fwrite(&lenArray[k], sizeof(lenArray[k]), 1, source_fp);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("\n};\n", source_fp );
|
||||
#endif
|
||||
}
|
||||
|
||||
/* FIXME?: what happens if in every range i there is at least one charArray != 0
|
||||
=> this will make index1[] = {0x00, 0x01, 0x02,... 0xfe, 0xff }
|
||||
=> then in index2, the last range will be ignored incorrectly */
|
||||
static void printIndex1(FILE *source_fp, sal_Int16 *set)
|
||||
{
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf (source_fp, "static const sal_Int16 index1[] = {\n\t");
|
||||
#else
|
||||
index1Offset = ftell(source_fp);
|
||||
#endif
|
||||
|
||||
sal_Int16 count = 0;
|
||||
for (sal_Int32 i = 0; i < 0x100; i++) {
|
||||
sal_Int32 j = 0;
|
||||
while( j < 0x100 && charArray[(i<<8) + j] == 0)
|
||||
j++;
|
||||
|
||||
set[i] = (j < 0x100 ? count++ : 0xff);
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf(source_fp, "0x%02x, ", set[i]);
|
||||
if ((i & 0x0f) == 0x0f)
|
||||
fputs ("\n\t", source_fp);
|
||||
#else
|
||||
fwrite(&set[i], sizeof(set[i]), 1, source_fp);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("};\n", source_fp);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void printIndex2(FILE *source_fp, sal_Int16 const *set)
|
||||
{
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs ("static const sal_Int32 index2[] = {\n\t", source_fp);
|
||||
#else
|
||||
index2Offset = ftell(source_fp);
|
||||
#endif
|
||||
sal_Int32 prev = 0;
|
||||
for (sal_Int32 i = 0; i < 0x100; i++) {
|
||||
if (set[i] != 0xff) {
|
||||
for (sal_Int32 j = 0; j < 0x100; j++) {
|
||||
sal_Int32 k = (i<<8) + j;
|
||||
if (prev != 0 )
|
||||
while( k < 0x10000 && charArray[k] == 0 )
|
||||
k++;
|
||||
|
||||
prev = charArray[(i<<8) + j];
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf(source_fp, "0x%lx, ", static_cast<long unsigned int>(k < 0x10000 ? charArray[k] + 1 : 0));
|
||||
if ((j & 0x0f) == 0x0f)
|
||||
fputs ("\n\t", source_fp);
|
||||
#else
|
||||
sal_uInt32 n = (k < 0x10000 ? charArray[k] + 1 : 0);
|
||||
fwrite(&n, sizeof(n), 1, source_fp);
|
||||
#endif
|
||||
}
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs ("\n\t", source_fp);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs ("\n};\n", source_fp);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Generates a bitmask for the existence of sal_Unicode values in dictionary;
|
||||
it packs 8 sal_Bool values in 1 sal_uInt8 */
|
||||
static void printExistsMask(FILE *source_fp)
|
||||
{
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf (source_fp, "static const sal_uInt8 existMark[] = {\n\t");
|
||||
#else
|
||||
existMarkOffset = ftell(source_fp);
|
||||
#endif
|
||||
for (unsigned int i = 0; i < 0x2000; i++)
|
||||
{
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fprintf(source_fp, "0x%02x, ", exists[i]);
|
||||
if ( (i & 0xf) == 0xf )
|
||||
fputs("\n\t", source_fp);
|
||||
#else
|
||||
fwrite(&exists[i], sizeof(exists[i]), 1, source_fp);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("\n};\n", source_fp);
|
||||
#endif
|
||||
}
|
||||
|
||||
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc, argv)
|
||||
{
|
||||
FILE *dictionary_fp, *source_fp;
|
||||
|
||||
if (argc == 1 || argc > 4)
|
||||
{
|
||||
fputs("3 arguments required: dictionary_file_name source_file_name language_code", stderr);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
dictionary_fp = fopen(argv[1], "rb"); // open the source file for read;
|
||||
if (dictionary_fp == nullptr)
|
||||
{
|
||||
fprintf(stderr, "Opening the dictionary source file %s for reading failed: %s\n", argv[1], strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(argc == 2)
|
||||
source_fp = stdout;
|
||||
else
|
||||
{
|
||||
// create the C source file to write
|
||||
source_fp = fopen(argv[2], "wb");
|
||||
if (source_fp == nullptr) {
|
||||
fclose(dictionary_fp);
|
||||
fprintf(stderr, "Opening %s for writing failed: %s\n", argv[2], strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
vector<sal_uInt32> lenArray; // stores the word boundaries in DataArea
|
||||
sal_Int16 set[0x100];
|
||||
|
||||
printIncludes(source_fp);
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("extern \"C\" {\n", source_fp);
|
||||
#endif
|
||||
printDataArea(dictionary_fp, source_fp, lenArray);
|
||||
printLenArray(source_fp, lenArray);
|
||||
printIndex1(source_fp, set);
|
||||
printIndex2(source_fp, set);
|
||||
printExistsMask(source_fp);
|
||||
printFunctions(source_fp, argv[3]);
|
||||
#ifndef DICT_JA_ZH_IN_DATAFILE
|
||||
fputs("}\n", source_fp);
|
||||
#else
|
||||
// Put pointers to the tables at the end of the file...
|
||||
fwrite(&dataAreaOffset, sizeof(dataAreaOffset), 1, source_fp);
|
||||
fwrite(&lenArrayOffset, sizeof(lenArrayOffset), 1, source_fp);
|
||||
fwrite(&index1Offset, sizeof(index1Offset), 1, source_fp);
|
||||
fwrite(&index2Offset, sizeof(index2Offset), 1, source_fp);
|
||||
fwrite(&existMarkOffset, sizeof(existMarkOffset), 1, source_fp);
|
||||
#endif
|
||||
|
||||
fclose(dictionary_fp);
|
||||
fclose(source_fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
|
@ -1,490 +0,0 @@
|
|||
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
||||
/*
|
||||
* This file is part of the LibreOffice project.
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* This file incorporates work covered by the following license notice:
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed
|
||||
* with this work for additional information regarding copyright
|
||||
* ownership. The ASF licenses this file to you under the Apache
|
||||
* License, Version 2.0 (the "License"); you may not use this file
|
||||
* except in compliance with the License. You may obtain a copy of
|
||||
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
||||
*/
|
||||
|
||||
#include <config_folders.h>
|
||||
#include <o3tl/temporary.hxx>
|
||||
#include <osl/file.h>
|
||||
#include <osl/module.h>
|
||||
#include <osl/mutex.hxx>
|
||||
#include <rtl/bootstrap.hxx>
|
||||
#include <com/sun/star/i18n/ScriptType.hpp>
|
||||
#include <com/sun/star/i18n/WordType.hpp>
|
||||
#include <xdictionary.hxx>
|
||||
#include <unicode/uchar.h>
|
||||
#include <string.h>
|
||||
#include <breakiteratorImpl.hxx>
|
||||
|
||||
using namespace com::sun::star::i18n;
|
||||
|
||||
namespace i18npool {
|
||||
|
||||
#ifdef DICT_JA_ZH_IN_DATAFILE
|
||||
|
||||
#elif !defined DISABLE_DYNLOADING
|
||||
|
||||
extern "C" { static void thisModule() {} }
|
||||
|
||||
#else
|
||||
|
||||
extern "C" {
|
||||
|
||||
sal_uInt8* getExistMark_ja();
|
||||
sal_Int16* getIndex1_ja();
|
||||
sal_Int32* getIndex2_ja();
|
||||
sal_Int32* getLenArray_ja();
|
||||
sal_Unicode* getDataArea_ja();
|
||||
|
||||
sal_uInt8* getExistMark_zh();
|
||||
sal_Int16* getIndex1_zh();
|
||||
sal_Int32* getIndex2_zh();
|
||||
sal_Int32* getLenArray_zh();
|
||||
sal_Unicode* getDataArea_zh();
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
xdictionary::xdictionary(const char *lang) :
|
||||
japaneseWordBreak( false )
|
||||
#ifdef DICT_JA_ZH_IN_DATAFILE
|
||||
, m_aFileHandle(nullptr),
|
||||
m_nFileSize(-1),
|
||||
m_pMapping(nullptr)
|
||||
#endif
|
||||
{
|
||||
|
||||
#ifdef DICT_JA_ZH_IN_DATAFILE
|
||||
|
||||
if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
|
||||
{
|
||||
OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
|
||||
rtl::Bootstrap::expandMacros(sUrl);
|
||||
|
||||
if( strcmp( lang, "ja" ) == 0 )
|
||||
sUrl += "ja.data";
|
||||
else if( strcmp( lang, "zh" ) == 0 )
|
||||
sUrl += "zh.data";
|
||||
|
||||
if( osl_openFile( sUrl.pData, &m_aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
|
||||
osl_getFileSize( m_aFileHandle, &m_nFileSize) == osl_File_E_None &&
|
||||
osl_mapFile( m_aFileHandle, (void **) &m_pMapping, m_nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
|
||||
{
|
||||
// We have the offsets to the parts of the file at its end, see gendict.cxx
|
||||
sal_Int64 *pEOF = (sal_Int64*)(m_pMapping + m_nFileSize);
|
||||
|
||||
data.existMark = (sal_uInt8*) (m_pMapping + pEOF[-1]);
|
||||
data.index2 = (sal_Int32*) (m_pMapping + pEOF[-2]);
|
||||
data.index1 = (sal_Int16*) (m_pMapping + pEOF[-3]);
|
||||
data.lenArray = (sal_Int32*) (m_pMapping + pEOF[-4]);
|
||||
data.dataArea = (sal_Unicode*) (m_pMapping + pEOF[-5]);
|
||||
}
|
||||
}
|
||||
|
||||
#elif !defined DISABLE_DYNLOADING
|
||||
|
||||
initDictionaryData( lang );
|
||||
|
||||
#else
|
||||
|
||||
if( strcmp( lang, "ja" ) == 0 ) {
|
||||
data.existMark = getExistMark_ja();
|
||||
data.index1 = getIndex1_ja();
|
||||
data.index2 = getIndex2_ja();
|
||||
data.lenArray = getLenArray_ja();
|
||||
data.dataArea = getDataArea_ja();
|
||||
}
|
||||
else if( strcmp( lang, "zh" ) == 0 ) {
|
||||
data.existMark = getExistMark_zh();
|
||||
data.index1 = getIndex1_zh();
|
||||
data.index2 = getIndex2_zh();
|
||||
data.lenArray = getLenArray_zh();
|
||||
data.dataArea = getDataArea_zh();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for (WordBreakCache & i : cache)
|
||||
i.size = 0;
|
||||
|
||||
japaneseWordBreak = false;
|
||||
}
|
||||
|
||||
xdictionary::~xdictionary()
|
||||
{
|
||||
for (const WordBreakCache & i : cache) {
|
||||
if (i.size > 0) {
|
||||
delete [] i.contents;
|
||||
delete [] i.wordboundary;
|
||||
}
|
||||
}
|
||||
#ifdef DICT_JA_ZH_IN_DATAFILE
|
||||
if (m_aFileHandle) {
|
||||
if (m_pMapping) {
|
||||
osl_unmapMappedFile(m_aFileHandle, m_pMapping, m_nFileSize);
|
||||
}
|
||||
osl_closeFile(m_aFileHandle);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct datacache {
|
||||
oslModule mhModule;
|
||||
OString maLang;
|
||||
xdictionarydata maData;
|
||||
};
|
||||
}
|
||||
|
||||
#if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
|
||||
|
||||
void xdictionary::initDictionaryData(const char *pLang)
|
||||
{
|
||||
// Global cache, never released for performance
|
||||
static std::vector< datacache > aLoadedCache;
|
||||
|
||||
osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
|
||||
for(const datacache & i : aLoadedCache)
|
||||
{
|
||||
if( i.maLang == pLang )
|
||||
{
|
||||
data = i.maData;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise add to the cache, positive or negative.
|
||||
datacache aEntry;
|
||||
aEntry.maLang = OString( pLang, strlen( pLang ) );
|
||||
|
||||
#ifdef SAL_DLLPREFIX
|
||||
OString sModuleName = // mostly "lib*.so" (with * == dict_zh)
|
||||
OString::Concat(SAL_DLLPREFIX "dict_") + pLang + SAL_DLLEXTENSION;
|
||||
#else
|
||||
OString sModuleName = // mostly "*.dll" (with * == dict_zh)
|
||||
OString::Concat("dict_") + pLang + SAL_DLLEXTENSION;
|
||||
#endif
|
||||
aEntry.mhModule = osl_loadModuleRelativeAscii( &thisModule, sModuleName.getStr(), SAL_LOADMODULE_DEFAULT );
|
||||
if( aEntry.mhModule ) {
|
||||
oslGenericFunction func;
|
||||
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
|
||||
aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
|
||||
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
|
||||
aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
|
||||
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
|
||||
aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
|
||||
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
|
||||
aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
|
||||
func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
|
||||
aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
|
||||
}
|
||||
|
||||
data = aEntry.maData;
|
||||
aLoadedCache.push_back( aEntry );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void xdictionary::setJapaneseWordBreak()
|
||||
{
|
||||
japaneseWordBreak = true;
|
||||
}
|
||||
|
||||
bool xdictionary::exists(const sal_uInt32 c) const
|
||||
{
|
||||
// 0x1FFF is the hardcoded limit in gendict for data.existMarks
|
||||
bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
|
||||
if (!exist && japaneseWordBreak)
|
||||
return BreakIteratorImpl::getScriptClass(c) == css::i18n::ScriptType::ASIAN;
|
||||
else
|
||||
return exist;
|
||||
}
|
||||
|
||||
sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) const
|
||||
{
|
||||
if ( !data.index1 ) return 0;
|
||||
|
||||
sal_Int16 idx = data.index1[str[0] >> 8];
|
||||
|
||||
if (idx == 0xFF) return 0;
|
||||
|
||||
idx = (idx<<8) | (str[0]&0xff);
|
||||
|
||||
sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
|
||||
|
||||
if (begin == 0) return 0;
|
||||
|
||||
str++; sLen--; // first character is not stored in the dictionary
|
||||
for (sal_uInt32 i = end; i > begin; i--) {
|
||||
sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
|
||||
if (sLen >= len) {
|
||||
const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
|
||||
sal_Int32 pos = 0;
|
||||
|
||||
while (pos < len && dstr[pos] == str[pos]) { pos++; }
|
||||
|
||||
if (pos == len)
|
||||
return len + 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* c-tor
|
||||
*/
|
||||
|
||||
WordBreakCache::WordBreakCache() :
|
||||
contents( nullptr ),
|
||||
wordboundary( nullptr ),
|
||||
length( 0 ),
|
||||
size( 0 )
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare two unicode string,
|
||||
*/
|
||||
|
||||
bool WordBreakCache::equals(const sal_Unicode* str, Boundary const & boundary) const
|
||||
{
|
||||
// Different length, different string.
|
||||
if (length != boundary.endPos - boundary.startPos) return false;
|
||||
|
||||
for (sal_Int32 i = 0; i < length; i++)
|
||||
if (contents[i] != str[i + boundary.startPos]) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Retrieve the segment containing the character at pos.
|
||||
* @param pos : Position of the given character.
|
||||
* @return true if CJK.
|
||||
*/
|
||||
bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
|
||||
Boundary& segBoundary)
|
||||
{
|
||||
sal_Int32 indexUtf16;
|
||||
|
||||
if (segmentCachedString.pData != rText.pData) {
|
||||
// Cache the passed text so we can avoid regenerating the segment if it's the same
|
||||
// (pData is refcounted and assigning the OUString references it, which ensures that
|
||||
// the object is the same if we get the same pointer back later)
|
||||
segmentCachedString = rText;
|
||||
} else {
|
||||
// If pos is within the cached boundary, use that boundary
|
||||
if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
|
||||
segBoundary.startPos = segmentCachedBoundary.startPos;
|
||||
segBoundary.endPos = segmentCachedBoundary.endPos;
|
||||
indexUtf16 = segmentCachedBoundary.startPos;
|
||||
rText.iterateCodePoints(&indexUtf16);
|
||||
return segmentCachedBoundary.endPos > indexUtf16;
|
||||
}
|
||||
}
|
||||
|
||||
segBoundary.endPos = segBoundary.startPos = pos;
|
||||
|
||||
indexUtf16 = pos;
|
||||
while (indexUtf16 > 0)
|
||||
{
|
||||
sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
|
||||
if (u_isWhitespace(ch) || exists(ch))
|
||||
segBoundary.startPos = indexUtf16;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
indexUtf16 = pos;
|
||||
while (indexUtf16 < rText.getLength())
|
||||
{
|
||||
sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16);
|
||||
if (u_isWhitespace(ch) || exists(ch))
|
||||
segBoundary.endPos = indexUtf16;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
// Cache the calculated boundary
|
||||
segmentCachedBoundary.startPos = segBoundary.startPos;
|
||||
segmentCachedBoundary.endPos = segBoundary.endPos;
|
||||
|
||||
indexUtf16 = segBoundary.startPos;
|
||||
rText.iterateCodePoints(&indexUtf16);
|
||||
return segBoundary.endPos > indexUtf16;
|
||||
}
|
||||
|
||||
#define KANJA 1
|
||||
#define KATAKANA 2
|
||||
#define HIRAKANA 3
|
||||
|
||||
static sal_Int16 JapaneseCharType(sal_Unicode c)
|
||||
{
|
||||
if (0x3041 <= c && c <= 0x309e)
|
||||
return HIRAKANA;
|
||||
if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
|
||||
return KATAKANA;
|
||||
return KANJA;
|
||||
}
|
||||
|
||||
WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary const & wordBoundary)
|
||||
{
|
||||
WordBreakCache& rCache = cache[text[0] & 0x1f];
|
||||
|
||||
if (rCache.size != 0 && rCache.equals(text, wordBoundary))
|
||||
return rCache;
|
||||
|
||||
sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
|
||||
|
||||
if (rCache.size == 0 || len > rCache.size) {
|
||||
if (rCache.size != 0) {
|
||||
delete [] rCache.contents;
|
||||
delete [] rCache.wordboundary;
|
||||
rCache.size = len;
|
||||
}
|
||||
else
|
||||
rCache.size = std::max<sal_Int32>(len, DEFAULT_SIZE);
|
||||
rCache.contents = new sal_Unicode[rCache.size + 1];
|
||||
rCache.wordboundary = new sal_Int32[rCache.size + 2];
|
||||
}
|
||||
rCache.length = len;
|
||||
memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
|
||||
*(rCache.contents + len) = 0x0000;
|
||||
// reset the wordboundary in cache
|
||||
memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
|
||||
|
||||
sal_Int32 i = 0; // loop variable
|
||||
while (rCache.wordboundary[i] < rCache.length) {
|
||||
len = 0;
|
||||
// look the continuous white space as one word and cache it
|
||||
while (u_isWhitespace(static_cast<sal_uInt32>(text[wordBoundary.startPos + rCache.wordboundary[i] + len])))
|
||||
len ++;
|
||||
|
||||
if (len == 0) {
|
||||
const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
|
||||
sal_Int32 slen = rCache.length - rCache.wordboundary[i];
|
||||
sal_Int16 type = 0, count = 0;
|
||||
for (;len == 0 && slen > 0; str++, slen--) {
|
||||
len = getLongestMatch(str, slen);
|
||||
if (len == 0) {
|
||||
if (!japaneseWordBreak) {
|
||||
len = 1;
|
||||
} else {
|
||||
if (count == 0)
|
||||
type = JapaneseCharType(*str);
|
||||
else if (type != JapaneseCharType(*str))
|
||||
break;
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (count)
|
||||
{
|
||||
rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
if (len) {
|
||||
rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
rCache.wordboundary[i + 1] = rCache.length + 1;
|
||||
|
||||
return rCache;
|
||||
}
|
||||
|
||||
Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
|
||||
{
|
||||
// looking for the first non-whitespace character from anyPos
|
||||
sal_uInt32 ch = 0;
|
||||
if (anyPos > 0)
|
||||
rText.iterateCodePoints(&anyPos, -1);
|
||||
|
||||
while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
|
||||
|
||||
return getWordBoundary(rText, anyPos, wordType, true);
|
||||
}
|
||||
|
||||
Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
|
||||
{
|
||||
boundary = getWordBoundary(rText, anyPos, wordType, true);
|
||||
anyPos = boundary.endPos;
|
||||
const sal_Int32 nLen = rText.getLength();
|
||||
if (anyPos < nLen) {
|
||||
// looking for the first non-whitespace character from anyPos
|
||||
sal_uInt32 ch = rText.iterateCodePoints(&anyPos);
|
||||
while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos);
|
||||
if (anyPos > 0)
|
||||
rText.iterateCodePoints(&anyPos, -1);
|
||||
}
|
||||
|
||||
return getWordBoundary(rText, anyPos, wordType, true);
|
||||
}
|
||||
|
||||
Boundary const & xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
|
||||
{
|
||||
const sal_Unicode *text=rText.getStr();
|
||||
sal_Int32 len=rText.getLength();
|
||||
if (anyPos >= len || anyPos < 0) {
|
||||
boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
|
||||
} else if (seekSegment(rText, anyPos, boundary)) { // character in dict
|
||||
WordBreakCache& aCache = getCache(text, boundary);
|
||||
sal_Int32 i = 0;
|
||||
|
||||
while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
|
||||
|
||||
sal_Int32 startPos = aCache.wordboundary[i - 1];
|
||||
// if bDirection is false
|
||||
if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
|
||||
{
|
||||
sal_uInt32 ch = rText.iterateCodePoints(&o3tl::temporary(sal_Int32(anyPos-1)));
|
||||
if (u_isWhitespace(ch))
|
||||
i--;
|
||||
}
|
||||
|
||||
boundary.endPos = boundary.startPos;
|
||||
boundary.endPos += aCache.wordboundary[i];
|
||||
boundary.startPos += aCache.wordboundary[i-1];
|
||||
|
||||
} else {
|
||||
boundary.startPos = anyPos;
|
||||
if (anyPos < len) rText.iterateCodePoints(&anyPos);
|
||||
boundary.endPos = std::min(anyPos, len);
|
||||
}
|
||||
if (wordType == WordType::WORD_COUNT) {
|
||||
// skip punctuation for word count.
|
||||
while (boundary.endPos < len)
|
||||
{
|
||||
sal_Int32 indexUtf16 = boundary.endPos;
|
||||
if (u_ispunct(rText.iterateCodePoints(&indexUtf16)))
|
||||
boundary.endPos = indexUtf16;
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return boundary;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
Loading…
Reference in a new issue