diff --git a/editeng/source/editeng/impedit3.cxx b/editeng/source/editeng/impedit3.cxx index 6f7d1e7ac928..b961393bb24b 100644 --- a/editeng/source/editeng/impedit3.cxx +++ b/editeng/source/editeng/impedit3.cxx @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -232,93 +233,6 @@ static void lcl_DrawRedLines( OutputDevice& rOutDev, } } -// For Kashidas from sw/source/core/text/porlay.cxx - -#define IS_JOINING_GROUP(c, g) ( u_getIntPropertyValue( (c), UCHAR_JOINING_GROUP ) == U_JG_##g ) -#define isAinChar(c) IS_JOINING_GROUP((c), AIN) -#define isAlefChar(c) IS_JOINING_GROUP((c), ALEF) -#define isDalChar(c) IS_JOINING_GROUP((c), DAL) -#define isFehChar(c) (IS_JOINING_GROUP((c), FEH) || IS_JOINING_GROUP((c), AFRICAN_FEH)) -#define isGafChar(c) IS_JOINING_GROUP((c), GAF) -#define isHehChar(c) IS_JOINING_GROUP((c), HEH) -#define isKafChar(c) IS_JOINING_GROUP((c), KAF) -#define isLamChar(c) IS_JOINING_GROUP((c), LAM) -#define isQafChar(c) (IS_JOINING_GROUP((c), QAF) || IS_JOINING_GROUP((c), AFRICAN_QAF)) -#define isRehChar(c) IS_JOINING_GROUP((c), REH) -#define isTahChar(c) IS_JOINING_GROUP((c), TAH) -#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA) -#define isWawChar(c) IS_JOINING_GROUP((c), WAW) -#define isSeenOrSadChar(c) (IS_JOINING_GROUP((c), SAD) || IS_JOINING_GROUP((c), SEEN)) - -// Beh and characters that behave like Beh in medial form. -static bool isBehChar(sal_Unicode cCh) -{ - bool bRet = false; - switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) - { - case U_JG_BEH: - case U_JG_NOON: - case U_JG_AFRICAN_NOON: - case U_JG_NYA: - case U_JG_YEH: - case U_JG_FARSI_YEH: - case U_JG_BURUSHASKI_YEH_BARREE: - bRet = true; - break; - default: - bRet = false; - break; - } - - return bRet; -} - -// Yeh and characters that behave like Yeh in final form. -static bool isYehChar(sal_Unicode cCh) -{ - bool bRet = false; - switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) - { - case U_JG_YEH: - case U_JG_FARSI_YEH: - case U_JG_YEH_BARREE: - case U_JG_BURUSHASKI_YEH_BARREE: - case U_JG_YEH_WITH_TAIL: - bRet = true; - break; - default: - bRet = false; - break; - } - - return bRet; -} - -static bool isTransparentChar ( sal_Unicode cCh ) -{ - return u_getIntPropertyValue( cCh, UCHAR_JOINING_TYPE ) == U_JT_TRANSPARENT; -} - -static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh ) -{ - // Lam + Alef - return ( isLamChar ( cCh ) && isAlefChar ( cNextCh )); -} - -static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh ) -{ - const int32_t nJoiningType = u_getIntPropertyValue( cPrevCh, UCHAR_JOINING_TYPE ); - bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != U_JT_NON_JOINING; - - // check for ligatures cPrevChar + cChar - if ( bRet ) - bRet = ! lcl_IsLigature( cPrevCh, cCh ); - - return bRet; -} - - - void ImpEditEngine::UpdateViews( EditView* pCurView ) { if ( !IsUpdateLayout() || IsFormatting() || maInvalidRect.IsEmpty() ) @@ -2317,9 +2231,6 @@ void ImpEditEngine::ImpAdjustBlocks(ParaPortion& rParaPortion, EditLine& rLine, { EditPaM aPaM( pNode, nChar+1 ); sal_uInt16 nScript = GetI18NScriptType(aPaM); - // Arabic script is handled above, but if no Kashida positions are found, use blanks. - if (nKashidas) - continue; if ( pNode->GetChar(nChar) == ' ' ) { @@ -2460,154 +2371,12 @@ void ImpEditEngine::ImpFindKashidas(ContentNode* pNode, sal_Int32 nStart, sal_In // restore selection for proper iteration at the end of the function aWordSel.Max().SetIndex( nSavPos ); - sal_Int32 nIdx = 0, nPrevIdx = 0; - sal_Int32 nKashidaPos = -1; - sal_Unicode cCh, cPrevCh = 0; + auto stKashidaPos = i18nutil::GetWordKashidaPosition(aWord); - int nPriorityLevel = 7; // 0..6 = level found - // 7 not found - - sal_Int32 nWordLen = aWord.getLength(); - - // ignore trailing vowel chars - while( nWordLen && isTransparentChar( aWord[ nWordLen - 1 ] )) - --nWordLen; - - while ( nIdx < nWordLen ) + if (stKashidaPos.has_value()) { - cCh = aWord[ nIdx ]; + sal_Int32 nKashidaPos = aWordSel.Min().GetIndex() + stKashidaPos->nIndex; - // 1. Priority: - // after user inserted kashida - if ( 0x640 == cCh ) - { - nKashidaPos = aWordSel.Min().GetIndex() + nIdx; - nPriorityLevel = 0; - } - - // 2. Priority: - // after a Seen or Sad - if (nPriorityLevel >= 1 && nIdx < nWordLen - 1) - { - if( isSeenOrSadChar( cCh ) - && (aWord[ nIdx+1 ] != 0x200C) ) // #i98410#: prevent ZWNJ expansion - { - nKashidaPos = aWordSel.Min().GetIndex() + nIdx; - nPriorityLevel = 1; - } - } - - // 3. Priority: - // before final form of Teh Marbuta, Heh, Dal - if ( nPriorityLevel >= 2 && nIdx > 0 ) - { - if ( isTehMarbutaChar ( cCh ) || // Teh Marbuta (right joining) - isDalChar ( cCh ) || // Dal (right joining) final form may appear in the middle of word - ( isHehChar ( cCh ) && nIdx == nWordLen - 1)) // Heh (dual joining) only at end of word - { - - SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx; - nPriorityLevel = 2; - } - } - } - - // 4. Priority: - // before final form of Alef, Tah, Lam, Kaf or Gaf - if ( nPriorityLevel >= 3 && nIdx > 0 ) - { - if ( isAlefChar ( cCh ) || // Alef (right joining) final form may appear in the middle of word - (( isLamChar ( cCh ) || // Lam, - isTahChar ( cCh ) || // Tah, - isKafChar ( cCh ) || // Kaf (all dual joining) - isGafChar ( cCh ) ) - && nIdx == nWordLen - 1)) // only at end of word - { - SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx; - nPriorityLevel = 3; - } - } - } - - // 5. Priority: - // before medial Beh-like - if ( nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1 ) - { - if ( isBehChar ( cCh ) ) - { - // check if next character is Reh or Yeh-like - sal_Unicode cNextCh = aWord[ nIdx + 1 ]; - if ( isRehChar ( cNextCh ) || isYehChar ( cNextCh )) - { - SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx; - nPriorityLevel = 4; - } - } - } - } - - // 6. Priority: - // before the final form of Waw, Ain, Qaf and Feh - if ( nPriorityLevel >= 5 && nIdx > 0 ) - { - if ( isWawChar ( cCh ) || // Wav (right joining) - // final form may appear in the middle of word - (( isAinChar ( cCh ) || // Ain (dual joining) - isQafChar ( cCh ) || // Qaf (dual joining) - isFehChar ( cCh ) ) // Feh (dual joining) - && nIdx == nWordLen - 1)) // only at end of word - { - SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx; - nPriorityLevel = 5; - } - } - } - - // other connecting possibilities - if ( nPriorityLevel >= 6 && nIdx > 0 ) - { - // Reh, Zain - if ( isRehChar ( cCh ) ) - { - SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx; - nPriorityLevel = 6; - } - } - } - - // Do not consider vowel marks when checking if a character - // can be connected to previous character. - if ( !isTransparentChar ( cCh) ) - { - cPrevCh = cCh; - nPrevIdx = nIdx; - } - - ++nIdx; - } // end of current word - - if (nKashidaPos >= 0) - { SeekCursor(pNode, nKashidaPos + 1, aTmpFont); aTmpFont.SetPhysFont(*GetRefDevice()); diff --git a/i18nutil/CppunitTest_i18nutil_kashida.mk b/i18nutil/CppunitTest_i18nutil_kashida.mk new file mode 100644 index 000000000000..4920f0a79a54 --- /dev/null +++ b/i18nutil/CppunitTest_i18nutil_kashida.mk @@ -0,0 +1,24 @@ +For makefiles: + +# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t; fill-column: 100 -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +$(eval $(call gb_CppunitTest_CppunitTest,i18nutil_kashida)) + +$(eval $(call gb_CppunitTest_add_exception_objects,i18nutil_kashida,\ + i18nutil/qa/cppunit/test_kashida \ +)) + +$(eval $(call gb_CppunitTest_use_libraries,i18nutil_kashida,\ + i18nutil \ + sal \ + test \ +)) + +# vim: set noet sw=4 ts=4: diff --git a/i18nutil/Library_i18nutil.mk b/i18nutil/Library_i18nutil.mk index fd6518a4173f..264c9c9f969e 100644 --- a/i18nutil/Library_i18nutil.mk +++ b/i18nutil/Library_i18nutil.mk @@ -44,6 +44,7 @@ $(eval $(call gb_Library_use_libraries,i18nutil,\ $(eval $(call gb_Library_add_exception_objects,i18nutil,\ i18nutil/source/utility/casefolding \ + i18nutil/source/utility/kashida \ i18nutil/source/utility/oneToOneMapping \ i18nutil/source/utility/paper \ i18nutil/source/utility/scripttypedetector \ diff --git a/i18nutil/Module_i18nutil.mk b/i18nutil/Module_i18nutil.mk index 9b543dfdc359..bb8ef7056c5c 100644 --- a/i18nutil/Module_i18nutil.mk +++ b/i18nutil/Module_i18nutil.mk @@ -12,4 +12,8 @@ $(eval $(call gb_Module_add_targets,i18nutil,\ Library_i18nutil \ )) +$(eval $(call gb_Module_add_check_targets,i18nutil,\ + CppunitTest_i18nutil_kashida \ +)) + # vim: set noet sw=4: diff --git a/i18nutil/qa/cppunit/test_kashida.cxx b/i18nutil/qa/cppunit/test_kashida.cxx new file mode 100644 index 000000000000..99ce2a1a969c --- /dev/null +++ b/i18nutil/qa/cppunit/test_kashida.cxx @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include +#include +#include +#include + +using namespace i18nutil; + +namespace +{ +class KashidaTest : public CppUnit::TestFixture +{ +public: + void testCharacteristic(); + + CPPUNIT_TEST_SUITE(KashidaTest); + CPPUNIT_TEST(testCharacteristic); + CPPUNIT_TEST_SUITE_END(); +}; + +void KashidaTest::testCharacteristic() +{ + // Characteristic tests for kashida candidate selection. + // Uses words from sample documents. + CPPUNIT_ASSERT(!GetWordKashidaPosition(u"متن"_ustr).has_value()); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"فارسی"_ustr).value().nIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), GetWordKashidaPosition(u"با"_ustr).value().nIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"نویسه"_ustr).value().nIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"کشیده"_ustr).value().nIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(0), GetWordKashidaPosition(u"برای"_ustr).value().nIndex); + CPPUNIT_ASSERT(!GetWordKashidaPosition(u"چینش"_ustr).has_value()); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"بهتر"_ustr).value().nIndex); + CPPUNIT_ASSERT(!GetWordKashidaPosition(u"ببببب"_ustr).has_value()); + CPPUNIT_ASSERT(!GetWordKashidaPosition(u"بپپپپ"_ustr).has_value()); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"تطویل"_ustr).value().nIndex); + CPPUNIT_ASSERT(!GetWordKashidaPosition(u"بپ"_ustr).has_value()); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"تطوی"_ustr).value().nIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"تحویل"_ustr).value().nIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"تشویل"_ustr).value().nIndex); + CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"تمثیل"_ustr).value().nIndex); +} + +CPPUNIT_TEST_SUITE_REGISTRATION(KashidaTest); +} + +CPPUNIT_PLUGIN_IMPLEMENT(); + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/i18nutil/source/utility/kashida.cxx b/i18nutil/source/utility/kashida.cxx new file mode 100644 index 000000000000..dbf2b818abf1 --- /dev/null +++ b/i18nutil/source/utility/kashida.cxx @@ -0,0 +1,286 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include + +namespace +{ +/* + https://www.khtt.net/en/page/1821/the-big-kashida-secret + + the rules of priorities that govern the addition of kashidas in Arabic text + made ... for ... Explorer 5.5 browser. + + The kashida justification is based on a connection priority scheme that + decides where kashidas are put automatically. + + This is how the software decides on kashida-inserting priorities: + 1. First it looks for characters with the highest priority in each word, + which means kashida-extensions will only been used in one position in each + word. Not more. + 2. The kashida will be connected to the character with the highest priority. + 3. If kashida connection opportunities are found with an equal level of + priority in one word, the kashida will be placed towards the end of the + word. + + The priority list of characters and the positioning is as follows: + 1. after a kashida that is manually placed in the text by the user, + 2. after a Seen or Sad (initial and medial form), + 3. before the final form of Taa Marbutah, Haa, Dal, + 4. before the final form of Alef, Tah Lam, Kaf and Gaf, + 5. before the preceding medial Baa of Ra, Ya and Alef Maqsurah, + 6. before the final form of Waw, Ain, Qaf and Fa, + 7. before the final form of other characters that can be connected. +*/ + +#define IS_JOINING_GROUP(c, g) (u_getIntPropertyValue((c), UCHAR_JOINING_GROUP) == U_JG_##g) +#define isAinChar(c) IS_JOINING_GROUP((c), AIN) +#define isAlefChar(c) IS_JOINING_GROUP((c), ALEF) +#define isDalChar(c) IS_JOINING_GROUP((c), DAL) +#define isFehChar(c) (IS_JOINING_GROUP((c), FEH) || IS_JOINING_GROUP((c), AFRICAN_FEH)) +#define isGafChar(c) IS_JOINING_GROUP((c), GAF) +#define isHehChar(c) IS_JOINING_GROUP((c), HEH) +#define isKafChar(c) IS_JOINING_GROUP((c), KAF) +#define isLamChar(c) IS_JOINING_GROUP((c), LAM) +#define isQafChar(c) (IS_JOINING_GROUP((c), QAF) || IS_JOINING_GROUP((c), AFRICAN_QAF)) +#define isRehChar(c) IS_JOINING_GROUP((c), REH) +#define isTahChar(c) IS_JOINING_GROUP((c), TAH) +#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA) +#define isWawChar(c) IS_JOINING_GROUP((c), WAW) +#define isSeenOrSadChar(c) (IS_JOINING_GROUP((c), SAD) || IS_JOINING_GROUP((c), SEEN)) + +// Beh and characters that behave like Beh in medial form. +bool isBehChar(sal_Unicode cCh) +{ + bool bRet = false; + switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) + { + case U_JG_BEH: + case U_JG_NOON: + case U_JG_AFRICAN_NOON: + case U_JG_NYA: + case U_JG_YEH: + case U_JG_FARSI_YEH: + case U_JG_BURUSHASKI_YEH_BARREE: + bRet = true; + break; + default: + bRet = false; + break; + } + + return bRet; +} + +// Yeh and characters that behave like Yeh in final form. +bool isYehChar(sal_Unicode cCh) +{ + bool bRet = false; + switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) + { + case U_JG_YEH: + case U_JG_FARSI_YEH: + case U_JG_YEH_BARREE: + case U_JG_BURUSHASKI_YEH_BARREE: + case U_JG_YEH_WITH_TAIL: + bRet = true; + break; + default: + bRet = false; + break; + } + + return bRet; +} + +bool isTransparentChar(sal_Unicode cCh) +{ + return u_getIntPropertyValue(cCh, UCHAR_JOINING_TYPE) == U_JT_TRANSPARENT; +} + +// Checks if cCh + cNectCh builds a ligature (used for Kashidas) +bool isLigature(sal_Unicode cCh, sal_Unicode cNextCh) +{ + // Lam + Alef + return (isLamChar(cCh) && isAlefChar(cNextCh)); +} + +// Checks if cCh is connectable to cPrevCh (used for Kashidas) +bool CanConnectToPrev(sal_Unicode cCh, sal_Unicode cPrevCh) +{ + const int32_t nJoiningType = u_getIntPropertyValue(cPrevCh, UCHAR_JOINING_TYPE); + bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != U_JT_NON_JOINING; + + // check for ligatures cPrevChar + cChar + if (bRet) + bRet = !isLigature(cPrevCh, cCh); + + return bRet; +} +} + +std::optional i18nutil::GetWordKashidaPosition(const OUString& rWord) +{ + sal_Int32 nIdx = 0; + sal_Int32 nPrevIdx = 0; + sal_Int32 nKashidaPos = -1; + sal_Unicode cCh = 0; + sal_Unicode cPrevCh = 0; + + int nPriorityLevel = 7; // 0..6 = level found, 7 not found + + sal_Int32 nWordLen = rWord.getLength(); + + // ignore trailing vowel chars + while (nWordLen && isTransparentChar(rWord[nWordLen - 1])) + { + --nWordLen; + } + + while (nIdx < nWordLen) + { + cCh = rWord[nIdx]; + + // 1. Priority: + // after user inserted kashida + if (0x640 == cCh) + { + nKashidaPos = nIdx; + nPriorityLevel = 0; + } + + // 2. Priority: + // after a Seen or Sad + if (nPriorityLevel >= 1 && nIdx < nWordLen - 1) + { + if (isSeenOrSadChar(cCh) + && (rWord[nIdx + 1] != 0x200C)) // #i98410#: prevent ZWNJ expansion + { + nKashidaPos = nIdx; + nPriorityLevel = 1; + } + } + + // 3. Priority: + // before final form of Teh Marbuta, Heh, Dal + if (nPriorityLevel >= 2 && nIdx > 0) + { + // Teh Marbuta (right joining) + // Dal (right joining) final form may appear in the middle of word + // Heh (dual joining) only at end of word + if (isTehMarbutaChar(cCh) || isDalChar(cCh) || (isHehChar(cCh) && nIdx == nWordLen - 1)) + { + SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character"); + // check if character is connectable to previous character, + if (CanConnectToPrev(cCh, cPrevCh)) + { + nKashidaPos = nPrevIdx; + nPriorityLevel = 2; + } + } + } + + // 4. Priority: + // before final form of Alef, Tah, Lam, Kaf or Gaf + if (nPriorityLevel >= 3 && nIdx > 0) + { + // Alef (right joining) final form may appear in the middle of word + // Lam, Tah, Kaf (all dual joining) only at end of word + if (isAlefChar(cCh) + || ((isLamChar(cCh) || isTahChar(cCh) || isKafChar(cCh) || isGafChar(cCh)) + && nIdx == nWordLen - 1)) + { + SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character"); + // check if character is connectable to previous character, + if (CanConnectToPrev(cCh, cPrevCh)) + { + nKashidaPos = nPrevIdx; + nPriorityLevel = 3; + } + } + } + + // 5. Priority: + // before medial Beh-like + if (nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1) + { + if (isBehChar(cCh)) + { + // check if next character is Reh or Yeh-like + sal_Unicode cNextCh = rWord[nIdx + 1]; + if (isRehChar(cNextCh) || isYehChar(cNextCh)) + { + SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character"); + // check if character is connectable to previous character, + if (CanConnectToPrev(cCh, cPrevCh)) + { + nKashidaPos = nPrevIdx; + nPriorityLevel = 4; + } + } + } + } + + // 6. Priority: + // before the final form of Waw, Ain, Qaf and Feh + if (nPriorityLevel >= 5 && nIdx > 0) + { + // Wav (right joining) final form may appear in the middle of word + // Ain, Qaf, Feh (all dual joining) only at end of word + if (isWawChar(cCh) + || ((isAinChar(cCh) || isQafChar(cCh) || isFehChar(cCh)) && nIdx == nWordLen - 1)) + { + SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character"); + // check if character is connectable to previous character, + if (CanConnectToPrev(cCh, cPrevCh)) + { + nKashidaPos = nPrevIdx; + nPriorityLevel = 5; + } + } + } + + // other connecting possibilities + if (nPriorityLevel >= 6 && nIdx > 0) + { + // Reh, Zain + if (isRehChar(cCh)) + { + SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character"); + // check if character is connectable to previous character, + if (CanConnectToPrev(cCh, cPrevCh)) + { + nKashidaPos = nPrevIdx; + nPriorityLevel = 6; + } + } + } + + // Do not consider vowel marks when checking if a character + // can be connected to previous character. + if (!isTransparentChar(cCh)) + { + cPrevCh = cCh; + nPrevIdx = nIdx; + } + + ++nIdx; + } // end of current word + + if (-1 != nKashidaPos) + { + return KashidaPosition{ nKashidaPos }; + } + + return std::nullopt; +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/include/i18nutil/kashida.hxx b/include/i18nutil/kashida.hxx new file mode 100644 index 000000000000..54797143143c --- /dev/null +++ b/include/i18nutil/kashida.hxx @@ -0,0 +1,24 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include +#include +#include + +namespace i18nutil +{ +struct KashidaPosition +{ + sal_Int32 nIndex; +}; + +I18NUTIL_DLLPUBLIC std::optional GetWordKashidaPosition(const OUString& rWord); +} + +/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */ diff --git a/sw/source/core/text/porlay.cxx b/sw/source/core/text/porlay.cxx index 923d5286c458..8574f6d31d12 100644 --- a/sw/source/core/text/porlay.cxx +++ b/sw/source/core/text/porlay.cxx @@ -79,124 +79,12 @@ #include #include #include +#include #include using namespace ::com::sun::star; using namespace i18n::ScriptType; -/* - https://www.khtt.net/en/page/1821/the-big-kashida-secret - - the rules of priorities that govern the addition of kashidas in Arabic text - made ... for ... Explorer 5.5 browser. - - The kashida justification is based on a connection priority scheme that - decides where kashidas are put automatically. - - This is how the software decides on kashida-inserting priorities: - 1. First it looks for characters with the highest priority in each word, - which means kashida-extensions will only been used in one position in each - word. Not more. - 2. The kashida will be connected to the character with the highest priority. - 3. If kashida connection opportunities are found with an equal level of - priority in one word, the kashida will be placed towards the end of the - word. - - The priority list of characters and the positioning is as follows: - 1. after a kashida that is manually placed in the text by the user, - 2. after a Seen or Sad (initial and medial form), - 3. before the final form of Taa Marbutah, Haa, Dal, - 4. before the final form of Alef, Tah Lam, Kaf and Gaf, - 5. before the preceding medial Baa of Ra, Ya and Alef Maqsurah, - 6. before the final form of Waw, Ain, Qaf and Fa, - 7. before the final form of other characters that can be connected. -*/ - -#define IS_JOINING_GROUP(c, g) ( u_getIntPropertyValue( (c), UCHAR_JOINING_GROUP ) == U_JG_##g ) -#define isAinChar(c) IS_JOINING_GROUP((c), AIN) -#define isAlefChar(c) IS_JOINING_GROUP((c), ALEF) -#define isDalChar(c) IS_JOINING_GROUP((c), DAL) -#define isFehChar(c) (IS_JOINING_GROUP((c), FEH) || IS_JOINING_GROUP((c), AFRICAN_FEH)) -#define isGafChar(c) IS_JOINING_GROUP((c), GAF) -#define isHehChar(c) IS_JOINING_GROUP((c), HEH) -#define isKafChar(c) IS_JOINING_GROUP((c), KAF) -#define isLamChar(c) IS_JOINING_GROUP((c), LAM) -#define isQafChar(c) (IS_JOINING_GROUP((c), QAF) || IS_JOINING_GROUP((c), AFRICAN_QAF)) -#define isRehChar(c) IS_JOINING_GROUP((c), REH) -#define isTahChar(c) IS_JOINING_GROUP((c), TAH) -#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA) -#define isWawChar(c) IS_JOINING_GROUP((c), WAW) -#define isSeenOrSadChar(c) (IS_JOINING_GROUP((c), SAD) || IS_JOINING_GROUP((c), SEEN)) - -// Beh and characters that behave like Beh in medial form. -static bool isBehChar(sal_Unicode cCh) -{ - bool bRet = false; - switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) - { - case U_JG_BEH: - case U_JG_NOON: - case U_JG_AFRICAN_NOON: - case U_JG_NYA: - case U_JG_YEH: - case U_JG_FARSI_YEH: - case U_JG_BURUSHASKI_YEH_BARREE: - bRet = true; - break; - default: - bRet = false; - break; - } - - return bRet; -} - -// Yeh and characters that behave like Yeh in final form. -static bool isYehChar(sal_Unicode cCh) -{ - bool bRet = false; - switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP)) - { - case U_JG_YEH: - case U_JG_FARSI_YEH: - case U_JG_YEH_BARREE: - case U_JG_BURUSHASKI_YEH_BARREE: - case U_JG_YEH_WITH_TAIL: - bRet = true; - break; - default: - bRet = false; - break; - } - - return bRet; -} - -static bool isTransparentChar ( sal_Unicode cCh ) -{ - return u_getIntPropertyValue( cCh, UCHAR_JOINING_TYPE ) == U_JT_TRANSPARENT; -} - -// Checks if cCh + cNectCh builds a ligature (used for Kashidas) -static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh ) -{ - // Lam + Alef - return ( isLamChar ( cCh ) && isAlefChar ( cNextCh )); -} - -// Checks if cCh is connectable to cPrevCh (used for Kashidas) -static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh ) -{ - const int32_t nJoiningType = u_getIntPropertyValue( cPrevCh, UCHAR_JOINING_TYPE ); - bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != U_JT_NON_JOINING; - - // check for ligatures cPrevChar + cChar - if( bRet ) - bRet = !lcl_IsLigature( cPrevCh, cCh ); - - return bRet; -} - static bool lcl_HasStrongLTR ( std::u16string_view rText, sal_Int32 nStart, sal_Int32 nEnd ) { for( sal_Int32 nCharIdx = nStart; nCharIdx < nEnd; ++nCharIdx ) @@ -1618,157 +1506,16 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& rNode, while ( aScanner.NextWord() ) { const OUString& rWord = aScanner.GetWord(); + auto stKashidaPos = i18nutil::GetWordKashidaPosition(rWord); - sal_Int32 nIdx = 0, nPrevIdx = 0; - sal_Int32 nKashidaPos = -1; - sal_Unicode cCh, cPrevCh = 0; - - int nPriorityLevel = 7; // 0..6 = level found - // 7 not found - - sal_Int32 nWordLen = rWord.getLength(); - - // ignore trailing vowel chars - while( nWordLen && isTransparentChar( rWord[ nWordLen - 1 ] )) - --nWordLen; - - while (nIdx < nWordLen) + if (stKashidaPos.has_value()) { - cCh = rWord[ nIdx ]; - - // 1. Priority: - // after user inserted kashida - if ( 0x640 == cCh ) - { - nKashidaPos = aScanner.GetBegin() + nIdx; - nPriorityLevel = 0; + // Only populate kashida positions for the invalidated tail + TextFrameIndex nNewKashidaPos{aScanner.GetBegin() + stKashidaPos->nIndex}; + if(nNewKashidaPos >= nLastKashida) { + m_Kashida.insert(m_Kashida.begin() + nCntKash, nNewKashidaPos); + nCntKash++; } - - // 2. Priority: - // after a Seen or Sad - if (nPriorityLevel >= 1 && nIdx < nWordLen - 1) - { - if( isSeenOrSadChar( cCh ) - && (rWord[ nIdx+1 ] != 0x200C) ) // #i98410#: prevent ZWNJ expansion - { - nKashidaPos = aScanner.GetBegin() + nIdx; - nPriorityLevel = 1; - } - } - - // 3. Priority: - // before final form of Teh Marbuta, Heh, Dal - if ( nPriorityLevel >= 2 && nIdx > 0 ) - { - if ( isTehMarbutaChar ( cCh ) || // Teh Marbuta (right joining) - isDalChar ( cCh ) || // Dal (right joining) final form may appear in the middle of word - ( isHehChar ( cCh ) && nIdx == nWordLen - 1)) // Heh (dual joining) only at end of word - { - - SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aScanner.GetBegin() + nPrevIdx; - nPriorityLevel = 2; - } - } - } - - // 4. Priority: - // before final form of Alef, Tah, Lam, Kaf or Gaf - if ( nPriorityLevel >= 3 && nIdx > 0 ) - { - if ( isAlefChar ( cCh ) || // Alef (right joining) final form may appear in the middle of word - (( isLamChar ( cCh ) || // Lam, - isTahChar ( cCh ) || // Tah, - isKafChar ( cCh ) || // Kaf (all dual joining) - isGafChar ( cCh ) ) - && nIdx == nWordLen - 1)) // only at end of word - { - SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aScanner.GetBegin() + nPrevIdx; - nPriorityLevel = 3; - } - } - } - - // 5. Priority: - // before medial Beh-like - if ( nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1 ) - { - if ( isBehChar ( cCh ) ) - { - // check if next character is Reh or Yeh-like - sal_Unicode cNextCh = rWord[ nIdx + 1 ]; - if ( isRehChar ( cNextCh ) || isYehChar ( cNextCh )) - { - SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aScanner.GetBegin() + nPrevIdx; - nPriorityLevel = 4; - } - } - } - } - - // 6. Priority: - // before the final form of Waw, Ain, Qaf and Feh - if ( nPriorityLevel >= 5 && nIdx > 0 ) - { - if ( isWawChar ( cCh ) || // Wav (right joining) - // final form may appear in the middle of word - (( isAinChar ( cCh ) || // Ain (dual joining) - isQafChar ( cCh ) || // Qaf (dual joining) - isFehChar ( cCh ) ) // Feh (dual joining) - && nIdx == nWordLen - 1)) // only at end of word - { - SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aScanner.GetBegin() + nPrevIdx; - nPriorityLevel = 5; - } - } - } - - // other connecting possibilities - if ( nPriorityLevel >= 6 && nIdx > 0 ) - { - // Reh, Zain - if ( isRehChar ( cCh ) ) - { - SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" ); - // check if character is connectable to previous character, - if ( lcl_ConnectToPrev( cCh, cPrevCh ) ) - { - nKashidaPos = aScanner.GetBegin() + nPrevIdx; - nPriorityLevel = 6; - } - } - } - - // Do not consider vowel marks when checking if a character - // can be connected to previous character. - if ( !isTransparentChar ( cCh) ) - { - cPrevCh = cCh; - nPrevIdx = nIdx; - } - - ++nIdx; - } // end of current word - - if ( -1 != nKashidaPos ) - { - m_Kashida.insert(m_Kashida.begin() + nCntKash, TextFrameIndex(nKashidaPos)); - nCntKash++; } } // end of kashida search }