tdf#163105 Consolidated duplicated kashida justification code

The kashida candidate position selection logic was copied-and-pasted
from Writer into Edit Engine. This change consolidates the shared code
into a library. This change also adds some minimal characteristic tests,
which previously did not exist.

Change-Id: I2bfbfa79858347803474b754566436f3e74d1a54
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/173883
Reviewed-by: Jonathan Clark <jonathan@libreoffice.org>
Tested-by: Jenkins
This commit is contained in:
Jonathan Clark 2024-09-23 15:26:45 -06:00
parent 2fc1034de4
commit fe4687ed17
8 changed files with 409 additions and 496 deletions

View file

@ -68,6 +68,7 @@
#include <com/sun/star/i18n/InputSequenceChecker.hpp>
#include <vcl/pdfextoutdevdata.hxx>
#include <i18nlangtag/mslangid.hxx>
#include <i18nutil/kashida.hxx>
#include <comphelper/processfactory.hxx>
#include <comphelper/lok.hxx>
@ -232,93 +233,6 @@ static void lcl_DrawRedLines( OutputDevice& rOutDev,
}
}
// For Kashidas from sw/source/core/text/porlay.cxx
#define IS_JOINING_GROUP(c, g) ( u_getIntPropertyValue( (c), UCHAR_JOINING_GROUP ) == U_JG_##g )
#define isAinChar(c) IS_JOINING_GROUP((c), AIN)
#define isAlefChar(c) IS_JOINING_GROUP((c), ALEF)
#define isDalChar(c) IS_JOINING_GROUP((c), DAL)
#define isFehChar(c) (IS_JOINING_GROUP((c), FEH) || IS_JOINING_GROUP((c), AFRICAN_FEH))
#define isGafChar(c) IS_JOINING_GROUP((c), GAF)
#define isHehChar(c) IS_JOINING_GROUP((c), HEH)
#define isKafChar(c) IS_JOINING_GROUP((c), KAF)
#define isLamChar(c) IS_JOINING_GROUP((c), LAM)
#define isQafChar(c) (IS_JOINING_GROUP((c), QAF) || IS_JOINING_GROUP((c), AFRICAN_QAF))
#define isRehChar(c) IS_JOINING_GROUP((c), REH)
#define isTahChar(c) IS_JOINING_GROUP((c), TAH)
#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA)
#define isWawChar(c) IS_JOINING_GROUP((c), WAW)
#define isSeenOrSadChar(c) (IS_JOINING_GROUP((c), SAD) || IS_JOINING_GROUP((c), SEEN))
// Beh and characters that behave like Beh in medial form.
static bool isBehChar(sal_Unicode cCh)
{
bool bRet = false;
switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
{
case U_JG_BEH:
case U_JG_NOON:
case U_JG_AFRICAN_NOON:
case U_JG_NYA:
case U_JG_YEH:
case U_JG_FARSI_YEH:
case U_JG_BURUSHASKI_YEH_BARREE:
bRet = true;
break;
default:
bRet = false;
break;
}
return bRet;
}
// Yeh and characters that behave like Yeh in final form.
static bool isYehChar(sal_Unicode cCh)
{
bool bRet = false;
switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
{
case U_JG_YEH:
case U_JG_FARSI_YEH:
case U_JG_YEH_BARREE:
case U_JG_BURUSHASKI_YEH_BARREE:
case U_JG_YEH_WITH_TAIL:
bRet = true;
break;
default:
bRet = false;
break;
}
return bRet;
}
static bool isTransparentChar ( sal_Unicode cCh )
{
return u_getIntPropertyValue( cCh, UCHAR_JOINING_TYPE ) == U_JT_TRANSPARENT;
}
static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh )
{
// Lam + Alef
return ( isLamChar ( cCh ) && isAlefChar ( cNextCh ));
}
static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh )
{
const int32_t nJoiningType = u_getIntPropertyValue( cPrevCh, UCHAR_JOINING_TYPE );
bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != U_JT_NON_JOINING;
// check for ligatures cPrevChar + cChar
if ( bRet )
bRet = ! lcl_IsLigature( cPrevCh, cCh );
return bRet;
}
void ImpEditEngine::UpdateViews( EditView* pCurView )
{
if ( !IsUpdateLayout() || IsFormatting() || maInvalidRect.IsEmpty() )
@ -2317,9 +2231,6 @@ void ImpEditEngine::ImpAdjustBlocks(ParaPortion& rParaPortion, EditLine& rLine,
{
EditPaM aPaM( pNode, nChar+1 );
sal_uInt16 nScript = GetI18NScriptType(aPaM);
// Arabic script is handled above, but if no Kashida positions are found, use blanks.
if (nKashidas)
continue;
if ( pNode->GetChar(nChar) == ' ' )
{
@ -2460,154 +2371,12 @@ void ImpEditEngine::ImpFindKashidas(ContentNode* pNode, sal_Int32 nStart, sal_In
// restore selection for proper iteration at the end of the function
aWordSel.Max().SetIndex( nSavPos );
sal_Int32 nIdx = 0, nPrevIdx = 0;
sal_Int32 nKashidaPos = -1;
sal_Unicode cCh, cPrevCh = 0;
auto stKashidaPos = i18nutil::GetWordKashidaPosition(aWord);
int nPriorityLevel = 7; // 0..6 = level found
// 7 not found
sal_Int32 nWordLen = aWord.getLength();
// ignore trailing vowel chars
while( nWordLen && isTransparentChar( aWord[ nWordLen - 1 ] ))
--nWordLen;
while ( nIdx < nWordLen )
if (stKashidaPos.has_value())
{
cCh = aWord[ nIdx ];
sal_Int32 nKashidaPos = aWordSel.Min().GetIndex() + stKashidaPos->nIndex;
// 1. Priority:
// after user inserted kashida
if ( 0x640 == cCh )
{
nKashidaPos = aWordSel.Min().GetIndex() + nIdx;
nPriorityLevel = 0;
}
// 2. Priority:
// after a Seen or Sad
if (nPriorityLevel >= 1 && nIdx < nWordLen - 1)
{
if( isSeenOrSadChar( cCh )
&& (aWord[ nIdx+1 ] != 0x200C) ) // #i98410#: prevent ZWNJ expansion
{
nKashidaPos = aWordSel.Min().GetIndex() + nIdx;
nPriorityLevel = 1;
}
}
// 3. Priority:
// before final form of Teh Marbuta, Heh, Dal
if ( nPriorityLevel >= 2 && nIdx > 0 )
{
if ( isTehMarbutaChar ( cCh ) || // Teh Marbuta (right joining)
isDalChar ( cCh ) || // Dal (right joining) final form may appear in the middle of word
( isHehChar ( cCh ) && nIdx == nWordLen - 1)) // Heh (dual joining) only at end of word
{
SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx;
nPriorityLevel = 2;
}
}
}
// 4. Priority:
// before final form of Alef, Tah, Lam, Kaf or Gaf
if ( nPriorityLevel >= 3 && nIdx > 0 )
{
if ( isAlefChar ( cCh ) || // Alef (right joining) final form may appear in the middle of word
(( isLamChar ( cCh ) || // Lam,
isTahChar ( cCh ) || // Tah,
isKafChar ( cCh ) || // Kaf (all dual joining)
isGafChar ( cCh ) )
&& nIdx == nWordLen - 1)) // only at end of word
{
SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx;
nPriorityLevel = 3;
}
}
}
// 5. Priority:
// before medial Beh-like
if ( nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1 )
{
if ( isBehChar ( cCh ) )
{
// check if next character is Reh or Yeh-like
sal_Unicode cNextCh = aWord[ nIdx + 1 ];
if ( isRehChar ( cNextCh ) || isYehChar ( cNextCh ))
{
SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx;
nPriorityLevel = 4;
}
}
}
}
// 6. Priority:
// before the final form of Waw, Ain, Qaf and Feh
if ( nPriorityLevel >= 5 && nIdx > 0 )
{
if ( isWawChar ( cCh ) || // Wav (right joining)
// final form may appear in the middle of word
(( isAinChar ( cCh ) || // Ain (dual joining)
isQafChar ( cCh ) || // Qaf (dual joining)
isFehChar ( cCh ) ) // Feh (dual joining)
&& nIdx == nWordLen - 1)) // only at end of word
{
SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx;
nPriorityLevel = 5;
}
}
}
// other connecting possibilities
if ( nPriorityLevel >= 6 && nIdx > 0 )
{
// Reh, Zain
if ( isRehChar ( cCh ) )
{
SAL_WARN_IF( 0 == cPrevCh, "editeng", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aWordSel.Min().GetIndex() + nPrevIdx;
nPriorityLevel = 6;
}
}
}
// Do not consider vowel marks when checking if a character
// can be connected to previous character.
if ( !isTransparentChar ( cCh) )
{
cPrevCh = cCh;
nPrevIdx = nIdx;
}
++nIdx;
} // end of current word
if (nKashidaPos >= 0)
{
SeekCursor(pNode, nKashidaPos + 1, aTmpFont);
aTmpFont.SetPhysFont(*GetRefDevice());

View file

@ -0,0 +1,24 @@
For makefiles:
# -*- Mode: makefile-gmake; tab-width: 4; indent-tabs-mode: t; fill-column: 100 -*-
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
$(eval $(call gb_CppunitTest_CppunitTest,i18nutil_kashida))
$(eval $(call gb_CppunitTest_add_exception_objects,i18nutil_kashida,\
i18nutil/qa/cppunit/test_kashida \
))
$(eval $(call gb_CppunitTest_use_libraries,i18nutil_kashida,\
i18nutil \
sal \
test \
))
# vim: set noet sw=4 ts=4:

View file

@ -44,6 +44,7 @@ $(eval $(call gb_Library_use_libraries,i18nutil,\
$(eval $(call gb_Library_add_exception_objects,i18nutil,\
i18nutil/source/utility/casefolding \
i18nutil/source/utility/kashida \
i18nutil/source/utility/oneToOneMapping \
i18nutil/source/utility/paper \
i18nutil/source/utility/scripttypedetector \

View file

@ -12,4 +12,8 @@ $(eval $(call gb_Module_add_targets,i18nutil,\
Library_i18nutil \
))
$(eval $(call gb_Module_add_check_targets,i18nutil,\
CppunitTest_i18nutil_kashida \
))
# vim: set noet sw=4:

View file

@ -0,0 +1,58 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <sal/types.h>
#include <cppunit/TestAssert.h>
#include <cppunit/TestFixture.h>
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/plugin/TestPlugIn.h>
#include <i18nutil/kashida.hxx>
using namespace i18nutil;
namespace
{
class KashidaTest : public CppUnit::TestFixture
{
public:
void testCharacteristic();
CPPUNIT_TEST_SUITE(KashidaTest);
CPPUNIT_TEST(testCharacteristic);
CPPUNIT_TEST_SUITE_END();
};
void KashidaTest::testCharacteristic()
{
// Characteristic tests for kashida candidate selection.
// Uses words from sample documents.
CPPUNIT_ASSERT(!GetWordKashidaPosition(u"متن"_ustr).has_value());
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"فارسی"_ustr).value().nIndex);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), GetWordKashidaPosition(u"با"_ustr).value().nIndex);
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"نویسه"_ustr).value().nIndex);
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"کشیده"_ustr).value().nIndex);
CPPUNIT_ASSERT_EQUAL(sal_Int32(0), GetWordKashidaPosition(u"برای"_ustr).value().nIndex);
CPPUNIT_ASSERT(!GetWordKashidaPosition(u"چینش"_ustr).has_value());
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"بهتر"_ustr).value().nIndex);
CPPUNIT_ASSERT(!GetWordKashidaPosition(u"ببببب"_ustr).has_value());
CPPUNIT_ASSERT(!GetWordKashidaPosition(u"بپپپپ"_ustr).has_value());
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"تطویل"_ustr).value().nIndex);
CPPUNIT_ASSERT(!GetWordKashidaPosition(u"بپ"_ustr).has_value());
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"تطوی"_ustr).value().nIndex);
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"تحویل"_ustr).value().nIndex);
CPPUNIT_ASSERT_EQUAL(sal_Int32(1), GetWordKashidaPosition(u"تشویل"_ustr).value().nIndex);
CPPUNIT_ASSERT_EQUAL(sal_Int32(3), GetWordKashidaPosition(u"تمثیل"_ustr).value().nIndex);
}
CPPUNIT_TEST_SUITE_REGISTRATION(KashidaTest);
}
CPPUNIT_PLUGIN_IMPLEMENT();
/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */

View file

@ -0,0 +1,286 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <i18nutil/kashida.hxx>
#include <i18nutil/unicode.hxx>
#include <sal/log.hxx>
namespace
{
/*
https://www.khtt.net/en/page/1821/the-big-kashida-secret
the rules of priorities that govern the addition of kashidas in Arabic text
made ... for ... Explorer 5.5 browser.
The kashida justification is based on a connection priority scheme that
decides where kashidas are put automatically.
This is how the software decides on kashida-inserting priorities:
1. First it looks for characters with the highest priority in each word,
which means kashida-extensions will only been used in one position in each
word. Not more.
2. The kashida will be connected to the character with the highest priority.
3. If kashida connection opportunities are found with an equal level of
priority in one word, the kashida will be placed towards the end of the
word.
The priority list of characters and the positioning is as follows:
1. after a kashida that is manually placed in the text by the user,
2. after a Seen or Sad (initial and medial form),
3. before the final form of Taa Marbutah, Haa, Dal,
4. before the final form of Alef, Tah Lam, Kaf and Gaf,
5. before the preceding medial Baa of Ra, Ya and Alef Maqsurah,
6. before the final form of Waw, Ain, Qaf and Fa,
7. before the final form of other characters that can be connected.
*/
#define IS_JOINING_GROUP(c, g) (u_getIntPropertyValue((c), UCHAR_JOINING_GROUP) == U_JG_##g)
#define isAinChar(c) IS_JOINING_GROUP((c), AIN)
#define isAlefChar(c) IS_JOINING_GROUP((c), ALEF)
#define isDalChar(c) IS_JOINING_GROUP((c), DAL)
#define isFehChar(c) (IS_JOINING_GROUP((c), FEH) || IS_JOINING_GROUP((c), AFRICAN_FEH))
#define isGafChar(c) IS_JOINING_GROUP((c), GAF)
#define isHehChar(c) IS_JOINING_GROUP((c), HEH)
#define isKafChar(c) IS_JOINING_GROUP((c), KAF)
#define isLamChar(c) IS_JOINING_GROUP((c), LAM)
#define isQafChar(c) (IS_JOINING_GROUP((c), QAF) || IS_JOINING_GROUP((c), AFRICAN_QAF))
#define isRehChar(c) IS_JOINING_GROUP((c), REH)
#define isTahChar(c) IS_JOINING_GROUP((c), TAH)
#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA)
#define isWawChar(c) IS_JOINING_GROUP((c), WAW)
#define isSeenOrSadChar(c) (IS_JOINING_GROUP((c), SAD) || IS_JOINING_GROUP((c), SEEN))
// Beh and characters that behave like Beh in medial form.
bool isBehChar(sal_Unicode cCh)
{
bool bRet = false;
switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
{
case U_JG_BEH:
case U_JG_NOON:
case U_JG_AFRICAN_NOON:
case U_JG_NYA:
case U_JG_YEH:
case U_JG_FARSI_YEH:
case U_JG_BURUSHASKI_YEH_BARREE:
bRet = true;
break;
default:
bRet = false;
break;
}
return bRet;
}
// Yeh and characters that behave like Yeh in final form.
bool isYehChar(sal_Unicode cCh)
{
bool bRet = false;
switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
{
case U_JG_YEH:
case U_JG_FARSI_YEH:
case U_JG_YEH_BARREE:
case U_JG_BURUSHASKI_YEH_BARREE:
case U_JG_YEH_WITH_TAIL:
bRet = true;
break;
default:
bRet = false;
break;
}
return bRet;
}
bool isTransparentChar(sal_Unicode cCh)
{
return u_getIntPropertyValue(cCh, UCHAR_JOINING_TYPE) == U_JT_TRANSPARENT;
}
// Checks if cCh + cNectCh builds a ligature (used for Kashidas)
bool isLigature(sal_Unicode cCh, sal_Unicode cNextCh)
{
// Lam + Alef
return (isLamChar(cCh) && isAlefChar(cNextCh));
}
// Checks if cCh is connectable to cPrevCh (used for Kashidas)
bool CanConnectToPrev(sal_Unicode cCh, sal_Unicode cPrevCh)
{
const int32_t nJoiningType = u_getIntPropertyValue(cPrevCh, UCHAR_JOINING_TYPE);
bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != U_JT_NON_JOINING;
// check for ligatures cPrevChar + cChar
if (bRet)
bRet = !isLigature(cPrevCh, cCh);
return bRet;
}
}
std::optional<i18nutil::KashidaPosition> i18nutil::GetWordKashidaPosition(const OUString& rWord)
{
sal_Int32 nIdx = 0;
sal_Int32 nPrevIdx = 0;
sal_Int32 nKashidaPos = -1;
sal_Unicode cCh = 0;
sal_Unicode cPrevCh = 0;
int nPriorityLevel = 7; // 0..6 = level found, 7 not found
sal_Int32 nWordLen = rWord.getLength();
// ignore trailing vowel chars
while (nWordLen && isTransparentChar(rWord[nWordLen - 1]))
{
--nWordLen;
}
while (nIdx < nWordLen)
{
cCh = rWord[nIdx];
// 1. Priority:
// after user inserted kashida
if (0x640 == cCh)
{
nKashidaPos = nIdx;
nPriorityLevel = 0;
}
// 2. Priority:
// after a Seen or Sad
if (nPriorityLevel >= 1 && nIdx < nWordLen - 1)
{
if (isSeenOrSadChar(cCh)
&& (rWord[nIdx + 1] != 0x200C)) // #i98410#: prevent ZWNJ expansion
{
nKashidaPos = nIdx;
nPriorityLevel = 1;
}
}
// 3. Priority:
// before final form of Teh Marbuta, Heh, Dal
if (nPriorityLevel >= 2 && nIdx > 0)
{
// Teh Marbuta (right joining)
// Dal (right joining) final form may appear in the middle of word
// Heh (dual joining) only at end of word
if (isTehMarbutaChar(cCh) || isDalChar(cCh) || (isHehChar(cCh) && nIdx == nWordLen - 1))
{
SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character");
// check if character is connectable to previous character,
if (CanConnectToPrev(cCh, cPrevCh))
{
nKashidaPos = nPrevIdx;
nPriorityLevel = 2;
}
}
}
// 4. Priority:
// before final form of Alef, Tah, Lam, Kaf or Gaf
if (nPriorityLevel >= 3 && nIdx > 0)
{
// Alef (right joining) final form may appear in the middle of word
// Lam, Tah, Kaf (all dual joining) only at end of word
if (isAlefChar(cCh)
|| ((isLamChar(cCh) || isTahChar(cCh) || isKafChar(cCh) || isGafChar(cCh))
&& nIdx == nWordLen - 1))
{
SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character");
// check if character is connectable to previous character,
if (CanConnectToPrev(cCh, cPrevCh))
{
nKashidaPos = nPrevIdx;
nPriorityLevel = 3;
}
}
}
// 5. Priority:
// before medial Beh-like
if (nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1)
{
if (isBehChar(cCh))
{
// check if next character is Reh or Yeh-like
sal_Unicode cNextCh = rWord[nIdx + 1];
if (isRehChar(cNextCh) || isYehChar(cNextCh))
{
SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character");
// check if character is connectable to previous character,
if (CanConnectToPrev(cCh, cPrevCh))
{
nKashidaPos = nPrevIdx;
nPriorityLevel = 4;
}
}
}
}
// 6. Priority:
// before the final form of Waw, Ain, Qaf and Feh
if (nPriorityLevel >= 5 && nIdx > 0)
{
// Wav (right joining) final form may appear in the middle of word
// Ain, Qaf, Feh (all dual joining) only at end of word
if (isWawChar(cCh)
|| ((isAinChar(cCh) || isQafChar(cCh) || isFehChar(cCh)) && nIdx == nWordLen - 1))
{
SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character");
// check if character is connectable to previous character,
if (CanConnectToPrev(cCh, cPrevCh))
{
nKashidaPos = nPrevIdx;
nPriorityLevel = 5;
}
}
}
// other connecting possibilities
if (nPriorityLevel >= 6 && nIdx > 0)
{
// Reh, Zain
if (isRehChar(cCh))
{
SAL_WARN_IF(0 == cPrevCh, "i18n", "No previous character");
// check if character is connectable to previous character,
if (CanConnectToPrev(cCh, cPrevCh))
{
nKashidaPos = nPrevIdx;
nPriorityLevel = 6;
}
}
}
// Do not consider vowel marks when checking if a character
// can be connected to previous character.
if (!isTransparentChar(cCh))
{
cPrevCh = cCh;
nPrevIdx = nIdx;
}
++nIdx;
} // end of current word
if (-1 != nKashidaPos)
{
return KashidaPosition{ nKashidaPos };
}
return std::nullopt;
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */

View file

@ -0,0 +1,24 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <i18nutil/i18nutildllapi.h>
#include <rtl/ustring.hxx>
#include <optional>
namespace i18nutil
{
struct KashidaPosition
{
sal_Int32 nIndex;
};
I18NUTIL_DLLPUBLIC std::optional<KashidaPosition> GetWordKashidaPosition(const OUString& rWord);
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab cinoptions=b1,g0,N-s cinkeys+=0=break: */

View file

@ -79,124 +79,12 @@
#include <unicode/ubidi.h>
#include <i18nutil/scripttypedetector.hxx>
#include <i18nutil/unicode.hxx>
#include <i18nutil/kashida.hxx>
#include <unotxdoc.hxx>
using namespace ::com::sun::star;
using namespace i18n::ScriptType;
/*
https://www.khtt.net/en/page/1821/the-big-kashida-secret
the rules of priorities that govern the addition of kashidas in Arabic text
made ... for ... Explorer 5.5 browser.
The kashida justification is based on a connection priority scheme that
decides where kashidas are put automatically.
This is how the software decides on kashida-inserting priorities:
1. First it looks for characters with the highest priority in each word,
which means kashida-extensions will only been used in one position in each
word. Not more.
2. The kashida will be connected to the character with the highest priority.
3. If kashida connection opportunities are found with an equal level of
priority in one word, the kashida will be placed towards the end of the
word.
The priority list of characters and the positioning is as follows:
1. after a kashida that is manually placed in the text by the user,
2. after a Seen or Sad (initial and medial form),
3. before the final form of Taa Marbutah, Haa, Dal,
4. before the final form of Alef, Tah Lam, Kaf and Gaf,
5. before the preceding medial Baa of Ra, Ya and Alef Maqsurah,
6. before the final form of Waw, Ain, Qaf and Fa,
7. before the final form of other characters that can be connected.
*/
#define IS_JOINING_GROUP(c, g) ( u_getIntPropertyValue( (c), UCHAR_JOINING_GROUP ) == U_JG_##g )
#define isAinChar(c) IS_JOINING_GROUP((c), AIN)
#define isAlefChar(c) IS_JOINING_GROUP((c), ALEF)
#define isDalChar(c) IS_JOINING_GROUP((c), DAL)
#define isFehChar(c) (IS_JOINING_GROUP((c), FEH) || IS_JOINING_GROUP((c), AFRICAN_FEH))
#define isGafChar(c) IS_JOINING_GROUP((c), GAF)
#define isHehChar(c) IS_JOINING_GROUP((c), HEH)
#define isKafChar(c) IS_JOINING_GROUP((c), KAF)
#define isLamChar(c) IS_JOINING_GROUP((c), LAM)
#define isQafChar(c) (IS_JOINING_GROUP((c), QAF) || IS_JOINING_GROUP((c), AFRICAN_QAF))
#define isRehChar(c) IS_JOINING_GROUP((c), REH)
#define isTahChar(c) IS_JOINING_GROUP((c), TAH)
#define isTehMarbutaChar(c) IS_JOINING_GROUP((c), TEH_MARBUTA)
#define isWawChar(c) IS_JOINING_GROUP((c), WAW)
#define isSeenOrSadChar(c) (IS_JOINING_GROUP((c), SAD) || IS_JOINING_GROUP((c), SEEN))
// Beh and characters that behave like Beh in medial form.
static bool isBehChar(sal_Unicode cCh)
{
bool bRet = false;
switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
{
case U_JG_BEH:
case U_JG_NOON:
case U_JG_AFRICAN_NOON:
case U_JG_NYA:
case U_JG_YEH:
case U_JG_FARSI_YEH:
case U_JG_BURUSHASKI_YEH_BARREE:
bRet = true;
break;
default:
bRet = false;
break;
}
return bRet;
}
// Yeh and characters that behave like Yeh in final form.
static bool isYehChar(sal_Unicode cCh)
{
bool bRet = false;
switch (u_getIntPropertyValue(cCh, UCHAR_JOINING_GROUP))
{
case U_JG_YEH:
case U_JG_FARSI_YEH:
case U_JG_YEH_BARREE:
case U_JG_BURUSHASKI_YEH_BARREE:
case U_JG_YEH_WITH_TAIL:
bRet = true;
break;
default:
bRet = false;
break;
}
return bRet;
}
static bool isTransparentChar ( sal_Unicode cCh )
{
return u_getIntPropertyValue( cCh, UCHAR_JOINING_TYPE ) == U_JT_TRANSPARENT;
}
// Checks if cCh + cNectCh builds a ligature (used for Kashidas)
static bool lcl_IsLigature( sal_Unicode cCh, sal_Unicode cNextCh )
{
// Lam + Alef
return ( isLamChar ( cCh ) && isAlefChar ( cNextCh ));
}
// Checks if cCh is connectable to cPrevCh (used for Kashidas)
static bool lcl_ConnectToPrev( sal_Unicode cCh, sal_Unicode cPrevCh )
{
const int32_t nJoiningType = u_getIntPropertyValue( cPrevCh, UCHAR_JOINING_TYPE );
bool bRet = nJoiningType != U_JT_RIGHT_JOINING && nJoiningType != U_JT_NON_JOINING;
// check for ligatures cPrevChar + cChar
if( bRet )
bRet = !lcl_IsLigature( cPrevCh, cCh );
return bRet;
}
static bool lcl_HasStrongLTR ( std::u16string_view rText, sal_Int32 nStart, sal_Int32 nEnd )
{
for( sal_Int32 nCharIdx = nStart; nCharIdx < nEnd; ++nCharIdx )
@ -1618,157 +1506,16 @@ void SwScriptInfo::InitScriptInfo(const SwTextNode& rNode,
while ( aScanner.NextWord() )
{
const OUString& rWord = aScanner.GetWord();
auto stKashidaPos = i18nutil::GetWordKashidaPosition(rWord);
sal_Int32 nIdx = 0, nPrevIdx = 0;
sal_Int32 nKashidaPos = -1;
sal_Unicode cCh, cPrevCh = 0;
int nPriorityLevel = 7; // 0..6 = level found
// 7 not found
sal_Int32 nWordLen = rWord.getLength();
// ignore trailing vowel chars
while( nWordLen && isTransparentChar( rWord[ nWordLen - 1 ] ))
--nWordLen;
while (nIdx < nWordLen)
if (stKashidaPos.has_value())
{
cCh = rWord[ nIdx ];
// 1. Priority:
// after user inserted kashida
if ( 0x640 == cCh )
{
nKashidaPos = aScanner.GetBegin() + nIdx;
nPriorityLevel = 0;
// Only populate kashida positions for the invalidated tail
TextFrameIndex nNewKashidaPos{aScanner.GetBegin() + stKashidaPos->nIndex};
if(nNewKashidaPos >= nLastKashida) {
m_Kashida.insert(m_Kashida.begin() + nCntKash, nNewKashidaPos);
nCntKash++;
}
// 2. Priority:
// after a Seen or Sad
if (nPriorityLevel >= 1 && nIdx < nWordLen - 1)
{
if( isSeenOrSadChar( cCh )
&& (rWord[ nIdx+1 ] != 0x200C) ) // #i98410#: prevent ZWNJ expansion
{
nKashidaPos = aScanner.GetBegin() + nIdx;
nPriorityLevel = 1;
}
}
// 3. Priority:
// before final form of Teh Marbuta, Heh, Dal
if ( nPriorityLevel >= 2 && nIdx > 0 )
{
if ( isTehMarbutaChar ( cCh ) || // Teh Marbuta (right joining)
isDalChar ( cCh ) || // Dal (right joining) final form may appear in the middle of word
( isHehChar ( cCh ) && nIdx == nWordLen - 1)) // Heh (dual joining) only at end of word
{
SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aScanner.GetBegin() + nPrevIdx;
nPriorityLevel = 2;
}
}
}
// 4. Priority:
// before final form of Alef, Tah, Lam, Kaf or Gaf
if ( nPriorityLevel >= 3 && nIdx > 0 )
{
if ( isAlefChar ( cCh ) || // Alef (right joining) final form may appear in the middle of word
(( isLamChar ( cCh ) || // Lam,
isTahChar ( cCh ) || // Tah,
isKafChar ( cCh ) || // Kaf (all dual joining)
isGafChar ( cCh ) )
&& nIdx == nWordLen - 1)) // only at end of word
{
SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aScanner.GetBegin() + nPrevIdx;
nPriorityLevel = 3;
}
}
}
// 5. Priority:
// before medial Beh-like
if ( nPriorityLevel >= 4 && nIdx > 0 && nIdx < nWordLen - 1 )
{
if ( isBehChar ( cCh ) )
{
// check if next character is Reh or Yeh-like
sal_Unicode cNextCh = rWord[ nIdx + 1 ];
if ( isRehChar ( cNextCh ) || isYehChar ( cNextCh ))
{
SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aScanner.GetBegin() + nPrevIdx;
nPriorityLevel = 4;
}
}
}
}
// 6. Priority:
// before the final form of Waw, Ain, Qaf and Feh
if ( nPriorityLevel >= 5 && nIdx > 0 )
{
if ( isWawChar ( cCh ) || // Wav (right joining)
// final form may appear in the middle of word
(( isAinChar ( cCh ) || // Ain (dual joining)
isQafChar ( cCh ) || // Qaf (dual joining)
isFehChar ( cCh ) ) // Feh (dual joining)
&& nIdx == nWordLen - 1)) // only at end of word
{
SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aScanner.GetBegin() + nPrevIdx;
nPriorityLevel = 5;
}
}
}
// other connecting possibilities
if ( nPriorityLevel >= 6 && nIdx > 0 )
{
// Reh, Zain
if ( isRehChar ( cCh ) )
{
SAL_WARN_IF( 0 == cPrevCh, "sw.core", "No previous character" );
// check if character is connectable to previous character,
if ( lcl_ConnectToPrev( cCh, cPrevCh ) )
{
nKashidaPos = aScanner.GetBegin() + nPrevIdx;
nPriorityLevel = 6;
}
}
}
// Do not consider vowel marks when checking if a character
// can be connected to previous character.
if ( !isTransparentChar ( cCh) )
{
cPrevCh = cCh;
nPrevIdx = nIdx;
}
++nIdx;
} // end of current word
if ( -1 != nKashidaPos )
{
m_Kashida.insert(m_Kashida.begin() + nCntKash, TextFrameIndex(nKashidaPos));
nCntKash++;
}
} // end of kashida search
}