Fix bug in khmr linebreaking and update dictionary
Change-Id: I2b776925c2c95cb56ccd592d036823c26054e059 Reviewed-on: https://gerrit.libreoffice.org/23316 Tested-by: Jenkins <ci@libreoffice.org> Reviewed-by: Martin Hosken <martin_hosken@sil.org>
This commit is contained in:
parent
1caac28389
commit
a976a19ca8
2 changed files with 17 additions and 310 deletions
327
external/icu/khmerbreakengine.patch
vendored
327
external/icu/khmerbreakengine.patch
vendored
|
@ -2,7 +2,7 @@ diff --git a/source/common/dictbe.cpp b/source/common/dictbe.cpp
|
|||
index f1c874d..3ad1b3f 100644
|
||||
--- misc/icu/source/common/dictbe.cpp
|
||||
+++ build/icu/source/common/dictbe.cpp
|
||||
@@ -27,8 +27,16 @@ U_NAMESPACE_BEGIN
|
||||
@@ -27,8 +27,17 @@ U_NAMESPACE_BEGIN
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
|
@ -14,13 +14,14 @@ index f1c874d..3ad1b3f 100644
|
|||
fTypes = breakTypes;
|
||||
+ fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
|
||||
+
|
||||
+ // note Skip Sets contain fIgnoreSet characters too.
|
||||
+ fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]]\\u200C\\u200D\\u2060"), status);
|
||||
+ fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]]\\u200C\\u200D\\u2060"), status);
|
||||
+ fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
|
||||
}
|
||||
|
||||
DictionaryBreakEngine::~DictionaryBreakEngine() {
|
||||
@@ -90,7 +98,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
|
||||
@@ -90,7 +99,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
utext_setNativeIndex(text, current);
|
||||
}
|
||||
|
@ -29,7 +30,7 @@ index f1c874d..3ad1b3f 100644
|
|||
return result;
|
||||
}
|
||||
|
||||
@@ -101,6 +109,163 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
|
||||
@@ -101,6 +110,169 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
|
||||
fSet.compact();
|
||||
}
|
||||
|
||||
|
@ -87,6 +88,8 @@ index f1c874d..3ad1b3f 100644
|
|||
+ }
|
||||
+ for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
|
||||
+ while (start > textStart) {
|
||||
+ while (fIgnoreSet.contains(c))
|
||||
+ c = utext_previous32(text);
|
||||
+ if (!fMarkSet.contains(c)) {
|
||||
+ if (fBaseSet.contains(c)) {
|
||||
+ c = utext_previous32(text);
|
||||
|
@ -125,6 +128,10 @@ index f1c874d..3ad1b3f 100644
|
|||
+ ++end;
|
||||
+ }
|
||||
+ for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
|
||||
+ while (fIgnoreSet.contains(c)) {
|
||||
+ utext_next32(text);
|
||||
+ c = utext_current32(text);
|
||||
+ }
|
||||
+ if (fBaseSet.contains(c)) {
|
||||
+ while (end < textEnd) {
|
||||
+ utext_next32(text);
|
||||
|
@ -193,7 +200,7 @@ index f1c874d..3ad1b3f 100644
|
|||
/*
|
||||
******************************************************************
|
||||
* PossibleWord
|
||||
@@ -128,35 +293,35 @@ private:
|
||||
@@ -128,35 +302,35 @@ private:
|
||||
public:
|
||||
PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
|
||||
~PossibleWord() {};
|
||||
|
@ -238,242 +245,7 @@ index f1c874d..3ad1b3f 100644
|
|||
// Dictionary leaves text after longest prefix, not longest word. Back up.
|
||||
if (count <= 0) {
|
||||
utext_setNativeIndex(text, start);
|
||||
@@ -261,16 +426,16 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t current;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
PossibleWord words[THAI_LOOKAHEAD];
|
||||
-
|
||||
+
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
-
|
||||
+
|
||||
while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
|
||||
cpWordLength = 0;
|
||||
cuWordLength = 0;
|
||||
|
||||
// Look for candidate words at the current position
|
||||
int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
-
|
||||
+
|
||||
// If we found exactly one, use that
|
||||
if (candidates == 1) {
|
||||
cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
|
||||
@@ -291,12 +456,12 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
|
||||
wordsMatched = 2;
|
||||
}
|
||||
-
|
||||
+
|
||||
// If we're already at the end of the range, we're done
|
||||
if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
|
||||
goto foundBest;
|
||||
}
|
||||
-
|
||||
+
|
||||
// See if any of the possible second words is followed by a third word
|
||||
do {
|
||||
// If we find a third word, stop right away
|
||||
@@ -315,13 +480,13 @@ foundBest:
|
||||
cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
|
||||
wordsFound += 1;
|
||||
}
|
||||
-
|
||||
+
|
||||
// We come here after having either found a word or not. We look ahead to the
|
||||
// next word. If it's not a dictionary word, we will combine it with the word we
|
||||
// just found (if there is one), but only if the preceding word does not exceed
|
||||
// the threshold.
|
||||
// The text iterator should now be positioned at the end of the word we found.
|
||||
-
|
||||
+
|
||||
UChar32 uc = 0;
|
||||
if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
|
||||
// if it is a dictionary word, do nothing. If it isn't, then if there is
|
||||
@@ -357,12 +522,12 @@ foundBest:
|
||||
}
|
||||
}
|
||||
}
|
||||
-
|
||||
+
|
||||
// Bump the word count if there wasn't already one
|
||||
if (cuWordLength <= 0) {
|
||||
wordsFound += 1;
|
||||
}
|
||||
-
|
||||
+
|
||||
// Update the length with the passed-over characters
|
||||
cuWordLength += chars;
|
||||
}
|
||||
@@ -371,14 +536,14 @@ foundBest:
|
||||
utext_setNativeIndex(text, current+cuWordLength);
|
||||
}
|
||||
}
|
||||
-
|
||||
+
|
||||
// Never stop before a combining mark.
|
||||
int32_t currPos;
|
||||
while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
|
||||
utext_next32(text);
|
||||
cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
|
||||
}
|
||||
-
|
||||
+
|
||||
// Look ahead for possible suffixes if a dictionary word does not follow.
|
||||
// We do this in code rather than using a rule so that the heuristic
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
@@ -496,16 +661,16 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t current;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
PossibleWord words[LAO_LOOKAHEAD];
|
||||
-
|
||||
+
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
-
|
||||
+
|
||||
while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
|
||||
cuWordLength = 0;
|
||||
cpWordLength = 0;
|
||||
|
||||
// Look for candidate words at the current position
|
||||
int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
-
|
||||
+
|
||||
// If we found exactly one, use that
|
||||
if (candidates == 1) {
|
||||
cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
|
||||
@@ -526,12 +691,12 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
words[wordsFound%LAO_LOOKAHEAD].markCurrent();
|
||||
wordsMatched = 2;
|
||||
}
|
||||
-
|
||||
+
|
||||
// If we're already at the end of the range, we're done
|
||||
if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
|
||||
goto foundBest;
|
||||
}
|
||||
-
|
||||
+
|
||||
// See if any of the possible second words is followed by a third word
|
||||
do {
|
||||
// If we find a third word, stop right away
|
||||
@@ -549,7 +714,7 @@ foundBest:
|
||||
cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
|
||||
wordsFound += 1;
|
||||
}
|
||||
-
|
||||
+
|
||||
// We come here after having either found a word or not. We look ahead to the
|
||||
// next word. If it's not a dictionary word, we will combine it withe the word we
|
||||
// just found (if there is one), but only if the preceding word does not exceed
|
||||
@@ -587,12 +752,12 @@ foundBest:
|
||||
}
|
||||
}
|
||||
}
|
||||
-
|
||||
+
|
||||
// Bump the word count if there wasn't already one
|
||||
if (cuWordLength <= 0) {
|
||||
wordsFound += 1;
|
||||
}
|
||||
-
|
||||
+
|
||||
// Update the length with the passed-over characters
|
||||
cuWordLength += chars;
|
||||
}
|
||||
@@ -601,14 +766,14 @@ foundBest:
|
||||
utext_setNativeIndex(text, current + cuWordLength);
|
||||
}
|
||||
}
|
||||
-
|
||||
+
|
||||
// Never stop before a combining mark.
|
||||
int32_t currPos;
|
||||
while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
|
||||
utext_next32(text);
|
||||
cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
|
||||
}
|
||||
-
|
||||
+
|
||||
// Look ahead for possible suffixes if a dictionary word does not follow.
|
||||
// We do this in code rather than using a rule so that the heuristic
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
@@ -689,16 +854,16 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t current;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
PossibleWord words[BURMESE_LOOKAHEAD];
|
||||
-
|
||||
+
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
-
|
||||
+
|
||||
while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
|
||||
cuWordLength = 0;
|
||||
cpWordLength = 0;
|
||||
|
||||
// Look for candidate words at the current position
|
||||
int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
-
|
||||
+
|
||||
// If we found exactly one, use that
|
||||
if (candidates == 1) {
|
||||
cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
|
||||
@@ -719,12 +884,12 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
|
||||
wordsMatched = 2;
|
||||
}
|
||||
-
|
||||
+
|
||||
// If we're already at the end of the range, we're done
|
||||
if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
|
||||
goto foundBest;
|
||||
}
|
||||
-
|
||||
+
|
||||
// See if any of the possible second words is followed by a third word
|
||||
do {
|
||||
// If we find a third word, stop right away
|
||||
@@ -742,7 +907,7 @@ foundBest:
|
||||
cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
|
||||
wordsFound += 1;
|
||||
}
|
||||
-
|
||||
+
|
||||
// We come here after having either found a word or not. We look ahead to the
|
||||
// next word. If it's not a dictionary word, we will combine it withe the word we
|
||||
// just found (if there is one), but only if the preceding word does not exceed
|
||||
@@ -780,12 +945,12 @@ foundBest:
|
||||
}
|
||||
}
|
||||
}
|
||||
-
|
||||
+
|
||||
// Bump the word count if there wasn't already one
|
||||
if (cuWordLength <= 0) {
|
||||
wordsFound += 1;
|
||||
}
|
||||
-
|
||||
+
|
||||
// Update the length with the passed-over characters
|
||||
cuWordLength += chars;
|
||||
}
|
||||
@@ -794,14 +959,14 @@ foundBest:
|
||||
utext_setNativeIndex(text, current + cuWordLength);
|
||||
}
|
||||
}
|
||||
-
|
||||
+
|
||||
// Never stop before a combining mark.
|
||||
int32_t currPos;
|
||||
while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
|
||||
utext_next32(text);
|
||||
cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
|
||||
}
|
||||
-
|
||||
+
|
||||
// Look ahead for possible suffixes if a dictionary word does not follow.
|
||||
// We do this in code rather than using a rule so that the heuristic
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
@@ -828,51 +993,28 @@ foundBest:
|
||||
@@ -828,51 +1002,28 @@ foundBest:
|
||||
* KhmerBreakEngine
|
||||
*/
|
||||
|
||||
|
@ -536,7 +308,7 @@ index f1c874d..3ad1b3f 100644
|
|||
}
|
||||
|
||||
KhmerBreakEngine::~KhmerBreakEngine() {
|
||||
@@ -884,180 +1027,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
@@ -884,180 +1036,204 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
|
@ -560,10 +332,10 @@ index f1c874d..3ad1b3f 100644
|
|||
+ startZwsp = scanBeforeStart(text, scanStart, breakStart);
|
||||
+ }
|
||||
+ utext_setNativeIndex(text, rangeStart);
|
||||
+ scanFwdClusters(text, rangeEnd, initAfter);
|
||||
+ scanFwdClusters(text, rangeStart, initAfter);
|
||||
+ bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
|
||||
+ utext_setNativeIndex(text, rangeEnd - 1);
|
||||
+ scanBackClusters(text, rangeStart, finalBefore);
|
||||
+ scanBackClusters(text, rangeEnd, finalBefore);
|
||||
+ if (finalBefore < initAfter) { // the whole run is tented so no breaks
|
||||
+ if (breakStart || fTypes < UBRK_LINE)
|
||||
+ foundBreaks.push(rangeStart, status);
|
||||
|
@ -715,7 +487,7 @@ index f1c874d..3ad1b3f 100644
|
|||
+ if (count == 0) {
|
||||
+ utext_setNativeIndex(text, ix);
|
||||
+ int32_t c = utext_current32(text);
|
||||
+ if (fPuncSet.contains(c) || c == ZWSP || c == WJ) {
|
||||
+ if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
|
||||
+ values.setElementAt(0, count);
|
||||
+ lengths.setElementAt(1, count++);
|
||||
+ } else if (fBaseSet.contains(c)) {
|
||||
|
@ -767,7 +539,7 @@ index f1c874d..3ad1b3f 100644
|
|||
+ int32_t ln = lengths.elementAti(j);
|
||||
+ utext_setNativeIndex(text, ln+ix);
|
||||
+ int32_t c = utext_current32(text);
|
||||
+ while (fPuncSet.contains(c)) {
|
||||
+ while (fPuncSet.contains(c) || fIgnoreSet.contains(c)) {
|
||||
+ ++ln;
|
||||
+ utext_next32(text);
|
||||
+ c = utext_current32(text);
|
||||
|
@ -887,71 +659,6 @@ index f1c874d..3ad1b3f 100644
|
|||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
@@ -1121,7 +1288,7 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
|
||||
return (int32_t)1 << bitIndex;
|
||||
}
|
||||
|
||||
-
|
||||
+
|
||||
/*
|
||||
* @param text A UText representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
@@ -1129,7 +1296,7 @@ static inline int32_t utext_i32_flag(int32_t bitIndex) {
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
-int32_t
|
||||
+int32_t
|
||||
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
@@ -1192,7 +1359,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
-
|
||||
+
|
||||
UnicodeString fragment;
|
||||
UnicodeString normalizedFragment;
|
||||
for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
|
||||
@@ -1261,7 +1428,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
}
|
||||
}
|
||||
}
|
||||
-
|
||||
+
|
||||
// bestSnlp[i] is the snlp of the best segmentation of the first i
|
||||
// code points in the range to be matched.
|
||||
UVector32 bestSnlp(numCodePts + 1, status);
|
||||
@@ -1271,7 +1438,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
}
|
||||
|
||||
|
||||
- // prev[i] is the index of the last CJK code point in the previous word in
|
||||
+ // prev[i] is the index of the last CJK code point in the previous word in
|
||||
// the best segmentation of the first i characters.
|
||||
UVector32 prev(numCodePts + 1, status);
|
||||
for(int32_t i = 0; i <= numCodePts; i++){
|
||||
@@ -1305,8 +1472,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
// Note: lengths is filled with code point lengths
|
||||
// The NULL parameter is the ignored code unit lengths.
|
||||
|
||||
- // if there are no single character matches found in the dictionary
|
||||
- // starting with this charcter, treat character as a 1-character word
|
||||
+ // if there are no single character matches found in the dictionary
|
||||
+ // starting with this charcter, treat character as a 1-character word
|
||||
// with the highest value possible, i.e. the least likely to occur.
|
||||
// Exclude Korean characters from this treatment, as they should be left
|
||||
// together by default.
|
||||
@@ -1380,7 +1547,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
numBreaks++;
|
||||
}
|
||||
|
||||
- // Now that we're done, convert positions in t_boundary[] (indices in
|
||||
+ // Now that we're done, convert positions in t_boundary[] (indices in
|
||||
// the normalized input string) back to indices in the original input UText
|
||||
// while reversing t_boundary and pushing values to foundBreaks.
|
||||
for (int32_t i = numBreaks-1; i >= 0; i--) {
|
||||
diff --git a/source/common/dictbe.h b/source/common/dictbe.h
|
||||
index d3488cd..26caa75 100644
|
||||
--- misc/icu/source/common/dictbe.h
|
||||
|
|
BIN
external/icu/khmerdict.dict
vendored
BIN
external/icu/khmerdict.dict
vendored
Binary file not shown.
Loading…
Reference in a new issue