office-gobmx/external/icu/icu4c-khmerbreakengine.patch.1

diff -ur icu.org/source/common/dictbe.cpp icu/source/common/dictbe.cpp
--- icu.org/source/common/dictbe.cpp	2017-01-20 01:20:31.000000000 +0100
+++ icu/source/common/dictbe.cpp	2017-04-21 23:14:23.845894374 +0200
@@ -29,8 +29,17 @@
  ******************************************************************
  */

-DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
+DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) :
+    clusterLimit(3)
+{
+    UErrorCode status = U_ZERO_ERROR;
     fTypes = breakTypes;
+    fViramaSet.applyPattern(UNICODE_STRING_SIMPLE("[[:ccc=VR:]]"), status);
+
+    // note Skip Sets contain fIgnoreSet characters too.
+    fSkipStartSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=OP:][:lb=QU:]\\u200C\\u200D\\u2060]"), status);
+    fSkipEndSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CP:][:lb=QU:][:lb=EX:][:lb=CL:]\\u200C\\u200D\\u2060]"), status);
+    fNBeforeSet.applyPattern(UNICODE_STRING_SIMPLE("[[:lb=CR:][:lb=LF:][:lb=NL:][:lb=SP:][:lb=ZW:][:lb=IS:][:lb=BA:][:lb=NS:]]"), status);
 }

 DictionaryBreakEngine::~DictionaryBreakEngine() {
@@ -92,7 +101,7 @@
         result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
         utext_setNativeIndex(text, current);
     }
-
+
     return result;
 }

@@ -103,6 +112,169 @@
     fSet.compact();
 }

+bool
+DictionaryBreakEngine::scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const {
+    UErrorCode status = U_ZERO_ERROR;
+    UText* ut = utext_clone(NULL, text, false, true, &status);
+    utext_setNativeIndex(ut, start);
+    UChar32 c = utext_current32(ut);
+    bool res = false;
+    doBreak = true;
+    while (start >= 0) {
+        if (!fSkipStartSet.contains(c)) {
+            res = (c == ZWSP);
+            break;
+        }
+        --start;
+        c = utext_previous32(ut);
+        doBreak = false;
+    }
+    utext_close(ut);
+    return res;
+}
+
+bool
+DictionaryBreakEngine::scanAfterEnd(UText *text, int32_t textEnd, int32_t& end, bool &doBreak) const {
+    UErrorCode status = U_ZERO_ERROR;
+    UText* ut = utext_clone(NULL, text, false, true, &status);
+    utext_setNativeIndex(ut, end);
+    UChar32 c = utext_current32(ut);
+    bool res = false;
+    doBreak = !fNBeforeSet.contains(c);
+    while (end < textEnd) {
+        if (!fSkipEndSet.contains(c)) {
+            res = (c == ZWSP);
+            break;
+        }
+        ++end;
+        c = utext_next32(ut);
+        doBreak = false;
+    }
+    utext_close(ut);
+    return res;
+}
+
+void
+DictionaryBreakEngine::scanBackClusters(UText *text, int32_t textStart, int32_t& start) const {
+    UChar32 c = 0;
+    start = utext_getNativeIndex(text);
+    while (start > textStart) {
+        c = utext_previous32(text);
+        --start;
+        if (!fSkipEndSet.contains(c))
+            break;
+    }
+    for (int i = 0; i < clusterLimit; ++i) { // scan backwards clusterLimit clusters
+        while (start > textStart) {
+            while (fIgnoreSet.contains(c))
+                c = utext_previous32(text);
+            if (!fMarkSet.contains(c)) {
+                if (fBaseSet.contains(c)) {
+                    c = utext_previous32(text);
+                    if (!fViramaSet.contains(c)) { // Virama (e.g. coeng) preceding base. Treat sequence as a mark
+                        utext_next32(text);
+                        c = utext_current32(text);
+                        break;
+                    } else {
+                        --start;
+                    }
+                } else {
+                    break;
+                }
+            }
+            c = utext_previous32(text);
+            --start;
+        }
+        if (!fBaseSet.contains(c) || start < textStart) {  // not a cluster start so finish
+            break;
+        }
+        c = utext_previous32(text);
+        --start;        // go round again
+    }                   // ignore hitting previous inhibitor since scanning for it should have found us!
+    ++start;            // counteract --before
+}
+
+void
+DictionaryBreakEngine::scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const {
+    UChar32 c = utext_current32(text);
+    end = utext_getNativeIndex(text);
+    while (end < textEnd) {
+        if (!fSkipStartSet.contains(c))
+            break;
+        utext_next32(text);
+        c = utext_current32(text);
+        ++end;
+    }
+    for (int i = 0; i < clusterLimit; ++i) { // scan forwards clusterLimit clusters
+        while (fIgnoreSet.contains(c)) {
+            utext_next32(text);
+            c = utext_current32(text);
+        }
+        if (fBaseSet.contains(c)) {
+            while (end < textEnd) {
+                utext_next32(text);
+                c = utext_current32(text);
+                ++end;
+                if (!fMarkSet.contains(c))
+                    break;
+                else if (fViramaSet.contains(c)) {  // handle coeng + base as mark
+                    utext_next32(text);
+                    c = utext_current32(text);
+                    ++end;
+                    if (!fBaseSet.contains(c))
+                        break;
+                }
+            }
+        } else {
+            --end;    // bad char so break after char before it
+            break;
+        }
+    }
+}
+
+bool
+DictionaryBreakEngine::scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const {
+    UErrorCode status = U_ZERO_ERROR;
+    UText* ut = utext_clone(NULL, text, false, true, &status);
+    int32_t nat = start;
+    utext_setNativeIndex(ut, nat);
+    bool foundFirst = true;
+    int32_t curr = start;
+    while (nat < end) {
+        UChar32 c = utext_current32(ut);
+        if (c == ZWSP || c == WJ) {
+            curr = nat + 1;
+            if (foundFirst)     // only scan backwards for first inhibitor
+                scanBackClusters(ut, start, before);
+            foundFirst = false; // don't scan backwards if we go around again. Also marks found something
+
+            utext_next32(ut);
+            scanFwdClusters(ut, end, after);
+            nat = after + 1;
+
+            if (c == ZWSP || c == WJ) {  // did we hit another one?
+                continue;
+            } else {
+                break;
+            }
+        }
+
+        ++nat;                  // keep hunting
+        utext_next32(ut);
+    }
+
+    utext_close(ut);
+
+    if (nat >= end && foundFirst) {
+        start = before = after = nat;
+        return false;           // failed to find anything
+    }
+    else {
+        start = curr;
+    }
+    return true;                // yup hit one
+}
+
 /*
  ******************************************************************
  * PossibleWord
@@ -130,35 +302,35 @@
 public:
     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
     ~PossibleWord() {};
-
+
     // Fill the list of candidates if needed, select the longest, and return the number found
-    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
-
+    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet = NULL, int32_t minLength = 0 );
+
     // Select the currently marked candidate, point after it in the text, and invalidate self
     int32_t   acceptMarked( UText *text );
-
+
     // Back up from the current candidate to the next shorter one; return TRUE if that exists
     // and point the text after it
     UBool     backUp( UText *text );
-
+
     // Return the longest prefix this candidate location shares with a dictionary word
     // Return value is in code points.
     int32_t   longestPrefix() { return prefix; };
-
+
     // Mark the current candidate as the one we like
     void      markCurrent() { mark = current; };
-
+
     // Get length in code points of the marked word.
     int32_t   markedCPLength() { return cpLengths[mark]; };
 };


-int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
+int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd, UnicodeSet const *ignoreSet, int32_t minLength) {
     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
     int32_t start = (int32_t)utext_getNativeIndex(text);
     if (start != offset) {
         offset = start;
-        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
+        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix, ignoreSet, minLength);
         // Dictionary leaves text after longest prefix, not longest word. Back up.
         if (count <= 0) {
             utext_setNativeIndex(text, start);
@@ -830,51 +1002,28 @@
  * KhmerBreakEngine
  */

-// How many words in a row are "good enough"?
-static const int32_t KHMER_LOOKAHEAD = 3;
-
-// Will not combine a non-word with a preceding dictionary word longer than this
-static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
-
-// Will not combine a non-word that shares at least this much prefix with a
-// dictionary word, with a preceding word
-static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
-
-// Minimum word size
-static const int32_t KHMER_MIN_WORD = 2;
-
-// Minimum number of characters for two words
-static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
-
 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
     : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
       fDictionary(adoptDictionary)
 {
-    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
+
+    clusterLimit = 3;
+
+    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]\\u2060\\u200C\\u200D]"), status);
     if (U_SUCCESS(status)) {
         setCharacters(fKhmerWordSet);
     }
     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
-    fMarkSet.add(0x0020);
-    fEndWordSet = fKhmerWordSet;
-    fBeginWordSet.add(0x1780, 0x17B3);
-    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
-    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
-    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
-    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
-    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
-//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
-//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
-//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
-//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
-//    fSuffixSet.add(THAI_PAIYANNOI);
-//    fSuffixSet.add(THAI_MAIYAMOK);
+    fIgnoreSet.add(0x2060);         // WJ
+    fIgnoreSet.add(0x200C, 0x200D); // ZWJ, ZWNJ
+    fBaseSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:lb=SA:]&[:^M:]]"), status);
+    fPuncSet.applyPattern(UNICODE_STRING_SIMPLE("[\\u17D4\\u17D5\\u17D6\\u17D7\\u17D9:]"), status);

     // Compact for caching.
     fMarkSet.compact();
-    fEndWordSet.compact();
-    fBeginWordSet.compact();
-//    fSuffixSet.compact();
+	fIgnoreSet.compact();
+	fBaseSet.compact();
+	fPuncSet.compact();
 }

 KhmerBreakEngine::~KhmerBreakEngine() {
@@ -886,180 +1035,204 @@
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UStack &foundBreaks ) const {
-    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
-        return 0;       // Not enough characters for two words
-    }
-
-    uint32_t wordsFound = 0;
-    int32_t cpWordLength = 0;
-    int32_t cuWordLength = 0;
-    int32_t current;
+    uint32_t wordsFound = foundBreaks.size();
     UErrorCode status = U_ZERO_ERROR;
-    PossibleWord words[KHMER_LOOKAHEAD];
-
+    int32_t before = 0;
+    int32_t after = 0;
+    int32_t finalBefore = 0;
+    int32_t initAfter = 0;
+    int32_t scanStart = rangeStart;
+    int32_t scanEnd = rangeEnd;
+
+    bool startZwsp = false;
+    bool breakStart = false;
+    bool breakEnd = false;
+
+    if (rangeStart > 0) {
+        --scanStart;
+        startZwsp = scanBeforeStart(text, scanStart, breakStart);
+    }
     utext_setNativeIndex(text, rangeStart);
+    scanFwdClusters(text, rangeEnd, initAfter);
+    bool endZwsp = scanAfterEnd(text, utext_nativeLength(text), scanEnd, breakEnd);
+    utext_setNativeIndex(text, rangeEnd - 1);
+    scanBackClusters(text, rangeStart, finalBefore);
+    if (finalBefore < initAfter) {   // the whole run is tented so no breaks
+        if (breakStart || fTypes < UBRK_LINE)
+            foundBreaks.push(rangeStart, status);
+        if (breakEnd || fTypes < UBRK_LINE)
+            foundBreaks.push(rangeEnd, status);
+        return foundBreaks.size() - wordsFound;
+    }
+
+    scanStart = rangeStart;
+    scanWJ(text, scanStart, rangeEnd, before, after);
+    if (startZwsp || initAfter >= before) {
+        after = initAfter;
+        before = 0;
+    }
+    if (!endZwsp && after > finalBefore && after < rangeEnd)
+        endZwsp = true;
+    if (endZwsp && before > finalBefore)
+        before = finalBefore;

-    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
-        cuWordLength = 0;
-        cpWordLength = 0;
-
-        // Look for candidate words at the current position
-        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
-
-        // If we found exactly one, use that
-        if (candidates == 1) {
-            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
-            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
-            wordsFound += 1;
-        }
-
-        // If there was more than one, see which one can take us forward the most words
-        else if (candidates > 1) {
-            // If we're already at the end of the range, we're done
-            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
-                goto foundBest;
-            }
-            do {
-                int32_t wordsMatched = 1;
-                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
-                    if (wordsMatched < 2) {
-                        // Followed by another dictionary word; mark first word as a good candidate
-                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
-                        wordsMatched = 2;
-                    }
-
-                    // If we're already at the end of the range, we're done
-                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
-                        goto foundBest;
-                    }
-
-                    // See if any of the possible second words is followed by a third word
-                    do {
-                        // If we find a third word, stop right away
-                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
-                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
-                            goto foundBest;
-                        }
-                    }
-                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
-                }
+    utext_setNativeIndex(text, rangeStart);
+    int32_t numCodePts = rangeEnd - rangeStart;
+    // bestSnlp[i] is the snlp of the best segmentation of the first i
+    // code points in the range to be matched.
+    UVector32 bestSnlp(numCodePts + 1, status);
+    bestSnlp.addElement(0, status);
+    for(int32_t i = 1; i <= numCodePts; i++) {
+        bestSnlp.addElement(kuint32max, status);
+    }
+
+    // prev[i] is the index of the last code point in the previous word in
+    // the best segmentation of the first i characters. Note negative implies
+	// that the code point is part of an unknown word.
+    UVector32 prev(numCodePts + 1, status);
+    for(int32_t i = 0; i <= numCodePts; i++) {
+        prev.addElement(kuint32max, status);
+    }
+
+    const int32_t maxWordSize = 20;
+    UVector32 values(maxWordSize, status);
+    values.setSize(maxWordSize);
+    UVector32 lengths(maxWordSize, status);
+    lengths.setSize(maxWordSize);
+
+    // Dynamic programming to find the best segmentation.
+
+    // In outer loop, i  is the code point index,
+    //                ix is the corresponding string (code unit) index.
+    //    They differ when the string contains supplementary characters.
+    int32_t ix = rangeStart;
+    for (int32_t i = 0;  i < numCodePts;  ++i, utext_setNativeIndex(text, ++ix)) {
+        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
+            continue;
+        }
+
+        int32_t count;
+        count = fDictionary->matches(text, numCodePts - i, maxWordSize,
+                             NULL, lengths.getBuffer(), values.getBuffer(), NULL, &fIgnoreSet, 2);
+                             // Note: lengths is filled with code point lengths
+                             //       The NULL parameter is the ignored code unit lengths.
+
+        for (int32_t j = 0; j < count; j++) {
+            int32_t ln = lengths.elementAti(j);
+            if (ln + i >= numCodePts)
+                continue;
+            utext_setNativeIndex(text, ln+ix);
+            int32_t c = utext_current32(text);
+            if (fMarkSet.contains(c) || c == 0x17D2) { // Coeng
+                lengths.removeElementAt(j);
+                values.removeElementAt(j);
+                --j;
+                --count;
             }
-            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
-foundBest:
-            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
-            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
-            wordsFound += 1;
         }
-
-        // We come here after having either found a word or not. We look ahead to the
-        // next word. If it's not a dictionary word, we will combine it with the word we
-        // just found (if there is one), but only if the preceding word does not exceed
-        // the threshold.
-        // The text iterator should now be positioned at the end of the word we found.
-        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
-            // if it is a dictionary word, do nothing. If it isn't, then if there is
-            // no preceding word, or the non-word shares less than the minimum threshold
-            // of characters with a dictionary word, then scan to resynchronize
-            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
-                  && (cuWordLength == 0
-                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
-                // Look for a plausible word boundary
-                int32_t remaining = rangeEnd - (current+cuWordLength);
-                UChar32 pc;
-                UChar32 uc;
-                int32_t chars = 0;
-                for (;;) {
-                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
-                    pc = utext_next32(text);
-                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
-                    chars += pcSize;
-                    remaining -= pcSize;
-                    if (remaining <= 0) {
+        if (count == 0) {
+            utext_setNativeIndex(text, ix);
+            int32_t c = utext_current32(text);
+            if (fPuncSet.contains(c) || fIgnoreSet.contains(c) || c == ZWSP) {
+                values.setElementAt(0, count);
+                lengths.setElementAt(1, count++);
+            } else if (fBaseSet.contains(c)) {
+                int32_t currix = utext_getNativeIndex(text);
+                do {
+                    utext_next32(text);
+                    c = utext_current32(text);
+                    if (utext_getNativeIndex(text) >= rangeEnd)
                         break;
-                    }
-                    uc = utext_current32(text);
-                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
-                        // Maybe. See if it's in the dictionary.
-                        int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
-                        utext_setNativeIndex(text, current+cuWordLength+chars);
-                        if (candidates > 0) {
+                    if (c == 0x17D2) { // Coeng
+                        utext_next32(text);
+                        c = utext_current32(text);
+                        if (!fBaseSet.contains(c) || utext_getNativeIndex(text) >= rangeEnd) {
                             break;
+                        } else {
+                            utext_next32(text);
+                            c = utext_current32(text);
+                            if (utext_getNativeIndex(text) >= rangeEnd)
+                                break;
                         }
                     }
-                }
-
-                // Bump the word count if there wasn't already one
-                if (cuWordLength <= 0) {
-                    wordsFound += 1;
-                }
-
-                // Update the length with the passed-over characters
-                cuWordLength += chars;
-            }
-            else {
-                // Back up to where we were for next iteration
-                utext_setNativeIndex(text, current+cuWordLength);
+                } while (fMarkSet.contains(c) || fIgnoreSet.contains(c));
+                values.setElementAt(BADSNLP, count);
+                lengths.setElementAt(utext_getNativeIndex(text) - currix, count++);
+            } else {
+                values.setElementAt(BADSNLP, count);
+                lengths.setElementAt(1, count++);
             }
         }

-        // Never stop before a combining mark.
-        int32_t currPos;
-        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
-            utext_next32(text);
-            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
+        for (int32_t j = 0; j < count; j++) {
+            uint32_t v = values.elementAti(j);
+            int32_t newSnlp = bestSnlp.elementAti(i) + v;
+            int32_t ln = lengths.elementAti(j);
+            utext_setNativeIndex(text, ln+ix);
+            int32_t c = utext_current32(text);
+            while ((fPuncSet.contains(c) || fIgnoreSet.contains(c)) && ln + i < numCodePts) {
+                ++ln;
+                utext_next32(text);
+                c = utext_current32(text);
+            }
+            int32_t ln_j_i = ln + i;   // yes really i!
+            if (newSnlp < bestSnlp.elementAti(ln_j_i)) {
+                if (v == BADSNLP) {
+                    int32_t p = prev.elementAti(i);
+                    if (p < 0)
+                        prev.setElementAt(p, ln_j_i);
+                    else
+                        prev.setElementAt(-i, ln_j_i);
+                }
+                else
+                    prev.setElementAt(i, ln_j_i);
+                bestSnlp.setElementAt(newSnlp, ln_j_i);
+            }
         }
-
-        // Look ahead for possible suffixes if a dictionary word does not follow.
-        // We do this in code rather than using a rule so that the heuristic
-        // resynch continues to function. For example, one of the suffix characters
-        // could be a typo in the middle of a word.
-//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
-//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
-//                && fSuffixSet.contains(uc = utext_current32(text))) {
-//                if (uc == KHMER_PAIYANNOI) {
-//                    if (!fSuffixSet.contains(utext_previous32(text))) {
-//                        // Skip over previous end and PAIYANNOI
-//                        utext_next32(text);
-//                        utext_next32(text);
-//                        wordLength += 1;            // Add PAIYANNOI to word
-//                        uc = utext_current32(text);     // Fetch next character
-//                    }
-//                    else {
-//                        // Restore prior position
-//                        utext_next32(text);
-//                    }
-//                }
-//                if (uc == KHMER_MAIYAMOK) {
-//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
-//                        // Skip over previous end and MAIYAMOK
-//                        utext_next32(text);
-//                        utext_next32(text);
-//                        wordLength += 1;            // Add MAIYAMOK to word
-//                    }
-//                    else {
-//                        // Restore prior position
-//                        utext_next32(text);
-//                    }
-//                }
-//            }
-//            else {
-//                utext_setNativeIndex(text, current+wordLength);
-//            }
-//        }
-
-        // Did we find a word on this iteration? If so, push it on the break stack
-        if (cuWordLength > 0) {
-            foundBreaks.push((current+cuWordLength), status);
+    }
+    // Start pushing the optimal offset index into t_boundary (t for tentative).
+    // prev[numCodePts] is guaranteed to be meaningful.
+    // We'll first push in the reverse order, i.e.,
+    // t_boundary[0] = numCodePts, and afterwards do a swap.
+    UVector32 t_boundary(numCodePts+1, status);
+
+    int32_t numBreaks = 0;
+    // No segmentation found, set boundary to end of range
+    while (numCodePts >= 0 && (uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
+        --numCodePts;
+    }
+    if (numCodePts < 0) {
+        t_boundary.addElement(numCodePts, status);
+        numBreaks++;
+    } else {
+        for (int32_t i = numCodePts; (uint32_t)i != kuint32max; i = prev.elementAti(i)) {
+            if (i < 0) i = -i;
+            t_boundary.addElement(i, status);
+            numBreaks++;
+        }
+        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
+    }
+
+    // Now that we're done, convert positions in t_boundary[] (indices in
+    // the normalized input string) back to indices in the original input UText
+    // while reversing t_boundary and pushing values to foundBreaks.
+    for (int32_t i = numBreaks-1; i >= 0; i--) {
+        int32_t cpPos = t_boundary.elementAti(i);
+        if (cpPos == 0 && !breakStart && fTypes >= UBRK_LINE) continue;
+        int32_t utextPos = cpPos + rangeStart;
+        while (utextPos > after && scanWJ(text, utextPos, scanEnd, before, after));
+        if (utextPos < before) {
+        // Boundaries are added to foundBreaks output in ascending order.
+            U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos);
+            foundBreaks.push(utextPos, status);
         }
     }
-
+
     // Don't return a break for the end of the dictionary range if there is one there.
-    if (foundBreaks.peeki() >= rangeEnd) {
+    if (!breakEnd && fTypes >= UBRK_LINE && foundBreaks.peeki() >= rangeEnd) {
         (void) foundBreaks.popi();
-        wordsFound -= 1;
     }
-
-    return wordsFound;
+    return foundBreaks.size() - wordsFound;
 }

 #if !UCONFIG_NO_NORMALIZATION
diff -ur icu.org/source/common/dictbe.h icu/source/common/dictbe.h
--- icu.org/source/common/dictbe.h	2017-01-20 01:20:31.000000000 +0100
+++ icu/source/common/dictbe.h	2017-04-21 23:14:23.845894374 +0200
@@ -34,6 +34,15 @@
  */
 class DictionaryBreakEngine : public LanguageBreakEngine {
  private:
+
+  /**
+   * <p>Default constructor.</p>
+   *
+   */
+  DictionaryBreakEngine();
+
+ protected:
+
     /**
      * The set of characters handled by this engine
      * @internal
@@ -48,11 +57,63 @@

   uint32_t      fTypes;

+  const int32_t WJ   = 0x2060;
+  const int32_t ZWSP = 0x200B;
+
   /**
-   * <p>Default constructor.</p>
-   *
+   * A Unicode set of all viramas
+   * @internal
    */
-  DictionaryBreakEngine();
+  UnicodeSet    fViramaSet;
+
+  /**
+   * A Unicode set of all base characters
+   * @internal
+   */
+  UnicodeSet    fBaseSet;
+
+  /**
+   * A Unicode set of all marks
+   * @internal
+   */
+  UnicodeSet    fMarkSet;
+
+  /**
+   * A Unicode set of all characters ignored ignored in dictionary matching
+   * @internal
+   */
+  UnicodeSet    fIgnoreSet;
+
+  /**
+   * A Unicode set of all characters ignored ignored in dictionary matching
+   * @internal
+   */
+  UnicodeSet    fSkipStartSet;
+
+  /**
+   * A Unicode set of all characters ignored ignored in dictionary matching
+   * @internal
+   */
+  UnicodeSet    fSkipEndSet;
+
+  /**
+   * A Unicode set of all characters that should not be broken before
+   * @internal
+   */
+  UnicodeSet    fNBeforeSet;
+
+  /**
+   * The number of clusters within which breaks are inhibited
+   * @internal
+   */
+  int32_t clusterLimit;
+
+  bool scanWJ(UText *text, int32_t &start, int32_t end, int32_t &before, int32_t &after) const;
+
+  bool scanBeforeStart(UText *text, int32_t& start, bool &doBreak) const;
+  bool scanAfterEnd(UText *text, int32_t rangeEnd, int32_t& end, bool &doBreak) const;
+  void scanBackClusters(UText *text, int32_t textStart, int32_t& start) const;
+  void scanFwdClusters(UText *text, int32_t textEnd, int32_t& end) const;

  public:

@@ -83,7 +144,7 @@
    * <p>Find any breaks within a run in the supplied text.</p>
    *
    * @param text A UText representing the text. The iterator is left at
-   * the end of the run of characters which the engine is capable of handling
+   * the end of the run of characters which the engine is capable of handling
    * that starts from the first (or last) character in the range.
    * @param startPos The start of the run within the supplied text.
    * @param endPos The end of the run within the supplied text.
@@ -245,118 +306,120 @@

 };

-/*******************************************************************
- * BurmeseBreakEngine
- */
-
-/**
- * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
- * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
- *
- * <p>After it is constructed a BurmeseBreakEngine may be shared between
- * threads without synchronization.</p>
- */
-class BurmeseBreakEngine : public DictionaryBreakEngine {
- private:
-    /**
-     * The set of characters handled by this engine
-     * @internal
-     */
-
-  UnicodeSet                fBurmeseWordSet;
-  UnicodeSet                fEndWordSet;
-  UnicodeSet                fBeginWordSet;
-  UnicodeSet                fMarkSet;
-  DictionaryMatcher  *fDictionary;
-
- public:
-
-  /**
-   * <p>Default constructor.</p>
-   *
-   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
-   * engine is deleted.
-   */
-  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
-
-  /**
-   * <p>Virtual destructor.</p>
-   */
-  virtual ~BurmeseBreakEngine();
-
- protected:
- /**
-  * <p>Divide up a range of known dictionary characters.</p>
-  *
-  * @param text A UText representing the text
-  * @param rangeStart The start of the range of dictionary characters
-  * @param rangeEnd The end of the range of dictionary characters
-  * @param foundBreaks Output of C array of int32_t break positions, or 0
-  * @return The number of breaks found
-  */
-  virtual int32_t divideUpDictionaryRange( UText *text,
-                                           int32_t rangeStart,
-                                           int32_t rangeEnd,
-                                           UStack &foundBreaks ) const;
-
-};
-
-/*******************************************************************
- * KhmerBreakEngine
- */
-
-/**
- * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
- * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
- *
- * <p>After it is constructed a KhmerBreakEngine may be shared between
- * threads without synchronization.</p>
- */
-class KhmerBreakEngine : public DictionaryBreakEngine {
- private:
-    /**
-     * The set of characters handled by this engine
-     * @internal
-     */
-
-  UnicodeSet                fKhmerWordSet;
-  UnicodeSet                fEndWordSet;
-  UnicodeSet                fBeginWordSet;
-  UnicodeSet                fMarkSet;
-  DictionaryMatcher  *fDictionary;
-
- public:
-
-  /**
-   * <p>Default constructor.</p>
-   *
-   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
-   * engine is deleted.
-   */
-  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
-
-  /**
-   * <p>Virtual destructor.</p>
-   */
-  virtual ~KhmerBreakEngine();
-
- protected:
- /**
-  * <p>Divide up a range of known dictionary characters.</p>
-  *
-  * @param text A UText representing the text
-  * @param rangeStart The start of the range of dictionary characters
-  * @param rangeEnd The end of the range of dictionary characters
-  * @param foundBreaks Output of C array of int32_t break positions, or 0
-  * @return The number of breaks found
-  */
-  virtual int32_t divideUpDictionaryRange( UText *text,
-                                           int32_t rangeStart,
-                                           int32_t rangeEnd,
-                                           UStack &foundBreaks ) const;
-
-};
-
+/*******************************************************************
+ * BurmeseBreakEngine
+ */
+
+/**
+ * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
+ *
+ * <p>After it is constructed a BurmeseBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class BurmeseBreakEngine : public DictionaryBreakEngine {
+ private:
+    /**
+     * The set of characters handled by this engine
+     * @internal
+     */
+
+  UnicodeSet                fBurmeseWordSet;
+  UnicodeSet                fEndWordSet;
+  UnicodeSet                fBeginWordSet;
+  UnicodeSet                fMarkSet;
+  DictionaryMatcher  *fDictionary;
+
+ public:
+
+  /**
+   * <p>Default constructor.</p>
+   *
+   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+   * engine is deleted.
+   */
+  BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+  /**
+   * <p>Virtual destructor.</p>
+   */
+  virtual ~BurmeseBreakEngine();
+
+ protected:
+ /**
+  * <p>Divide up a range of known dictionary characters.</p>
+  *
+  * @param text A UText representing the text
+  * @param rangeStart The start of the range of dictionary characters
+  * @param rangeEnd The end of the range of dictionary characters
+  * @param foundBreaks Output of C array of int32_t break positions, or 0
+  * @return The number of breaks found
+  */
+  virtual int32_t divideUpDictionaryRange( UText *text,
+                                           int32_t rangeStart,
+                                           int32_t rangeEnd,
+                                           UStack &foundBreaks ) const;
+
+};
+
+/*******************************************************************
+ * KhmerBreakEngine
+ */
+
+/**
+ * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
+ * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
+ *
+ * <p>After it is constructed a KhmerBreakEngine may be shared between
+ * threads without synchronization.</p>
+ */
+class KhmerBreakEngine : public DictionaryBreakEngine {
+ private:
+    /**
+     * The set of characters handled by this engine
+     * @internal
+     */
+
+  UnicodeSet                fKhmerWordSet;
+  UnicodeSet                fBeginWordSet;
+  UnicodeSet                fPuncSet;
+  DictionaryMatcher        *fDictionary;
+
+  const uint32_t BADSNLP = 256 * 20;
+  const uint32_t kuint32max = 0x7FFFFFFF;
+
+ public:
+
+  /**
+   * <p>Default constructor.</p>
+   *
+   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
+   * engine is deleted.
+   */
+  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
+
+  /**
+   * <p>Virtual destructor.</p>
+   */
+  virtual ~KhmerBreakEngine();
+
+ protected:
+ /**
+  * <p>Divide up a range of known dictionary characters.</p>
+  *
+  * @param text A UText representing the text
+  * @param rangeStart The start of the range of dictionary characters
+  * @param rangeEnd The end of the range of dictionary characters
+  * @param foundBreaks Output of C array of int32_t break positions, or 0
+  * @return The number of breaks found
+  */
+  virtual int32_t divideUpDictionaryRange( UText *text,
+                                           int32_t rangeStart,
+                                           int32_t rangeEnd,
+                                           UStack &foundBreaks ) const;
+
+};
+
 #if !UCONFIG_NO_NORMALIZATION

 /*******************************************************************
diff -ur icu.org/source/common/dictionarydata.cpp icu/source/common/dictionarydata.cpp
--- icu.org/source/common/dictionarydata.cpp	2017-01-20 01:20:31.000000000 +0100
+++ icu/source/common/dictionarydata.cpp	2017-04-21 23:14:23.846894372 +0200
@@ -44,7 +44,7 @@

 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
-                            int32_t *prefix) const {
+                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {

     UCharsTrie uct(characters);
     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
@@ -55,7 +55,13 @@
         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
         codePointsMatched += 1;
+        if (ignoreSet != NULL && ignoreSet->contains(c)) {
+            continue;
+        }
         if (USTRINGTRIE_HAS_VALUE(result)) {
+            if (codePointsMatched < minLength) {
+                continue;
+            }
             if (wordCount < limit) {
                 if (values != NULL) {
                     values[wordCount] = uct.getValue();
@@ -112,7 +118,7 @@

 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
-                            int32_t *prefix) const {
+                            int32_t *prefix, UnicodeSet const* ignoreSet, int32_t minLength) const {
     BytesTrie bt(characters);
     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
     int32_t wordCount = 0;
@@ -122,7 +128,13 @@
         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
         codePointsMatched += 1;
+        if (ignoreSet != NULL && ignoreSet->contains(c)) {
+            continue;
+        }
         if (USTRINGTRIE_HAS_VALUE(result)) {
+            if (codePointsMatched < minLength) {
+                continue;
+            }
             if (wordCount < limit) {
                 if (values != NULL) {
                     values[wordCount] = bt.getValue();
diff -ur icu.org/source/common/dictionarydata.h icu/source/common/dictionarydata.h
--- icu.org/source/common/dictionarydata.h	2017-01-20 01:20:31.000000000 +0100
+++ icu/source/common/dictionarydata.h	2017-04-21 23:14:23.846894372 +0200
@@ -21,6 +21,7 @@
 #include "unicode/utext.h"
 #include "unicode/udata.h"
 #include "udataswp.h"
+#include "unicode/uniset.h"
 #include "unicode/uobject.h"
 #include "unicode/ustringtrie.h"

@@ -92,7 +93,7 @@
      */
     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
-                            int32_t *prefix) const = 0;
+                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const = 0;

     /** @return DictionaryData::TRIE_TYPE_XYZ */
     virtual int32_t getType() const = 0;
@@ -107,7 +108,7 @@
     virtual ~UCharsDictionaryMatcher();
     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
-                            int32_t *prefix) const;
+                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
     virtual int32_t getType() const;
 private:
     const UChar *characters;
@@ -125,7 +126,7 @@
     virtual ~BytesDictionaryMatcher();
     virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
-                            int32_t *prefix) const;
+                            int32_t *prefix, UnicodeSet const* ignoreSet = NULL, int32_t minLength = 0) const;
     virtual int32_t getType() const;
 private:
     UChar32 transform(UChar32 c) const;
diff -ur icu.org/source/data/Makefile.in icu/source/data/Makefile.in
--- icu.org/source/data/Makefile.in	2017-04-21 23:13:03.248087545 +0200
+++ icu/source/data/Makefile.in	2017-04-21 23:14:23.846894372 +0200
@@ -183,7 +183,7 @@
 endif
 endif

-packagedata: icupkg.inc $(PKGDATA_LIST) build-local
+packagedata: icupkg.inc $(PKGDATA_LIST) build-local $(MAINBUILDDIR)/khmerdict.stamp
 ifneq ($(ENABLE_STATIC),)
 ifeq ($(PKGDATA_MODE),dll)
 	$(PKGDATA_INVOKE) $(PKGDATA) -e $(ICUDATA_ENTRY_POINT) -T $(OUTTMPDIR) -p $(ICUDATA_NAME) $(PKGDATA_LIBSTATICNAME) -m static $(PKGDATA_VERSIONING) $(PKGDATA_LIST)
@@ -567,8 +567,14 @@
 	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1000 -c -i $(BUILDDIR) $(DICTSRCDIR)/burmesedict.txt $(BRKBLDDIR)/burmesedict.dict

 # TODO: figure out why combining characters are here?
-$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
-	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
+#$(BRKBLDDIR)/khmerdict.dict: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(DAT_FILES)
+#	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
+
+#$(MAINBUILDDIR)/khmerdict.stamp: $(TOOLBINDIR)/gendict$(TOOLEXEEXT) $(BRKSRCDIR)/khmerdict.txt build-local
+# 	$(INVOKE) $(TOOLBINDIR)/gendict --bytes --transform offset-0x1780 -c -i $(BUILDDIR) $(DICTSRCDIR)/khmerdict.txt $(BRKBLDDIR)/khmerdict.dict
+$(MAINBUILDDIR)/khmerdict.stamp: $(BRKSRCDIR)/khmerdict.dict build-local
+	cp $< $(BRKBLDDIR)
+	echo "timestamp" > $@

 ####################################################    CFU
 # CFU FILES