3a02490e1a
Similar to the straight (typewriter or ASCII) apostrophe, straight
double quotation mark (") matches its typographic variants now,
like other word processors do.
Note: regex search doesn't use this matching, similar to the apostrophe
search.
Follow-up to commit d40f2d02df
"tdf#117643 Writer: fix apostrophe search regression".
Change-Id: If6a3ee00750828583cd0cfc4aa7f7b656ea9bd1e
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/169605
Reviewed-by: László Németh <nemeth@numbertext.org>
Tested-by: Jenkins
1591 lines
58 KiB
C++
1591 lines
58 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
|
|
#include "textsearch.hxx"
|
|
#include "levdis.hxx"
|
|
#include <com/sun/star/i18n/BreakIterator.hpp>
|
|
#include <com/sun/star/util/SearchAlgorithms2.hpp>
|
|
#include <com/sun/star/util/SearchFlags.hpp>
|
|
#include <com/sun/star/i18n/WordType.hpp>
|
|
#include <com/sun/star/i18n/ScriptType.hpp>
|
|
#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
|
|
#include <com/sun/star/i18n/CharacterClassification.hpp>
|
|
#include <com/sun/star/i18n/KCharacterType.hpp>
|
|
#include <com/sun/star/i18n/Transliteration.hpp>
|
|
#include <cppuhelper/supportsservice.hxx>
|
|
#include <cppuhelper/weak.hxx>
|
|
#include <i18nutil/transliteration.hxx>
|
|
#include <rtl/ustrbuf.hxx>
|
|
#include <sal/log.hxx>
|
|
|
|
#include <unicode/regex.h>
|
|
|
|
using namespace ::com::sun::star::util;
|
|
using namespace ::com::sun::star::uno;
|
|
using namespace ::com::sun::star::lang;
|
|
using namespace ::com::sun::star::i18n;
|
|
using namespace ::com::sun::star;
|
|
|
|
const TransliterationFlags COMPLEX_TRANS_MASK =
|
|
TransliterationFlags::ignoreBaFa_ja_JP |
|
|
TransliterationFlags::ignoreIterationMark_ja_JP |
|
|
TransliterationFlags::ignoreTiJi_ja_JP |
|
|
TransliterationFlags::ignoreHyuByu_ja_JP |
|
|
TransliterationFlags::ignoreSeZe_ja_JP |
|
|
TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
|
|
TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
|
|
TransliterationFlags::ignoreProlongedSoundMark_ja_JP;
|
|
|
|
namespace
|
|
{
|
|
TransliterationFlags maskComplexTrans( TransliterationFlags n )
|
|
{
|
|
// IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
|
|
// in complex transliteration.
|
|
return
|
|
n & (COMPLEX_TRANS_MASK | // all set ignore bits
|
|
TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit
|
|
TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value
|
|
}
|
|
|
|
bool isComplexTrans( TransliterationFlags n )
|
|
{
|
|
return bool(n & COMPLEX_TRANS_MASK);
|
|
}
|
|
|
|
TransliterationFlags maskSimpleTrans( TransliterationFlags n )
|
|
{
|
|
return n & ~COMPLEX_TRANS_MASK;
|
|
}
|
|
|
|
bool isSimpleTrans( TransliterationFlags n )
|
|
{
|
|
return bool(maskSimpleTrans(n));
|
|
}
|
|
|
|
// Regex patterns are case sensitive.
|
|
TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
|
|
{
|
|
TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
|
|
TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
|
|
if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
|
|
v = TransliterationFlags::NONE;
|
|
return (m | v) & ~COMPLEX_TRANS_MASK;
|
|
}
|
|
|
|
bool isSimpleRegexTrans( TransliterationFlags n )
|
|
{
|
|
return bool(maskSimpleRegexTrans(n));
|
|
}
|
|
|
|
bool isReplacePunctuation( OUString &rStr )
|
|
{
|
|
return rStr.indexOf(u'\u2018') > -1 ||
|
|
rStr.indexOf(u'\u2019') > -1 ||
|
|
rStr.indexOf(u'\u201A') > -1 ||
|
|
rStr.indexOf(u'\u201B') > -1 ||
|
|
rStr.indexOf(u'\u201C') > -1 ||
|
|
rStr.indexOf(u'\u201D') > -1 ||
|
|
rStr.indexOf(u'\u201E') > -1 ||
|
|
rStr.indexOf(u'\u201F') > -1;
|
|
}
|
|
|
|
OUString replacePunctuation( OUString &rStr )
|
|
{
|
|
return rStr.replace(u'\u2018', '\'')
|
|
.replace(u'\u2019', '\'')
|
|
.replace(u'\u201A', '\'')
|
|
.replace(u'\u201B', '\'')
|
|
.replace(u'\u201C', '"')
|
|
.replace(u'\u201D', '"')
|
|
.replace(u'\u201E', '"')
|
|
.replace(u'\u201F', '"');
|
|
}
|
|
};
|
|
|
|
TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
|
|
: m_xContext( rxContext )
|
|
{
|
|
SearchOptions2 aOpt;
|
|
aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
|
|
aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
|
|
aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
|
|
//aOpt.Locale = ???;
|
|
setOptions( aOpt );
|
|
}
|
|
|
|
TextSearch::~TextSearch()
|
|
{
|
|
pRegexMatcher.reset();
|
|
pWLD.reset();
|
|
pJumpTable.reset();
|
|
pJumpTable2.reset();
|
|
}
|
|
|
|
void TextSearch::setOptions2( const SearchOptions2& rOptions )
|
|
{
|
|
std::unique_lock g(m_aMutex);
|
|
|
|
aSrchPara = rOptions;
|
|
|
|
pRegexMatcher.reset();
|
|
pWLD.reset();
|
|
pJumpTable.reset();
|
|
pJumpTable2.reset();
|
|
maWildcardReversePattern.clear();
|
|
maWildcardReversePattern2.clear();
|
|
TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
|
|
bSearchApostrophe = false;
|
|
bool bReplaceApostrophe = false;
|
|
if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
|
|
{
|
|
// RESrchPrepare will consider aSrchPara.transliterateFlags when
|
|
// picking the actual regex pattern
|
|
// (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
|
|
// case-insensitivity. Create transliteration instance, if any, without
|
|
// ignore-case so later in TextSearch::searchForward() the string to
|
|
// match is not case-altered, leave case-(in)sensitive to regex engine.
|
|
transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
|
|
}
|
|
else if ( aSrchPara.searchString.indexOf('\'') > - 1 || aSrchPara.searchString.indexOf('"') > - 1 )
|
|
{
|
|
bSearchApostrophe = true;
|
|
bReplaceApostrophe = isReplacePunctuation(aSrchPara.searchString);
|
|
}
|
|
|
|
// Create Transliteration class
|
|
if( isSimpleTrans( transliterateFlags) )
|
|
{
|
|
if( !xTranslit.is() )
|
|
xTranslit.set( Transliteration::create( m_xContext ) );
|
|
xTranslit->loadModule(
|
|
static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
|
|
aSrchPara.Locale);
|
|
}
|
|
else if( xTranslit.is() )
|
|
xTranslit = nullptr;
|
|
|
|
// Create Transliteration for 2<->1, 2<->2 transliteration
|
|
if ( isComplexTrans( transliterateFlags) )
|
|
{
|
|
if( !xTranslit2.is() )
|
|
xTranslit2.set( Transliteration::create( m_xContext ) );
|
|
// Load transliteration module
|
|
xTranslit2->loadModule(
|
|
static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
|
|
aSrchPara.Locale);
|
|
}
|
|
|
|
if ( !xBreak.is() )
|
|
xBreak = css::i18n::BreakIterator::create( m_xContext );
|
|
|
|
sSrchStr = aSrchPara.searchString;
|
|
|
|
// Transliterate search string.
|
|
if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
|
|
{
|
|
if (isSimpleRegexTrans(transliterateFlags))
|
|
{
|
|
if (maskSimpleRegexTrans(transliterateFlags) !=
|
|
maskSimpleTrans(transliterateFlags))
|
|
{
|
|
css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
|
|
Transliteration::create( m_xContext ));
|
|
if (xTranslitPattern.is())
|
|
{
|
|
xTranslitPattern->loadModule(
|
|
static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
|
|
aSrchPara.Locale);
|
|
sSrchStr = xTranslitPattern->transliterateString2String(
|
|
aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (xTranslit.is())
|
|
sSrchStr = xTranslit->transliterateString2String(
|
|
aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
|
|
}
|
|
// xTranslit2 complex transliterated sSrchStr2 is not used in
|
|
// regex, see TextSearch::searchForward() and
|
|
// TextSearch::searchBackward()
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
|
|
sSrchStr = xTranslit->transliterateString2String(
|
|
aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
|
|
|
|
if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
|
|
sSrchStr2 = xTranslit2->transliterateString2String(
|
|
aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
|
|
}
|
|
|
|
if ( bReplaceApostrophe )
|
|
sSrchStr = replacePunctuation(sSrchStr);
|
|
|
|
// Take the new SearchOptions2::AlgorithmType2 field and ignore
|
|
// SearchOptions::algorithmType
|
|
switch( aSrchPara.AlgorithmType2)
|
|
{
|
|
case SearchAlgorithms2::REGEXP:
|
|
fnForward = &TextSearch::RESrchFrwrd;
|
|
fnBackward = &TextSearch::RESrchBkwrd;
|
|
RESrchPrepare( aSrchPara);
|
|
break;
|
|
|
|
case SearchAlgorithms2::APPROXIMATE:
|
|
fnForward = &TextSearch::ApproxSrchFrwrd;
|
|
fnBackward = &TextSearch::ApproxSrchBkwrd;
|
|
|
|
pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
|
|
aSrchPara.insertedChars, aSrchPara.deletedChars,
|
|
0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );
|
|
|
|
nLimit = pWLD->GetLimit();
|
|
break;
|
|
|
|
case SearchAlgorithms2::WILDCARD:
|
|
mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
|
|
mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
|
|
fnForward = &TextSearch::WildcardSrchFrwrd;
|
|
fnBackward = &TextSearch::WildcardSrchBkwrd;
|
|
break;
|
|
|
|
default:
|
|
SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
|
|
[[fallthrough]];
|
|
case SearchAlgorithms2::ABSOLUTE:
|
|
fnForward = &TextSearch::NSrchFrwrd;
|
|
fnBackward = &TextSearch::NSrchBkwrd;
|
|
break;
|
|
}
|
|
}
|
|
|
|
void TextSearch::setOptions( const SearchOptions& rOptions )
|
|
{
|
|
sal_Int16 nAlgorithmType2;
|
|
switch (rOptions.algorithmType)
|
|
{
|
|
case SearchAlgorithms_REGEXP:
|
|
nAlgorithmType2 = SearchAlgorithms2::REGEXP;
|
|
break;
|
|
case SearchAlgorithms_APPROXIMATE:
|
|
nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
|
|
break;
|
|
default:
|
|
SAL_WARN("i18npool","TextSearch::setOptions - default what?");
|
|
[[fallthrough]];
|
|
case SearchAlgorithms_ABSOLUTE:
|
|
nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
|
|
break;
|
|
}
|
|
// It would be nice if an inherited struct had a ctor that takes an
|
|
// instance of the object the struct derived from...
|
|
SearchOptions2 aOptions2(
|
|
rOptions.algorithmType,
|
|
rOptions.searchFlag,
|
|
rOptions.searchString,
|
|
rOptions.replaceString,
|
|
rOptions.Locale,
|
|
rOptions.changedChars,
|
|
rOptions.deletedChars,
|
|
rOptions.insertedChars,
|
|
rOptions.transliterateFlags,
|
|
nAlgorithmType2,
|
|
0 // no wildcard search, no escape character...
|
|
);
|
|
setOptions2( aOptions2);
|
|
}
|
|
|
|
static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
|
|
{
|
|
auto pOff = std::find_if(rOff.begin(), rOff.end(),
|
|
[nPos](const sal_Int32 nOff) { return nOff >= nPos; });
|
|
return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
|
|
}
|
|
|
|
SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
std::unique_lock g(m_aMutex);
|
|
|
|
SearchResult sres;
|
|
|
|
OUString in_str(searchStr);
|
|
|
|
// in non-regex mode, allow searching typographical apostrophe with the ASCII one
|
|
// to avoid regression after using automatic conversion to U+2019 during typing in Writer
|
|
bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
|
|
|
|
bUsePrimarySrchStr = true;
|
|
|
|
if ( xTranslit.is() )
|
|
{
|
|
// apply normal transliteration (1<->1, 1<->0)
|
|
|
|
sal_Int32 nInStartPos = startPos;
|
|
if (pRegexMatcher && startPos > 0)
|
|
{
|
|
// tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
|
|
// transliterate enough of the leading text to allow sensible look-behind assertions.
|
|
// 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
|
|
// See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
|
|
// When search regex doesn't start with an assertion, 3 is to allow startPos to be in
|
|
// the middle of a surrogate pair, preceded by another surrogate pair.
|
|
const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
|
|
nInStartPos -= std::min(nMaxLeadingLen, startPos);
|
|
}
|
|
sal_Int32 nInEndPos = endPos;
|
|
if (pRegexMatcher && endPos < searchStr.getLength())
|
|
{
|
|
// tdf#65038: ditto for look-ahead assertions
|
|
const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
|
|
nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
|
|
}
|
|
|
|
css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
|
|
in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);
|
|
|
|
if ( bReplaceApostrophe )
|
|
in_str = replacePunctuation(in_str);
|
|
|
|
// JP 20.6.2001: also the start and end positions must be corrected!
|
|
sal_Int32 newStartPos =
|
|
(startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );
|
|
|
|
sal_Int32 newEndPos = (endPos < searchStr.getLength())
|
|
? FindPosInSeq_Impl( offset, endPos )
|
|
: in_str.getLength();
|
|
|
|
sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
|
|
|
|
// Map offsets back to untransliterated string.
|
|
const sal_Int32 nOffsets = offset.getLength();
|
|
if (nOffsets)
|
|
{
|
|
auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
|
|
auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
|
|
// For regex nGroups is the number of groups+1 with group 0 being
|
|
// the entire match.
|
|
const sal_Int32 nGroups = sres.startOffset.getLength();
|
|
for ( sal_Int32 k = 0; k < nGroups; k++ )
|
|
{
|
|
const sal_Int32 nStart = sres.startOffset[k];
|
|
// Result offsets are negative (-1) if a group expression was
|
|
// not matched.
|
|
if (nStart >= 0)
|
|
sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
|
|
// JP 20.6.2001: end is ever exclusive and then don't return
|
|
// the position of the next character - return the
|
|
// next position behind the last found character!
|
|
// "a b c" find "b" must return 2,3 and not 2,4!!!
|
|
const sal_Int32 nStop = sres.endOffset[k];
|
|
if (nStop >= 0)
|
|
{
|
|
if (nStop > 0)
|
|
sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
|
|
else
|
|
sres_endOffsetRange[k] = offset[0];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ( bReplaceApostrophe )
|
|
in_str = in_str.replace(u'\u2019', '\'');
|
|
|
|
sres = (this->*fnForward)( in_str, startPos, endPos );
|
|
}
|
|
|
|
if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
|
|
{
|
|
SearchResult sres2;
|
|
|
|
in_str = searchStr;
|
|
css::uno::Sequence <sal_Int32> offset( in_str.getLength());
|
|
|
|
in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
|
|
|
|
if( startPos )
|
|
startPos = FindPosInSeq_Impl( offset, startPos );
|
|
|
|
if( endPos < searchStr.getLength() )
|
|
endPos = FindPosInSeq_Impl( offset, endPos );
|
|
else
|
|
endPos = in_str.getLength();
|
|
|
|
bUsePrimarySrchStr = false;
|
|
sres2 = (this->*fnForward)( in_str, startPos, endPos );
|
|
auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
|
|
auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
|
|
|
|
for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
|
|
{
|
|
if (sres2.startOffset[k])
|
|
sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
|
|
if (sres2.endOffset[k])
|
|
sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
|
|
}
|
|
|
|
// pick first and long one
|
|
if ( sres.subRegExpressions == 0)
|
|
return sres2;
|
|
if ( sres2.subRegExpressions == 1)
|
|
{
|
|
if ( sres.startOffset[0] > sres2.startOffset[0])
|
|
return sres2;
|
|
else if ( sres.startOffset[0] == sres2.startOffset[0] &&
|
|
sres.endOffset[0] < sres2.endOffset[0])
|
|
return sres2;
|
|
}
|
|
}
|
|
|
|
return sres;
|
|
}
|
|
|
|
SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
std::unique_lock g(m_aMutex);
|
|
|
|
SearchResult sres;
|
|
|
|
OUString in_str(searchStr);
|
|
|
|
// in non-regex mode, allow searching typographical apostrophe with the ASCII one
|
|
// to avoid regression after using automatic conversion to U+2019 during typing in Writer
|
|
bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);
|
|
|
|
bUsePrimarySrchStr = true;
|
|
|
|
if ( xTranslit.is() )
|
|
{
|
|
// apply only simple 1<->1 transliteration here
|
|
css::uno::Sequence<sal_Int32> offset(startPos - endPos);
|
|
in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );
|
|
|
|
if ( bReplaceApostrophe )
|
|
in_str = replacePunctuation(in_str);
|
|
|
|
// JP 20.6.2001: also the start and end positions must be corrected!
|
|
sal_Int32 const newStartPos = (startPos < searchStr.getLength())
|
|
? FindPosInSeq_Impl( offset, startPos )
|
|
: in_str.getLength();
|
|
|
|
sal_Int32 const newEndPos =
|
|
(endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );
|
|
|
|
// TODO: this would need nExtraOffset handling to avoid $ matching
|
|
// if (pRegexMatcher && startPos < searchStr.getLength())
|
|
// but that appears to be impossible with ICU regex
|
|
|
|
sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
|
|
|
|
// Map offsets back to untransliterated string.
|
|
const sal_Int32 nOffsets = offset.getLength();
|
|
if (nOffsets)
|
|
{
|
|
auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
|
|
auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
|
|
// For regex nGroups is the number of groups+1 with group 0 being
|
|
// the entire match.
|
|
const sal_Int32 nGroups = sres.startOffset.getLength();
|
|
for ( sal_Int32 k = 0; k < nGroups; k++ )
|
|
{
|
|
const sal_Int32 nStart = sres.startOffset[k];
|
|
// Result offsets are negative (-1) if a group expression was
|
|
// not matched.
|
|
if (nStart >= 0)
|
|
{
|
|
if (nStart > 0)
|
|
sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
|
|
else
|
|
sres_startOffsetRange[k] = offset[0];
|
|
}
|
|
// JP 20.6.2001: end is ever exclusive and then don't return
|
|
// the position of the next character - return the
|
|
// next position behind the last found character!
|
|
// "a b c" find "b" must return 2,3 and not 2,4!!!
|
|
const sal_Int32 nStop = sres.endOffset[k];
|
|
if (nStop >= 0)
|
|
sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ( bReplaceApostrophe )
|
|
in_str = replacePunctuation(in_str);
|
|
|
|
sres = (this->*fnBackward)( in_str, startPos, endPos );
|
|
}
|
|
|
|
if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
|
|
{
|
|
SearchResult sres2;
|
|
|
|
in_str = searchStr;
|
|
css::uno::Sequence <sal_Int32> offset( in_str.getLength());
|
|
|
|
in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
|
|
|
|
if( startPos < searchStr.getLength() )
|
|
startPos = FindPosInSeq_Impl( offset, startPos );
|
|
else
|
|
startPos = in_str.getLength();
|
|
|
|
if( endPos )
|
|
endPos = FindPosInSeq_Impl( offset, endPos );
|
|
|
|
bUsePrimarySrchStr = false;
|
|
sres2 = (this->*fnBackward)( in_str, startPos, endPos );
|
|
auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
|
|
auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);
|
|
|
|
for( int k = 0; k < sres2.startOffset.getLength(); k++ )
|
|
{
|
|
if (sres2.startOffset[k])
|
|
sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
|
|
if (sres2.endOffset[k])
|
|
sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
|
|
}
|
|
|
|
// pick last and long one
|
|
if ( sres.subRegExpressions == 0 )
|
|
return sres2;
|
|
if ( sres2.subRegExpressions == 1 )
|
|
{
|
|
if ( sres.startOffset[0] < sres2.startOffset[0] )
|
|
return sres2;
|
|
if ( sres.startOffset[0] == sres2.startOffset[0] &&
|
|
sres.endOffset[0] > sres2.endOffset[0] )
|
|
return sres2;
|
|
}
|
|
}
|
|
|
|
return sres;
|
|
}
|
|
|
|
|
|
bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
|
|
{
|
|
bool bRet = true;
|
|
if( '\x7f' != rStr[nPos])
|
|
{
|
|
if ( !xCharClass.is() )
|
|
xCharClass = CharacterClassification::create( m_xContext );
|
|
sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
|
|
aSrchPara.Locale );
|
|
if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
|
|
KCharacterType::LETTER ) & nCType ) )
|
|
bRet = false;
|
|
}
|
|
return bRet;
|
|
}
|
|
|
|
// --------- helper methods for Boyer-Moore like text searching ----------
|
|
// TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
|
|
|
|
void TextSearch::MakeForwardTab()
|
|
{
|
|
// create the jumptable for the search text
|
|
|
|
if( pJumpTable && bIsForwardTab )
|
|
{
|
|
return; // the jumpTable is ok
|
|
}
|
|
bIsForwardTab = true;
|
|
|
|
sal_Int32 n, nLen = sSrchStr.getLength();
|
|
pJumpTable.reset( new TextSearchJumpTable );
|
|
|
|
for( n = 0; n < nLen - 1; ++n )
|
|
{
|
|
sal_Unicode cCh = sSrchStr[n];
|
|
sal_Int32 nDiff = nLen - n - 1;
|
|
TextSearchJumpTable::value_type aEntry( cCh, nDiff );
|
|
|
|
::std::pair< TextSearchJumpTable::iterator, bool > aPair =
|
|
pJumpTable->insert( aEntry );
|
|
if ( !aPair.second )
|
|
(*(aPair.first)).second = nDiff;
|
|
}
|
|
}
|
|
|
|
void TextSearch::MakeForwardTab2()
|
|
{
|
|
// create the jumptable for the search text
|
|
if( pJumpTable2 && bIsForwardTab )
|
|
{
|
|
return; // the jumpTable is ok
|
|
}
|
|
bIsForwardTab = true;
|
|
|
|
sal_Int32 n, nLen = sSrchStr2.getLength();
|
|
pJumpTable2.reset( new TextSearchJumpTable );
|
|
|
|
for( n = 0; n < nLen - 1; ++n )
|
|
{
|
|
sal_Unicode cCh = sSrchStr2[n];
|
|
sal_Int32 nDiff = nLen - n - 1;
|
|
|
|
TextSearchJumpTable::value_type aEntry( cCh, nDiff );
|
|
::std::pair< TextSearchJumpTable::iterator, bool > aPair =
|
|
pJumpTable2->insert( aEntry );
|
|
if ( !aPair.second )
|
|
(*(aPair.first)).second = nDiff;
|
|
}
|
|
}
|
|
|
|
void TextSearch::MakeBackwardTab()
|
|
{
|
|
// create the jumptable for the search text
|
|
if( pJumpTable && !bIsForwardTab)
|
|
{
|
|
return; // the jumpTable is ok
|
|
}
|
|
bIsForwardTab = false;
|
|
|
|
sal_Int32 n, nLen = sSrchStr.getLength();
|
|
pJumpTable.reset( new TextSearchJumpTable );
|
|
|
|
for( n = nLen-1; n > 0; --n )
|
|
{
|
|
sal_Unicode cCh = sSrchStr[n];
|
|
TextSearchJumpTable::value_type aEntry( cCh, n );
|
|
::std::pair< TextSearchJumpTable::iterator, bool > aPair =
|
|
pJumpTable->insert( aEntry );
|
|
if ( !aPair.second )
|
|
(*(aPair.first)).second = n;
|
|
}
|
|
}
|
|
|
|
void TextSearch::MakeBackwardTab2()
|
|
{
|
|
// create the jumptable for the search text
|
|
if( pJumpTable2 && !bIsForwardTab )
|
|
{
|
|
return; // the jumpTable is ok
|
|
}
|
|
bIsForwardTab = false;
|
|
|
|
sal_Int32 n, nLen = sSrchStr2.getLength();
|
|
pJumpTable2.reset( new TextSearchJumpTable );
|
|
|
|
for( n = nLen-1; n > 0; --n )
|
|
{
|
|
sal_Unicode cCh = sSrchStr2[n];
|
|
TextSearchJumpTable::value_type aEntry( cCh, n );
|
|
::std::pair< TextSearchJumpTable::iterator, bool > aPair =
|
|
pJumpTable2->insert( aEntry );
|
|
if ( !aPair.second )
|
|
(*(aPair.first)).second = n;
|
|
}
|
|
}
|
|
|
|
sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
|
|
{
|
|
TextSearchJumpTable *pJump;
|
|
OUString sSearchKey;
|
|
|
|
if ( bUsePrimarySrchStr ) {
|
|
pJump = pJumpTable.get();
|
|
sSearchKey = sSrchStr;
|
|
} else {
|
|
pJump = pJumpTable2.get();
|
|
sSearchKey = sSrchStr2;
|
|
}
|
|
|
|
TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
|
|
if ( iLook == pJump->end() )
|
|
return sSearchKey.getLength();
|
|
return (*iLook).second;
|
|
}
|
|
|
|
|
|
SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
SearchResult aRet;
|
|
aRet.subRegExpressions = 0;
|
|
|
|
OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
|
|
|
|
sal_Int32 nSuchIdx = searchStr.getLength();
|
|
sal_Int32 nEnd = endPos;
|
|
if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
|
|
return aRet;
|
|
|
|
|
|
if( nEnd < sSearchKey.getLength() ) // position inside the search region ?
|
|
return aRet;
|
|
|
|
nEnd -= sSearchKey.getLength();
|
|
|
|
if (bUsePrimarySrchStr)
|
|
MakeForwardTab(); // create the jumptable
|
|
else
|
|
MakeForwardTab2();
|
|
|
|
for (sal_Int32 nCmpIdx = startPos; // start position for the search
|
|
nCmpIdx <= nEnd;
|
|
nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
|
|
{
|
|
nSuchIdx = sSearchKey.getLength() - 1;
|
|
while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
|
|
{
|
|
if( nSuchIdx == 0 )
|
|
{
|
|
if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
|
|
{
|
|
sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
|
|
bool bAtStart = !nCmpIdx;
|
|
bool bAtEnd = nFndEnd == endPos;
|
|
bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
|
|
bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd );
|
|
// * 1 -> only one word in the paragraph
|
|
// * 2 -> at begin of paragraph
|
|
// * 3 -> at end of paragraph
|
|
// * 4 -> inside the paragraph
|
|
if( !( ( bAtStart && bAtEnd ) || // 1
|
|
( bAtStart && bDelimBehind ) || // 2
|
|
( bAtEnd && bDelimBefore ) || // 3
|
|
( bDelimBefore && bDelimBehind ))) // 4
|
|
break;
|
|
}
|
|
|
|
aRet.subRegExpressions = 1;
|
|
aRet.startOffset = { nCmpIdx };
|
|
aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };
|
|
|
|
return aRet;
|
|
}
|
|
else
|
|
nSuchIdx--;
|
|
}
|
|
}
|
|
return aRet;
|
|
}
|
|
|
|
SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
SearchResult aRet;
|
|
aRet.subRegExpressions = 0;
|
|
|
|
OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
|
|
|
|
sal_Int32 nSuchIdx = searchStr.getLength();
|
|
sal_Int32 nEnd = endPos;
|
|
if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
|
|
return aRet;
|
|
|
|
if (bUsePrimarySrchStr)
|
|
MakeBackwardTab(); // create the jumptable
|
|
else
|
|
MakeBackwardTab2();
|
|
|
|
if( nEnd == nSuchIdx ) // end position for the search
|
|
nEnd = sSearchKey.getLength();
|
|
else
|
|
nEnd += sSearchKey.getLength();
|
|
|
|
sal_Int32 nCmpIdx = startPos; // start position for the search
|
|
|
|
while (nCmpIdx >= nEnd)
|
|
{
|
|
nSuchIdx = 0;
|
|
while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
|
|
searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
|
|
nSuchIdx++;
|
|
if( nSuchIdx >= sSearchKey.getLength() )
|
|
{
|
|
if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
|
|
{
|
|
sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
|
|
bool bAtStart = !nFndStt;
|
|
bool bAtEnd = nCmpIdx == startPos;
|
|
bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
|
|
bool bDelimBefore = bAtStart || // begin of paragraph
|
|
IsDelimiter( searchStr, nFndStt-1 );
|
|
// * 1 -> only one word in the paragraph
|
|
// * 2 -> at begin of paragraph
|
|
// * 3 -> at end of paragraph
|
|
// * 4 -> inside the paragraph
|
|
if( ( bAtStart && bAtEnd ) || // 1
|
|
( bAtStart && bDelimBehind ) || // 2
|
|
( bAtEnd && bDelimBefore ) || // 3
|
|
( bDelimBefore && bDelimBehind )) // 4
|
|
{
|
|
aRet.subRegExpressions = 1;
|
|
aRet.startOffset = { nCmpIdx };
|
|
aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
|
|
return aRet;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
aRet.subRegExpressions = 1;
|
|
aRet.startOffset = { nCmpIdx };
|
|
aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
|
|
return aRet;
|
|
}
|
|
}
|
|
nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
|
|
if( nCmpIdx < nSuchIdx )
|
|
return aRet;
|
|
nCmpIdx -= nSuchIdx;
|
|
}
|
|
return aRet;
|
|
}
|
|
|
|
void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
|
|
{
|
|
TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
|
|
// select the transliterated pattern string
|
|
const OUString& rPatternStr =
|
|
(isSimpleTrans(transliterateFlags) ? sSrchStr
|
|
: (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));
|
|
|
|
sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
|
|
// map css::util::SearchFlags to ICU uregex.h flags
|
|
// TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
|
|
// REG_NEWLINE is neither properly defined nor used anywhere => not implemented
|
|
// REG_NOSUB is not used anywhere => not implemented
|
|
// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
|
|
// LEV_RELAXED is only used for SearchAlgorithm==Approximate
|
|
// Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
|
|
// probably because the transliteration flag IGNORE_CASE handles it as well.
|
|
if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
|
|
|| (transliterateFlags & TransliterationFlags::IGNORE_CASE))
|
|
nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
|
|
UErrorCode nIcuErr = U_ZERO_ERROR;
|
|
// assumption: transliteration didn't mangle regexp control chars
|
|
icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
|
|
#ifndef DISABLE_WORDBOUND_EMULATION
|
|
// for convenience specific syntax elements of the old regex engine are emulated
|
|
// - by replacing \< with "word-break followed by a look-ahead word-char"
|
|
static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);
|
|
static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);
|
|
static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
|
|
aChevronMatcherB.reset( aIcuSearchPatStr);
|
|
aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
|
|
aChevronMatcherB.reset();
|
|
// - by replacing \> with "look-behind word-char followed by a word-break"
|
|
static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);
|
|
static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);
|
|
static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
|
|
aChevronMatcherE.reset( aIcuSearchPatStr);
|
|
aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
|
|
aChevronMatcherE.reset();
|
|
#endif
|
|
pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
|
|
if (nIcuErr)
|
|
{
|
|
SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
|
|
pRegexMatcher.reset();
|
|
}
|
|
else
|
|
{
|
|
// Pathological patterns may result in exponential run time making the
|
|
// application appear to be frozen. Limit that. Documentation for this
|
|
// call says
|
|
// https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
|
|
// "The units of the limit are steps of the match engine.
|
|
// Correspondence with actual processor time will depend on the speed
|
|
// of the processor and the details of the specific pattern, but will
|
|
// typically be on the order of milliseconds."
|
|
// Just what is a good value? 42 is always an answer ... the 23 enigma
|
|
// as well... which on the dev's machine is roughly 50 seconds with the
|
|
// pattern of fdo#70627.
|
|
/* TODO: make this a configuration settable value and possibly take
|
|
* complexity of expression into account and maybe even length of text
|
|
* to be matched; currently (2013-11-25) that is at most one 64k
|
|
* paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
|
|
pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
|
|
}
|
|
}
|
|
|
|
|
|
static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
|
|
sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
|
|
{
|
|
pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
|
|
pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
|
|
pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
|
|
// look-ahead/behind assertions
|
|
|
|
if (!pRegexMatcher->find(rIcuErr))
|
|
{
|
|
/* TODO: future versions could pass the UErrorCode or translations
|
|
* thereof to the caller, for example to inform the user of
|
|
* U_REGEX_TIME_OUT. The strange thing though is that an error is set
|
|
* only after the second call that returns immediately and not if
|
|
* timeout occurred on the first call?!? */
|
|
SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
|
|
sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
SearchResult aRet;
|
|
aRet.subRegExpressions = 0;
|
|
if( !pRegexMatcher)
|
|
return aRet;
|
|
|
|
if( endPos > searchStr.getLength())
|
|
endPos = searchStr.getLength();
|
|
|
|
// use the ICU RegexMatcher to find the matches
|
|
UErrorCode nIcuErr = U_ZERO_ERROR;
|
|
const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
|
|
searchStr.getLength());
|
|
pRegexMatcher->reset( aSearchTargetStr);
|
|
// search until there is a valid match
|
|
for(;;)
|
|
{
|
|
if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
|
|
return aRet;
|
|
|
|
// #i118887# ignore zero-length matches e.g. "a*" in "bc"
|
|
int nStartOfs = pRegexMatcher->start( nIcuErr);
|
|
int nEndOfs = pRegexMatcher->end( nIcuErr);
|
|
if( nStartOfs < nEndOfs)
|
|
break;
|
|
// If the zero-length match is behind the string, do not match it again
|
|
// and again until startPos reaches there. A match behind the string is
|
|
// a "$" anchor.
|
|
if (nStartOfs == endPos)
|
|
break;
|
|
// try at next position if there was a zero-length match
|
|
if( ++startPos >= endPos)
|
|
return aRet;
|
|
}
|
|
|
|
// extract the result of the search
|
|
const int nGroupCount = pRegexMatcher->groupCount();
|
|
aRet.subRegExpressions = nGroupCount + 1;
|
|
aRet.startOffset.realloc( aRet.subRegExpressions);
|
|
auto pstartOffset = aRet.startOffset.getArray();
|
|
aRet.endOffset.realloc( aRet.subRegExpressions);
|
|
auto pendOffset = aRet.endOffset.getArray();
|
|
pstartOffset[0] = pRegexMatcher->start( nIcuErr);
|
|
pendOffset[0] = pRegexMatcher->end( nIcuErr);
|
|
for( int i = 1; i <= nGroupCount; ++i) {
|
|
pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
|
|
pendOffset[i] = pRegexMatcher->end( i, nIcuErr);
|
|
}
|
|
|
|
return aRet;
|
|
}
|
|
|
|
SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
|
|
sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
// NOTE: for backwards search callers provide startPos/endPos inverted!
|
|
SearchResult aRet;
|
|
aRet.subRegExpressions = 0;
|
|
if( !pRegexMatcher)
|
|
return aRet;
|
|
|
|
if( startPos > searchStr.getLength())
|
|
startPos = searchStr.getLength();
|
|
|
|
// use the ICU RegexMatcher to find the matches
|
|
// TODO: use ICU's backward searching once it becomes available
|
|
// as its replacement using forward search is not as good as the real thing
|
|
UErrorCode nIcuErr = U_ZERO_ERROR;
|
|
const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
|
|
searchStr.getLength());
|
|
pRegexMatcher->reset( aSearchTargetStr);
|
|
if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
|
|
return aRet;
|
|
|
|
// find the last match
|
|
int nLastPos = 0;
|
|
int nFoundEnd = 0;
|
|
int nGoodPos = 0, nGoodEnd = 0;
|
|
bool bFirst = true;
|
|
do {
|
|
nLastPos = pRegexMatcher->start( nIcuErr);
|
|
nFoundEnd = pRegexMatcher->end( nIcuErr);
|
|
if (nLastPos < nFoundEnd)
|
|
{
|
|
// remember last non-zero-length match
|
|
nGoodPos = nLastPos;
|
|
nGoodEnd = nFoundEnd;
|
|
}
|
|
if( nFoundEnd >= startPos)
|
|
break;
|
|
bFirst = false;
|
|
if( nFoundEnd == nLastPos)
|
|
++nFoundEnd;
|
|
} while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));
|
|
|
|
// Ignore all zero-length matches except "$" anchor on first match.
|
|
if (nGoodPos == nGoodEnd)
|
|
{
|
|
if (bFirst && nLastPos == startPos)
|
|
nGoodPos = nLastPos;
|
|
else
|
|
return aRet;
|
|
}
|
|
|
|
// find last match again to get its details
|
|
lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);
|
|
|
|
// fill in the details of the last match
|
|
const int nGroupCount = pRegexMatcher->groupCount();
|
|
aRet.subRegExpressions = nGroupCount + 1;
|
|
aRet.startOffset.realloc( aRet.subRegExpressions);
|
|
auto pstartOffset = aRet.startOffset.getArray();
|
|
aRet.endOffset.realloc( aRet.subRegExpressions);
|
|
auto pendOffset = aRet.endOffset.getArray();
|
|
// NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
|
|
pstartOffset[0] = pRegexMatcher->end( nIcuErr);
|
|
pendOffset[0] = pRegexMatcher->start( nIcuErr);
|
|
for( int i = 1; i <= nGroupCount; ++i) {
|
|
pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
|
|
pendOffset[i] = pRegexMatcher->start( i, nIcuErr);
|
|
}
|
|
|
|
return aRet;
|
|
}
|
|
|
|
|
|
// search for words phonetically
|
|
SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
|
|
sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
SearchResult aRet;
|
|
aRet.subRegExpressions = 0;
|
|
|
|
if( !xBreak.is() )
|
|
return aRet;
|
|
|
|
sal_Int32 nStt, nEnd;
|
|
|
|
Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
|
|
aSrchPara.Locale,
|
|
WordType::ANYWORD_IGNOREWHITESPACES, true );
|
|
|
|
do
|
|
{
|
|
if( aWBnd.startPos >= endPos )
|
|
break;
|
|
nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
|
|
nEnd = std::min(aWBnd.endPos, endPos);
|
|
|
|
if( nStt < nEnd &&
|
|
pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
|
|
{
|
|
aRet.subRegExpressions = 1;
|
|
aRet.startOffset = { nStt };
|
|
aRet.endOffset = { nEnd };
|
|
break;
|
|
}
|
|
|
|
nStt = nEnd - 1;
|
|
aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
|
|
WordType::ANYWORD_IGNOREWHITESPACES);
|
|
} while( aWBnd.startPos != aWBnd.endPos ||
|
|
(aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
|
|
// #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
|
|
// whitespace) in searchStr, getWordBoundary() returned startPos,startPos
|
|
// and nextWord() does also => don't loop forever.
|
|
return aRet;
|
|
}
|
|
|
|
SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
|
|
sal_Int32 startPos, sal_Int32 endPos )
|
|
{
|
|
SearchResult aRet;
|
|
aRet.subRegExpressions = 0;
|
|
|
|
if( !xBreak.is() )
|
|
return aRet;
|
|
|
|
sal_Int32 nStt, nEnd;
|
|
|
|
Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
|
|
aSrchPara.Locale,
|
|
WordType::ANYWORD_IGNOREWHITESPACES, true );
|
|
|
|
do
|
|
{
|
|
if( aWBnd.endPos <= endPos )
|
|
break;
|
|
nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
|
|
nEnd = std::min(aWBnd.endPos, startPos);
|
|
|
|
if( nStt < nEnd &&
|
|
pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
|
|
{
|
|
aRet.subRegExpressions = 1;
|
|
aRet.startOffset = { nEnd };
|
|
aRet.endOffset = { nStt };
|
|
break;
|
|
}
|
|
if( !nStt )
|
|
break;
|
|
|
|
aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
|
|
WordType::ANYWORD_IGNOREWHITESPACES);
|
|
} while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
|
|
return aRet;
|
|
}
|
|
|
|
|
|
namespace {
|
|
void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
|
|
{
|
|
rRes.subRegExpressions = 1;
|
|
rRes.startOffset = { nStartOffset };
|
|
rRes.endOffset = { nEndOffset };
|
|
}
|
|
}
|
|
|
|
SearchResult TextSearch::WildcardSrchFrwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
|
|
{
|
|
SearchResult aRes;
|
|
aRes.subRegExpressions = 0; // no match
|
|
sal_Int32 nStartOffset = nStartPos;
|
|
sal_Int32 nEndOffset = nEndPos;
|
|
|
|
const sal_Int32 nStringLen = searchStr.getLength();
|
|
|
|
// Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
|
|
// string match with [0,0).
|
|
if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
|
|
(nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
|
|
return aRes;
|
|
|
|
const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
|
|
const sal_Int32 nPatternLen = rPattern.getLength();
|
|
|
|
// Handle special cases empty pattern and/or string outside of the loop to
|
|
// not add performance penalties there and simplify.
|
|
if (nStartPos == nEndPos)
|
|
{
|
|
sal_Int32 i = 0;
|
|
while (i < nPatternLen && rPattern[i] == '*')
|
|
++i;
|
|
if (i == nPatternLen)
|
|
setWildcardMatch( aRes, nStartOffset, nEndOffset);
|
|
return aRes;
|
|
}
|
|
|
|
// Empty pattern does not match any non-empty string.
|
|
if (!nPatternLen)
|
|
return aRes;
|
|
|
|
bool bRewind = false;
|
|
sal_uInt32 cPattern = 0;
|
|
sal_Int32 nPattern = 0;
|
|
sal_Int32 nAfterFakePattern = nPattern;
|
|
if (mbWildcardAllowSubstring)
|
|
{
|
|
// Fake a leading '*' wildcard.
|
|
cPattern = '*';
|
|
bRewind = true;
|
|
// Assume a non-'*' pattern character follows. If it is a '*' instead
|
|
// that will be handled in the loop by setting nPat.
|
|
sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
|
|
if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
|
|
rPattern.iterateCodePoints( &nAfterFakePattern);
|
|
}
|
|
|
|
sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
|
|
sal_uInt32 cPatternAfterAsterisk = 0;
|
|
bool bEscaped = false, bEscapedAfterAsterisk = false;
|
|
|
|
// The loop code tries to avoid costly calls to iterateCodePoints() when
|
|
// possible.
|
|
|
|
do
|
|
{
|
|
if (bRewind)
|
|
{
|
|
// Reuse cPattern after '*', nPattern was correspondingly
|
|
// incremented to point behind cPattern.
|
|
bRewind = false;
|
|
}
|
|
else if (nPattern < nPatternLen)
|
|
{
|
|
// nPattern will be incremented by iterateCodePoints().
|
|
cPattern = rPattern.iterateCodePoints( &nPattern);
|
|
if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
|
|
{
|
|
bEscaped = true;
|
|
cPattern = rPattern.iterateCodePoints( &nPattern);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// A trailing '*' is handled below.
|
|
if (mbWildcardAllowSubstring)
|
|
{
|
|
// If the pattern is consumed and substring match allowed we're good.
|
|
setWildcardMatch( aRes, nStartOffset, nString);
|
|
return aRes;
|
|
}
|
|
else if (nString < nEndPos && nLastAsterisk >= 0)
|
|
{
|
|
// If substring match is not allowed try a greedy '*' match.
|
|
nPattern = nLastAsterisk;
|
|
continue; // do
|
|
}
|
|
else
|
|
return aRes;
|
|
}
|
|
|
|
if (cPattern == '*' && !bEscaped)
|
|
{
|
|
// '*' is one code unit, so not using iterateCodePoints() is ok.
|
|
while (nPattern < nPatternLen && rPattern[nPattern] == '*')
|
|
++nPattern;
|
|
|
|
if (nPattern >= nPatternLen)
|
|
{
|
|
// Last pattern is '*', remaining string matches.
|
|
setWildcardMatch( aRes, nStartOffset, nEndOffset);
|
|
return aRes;
|
|
}
|
|
|
|
nLastAsterisk = nPattern; // Remember last encountered '*'.
|
|
|
|
// cPattern will be the next non-'*' character, nPattern
|
|
// incremented.
|
|
cPattern = rPattern.iterateCodePoints( &nPattern);
|
|
if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
|
|
{
|
|
bEscaped = true;
|
|
cPattern = rPattern.iterateCodePoints( &nPattern);
|
|
}
|
|
|
|
cPatternAfterAsterisk = cPattern;
|
|
bEscapedAfterAsterisk = bEscaped;
|
|
nPat = nPattern; // Remember position of pattern behind '*', already incremented.
|
|
nStr = nString; // Remember the current string to be matched.
|
|
}
|
|
|
|
if (nString >= nEndPos)
|
|
// Whatever follows in pattern, string will not match.
|
|
return aRes;
|
|
|
|
// nString will be incremented by iterateCodePoints().
|
|
sal_uInt32 cString = searchStr.iterateCodePoints( &nString);
|
|
|
|
if ((cPattern != '?' || bEscaped) && cPattern != cString)
|
|
{
|
|
if (nPat == -1)
|
|
// Non-match already without any '*' pattern.
|
|
return aRes;
|
|
|
|
bRewind = true;
|
|
nPattern = nPat; // Rewind pattern to character behind '*', already incremented.
|
|
cPattern = cPatternAfterAsterisk;
|
|
bEscaped = bEscapedAfterAsterisk;
|
|
searchStr.iterateCodePoints( &nStr);
|
|
nString = nStr; // Restore incremented remembered string position.
|
|
if (nPat == nAfterFakePattern)
|
|
{
|
|
// Next start offset will be the next character.
|
|
nStartOffset = nString;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// An unescaped '?' pattern matched any character, or characters
|
|
// matched. Reset only escaped state.
|
|
bEscaped = false;
|
|
}
|
|
}
|
|
while (nString < nEndPos);
|
|
|
|
if (bRewind)
|
|
return aRes;
|
|
|
|
// Eat trailing '*' pattern that matches anything, including nothing.
|
|
// '*' is one code unit, so not using iterateCodePoints() is ok.
|
|
while (nPattern < nPatternLen && rPattern[nPattern] == '*')
|
|
++nPattern;
|
|
|
|
if (nPattern == nPatternLen)
|
|
setWildcardMatch( aRes, nStartOffset, nEndOffset);
|
|
return aRes;
|
|
}
|
|
|
|
SearchResult TextSearch::WildcardSrchBkwrd( const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
|
|
{
|
|
SearchResult aRes;
|
|
aRes.subRegExpressions = 0; // no match
|
|
|
|
sal_Int32 nStartOffset = nStartPos;
|
|
sal_Int32 nEndOffset = nEndPos;
|
|
|
|
const sal_Int32 nStringLen = searchStr.getLength();
|
|
|
|
// Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
|
|
// string match with (0,0].
|
|
if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
|
|
(nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
|
|
return aRes;
|
|
|
|
const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
|
|
sal_Int32 nPatternLen = rPattern.getLength();
|
|
|
|
// Handle special cases empty pattern and/or string outside of the loop to
|
|
// not add performance penalties there and simplify.
|
|
if (nStartPos == nEndPos)
|
|
{
|
|
sal_Int32 i = 0;
|
|
while (i < nPatternLen && rPattern[i] == '*')
|
|
++i;
|
|
if (i == nPatternLen)
|
|
setWildcardMatch( aRes, nStartOffset, nEndOffset);
|
|
return aRes;
|
|
}
|
|
|
|
// Empty pattern does not match any non-empty string.
|
|
if (!nPatternLen)
|
|
return aRes;
|
|
|
|
// Reverse escaped patterns to ease the handling of escapes, keeping escape
|
|
// and following character as one sequence in backward direction.
|
|
if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
|
|
(!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
|
|
{
|
|
OUStringBuffer aPatternBuf( rPattern);
|
|
sal_Int32 nIndex = 0;
|
|
while (nIndex < nPatternLen)
|
|
{
|
|
const sal_Int32 nOld = nIndex;
|
|
const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
|
|
if (cu == mcWildcardEscapeChar)
|
|
{
|
|
if (nIndex < nPatternLen)
|
|
{
|
|
if (nIndex - nOld == 1)
|
|
{
|
|
// Simply move code units, we already memorized the one
|
|
// in 'cu'.
|
|
const sal_Int32 nOld2 = nIndex;
|
|
rPattern.iterateCodePoints( &nIndex);
|
|
for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
|
|
aPatternBuf[nOld+i] = rPattern[nOld2+i];
|
|
aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
|
|
}
|
|
else
|
|
{
|
|
// Copy the escape character code units first in the
|
|
// unlikely case that it would not be of BMP.
|
|
assert(nIndex - nOld == 2); // it's UTF-16, so...
|
|
sal_Unicode buf[2];
|
|
buf[0] = rPattern[nOld];
|
|
buf[1] = rPattern[nOld+1];
|
|
const sal_Int32 nOld2 = nIndex;
|
|
rPattern.iterateCodePoints( &nIndex);
|
|
for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
|
|
aPatternBuf[nOld+i] = rPattern[nOld2+i];
|
|
aPatternBuf[nIndex-2] = buf[0];
|
|
aPatternBuf[nIndex-1] = buf[1];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Trailing escape would become leading escape, do what?
|
|
// Eliminate.
|
|
aPatternBuf.remove( nOld, nIndex - nOld);
|
|
}
|
|
}
|
|
}
|
|
if (bUsePrimarySrchStr)
|
|
maWildcardReversePattern = aPatternBuf.makeStringAndClear();
|
|
else
|
|
maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
|
|
}
|
|
const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
|
|
nPatternLen = rReversePattern.getLength();
|
|
|
|
bool bRewind = false;
|
|
sal_uInt32 cPattern = 0;
|
|
sal_Int32 nPattern = nPatternLen;
|
|
sal_Int32 nAfterFakePattern = nPattern;
|
|
if (mbWildcardAllowSubstring)
|
|
{
|
|
// Fake a trailing '*' wildcard.
|
|
cPattern = '*';
|
|
bRewind = true;
|
|
// Assume a non-'*' pattern character follows. If it is a '*' instead
|
|
// that will be handled in the loop by setting nPat.
|
|
sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
|
|
if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
|
|
rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
|
|
}
|
|
|
|
sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
|
|
sal_uInt32 cPatternAfterAsterisk = 0;
|
|
bool bEscaped = false, bEscapedAfterAsterisk = false;
|
|
|
|
// The loop code tries to avoid costly calls to iterateCodePoints() when
|
|
// possible.
|
|
|
|
do
|
|
{
|
|
if (bRewind)
|
|
{
|
|
// Reuse cPattern after '*', nPattern was correspondingly
|
|
// decremented to point before cPattern.
|
|
bRewind = false;
|
|
}
|
|
else if (nPattern > 0)
|
|
{
|
|
// nPattern will be decremented by iterateCodePoints().
|
|
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
|
|
if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
|
|
{
|
|
bEscaped = true;
|
|
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// A trailing '*' is handled below.
|
|
if (mbWildcardAllowSubstring)
|
|
{
|
|
// If the pattern is consumed and substring match allowed we're good.
|
|
setWildcardMatch( aRes, nStartOffset, nString);
|
|
return aRes;
|
|
}
|
|
else if (nString > nEndPos && nLastAsterisk >= 0)
|
|
{
|
|
// If substring match is not allowed try a greedy '*' match.
|
|
nPattern = nLastAsterisk;
|
|
continue; // do
|
|
}
|
|
else
|
|
return aRes;
|
|
}
|
|
|
|
if (cPattern == '*' && !bEscaped)
|
|
{
|
|
// '*' is one code unit, so not using iterateCodePoints() is ok.
|
|
while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
|
|
--nPattern;
|
|
|
|
if (nPattern <= 0)
|
|
{
|
|
// First pattern is '*', remaining string matches.
|
|
setWildcardMatch( aRes, nStartOffset, nEndOffset);
|
|
return aRes;
|
|
}
|
|
|
|
nLastAsterisk = nPattern; // Remember last encountered '*'.
|
|
|
|
// cPattern will be the previous non-'*' character, nPattern
|
|
// decremented.
|
|
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
|
|
if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
|
|
{
|
|
bEscaped = true;
|
|
cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
|
|
}
|
|
|
|
cPatternAfterAsterisk = cPattern;
|
|
bEscapedAfterAsterisk = bEscaped;
|
|
nPat = nPattern; // Remember position of pattern before '*', already decremented.
|
|
nStr = nString; // Remember the current string to be matched.
|
|
}
|
|
|
|
if (nString <= nEndPos)
|
|
// Whatever leads in pattern, string will not match.
|
|
return aRes;
|
|
|
|
// nString will be decremented by iterateCodePoints().
|
|
sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);
|
|
|
|
if ((cPattern != '?' || bEscaped) && cPattern != cString)
|
|
{
|
|
if (nPat == -1)
|
|
// Non-match already without any '*' pattern.
|
|
return aRes;
|
|
|
|
bRewind = true;
|
|
nPattern = nPat; // Rewind pattern to character before '*', already decremented.
|
|
cPattern = cPatternAfterAsterisk;
|
|
bEscaped = bEscapedAfterAsterisk;
|
|
searchStr.iterateCodePoints( &nStr, -1);
|
|
nString = nStr; // Restore decremented remembered string position.
|
|
if (nPat == nAfterFakePattern)
|
|
{
|
|
// Next start offset will be this character (exclusive).
|
|
nStartOffset = nString;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// An unescaped '?' pattern matched any character, or characters
|
|
// matched. Reset only escaped state.
|
|
bEscaped = false;
|
|
}
|
|
}
|
|
while (nString > nEndPos);
|
|
|
|
if (bRewind)
|
|
return aRes;
|
|
|
|
// Eat leading '*' pattern that matches anything, including nothing.
|
|
// '*' is one code unit, so not using iterateCodePoints() is ok.
|
|
while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
|
|
--nPattern;
|
|
|
|
if (nPattern == 0)
|
|
setWildcardMatch( aRes, nStartOffset, nEndOffset);
|
|
return aRes;
|
|
}
|
|
|
|
|
|
OUString SAL_CALL
|
|
TextSearch::getImplementationName()
|
|
{
|
|
return u"com.sun.star.util.TextSearch_i18n"_ustr;
|
|
}
|
|
|
|
sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
|
|
{
|
|
return cppu::supportsService(this, rServiceName);
|
|
}
|
|
|
|
Sequence< OUString > SAL_CALL
|
|
TextSearch::getSupportedServiceNames()
|
|
{
|
|
return { u"com.sun.star.util.TextSearch"_ustr, u"com.sun.star.util.TextSearch2"_ustr };
|
|
}
|
|
|
|
extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
|
|
i18npool_TextSearch_get_implementation(
|
|
css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
|
|
{
|
|
return cppu::acquire(new TextSearch(context));
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|