office-gobmx/i18npool/source/search/levdis.cxx
Mike Kaganski 995b3186fa Use std algorithms here
Change-Id: Ib7bb92cca1f52067f9030b6c6fdc088409ca10ef
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/113601
Tested-by: Jenkins
Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
2021-04-05 15:16:23 +02:00

373 lines
12 KiB
C++

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
/*
Weighted Levenshtein Distance
including wildcards
'*' for any number (0 or more) of arbitrary characters
'?' for exactly one arbitrary character
escapable with backslash, "\*" or "\?"
Return:
WLD if WLD <= nLimit, else nLimit+1
or, if bSplitCount:
WLD if WLD <= nLimit
-WLD if Replace and Insert and Delete <= nLimit
else nLimit+1
Recursive definition of WLD:
WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
WLD( X(i) , Y(j-1) ) + q ,
WLD( X(i-1), Y(j) ) + r )
X(i) := the first i characters of the word X
Y(j) := the first j characters of the word Y
p(i,j) := 0 if i-th character of X == j-th character of Y,
p else
Boundary conditions:
WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
WLD( X(0), Y(0) ) := 0
Instead of recursions a dynamic algorithm is used.
See also: German computer magazine
c't 07/89 pages 192-208 and c't 03/94 pages 230-239
*/
#include <algorithm>
#include <numeric>
#include "levdis.hxx"
#define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
#define LEVDISDOUBLEBUF 2048 // no doubling atop this border
static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
{
const sal_Unicode* pTempStr = pStr;
while( *pTempStr )
pTempStr++;
return static_cast<sal_Int32>(pTempStr-pStr);
}
// Distance from string to pattern
int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
{
int nSPMin = 0; // penalty point Minimum
int nRepS = 0; // for SplitCount
// length difference between pattern and string
int nLenDiff = nPatternLen - nStars - nStringLen;
// more insertions or deletions necessary as the limit? Then leave
if ( (nLenDiff * nInsQ0 > nLimit)
|| ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
return LEVDISBIG;
// comparative String greater than instantaneous array
// -> adapt array size
if ( nStringLen >= nArrayLen )
{
// increase size much more to avoid reallocation
if ( nStringLen < LEVDISDOUBLEBUF )
nArrayLen = 2 * nStringLen;
else
nArrayLen = nStringLen + 1;
npDistance = aDisMem.NewMem( nArrayLen );
}
// Calculate start values of the second column (first pattern value).
// First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
// therefore the minimum is 0
if ( nPatternLen == 0 )
{
// Count of deletions to reach pattern
for ( sal_Int32 i=0; i <= nStringLen; i++ )
npDistance[i] = i * nDelR0;
}
else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
{
// instead of a '*' you can fit in anything
for ( sal_Int32 i=0; i <= nStringLen; i++ )
npDistance[i] = 0;
}
else
{
sal_Unicode c;
int nP;
c = cpPattern[0];
if ( c == '?' && bpPatIsWild[0] )
nP = 0; // a '?' could be any character.
else
// Minimum of replacement and deletion+insertion weighting
nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
npDistance[0] = nInsQ0; // start with simple insert
npDistance[1] = nInsQ0;
npDistance[2] = nInsQ0;
int nReplacePos = -1; // tristate flag
int nDelCnt = 0;
for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
{
if ( cString[i-1] == c )
nP = 0; // Replace from this position is 0
// Deletions to match pattern + Replace
npDistance[i] = nDelCnt + nP;
if ( bSplitCount )
{
if ( nReplacePos < 0 && nP )
{ // this position will be replaced
nRepS++;
nReplacePos = i;
}
else if ( nReplacePos > 0 && !nP )
{
// same count of c
int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
if ( !nBalance )
{ // one was replaced that was an insertion instead
nRepS--;
nReplacePos = 0;
}
}
}
}
nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
}
// calculate distance matrix
sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached
while ( (j < nPatternLen-1)
&& nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
{
sal_Unicode c;
int nP, nQ, nR, nPij, d2;
j++;
c = cpPattern[j];
if ( bpPatIsWild[j] ) // '*' or '?' not escaped
nP = 0; // could be replaced without penalty
else
nP = nRepP0;
if ( c == '*' && bpPatIsWild[j] )
{
nQ = 0; // insertion and deletion without penalty
nR = 0;
}
else
{
nQ = nInsQ0; // usual weighting
nR = nDelR0;
}
d2 = npDistance[0];
// increase insert count to get from null string to pattern
npDistance[0] = npDistance[0] + nQ;
nSPMin = npDistance[0];
int nReplacePos = -1; // tristate flag
// for each pattern column run through the string
for ( sal_Int32 i=1; i <= nStringLen; i++ )
{
int d1 = d2; // WLD( X(i-1), Y(j-1) )
d2 = npDistance[i]; // WLD( X(i) , Y(j-1) )
if ( cString[i-1] == c )
{
nPij = 0; // p(i,j)
if ( nReplacePos < 0 )
{
// same count of c
int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
if ( !nBalance )
nReplacePos = 0; // no replacement
}
}
else
nPij = nP;
// WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
// WLD( X(i) , Y(j-1) ) + q ,
// WLD( X(i-1), Y(j) ) + r )
npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
if ( npDistance[i] < nSPMin )
nSPMin = npDistance[i];
if ( bSplitCount )
{
if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
{ // this position will be replaced
nRepS++;
nReplacePos = i;
}
else if ( nReplacePos > 0 && !nPij )
{
// character is equal in string and pattern
//
// If from this point:
// * pattern and string have the same count of this
// character
// * and character count is the same before this position
// then the replace was none.
//
// Scrambled letters are recognized here and the nRepS
// replacement is withdrawn, whereby the double limit kicks
// in.
// Same count of c
int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
if ( !nBalance )
{ // one was replaced that was an insertion instead
nRepS--;
nReplacePos = 0;
}
}
}
}
}
if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
return npDistance[nStringLen];
else
{
if ( bSplitCount )
{
if ( nRepS && nLenDiff > 0 )
nRepS -= nLenDiff; // Inserts were counted
if ( (nSPMin <= 2 * nLimit)
&& (npDistance[nStringLen] <= 2 * nLimit)
&& (nRepS * nRepP0 <= nLimit) )
return -npDistance[nStringLen];
return LEVDISBIG;
}
return LEVDISBIG;
}
}
// Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
// from user values nOtherX, nShorterY, nLongerZ, bRelaxed
void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
{
if ( nX < 0 ) nX = 0; // only positive values
if ( nY < 0 ) nY = 0;
if ( nZ < 0 ) nZ = 0;
if (0 == std::min({ nX, nY, nZ })) // at least one 0
{
int nMid, nMax;
nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
nLimit = nMax; // either 0 or the only one >0
else // one is 0
nLimit = std::lcm( nMid, nMax );
}
else // all three of them are not 0
nLimit = std::lcm(std::lcm(nX, nY), nZ);
nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
bSplitCount = bRelaxed;
}
// The value in the middle
int WLevDistance::Mid3( int x, int y, int z )
{
int min = std::min({ x, y, z });
if ( x == min )
return std::min(y, z);
else if ( y == min )
return std::min(x, z);
else // z == min
return std::min(x, y);
}
// initialize data from CTOR
void WLevDistance::InitData( const sal_Unicode* cPattern )
{
cpPattern = aPatMem.GetcPtr();
bpPatIsWild = aPatMem.GetbPtr();
npDistance = aDisMem.GetPtr();
nStars = 0;
const sal_Unicode* cp1 = cPattern;
sal_Unicode* cp2 = cpPattern;
bool* bp = bpPatIsWild;
// copy pattern, count asterisks, escaped Jokers
while ( *cp1 )
{
if ( *cp1 == '\\' ) // maybe escaped
{
if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker?
{
cp1++; // skip '\\'
nPatternLen--;
}
*bp++ = false;
}
else if ( *cp1 == '*' || *cp1 == '?' ) // Joker
{
if ( *cp1 == '*' )
nStars++;
*bp++ = true;
}
else
*bp++ = false;
*cp2++ = *cp1++;
}
*cp2 = '\0';
}
WLevDistance::WLevDistance( const sal_Unicode* cPattern,
int nOtherX, int nShorterY, int nLongerZ,
bool bRelaxed ) :
nPatternLen( Impl_WLD_StringLen(cPattern) ),
aPatMem( nPatternLen + 1 ),
nArrayLen( nPatternLen + 1 ),
aDisMem( nArrayLen )
{
InitData( cPattern );
CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
}
// CopyCTor
WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
nPatternLen( rWLD.nPatternLen ),
aPatMem( nPatternLen + 1 ),
nArrayLen( nPatternLen + 1 ),
aDisMem( nArrayLen ),
nLimit( rWLD.nLimit ),
nRepP0( rWLD.nRepP0 ),
nInsQ0( rWLD.nInsQ0 ),
nDelR0( rWLD.nDelR0 ),
nStars( rWLD.nStars ),
bSplitCount( rWLD.bSplitCount )
{
cpPattern = aPatMem.GetcPtr();
bpPatIsWild = aPatMem.GetbPtr();
npDistance = aDisMem.GetPtr();
sal_Int32 i;
for ( i=0; i<nPatternLen; i++ )
{
cpPattern[i] = rWLD.cpPattern[i];
bpPatIsWild[i] = rWLD.bpPatIsWild[i];
}
cpPattern[i] = '\0';
}
// DTor
WLevDistance::~WLevDistance()
{
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */