995b3186fa
Change-Id: Ib7bb92cca1f52067f9030b6c6fdc088409ca10ef Reviewed-on: https://gerrit.libreoffice.org/c/core/+/113601 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
373 lines
12 KiB
C++
373 lines
12 KiB
C++
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
|
/*
|
|
* This file is part of the LibreOffice project.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
*
|
|
* This file incorporates work covered by the following license notice:
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
|
|
*/
|
|
|
|
/*
|
|
|
|
Weighted Levenshtein Distance
|
|
including wildcards
|
|
'*' for any number (0 or more) of arbitrary characters
|
|
'?' for exactly one arbitrary character
|
|
escapable with backslash, "\*" or "\?"
|
|
|
|
Return:
|
|
WLD if WLD <= nLimit, else nLimit+1
|
|
|
|
or, if bSplitCount:
|
|
WLD if WLD <= nLimit
|
|
-WLD if Replace and Insert and Delete <= nLimit
|
|
else nLimit+1
|
|
|
|
Recursive definition of WLD:
|
|
|
|
WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
|
|
WLD( X(i) , Y(j-1) ) + q ,
|
|
WLD( X(i-1), Y(j) ) + r )
|
|
|
|
X(i) := the first i characters of the word X
|
|
Y(j) := the first j characters of the word Y
|
|
p(i,j) := 0 if i-th character of X == j-th character of Y,
|
|
p else
|
|
|
|
Boundary conditions:
|
|
WLD( X(0), Y(j) ) := j*q (Y created by j inserts)
|
|
WLD( X(i), Y(0) ) := i*r (Y created by i deletes)
|
|
WLD( X(0), Y(0) ) := 0
|
|
|
|
Instead of recursions a dynamic algorithm is used.
|
|
|
|
See also: German computer magazine
|
|
c't 07/89 pages 192-208 and c't 03/94 pages 230-239
|
|
*/
|
|
|
|
#include <algorithm>
|
|
#include <numeric>
|
|
|
|
#include "levdis.hxx"
|
|
|
|
#define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit
|
|
#define LEVDISDOUBLEBUF 2048 // no doubling atop this border
|
|
|
|
static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
|
|
{
|
|
const sal_Unicode* pTempStr = pStr;
|
|
while( *pTempStr )
|
|
pTempStr++;
|
|
return static_cast<sal_Int32>(pTempStr-pStr);
|
|
}
|
|
|
|
// Distance from string to pattern
|
|
int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
|
|
{
|
|
int nSPMin = 0; // penalty point Minimum
|
|
int nRepS = 0; // for SplitCount
|
|
|
|
// length difference between pattern and string
|
|
int nLenDiff = nPatternLen - nStars - nStringLen;
|
|
// more insertions or deletions necessary as the limit? Then leave
|
|
if ( (nLenDiff * nInsQ0 > nLimit)
|
|
|| ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
|
|
return LEVDISBIG;
|
|
|
|
// comparative String greater than instantaneous array
|
|
// -> adapt array size
|
|
if ( nStringLen >= nArrayLen )
|
|
{
|
|
// increase size much more to avoid reallocation
|
|
if ( nStringLen < LEVDISDOUBLEBUF )
|
|
nArrayLen = 2 * nStringLen;
|
|
else
|
|
nArrayLen = nStringLen + 1;
|
|
npDistance = aDisMem.NewMem( nArrayLen );
|
|
}
|
|
|
|
// Calculate start values of the second column (first pattern value).
|
|
// First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
|
|
// therefore the minimum is 0
|
|
if ( nPatternLen == 0 )
|
|
{
|
|
// Count of deletions to reach pattern
|
|
for ( sal_Int32 i=0; i <= nStringLen; i++ )
|
|
npDistance[i] = i * nDelR0;
|
|
}
|
|
else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
|
|
{
|
|
// instead of a '*' you can fit in anything
|
|
for ( sal_Int32 i=0; i <= nStringLen; i++ )
|
|
npDistance[i] = 0;
|
|
}
|
|
else
|
|
{
|
|
sal_Unicode c;
|
|
int nP;
|
|
c = cpPattern[0];
|
|
if ( c == '?' && bpPatIsWild[0] )
|
|
nP = 0; // a '?' could be any character.
|
|
else
|
|
// Minimum of replacement and deletion+insertion weighting
|
|
nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
|
|
npDistance[0] = nInsQ0; // start with simple insert
|
|
npDistance[1] = nInsQ0;
|
|
npDistance[2] = nInsQ0;
|
|
int nReplacePos = -1; // tristate flag
|
|
int nDelCnt = 0;
|
|
for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
|
|
{
|
|
if ( cString[i-1] == c )
|
|
nP = 0; // Replace from this position is 0
|
|
// Deletions to match pattern + Replace
|
|
npDistance[i] = nDelCnt + nP;
|
|
if ( bSplitCount )
|
|
{
|
|
if ( nReplacePos < 0 && nP )
|
|
{ // this position will be replaced
|
|
nRepS++;
|
|
nReplacePos = i;
|
|
}
|
|
else if ( nReplacePos > 0 && !nP )
|
|
{
|
|
// same count of c
|
|
int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
|
|
if ( !nBalance )
|
|
{ // one was replaced that was an insertion instead
|
|
nRepS--;
|
|
nReplacePos = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
|
|
}
|
|
|
|
// calculate distance matrix
|
|
sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached
|
|
while ( (j < nPatternLen-1)
|
|
&& nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
|
|
{
|
|
sal_Unicode c;
|
|
int nP, nQ, nR, nPij, d2;
|
|
|
|
j++;
|
|
c = cpPattern[j];
|
|
if ( bpPatIsWild[j] ) // '*' or '?' not escaped
|
|
nP = 0; // could be replaced without penalty
|
|
else
|
|
nP = nRepP0;
|
|
if ( c == '*' && bpPatIsWild[j] )
|
|
{
|
|
nQ = 0; // insertion and deletion without penalty
|
|
nR = 0;
|
|
}
|
|
else
|
|
{
|
|
nQ = nInsQ0; // usual weighting
|
|
nR = nDelR0;
|
|
}
|
|
d2 = npDistance[0];
|
|
// increase insert count to get from null string to pattern
|
|
npDistance[0] = npDistance[0] + nQ;
|
|
nSPMin = npDistance[0];
|
|
int nReplacePos = -1; // tristate flag
|
|
// for each pattern column run through the string
|
|
for ( sal_Int32 i=1; i <= nStringLen; i++ )
|
|
{
|
|
int d1 = d2; // WLD( X(i-1), Y(j-1) )
|
|
d2 = npDistance[i]; // WLD( X(i) , Y(j-1) )
|
|
if ( cString[i-1] == c )
|
|
{
|
|
nPij = 0; // p(i,j)
|
|
if ( nReplacePos < 0 )
|
|
{
|
|
// same count of c
|
|
int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
|
|
if ( !nBalance )
|
|
nReplacePos = 0; // no replacement
|
|
}
|
|
}
|
|
else
|
|
nPij = nP;
|
|
// WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
|
|
// WLD( X(i) , Y(j-1) ) + q ,
|
|
// WLD( X(i-1), Y(j) ) + r )
|
|
npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
|
|
if ( npDistance[i] < nSPMin )
|
|
nSPMin = npDistance[i];
|
|
if ( bSplitCount )
|
|
{
|
|
if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
|
|
{ // this position will be replaced
|
|
nRepS++;
|
|
nReplacePos = i;
|
|
}
|
|
else if ( nReplacePos > 0 && !nPij )
|
|
{
|
|
// character is equal in string and pattern
|
|
//
|
|
// If from this point:
|
|
// * pattern and string have the same count of this
|
|
// character
|
|
// * and character count is the same before this position
|
|
// then the replace was none.
|
|
//
|
|
// Scrambled letters are recognized here and the nRepS
|
|
// replacement is withdrawn, whereby the double limit kicks
|
|
// in.
|
|
|
|
// Same count of c
|
|
int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
|
|
if ( !nBalance )
|
|
{ // one was replaced that was an insertion instead
|
|
nRepS--;
|
|
nReplacePos = 0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
|
|
return npDistance[nStringLen];
|
|
else
|
|
{
|
|
if ( bSplitCount )
|
|
{
|
|
if ( nRepS && nLenDiff > 0 )
|
|
nRepS -= nLenDiff; // Inserts were counted
|
|
if ( (nSPMin <= 2 * nLimit)
|
|
&& (npDistance[nStringLen] <= 2 * nLimit)
|
|
&& (nRepS * nRepP0 <= nLimit) )
|
|
return -npDistance[nStringLen];
|
|
return LEVDISBIG;
|
|
}
|
|
return LEVDISBIG;
|
|
}
|
|
}
|
|
|
|
// Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount
|
|
// from user values nOtherX, nShorterY, nLongerZ, bRelaxed
|
|
void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
|
|
{
|
|
if ( nX < 0 ) nX = 0; // only positive values
|
|
if ( nY < 0 ) nY = 0;
|
|
if ( nZ < 0 ) nZ = 0;
|
|
if (0 == std::min({ nX, nY, nZ })) // at least one 0
|
|
{
|
|
int nMid, nMax;
|
|
nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
|
|
if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
|
|
nLimit = nMax; // either 0 or the only one >0
|
|
else // one is 0
|
|
nLimit = std::lcm( nMid, nMax );
|
|
}
|
|
else // all three of them are not 0
|
|
nLimit = std::lcm(std::lcm(nX, nY), nZ);
|
|
nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
|
|
nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
|
|
nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
|
|
bSplitCount = bRelaxed;
|
|
}
|
|
|
|
// The value in the middle
|
|
int WLevDistance::Mid3( int x, int y, int z )
|
|
{
|
|
int min = std::min({ x, y, z });
|
|
if ( x == min )
|
|
return std::min(y, z);
|
|
else if ( y == min )
|
|
return std::min(x, z);
|
|
else // z == min
|
|
return std::min(x, y);
|
|
}
|
|
|
|
// initialize data from CTOR
|
|
void WLevDistance::InitData( const sal_Unicode* cPattern )
|
|
{
|
|
cpPattern = aPatMem.GetcPtr();
|
|
bpPatIsWild = aPatMem.GetbPtr();
|
|
npDistance = aDisMem.GetPtr();
|
|
nStars = 0;
|
|
const sal_Unicode* cp1 = cPattern;
|
|
sal_Unicode* cp2 = cpPattern;
|
|
bool* bp = bpPatIsWild;
|
|
// copy pattern, count asterisks, escaped Jokers
|
|
while ( *cp1 )
|
|
{
|
|
if ( *cp1 == '\\' ) // maybe escaped
|
|
{
|
|
if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker?
|
|
{
|
|
cp1++; // skip '\\'
|
|
nPatternLen--;
|
|
}
|
|
*bp++ = false;
|
|
}
|
|
else if ( *cp1 == '*' || *cp1 == '?' ) // Joker
|
|
{
|
|
if ( *cp1 == '*' )
|
|
nStars++;
|
|
*bp++ = true;
|
|
}
|
|
else
|
|
*bp++ = false;
|
|
*cp2++ = *cp1++;
|
|
}
|
|
*cp2 = '\0';
|
|
}
|
|
|
|
WLevDistance::WLevDistance( const sal_Unicode* cPattern,
|
|
int nOtherX, int nShorterY, int nLongerZ,
|
|
bool bRelaxed ) :
|
|
nPatternLen( Impl_WLD_StringLen(cPattern) ),
|
|
aPatMem( nPatternLen + 1 ),
|
|
nArrayLen( nPatternLen + 1 ),
|
|
aDisMem( nArrayLen )
|
|
{
|
|
InitData( cPattern );
|
|
CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
|
|
}
|
|
|
|
// CopyCTor
|
|
WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
|
|
nPatternLen( rWLD.nPatternLen ),
|
|
aPatMem( nPatternLen + 1 ),
|
|
nArrayLen( nPatternLen + 1 ),
|
|
aDisMem( nArrayLen ),
|
|
nLimit( rWLD.nLimit ),
|
|
nRepP0( rWLD.nRepP0 ),
|
|
nInsQ0( rWLD.nInsQ0 ),
|
|
nDelR0( rWLD.nDelR0 ),
|
|
nStars( rWLD.nStars ),
|
|
bSplitCount( rWLD.bSplitCount )
|
|
{
|
|
cpPattern = aPatMem.GetcPtr();
|
|
bpPatIsWild = aPatMem.GetbPtr();
|
|
npDistance = aDisMem.GetPtr();
|
|
sal_Int32 i;
|
|
for ( i=0; i<nPatternLen; i++ )
|
|
{
|
|
cpPattern[i] = rWLD.cpPattern[i];
|
|
bpPatIsWild[i] = rWLD.bpPatIsWild[i];
|
|
}
|
|
cpPattern[i] = '\0';
|
|
}
|
|
|
|
// DTor
|
|
WLevDistance::~WLevDistance()
|
|
{
|
|
}
|
|
|
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|