Content of character-based-string-similarity module

xquery version "1.0";

(:
 : Copyright 2006-2009 The FLWOR Foundation.
 :                      
 : Licensed under the Apache License, Version 2.0 (the "License");
 : you may not use this file except in compliance with the License.
 : You may obtain a copy of the License at
 :
 : http://www.apache.org/licenses/LICENSE-2.0
 :
 : Unless required by applicable law or agreed to in writing, software
 : distributed under the License is distributed on an "AS IS" BASIS,
 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 : See the License for the specific language governing permissions and
 : limitations under the License.
 :)

(:~
 : <p>This library module provides character-based string similarity functions 
 : that view strings as sequences of characters, generally computing a similarity score
 : that corresponds to the cost of transforming one string into another.
 : These functions are particularly useful for matching near duplicate strings  
 : in the presence of typographical errors. </p>
 : <p>The logic contained in this module is not specific to any particular XQuery implementation.</p>
 :
 : @author Bruno Martins and Diogo Simões
 : @project Zorba/Data Cleaning/Character-Based String Similarity
 :)

module namespace simc = "http://zorba.io/modules/data-cleaning/character-based-string-similarity";

declare namespace ver = "http://zorba.io/options/versioning";
declare option ver:module-version "2.0";

(:~
 : <p>Returns the edit distance between two strings.</p>
 : <p/>
 : <p>This distance, also refered to as the Levenshtein distance, is defined as the minimum number 
 : of edits needed to transform one string into the other, with the allowable edit operations 
 : being insertion, deletion, or substitution of a single character.</p>
 : <p/>
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery">edit-distance("FLWOR", "FLOWER")</pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery">2</pre></p>
 :
 : @param $s1 The first string.
 : @param $s2 The second string.
 : @return The edit distance between the two strings.
 : @example test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq
 :)
declare function simc:edit-distance ( $s1 as xs:string, $s2 as xs:string ) as xs:integer {
 if(string-length($s1) = 0) then string-length($s2) else
 if(string-length($s2) = 0) then string-length($s1) else
 min((
  simc:edit-distance(substring($s1, 2), $s2) + 1 ,
  simc:edit-distance($s1, substring($s2, 2)) + 1 ,
  simc:edit-distance(substring($s1, 2), substring($s2, 2)) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then 0 else 1 )
 ))
};

(:~
 : <p>Returns the Jaro similarity coefficient between two strings.</p>
 : <p/>
 : <p>This similarity coefficient is based on the number of transposed characters and on a 
 : weighted sum of the percentage of matched characters held within the strings. The higher 
 : the Jaro-Winkler value is, the more similar the strings are. The coefficient is 
 : normalized such that 0 equates to no similarity and 1 is an exact match.</p>
 : <p/>
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery">jaro("FLWOR Found.", "FLWOR Foundation")</pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery">0.5853174603174603</pre></p>
 :
 : @param $s1 The first string.
 : @param $s2 The second string.
 : @return The Jaro similarity coefficient between the two strings.
 : @example test/Queries/data-cleaning/character-based-string-similarity/jaro.xq
 :)
declare function simc:jaro ( $s1 as xs:string, $s2 as xs:string ) as xs:double {
 let $s    := for $i in ($s1,$s2) order by string-length($i) return $i
 let $l1   := string-length($s[1])
 let $l2   := string-length($s[2])
 let $mt   := xs:integer((max(($l1,$l2)) div 2.0) - 1)
 let $mc   := for $i in 1 to min( ($l1 , $l2) ) 
              let $auxmatch := substring($s[2], max((1,$i - $mt)), $mt * 2 )
              return for $j in 1 to string-length($auxmatch)  
                     where substring($auxmatch, $j, 1) = substring($s[1], $i, 1)
                     return <match char="{substring($s[1], $i, 1)}" pos1="{$i}" pos2="{$j + max((1,$i - $mt)) - 1}" />
 let $m    := if (count($mc) = 0) then (1) else (count($mc))
 let $t    := count( for $i in $mc, $j in $mc where $i/@pos1>$j/@pos1 and $i/@pos2<$j/@pos2 return $i )
 let $dist := xs:double((($m div $l1) + ($m div $l2) + (($m - $t) div $m)) div 3)
 return $dist
};

(:~
 : <p>Returns the Jaro-Winkler similarity coefficient between two strings.</p>
 : <p/>
 : <p>This similarity coefficient corresponds to an extension of the Jaro similarity coefficient that weights or
 : penalizes strings based on their similarity at the beginning of the string, up to a given prefix size.</p>
 : <p/>
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery">jaro-winkler("DWAYNE", "DUANE", 4, 0.1 )</pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery">0.8577777777777778</pre></p>
 :
 : @param $s1 The first string.
 : @param $s2 The second string.
 : @param $prefix The number of characters to consider when testing for equal prefixes in the strings.
 : @param $fact The weighting factor to consider when the input strings have equal prefixes.
 : @return The Jaro-Winkler similarity coefficient between the two strings.
 : @example test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq
 :)
declare function simc:jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $prefix as xs:integer, $fact as xs:double ) as xs:double {
 let $jaro := simc:jaro( $s1 , $s2 )
 let $cc   := for $i in 1 to min(($prefix, string-length($s1), string-length($s2))) 
              where substring($s1, 0, $i) = substring($s2, 0, $i) return $i
 return ($jaro + ( $fact * max($cc) * ( 1 - $jaro ) ) )
};

(:~
 : <p>Returns the Needleman-Wunsch distance between two strings.</p>
 : <p/>
 : <p>The Needleman-Wunsch distance is similar to the basic edit distance metric, adding a 
 : variable cost adjustment to the cost of a gap (i.e., an insertion or deletion) in the 
 : distance metric.</p>
 : <p/>
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery">needleman-wunsch("KAK", "KQRK", 1, 1)</pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery">0</pre></p>
 :
 : @param $s1 The first string.
 : @param $s2 The second string.
 : @param $score The score value.
 : @param $penalty The penalty value.
 : @return The Needleman-Wunsch distance between the two strings.
 : @example test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq
 :)
declare function simc:needleman-wunsch ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{
 
 if(string-length($s1) = 0) then string-length($s2)* - $penalty else
 if(string-length($s2) = 0) then string-length($s1)* - $penalty else
 max((
  simc:needleman-wunsch(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
  simc:needleman-wunsch($s1, substring($s2, 2), $score, $penalty) - $penalty ,
  simc:needleman-wunsch(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
 ))
};

(:~
 : <p>Returns the Smith-Waterman distance between two strings.</p>
 : <p/>
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery">smith-waterman("ACACACTA", "AGCACACA", 2, 1)</pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery">12</pre></p>
 :
 : @param $s1 The first string.
 : @param $s2 The second string.
 : @param $score The score value.
 : @param $penalty The penalty value.
 : @return The Smith-Waterman distance between the two strings.
 :)
declare function simc:smith-waterman ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{ 
 if(string-length($s1) = 0) then 0 else
 if(string-length($s2) = 0) then 0 else
 max((
  0,
  simc:smith-waterman(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
  simc:smith-waterman($s1, substring($s2, 2), $score, $penalty) - $penalty ,
  simc:smith-waterman(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
 ))
};