Content of set-similarity module

xquery version "1.0";

(:
 : Copyright 2006-2009 The FLWOR Foundation.
 :
 : Licensed under the Apache License, Version 2.0 (the "License");
 : you may not use this file except in compliance with the License.
 : You may obtain a copy of the License at
 :
 : http://www.apache.org/licenses/LICENSE-2.0
 :
 : Unless required by applicable law or agreed to in writing, software
 : distributed under the License is distributed on an "AS IS" BASIS,
 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 : See the License for the specific language governing permissions and
 : limitations under the License.
 :)

(:~
 : <p>This library module provides similarity functions for comparing sets of XML 
 : nodes (e.g., sets of XML elements, attributes or atomic values).</p>
 : <p/>
 : <p>These functions are particularly useful for matching near duplicate sets of XML nodes.</p>
 : <p/>
 : <p>The logic contained in this module is not specific to any particular XQuery implementation.</p>
 :
 : @author Bruno Martins
 : @project Zorba/Data Cleaning/Set Similarity
 :)

module namespace set = "http://zorba.io/modules/data-cleaning/set-similarity";

declare namespace ver = "http://zorba.io/options/versioning";
declare option ver:module-version "2.0";

(:~
 : <p>Returns the union between two sets, using the deep-equal() function to compare the XML nodes from the sets.</p>
 : <p/>
 : 
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery"> deep-union ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery"> ("a", "b", "c", <d/> ) </pre></p>
 :
 : @param $s1 The first set.
 : @param $s2 The second set.
 : @return The union of both sets.
 : @example test/Queries/data-cleaning/set-similarity/deep-union.xq
 :)
declare function set:deep-union ( $s1 , $s2 ) as item()*{
 let $s := ( $s1 , $s2 )
 for $a at $apos in $s
 where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
 return $a
};

(:~
 : <p>Returns the intersection between two sets, using the deep-equal() function to compare the XML nodes from the sets.</p>
 : <p/>
 : 
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery"> deep-intersect ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery"> ("a") </pre></p>
 :
 : @param $s1 The first set.
 : @param $s2 The second set.
 : @return The intersection of both sets.
 : @example test/Queries/data-cleaning/set-similarity/deep-intersect.xq
 :)
declare function set:deep-intersect ( $s1 , $s2 ) as item()*{
 for $a at $apos in $s1
 let $t1 := every $ba in subsequence($s1, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
 let $t2 := some $bb in $s2 satisfies deep-equal($bb,$a)
 where $t1 and $t2
 return $a
};

(:~
 : <p>Removes exact duplicates from a set, using the deep-equal() function to compare the XML nodes from the sets.</p>
 : <p/>
 : 
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery"> distinct ( ( "a", "a", <b/> ) ) </pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery"> ("a", <b/> ) </pre></p>
 :
 : @param $s A set.
 : @return The set provided as input without the exact duplicates (i.e., returns the distinct nodes from the set provided as input).
 : @example test/Queries/data-cleaning/set-similarity/distinct.xq
 :)
declare function set:distinct ( $s ) as item()*{
 for $a at $apos in $s
 where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
 return $a
};

(:~
 : <p>Returns the overlap coefficient between two sets of XML nodes.</p>
 : <p>The overlap coefficient is defined as the shared information between the input sets 
 : (i.e., the size of the intersection) over the size of the smallest input set.</p>
 : <p/>
 : 
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery"> overlap ( ( "a", "b", <c/> ) , ( "a", "a", "b" ) ) </pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery"> 1.0 </pre></p>
 :
 : @param $s1 The first set.
 : @param $s2 The second set.
 : @return The overlap coefficient between the two sets.
 : @example test/Queries/data-cleaning/set-similarity/overlap.xq
 :)
declare function set:overlap ( $s1 , $s2 ) as xs:double {
  count( set:deep-intersect($s1, $s2) ) div min((count(set:distinct($s1)) , count(set:distinct($s2))))
};

(:~
 : <p>Returns the Dice similarity coefficient between two sets of XML nodes.</p>
 : <p>The Dice coefficient is defined as defined as twice the shared information between the input sets 
 : (i.e., the size of the intersection) over the sum of the cardinalities for the input sets.</p>
 : <p/>
 : 
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery"> dice ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery"> 0.4 </pre></p>
 :
 : @param $s1 The first set.
 : @param $s2 The second set.
 : @return The Dice similarity coefficient between the two sets.
 : @example test/Queries/data-cleaning/set-similarity/dice.xq
 :)
declare function set:dice ( $s1 , $s2 ) as xs:double {
  2 * count( set:deep-intersect($s1,$s2) ) div ( count(set:distinct($s1)) + count(set:distinct($s2)) )
};

(:~
 : <p>Returns the Jaccard similarity coefficient between two sets of XML nodes.</p>
 : <p>The Jaccard coefficient is defined as the size of the intersection divided by the size of the 
 : union of the input sets.</p>
 : <p/>
 : 
 : <p>Example usage : <pre class="ace-static" ace-mode="xquery"> jaccard ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre></p>
 : <p/>
 : <p>The function invocation in the example above returns : <pre class="ace-static" ace-mode="xquery"> 0.25 </pre></p>
 :
 : @param $s1 The first set.
 : @param $s2 The second set.
 : @return The Jaccard similarity coefficient between the two sets.
 : @example test/Queries/data-cleaning/set-similarity/jaccard.xq
 :)
declare function set:jaccard ( $s1 , $s2  ) as xs:double {
 count( set:deep-intersect($s1,$s2) ) div count( set:deep-union($s1,$s2) )
};