Or you could do it completely out of indexes :)
for $c in doc()//colors
is likely to create an EXPANDED TREE CACHE error on larger data sets.
Here is a slightly more complicated way to attack this when the data is huge, make sure the URI Lexicon is turned on and then add a element range index on the element color and compute the distinct color values that have duplication somewhere. Then loop over only the documents that have this color one by one and compute the item-frequency counts of the colors of interest in the documents. If you get a frequency over 1, this document needs de-duplication.
let $qn := xs:QName("color")
let $colorsWithItemFreq := cts:element-values($qn, (), ("ascending", "item-order", "item-frequency"))
let $colorsOfInterest :=
for $color at $i in cts:element-values($qn, (), ("ascending", "item-order", "fragment-frequency"))
let $fragFrequency := cts:frequency($color)
let $itemFrequency := cts:frequency($colorsWithItemFreq[$i])
where $itemFrequency gt $fragFrequency
return
$color
for $uri in cts:uris( (), ("document"), cts:element-value-query($qn, $colorsOfInterest)
let $colorsWithDuplicationInThisDoc :=
for $color in cts:element-values($qn, (), ("item-frequency"), cts:document-query($uri) )
where $color = $colorsOfInterest and cts:frequency($color) gt 1
return
$color
where fn:count( $colorsWithDuplicationInThisDoc ) gt 1
return
$uri
Hope that helps.