You should use actual sets to perform the operations. For a brute force approach, a recursive function is easiest to program:
This one expects a list of sets and tries each one with a recursion on the remaining sets in the list. It returns the shortest list of sets that it finds.
def setCover(setList,target=None):
if not setList: return None
if target is None: target = set.union(*setList)
bestCover = []
for i,values in enumerate(setList):
remaining = target - values
if remaining == target: continue
if not remaining: return [values]
subCover = setCover(setList[i+1:],remaining)
if not subCover: continue
if not bestCover or len(subCover)<len(bestCover)-1:
bestCover = [values] + subCover
return bestCover
output:
M = [
{ "1", "4", "7" },
{ "1", "2" },
{ "2", "5", "6" },
{ "2", "5" },
{ "3" },
{ "8", "6" }
]
print(setCover(M))
# [{'7', '1', '4'}, {'6', '5', '2'}, {'3'}, {'6', '8'}]
Brute fore is quite slow but it can be optimized. The above function does skip sets that don't add more coverage but it performs in roughly O(n!).
There are several optimization strategies that will make it go faster.
- Sort the set list in decreasing order of set length (combined with the other optimizations)
- avoid memory allocation by not creating sub lists (i.e. pass the same list object with a starting index)
- keep track of the current best result to constrain the recursion to a maximum count (i.e. skipping over sub-solutions that have more items than the current best)
- isolate values that appear in only one set, the sets that contain them are mandatory and will always be part of the solution. Only analyse the remaining sets
- compute the maximum coverage of remaining sets for each position in the list so that you can know in advance that there will not be a solution with the remaining items
Here is an optimized version of the function:
from itertools import accumulate
def setCover4(setList,start=0,target=None,maxCount=None,cumSets=None):
# short circuit recursion when imposible to cover with fewer sets than current best
if maxCount is None: maxCount = len(setList)
if maxCount == 0: return None
# sort sets in descending order of their size to maximize initial coverage
if target is None:
target = set.union(*setList)
setList = sorted(setList,key=len,reverse=True)
# values that exist in only one set make that set mandatory in the solution
# set them apart and combine them with the solution for the remaining sets
if start == 0:
singletons = target
foundOnce = set()
for s in setList:
singletons = singletons - (s & foundOnce)
foundOnce.update(s)
if singletons:
mandatorySets = [ s for s in setList if s&singletons ]
remaining = target - set.union(*mandatorySets)
if not remaining: return mandatorySets
setList = [s for s in setList if not s&singletons]
subCover = setCover4(setList,0,remaining)
if subCover : return mandatorySets + subCover
return None
# predetermine the remaining coverage from each position to the end of the list
if cumSets is None:
cumSets = [ u for u in accumulate(reversed(setList),set.union) ][::-1]
# try sets at each position (from start to end) recursing with remaining sets
bestCover = []
for i in range(start,len(setList)):
if not cumSets[i].issuperset(target): break # no solution in remaining sets
values = setList[i]
remaining = target - values
if remaining == target: continue
if not remaining: return [values]
subCover = setCover4(setList,i+1,remaining,maxCount-1,cumSets)
if not subCover: continue
if not bestCover or len(subCover)<len(bestCover)-1:
bestCover = [values] + subCover
maxCount = len(bestCover)
return bestCover
Performance tests show that setCover4
responds orders of magnitude faster than the original setCover
function
from timeit import timeit
import random
samples = 10
values = list(range(100))
subsetSize = 10
subsetCount = 20
M = [ set(random.sample(values,random.randrange(1,subsetSize))) for _ in range(subsetCount) ]
t = timeit(lambda:setCover(M),number=samples)
print("setCover ",f"{t:.5f}")
t = timeit(lambda:setCover4(M),number=samples)
print("setCover4",f"{t:.5f}")
# setCover 9.11923
# setCover4 0.00095
More tests with varying numbers of sets and set sizes confirm the performance difference but it also shows that setCover4
, despite being optimized, also has an exponential time pattern.
for subsetSize in (10,20,30):
print("")
for subsetCount in (10,15,18,19,20,25,30,35,40,45,50):
t1 = t4 = 0
for _ in range(samples):
M = [ set(random.sample(values,random.randrange(1,subsetSize))) for _ in range(subsetCount) ]
if subsetCount < 25: t1 += timeit(lambda:setCover(M),number=1)
t4 += timeit(lambda:setCover4(M),number=1)
print(f"subsetSize={subsetSize}",f"subsetCount={subsetCount}",
f" setCover:{t1:8.5f}" if t1 else " setCover: -------",
f" setCover4:{t4:8.5f}")
Results:
subsetSize=10 subsetCount=10 setCover: 0.01501 setCover4: 0.00039
subsetSize=10 subsetCount=15 setCover: 0.28903 setCover4: 0.00034
subsetSize=10 subsetCount=18 setCover: 2.05937 setCover4: 0.00042
subsetSize=10 subsetCount=19 setCover: 4.32700 setCover4: 0.00044
subsetSize=10 subsetCount=20 setCover: 8.08408 setCover4: 0.00045
subsetSize=10 subsetCount=25 setCover: ------- setCover4: 0.00101
subsetSize=10 subsetCount=30 setCover: ------- setCover4: 0.00158
subsetSize=10 subsetCount=35 setCover: ------- setCover4: 0.00215
subsetSize=10 subsetCount=40 setCover: ------- setCover4: 0.00813
subsetSize=10 subsetCount=45 setCover: ------- setCover4: 0.01751
subsetSize=10 subsetCount=50 setCover: ------- setCover4: 0.13528
subsetSize=20 subsetCount=10 setCover: 0.01878 setCover4: 0.00049
subsetSize=20 subsetCount=15 setCover: 0.35464 setCover4: 0.00050
subsetSize=20 subsetCount=18 setCover: 2.66359 setCover4: 0.00057
subsetSize=20 subsetCount=19 setCover: 4.73091 setCover4: 0.00074
subsetSize=20 subsetCount=20 setCover: 8.37055 setCover4: 0.00069
subsetSize=20 subsetCount=25 setCover: ------- setCover4: 0.00176
subsetSize=20 subsetCount=30 setCover: ------- setCover4: 0.00979
subsetSize=20 subsetCount=35 setCover: ------- setCover4: 0.05368
subsetSize=20 subsetCount=40 setCover: ------- setCover4: 0.32195
subsetSize=20 subsetCount=45 setCover: ------- setCover4: 5.34897
subsetSize=20 subsetCount=50 setCover: ------- setCover4:44.98202
subsetSize=30 subsetCount=10 setCover: 0.01798 setCover4: 0.00056
subsetSize=30 subsetCount=15 setCover: 0.32203 setCover4: 0.00058
subsetSize=30 subsetCount=18 setCover: 2.10538 setCover4: 0.00089
subsetSize=30 subsetCount=19 setCover: 4.31587 setCover4: 0.00121
subsetSize=30 subsetCount=20 setCover: 8.22864 setCover4: 0.00118
subsetSize=30 subsetCount=25 setCover: ------- setCover4: 0.01261
subsetSize=30 subsetCount=30 setCover: ------- setCover4: 0.05015
subsetSize=30 subsetCount=35 setCover: ------- setCover4: 0.38848
subsetSize=30 subsetCount=40 setCover: ------- setCover4: 3.29696
subsetSize=30 subsetCount=45 setCover: ------- setCover4: 8.22697
subsetSize=30 subsetCount=50 setCover: ------- setCover4:23.39054