Solution
I have used https://stackoverflow.com/a/15174569/61903 to calculate the cosine similarity of two strings (credits to @vpekar) as a base algorithm for similarity. Generally I put all the strings into a list. Then I set a index parameter i to 0 and loop over i as long as it is in the range of the list length. Within that loop I iterate a position p from i+1 to length(list). Then I find the maximum cosine value between list[i] and list[p]. Both textstrings will be put into a out list so they won't be taken into account in later similarity calculations. Both textstrings will be put into the result list along with the cosine value, the datastructure is VectorResult.
Afterwards the list is sorted by the cosine value. We now have unique string pairs with descending cosine, a.k.a. similarity value. HTH.
import re
import math
import timeit
from collections import Counter
WORD = re.compile(r'\w+')
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
class VectorResult(object):
def __init__(self, cosine, text_1, text_2):
self.cosine = cosine
self.text_1 = text_1
self.text_2 = text_2
def __eq__(self, other):
if self.cosine == other.cosine:
return True
return False
def __le__(self, other):
if self.cosine <= other.cosine:
return True
return False
def __ge__(self, other):
if self.cosine >= other.cosine:
return True
return False
def __lt__(self, other):
if self.cosine < other.cosine:
return True
return False
def __gt__(self, other):
if self.cosine > other.cosine:
return True
return False
def main():
start = timeit.default_timer()
texts = []
with open('data.txt', 'r') as f:
texts = f.readlines()
cosmap = []
i = 0
out = []
while i < len(texts):
max_cosine = 0.0
current = None
for p in range(i + 1, len(texts)):
if texts[i] in out or texts[p] in out:
continue
vector1 = text_to_vector(texts[i])
vector2 = text_to_vector(texts[p])
cosine = get_cosine(vector1, vector2)
if cosine > max_cosine:
current = VectorResult(cosine, texts[i], texts[p])
max_cosine = cosine
if current:
out.extend([current.text_1, current.text_2])
cosmap.append(current)
i += 1
cosmap = sorted(cosmap)
for item in reversed(cosmap):
print(item.cosine, item.text_1, item.text_2)
end = timeit.default_timer()
print("Similarity Sorting of {} strings lasted {} s.".format(len(texts), end - start))
if __name__ == '__main__':
main()
Results
I used your sampple adresses at http://pastebin.com/hySkZ4Pn as test data:
1.0000000000000002 NO 15& 16 1ST FLOOR,2ND MAIN ROAD,KHB COLONY,GANDINAGAR YELAHANKA
NO 15& 16 1ST FLOOR,2ND MAIN ROAD,KHB COLONY,GANDINAGAR YELAHANKA
1.0 # 51/3 AGRAHARA YELAHANKA
#51/3 AGRAHARA YELAHANKA
0.9999999999999999 # C M C ROAD,YALAHANKA
# C M C ROAD,YALAHANKA
0.8728715609439696 # 1002/B B B ROAD,YELAHANKA
0,B B ROAD,YELAHANKA
0.8432740427115678 # LAKSHMI COMPLEX C M C ROAD,YALAHANKA
# SRI LAKSHMAN COMPLEX C M C ROAD,YALAHANKA
0.8333333333333335 # 85/1 B B M P OFFICE ROAD,KOGILU YELAHANKA
#85/1 B B M P OFFICE NEAR KOGILU YALAHANKA
0.8249579113843053 # 689 3RD A CROSS SHESHADRIPURAM CALLEGE OPP YELAHANKA
# 715 3RD CROSS A SECTUR SHESHADRIPURAM CALLEGE OPP YELAHANKA
0.8249579113843053 # 10 RAMAIAIA COMPLEX B B ROAD,YALAHANKA
# JAMATI COMPLEX B B ROAD,YALAHANKA
[ SNIPPED ]
Similarity Sorting of 702 strings lasted 8.955146235887025 s.