below code from good Samaritan - works great in English, can find strings of text in a large document and get confidence on how well it matches
but cant figure out how to get it working with Thai characters
#!/usr/bin/python
from difflib import SequenceMatcher as SM
from nltk.util import ngrams
import codecs
with open('mainEN.txt', 'r') as hay_file:
hay = hay_file.read()
with open('searchEN.txt', 'r') as needle_file:
needle = needle_file.read()
needle_length = len(needle.split())
max_sim_val = 0
max_sim_string = u""
for ngram in ngrams(hay.split(), needle_length + int(.2*needle_length)):
hay_ngram = u" ".join(ngram)
similarity = SM(None, hay_ngram, needle).ratio()
if similarity > max_sim_val:
max_sim_val = similarity
max_sim_string = hay_ngram
print max_sim_val, max_sim_string