#-*- coding:utf-8 -*-
""" A basic lexicographic Similarity algorithm ...part1
@author: WhiZTiM (Timothy Onogu)
@copyright: (c) January, 2013
@NOTE: This algorithm is developed as it is, with the hope
that it will be useful and I will not be responsible
for any failures if any arises...
@contact: whiztim@whiztim.com
"""
def split_on_delimeters(string, delimeter=":;. \"!,$()-_+=~'`"):
"""splits string upon the occurence of any character in delimeter"""
rtn = []
placeHolder = ""
for x in string:
if(not x in delimeter): #if current iter is not a delimeter
placeHolder = placeHolder + x
continue
if(placeHolder != ""): #Not allowed to append empty string
rtn.append(placeHolder)
placeHolder = ""
continue
if(placeHolder != ""): #Add the last item that is not catered for in the "for-loop"
rtn.append(placeHolder)
return rtn
def returnbigrams(string):
"""This function returns bigrams"""
return [string[n:n+2] for n in range(len(string) - 1)]
def word_similarity(word1, word2, case_sensitive=False):
"""This function, returns in percentage, how similar the string 'word1' is to 'word2'
>..>case_sensitive for considering cases"""
if(not case_sensitive):
word1 = word1.lower()
word2 = word2.lower()
pairs_word1 = returnbigrams(word1)
pairs_word2 = returnbigrams(word2)
t = len(pairs_word1) + len(pairs_word2)
sb = 0
for x in pairs_word1:
for y in pairs_word2:
if(x == y): #bigrams match
sb += 2.0 #add (1+1=2).. since its found in both.
pairs_word2.remove(y) #we do not need it again
break #break inner loop
similarity = (sb / t * 100.0)
return similarity
sent1= "Getting higher on rank oh"
sent2= "Getting on higher rank"
list1 = split_on_delimeters(sent1)
list2 = split_on_delimeters(sent2)
list1_ratios=[]
if len(list1) > len(list2):
for x in list1:
list2_ratios=[]
for y in list2:
list2_ratios.append(word_similarity(x,y))
list1_ratios.append(max(list2_ratios))
print list1_ratios
else:
for x in list2:
list2_ratios=[]
for y in list1:
list2_ratios.append(word_similarity(x,y))
list1_ratios.append(max(list2_ratios))
print list1_ratios