#-*- coding:utf-8 -*-
""" A basic lexicographic Similarity algorithm ...part1
@author: WhiZTiM (Timothy Onogu)
@copyright: (c) January, 2013
@NOTE: This algorithm is developed as it is, with the hope
that it will be useful and I will not be responsible
for any failures if any arises...
@contact: whiztim@whiztim.com
"""
from __future__ import division
def split_on_delimeters(string, delimeter=":;. \"!,$()-_+=~'`"):
"""splits string upon the occurence of any character in delimeter"""
rtn = []
placeHolder = ""
for x in string:
if(not x in delimeter): #if current iter is not a delimeter
placeHolder = placeHolder + x
continue
if(placeHolder != ""): #Not allowed to append empty string
rtn.append(placeHolder)
placeHolder = ""
continue
if(placeHolder != ""): #Add the last item that is not catered for in the "for-loop"
rtn.append(placeHolder)
return rtn
def returnbigrams(string):
"""This function returns bigrams"""
return [string[n:n+2] for n in range(len(string) - 1)]
def word_similarity(word1, word2, case_sensitive=False):
"""This function, returns in percentage, how similar the string 'word1' is to 'word2'
>..>case_sensitive for considering cases"""
if(not case_sensitive):
word1 = word1.lower()
word2 = word2.lower()
pairs_word1 = returnbigrams(word1)
pairs_word2 = returnbigrams(word2)
t = len(pairs_word1) + len(pairs_word2)
sb = 0
for x in pairs_word1:
for y in pairs_word2:
if(x == y): #bigrams match
sb += 2.0 #add (1+1=2).. since its found in both.
pairs_word2.remove(y) #we do not need it again
break #break inner loop
similarity = (sb / t * 100.0)
return similarity
def get_weight(word1, words1):
w = round((word1 / words1), 3)
return w
sent1= "Getting higher on rank oh"
sent2= "Getting on higher rank"
list1 = split_on_delimeters(sent1)
list2 = split_on_delimeters(sent2)
list1_ratios=[]
weight_list=[] #new array
weighted_similarity_list=[] #new array
sum1 = 0 #new array - used to get sum of words for weight
if len(list1) > len(list2):
for x in list1:
list2_ratios=[]
sum1+=len(x) #sum of words is incremented
for y in list2:
list2_ratios.append(word_similarity(x,y))
list1_ratios.append(max(list2_ratios))
else:
for x in list2:
list2_ratios=[]
sum1+=len(x) #sum of words is incremented
for y in list1:
list2_ratios.append(word_similarity(x,y))
list1_ratios.append(max(list2_ratios))
print str(list1_ratios) + " - Similarity list"
#Getting weight list
for x in list1:
weight_list.append( get_weight(len(x), sum1))
print str(weight_list) + " - Weight List of words in LiST 1"
#Getting weighted similarity
for x in range(len(weight_list)):
weighted_similarity_list.append( round((weight_list[x] * list1_ratios[x]),3))
print str(weighted_similarity_list) + " - Weighted similarity List of words in LiST 1"
#Getting similarity
similarity = sum(weighted_similarity_list)
print str(similarity) + " - Similarity"