#-*- coding:utf-8 -*-

""" A basic lexicographic Similarity algorithm ...part1
    @author: WhiZTiM (Timothy Onogu)
	@copyright: (c) January, 2013
	@NOTE: This algorithm is developed as it is, with the hope
		that it will be useful and I will not be responsible
		for any failures if any arises...
	@contact: whiztim@whiztim.com
"""
from __future__ import division

def split_on_delimeters(string, delimeter=":;. \"!,$()-_+=~'`"):
        """splits string upon the occurence of any character in delimeter"""    
        rtn = []
        placeHolder = ""
        for x in string:
                if(not x in delimeter):                 #if current iter is not a delimeter
                        placeHolder = placeHolder + x
                        continue
                if(placeHolder != ""):                  #Not allowed to append empty string
                        rtn.append(placeHolder)
                        placeHolder = ""
                        continue
        if(placeHolder != ""):                          #Add the last item that is not catered for in the "for-loop"
                rtn.append(placeHolder)
        return rtn
def returnbigrams(string):
        """This function returns bigrams"""
        return [string[n:n+2] for n in range(len(string) - 1)]
    
def word_similarity(word1, word2, case_sensitive=False):
        """This function, returns in percentage, how similar the string 'word1' is to 'word2'
           >..>case_sensitive for considering cases"""
        if(not case_sensitive):
                word1 = word1.lower()
                word2 = word2.lower()
        pairs_word1 = returnbigrams(word1)
        pairs_word2 = returnbigrams(word2)
        t = len(pairs_word1) + len(pairs_word2)
 
        sb = 0
        for x in pairs_word1:
                for y in pairs_word2:
                        if(x == y):                     #bigrams match
                                sb += 2.0               #add (1+1=2).. since its found in both.
                                pairs_word2.remove(y)   #we do not need it again
                                break                   #break inner loop
        
        similarity = (sb / t * 100.0)
        return similarity
    
def get_weight(word1, words1):
    w = round((word1 / words1), 3)
    return w
    
    
sent1= "Getting higher on rank oh"
sent2= "Getting on higher rank"

list1 =  split_on_delimeters(sent1)
list2 =  split_on_delimeters(sent2)
list1_ratios=[]
weight_list=[] #new array
weighted_similarity_list=[] #new array
sum1 = 0 #new array - used to get sum of words for weight

if len(list1) > len(list2):
    for x in list1:
        list2_ratios=[]
        sum1+=len(x) #sum of words is incremented
        for y in list2:
            list2_ratios.append(word_similarity(x,y))
        list1_ratios.append(max(list2_ratios))
    
    
else:
    for x in list2:
        list2_ratios=[]
        sum1+=len(x) #sum of words is incremented
        for y in list1:
            list2_ratios.append(word_similarity(x,y))
        list1_ratios.append(max(list2_ratios))
    
print str(list1_ratios) + " - Similarity list"

#Getting weight list
for x in list1:
        weight_list.append( get_weight(len(x), sum1))
print str(weight_list) + " - Weight List of words in LiST 1"

#Getting weighted similarity
for x in range(len(weight_list)):
        weighted_similarity_list.append( round((weight_list[x] * list1_ratios[x]),3))
print str(weighted_similarity_list) + " - Weighted similarity  List of words in LiST 1"

#Getting similarity
similarity = sum(weighted_similarity_list)
print str(similarity) + " - Similarity"





