fork download
  1. #-*- coding:utf-8 -*-
  2.  
  3. """ A basic lexicographic Similarity algorithm ...part1
  4. @author: WhiZTiM (Timothy Onogu)
  5. @copyright: (c) January, 2013
  6. @NOTE: This algorithm is developed as it is, with the hope
  7. that it will be useful and I will not be responsible
  8. for any failures if any arises...
  9. @contact: whiztim@whiztim.com
  10. """
  11. from __future__ import division
  12.  
  13. def split_on_delimeters(string, delimeter=":;. \"!,$()-_+=~'`"):
  14. """splits string upon the occurence of any character in delimeter"""
  15. rtn = []
  16. placeHolder = ""
  17. for x in string:
  18. if(not x in delimeter): #if current iter is not a delimeter
  19. placeHolder = placeHolder + x
  20. continue
  21. if(placeHolder != ""): #Not allowed to append empty string
  22. rtn.append(placeHolder)
  23. placeHolder = ""
  24. continue
  25. if(placeHolder != ""): #Add the last item that is not catered for in the "for-loop"
  26. rtn.append(placeHolder)
  27. return rtn
  28. def returnbigrams(string):
  29. """This function returns bigrams"""
  30. return [string[n:n+2] for n in range(len(string) - 1)]
  31.  
  32. def word_similarity(word1, word2, case_sensitive=False):
  33. """This function, returns in percentage, how similar the string 'word1' is to 'word2'
  34. >..>case_sensitive for considering cases"""
  35. if(not case_sensitive):
  36. word1 = word1.lower()
  37. word2 = word2.lower()
  38. pairs_word1 = returnbigrams(word1)
  39. pairs_word2 = returnbigrams(word2)
  40. t = len(pairs_word1) + len(pairs_word2)
  41.  
  42. sb = 0
  43. for x in pairs_word1:
  44. for y in pairs_word2:
  45. if(x == y): #bigrams match
  46. sb += 2.0 #add (1+1=2).. since its found in both.
  47. pairs_word2.remove(y) #we do not need it again
  48. break #break inner loop
  49.  
  50. similarity = (sb / t * 100.0)
  51. return similarity
  52.  
  53. def get_weight(word1, words1):
  54. w = round((word1 / words1), 3)
  55. return w
  56.  
  57.  
  58. sent1= "Getting higher on rank oh"
  59. sent2= "Getting on higher rank"
  60.  
  61. list1 = split_on_delimeters(sent1)
  62. list2 = split_on_delimeters(sent2)
  63. list1_ratios=[]
  64. weight_list=[] #new array
  65. weighted_similarity_list=[] #new array
  66. sum1 = 0 #new array - used to get sum of words for weight
  67.  
  68. if len(list1) > len(list2):
  69. for x in list1:
  70. list2_ratios=[]
  71. sum1+=len(x) #sum of words is incremented
  72. for y in list2:
  73. list2_ratios.append(word_similarity(x,y))
  74. list1_ratios.append(max(list2_ratios))
  75.  
  76.  
  77. else:
  78. for x in list2:
  79. list2_ratios=[]
  80. sum1+=len(x) #sum of words is incremented
  81. for y in list1:
  82. list2_ratios.append(word_similarity(x,y))
  83. list1_ratios.append(max(list2_ratios))
  84.  
  85. print str(list1_ratios) + " - Similarity list"
  86.  
  87. #Getting weight list
  88. for x in list1:
  89. weight_list.append( get_weight(len(x), sum1))
  90. print str(weight_list) + " - Weight List of words in LiST 1"
  91.  
  92. #Getting weighted similarity
  93. for x in range(len(weight_list)):
  94. weighted_similarity_list.append( round((weight_list[x] * list1_ratios[x]),3))
  95. print str(weighted_similarity_list) + " - Weighted similarity List of words in LiST 1"
  96.  
  97. #Getting similarity
  98. similarity = sum(weighted_similarity_list)
  99. print str(similarity) + " - Similarity"
  100.  
  101.  
  102.  
  103.  
  104.  
  105.  
Success #stdin #stdout 0.08s 10840KB
stdin
Standard input is empty
stdout
[100.0, 100.0, 100.0, 100.0, 0.0] - Similarity list
[0.333, 0.286, 0.095, 0.19, 0.095] - Weight List of words in LiST 1
[33.3, 28.6, 9.5, 19.0, 0.0] - Weighted similarity  List of words in LiST 1
90.4 - Similarity