fork download
  1. #-*- coding:utf-8 -*-
  2.  
  3. """ A basic lexicographic Similarity algorithm ...part1
  4. @author: WhiZTiM (Timothy Onogu)
  5. @copyright: (c) January, 2013
  6. @NOTE: This algorithm is developed as it is, with the hope
  7. that it will be useful and I will not be responsible
  8. for any failures if any arises...
  9. @contact: whiztim@whiztim.com
  10. """
  11.  
  12. def split_on_delimeters(string, delimeter=":;. \"!,$()-_+=~'`"):
  13. """splits string upon the occurence of any character in delimeter"""
  14. rtn = []
  15. placeHolder = ""
  16. for x in string:
  17. if(not x in delimeter): #if current iter is not a delimeter
  18. placeHolder = placeHolder + x
  19. continue
  20. if(placeHolder != ""): #Not allowed to append empty string
  21. rtn.append(placeHolder)
  22. placeHolder = ""
  23. continue
  24. if(placeHolder != ""): #Add the last item that is not catered for in the "for-loop"
  25. rtn.append(placeHolder)
  26. return rtn
  27.  
  28. def returnbigrams(string):
  29. """This function returns bigrams"""
  30. return [string[n:n+2] for n in range(len(string) - 1)]
  31.  
  32. def word_similarity(word1, word2, case_sensitive=False):
  33. """This function, returns in percentage, how similar the string 'word1' is to 'word2'
  34. >..>case_sensitive for considering cases"""
  35. if(not case_sensitive):
  36. word1 = word1.lower()
  37. word2 = word2.lower()
  38. pairs_word1 = returnbigrams(word1)
  39. pairs_word2 = returnbigrams(word2)
  40. t = len(pairs_word1) + len(pairs_word2)
  41.  
  42. sb = 0
  43. for x in pairs_word1:
  44. for y in pairs_word2:
  45. if(x == y): #bigrams match
  46. sb += 2.0 #add (1+1=2).. since its found in both.
  47. pairs_word2.remove(y) #we do not need it again
  48. break #break inner loop
  49.  
  50. similarity = (sb / t * 100.0)
  51. return similarity
  52.  
  53. sent1= "Getting higher on rank oh"
  54. sent2= "Getting on higher rank"
  55.  
  56. list1 = split_on_delimeters(sent1)
  57. list2 = split_on_delimeters(sent2)
  58. list1_ratios=[]
  59.  
  60. if len(list1) > len(list2):
  61. for x in list1:
  62. list2_ratios=[]
  63. for y in list2:
  64. list2_ratios.append(word_similarity(x,y))
  65. list1_ratios.append(max(list2_ratios))
  66. print list1_ratios
  67. else:
  68. for x in list2:
  69. list2_ratios=[]
  70. for y in list1:
  71. list2_ratios.append(word_similarity(x,y))
  72. list1_ratios.append(max(list2_ratios))
  73. print list1_ratios
  74.  
  75.  
  76.  
Success #stdin #stdout 0.09s 10824KB
stdin
Standard input is empty
stdout
[100.0, 100.0, 100.0, 100.0, 0.0]