fork download
  1. #coding=utf-8 # 全文utf-8编码
  2. import sys
  3.  
  4. def apriori(D, minSup):
  5.  
  6. '''频繁项集用keys表示,
  7. key表示项集中的某一项,
  8. cutKeys表示经过剪枝步的某k项集。
  9. C表示某k项集的每一项在事务数据库D中的支持计数
  10. '''
  11.  
  12. C1 = {}
  13. for T in D:
  14. for I in T:
  15. if I in C1:
  16. C1[I] += 1
  17. else:
  18. C1[I] = 1
  19.  
  20. print C1
  21. _keys1 = C1.keys()
  22.  
  23. keys1 = []
  24. for i in _keys1:
  25. keys1.append([i])
  26.  
  27. n = len(D)
  28. cutKeys1 = []
  29. for k in keys1[:]:
  30. if C1[k[0]]*1.0/n >= minSup:
  31. cutKeys1.append(k)
  32.  
  33. cutKeys1.sort()
  34.  
  35.  
  36. keys = cutKeys1
  37. all_keys = []
  38. while keys != []:
  39. C = getC(D, keys)
  40. cutKeys = getCutKeys(keys, C, minSup, len(D))
  41. for key in cutKeys:
  42. all_keys.append(key)
  43. keys = aproiri_gen(cutKeys)
  44.  
  45. return all_keys
  46.  
  47. def getC(D, keys):
  48. '''对keys中的每一个key进行计数'''
  49. C = []
  50. for key in keys:
  51. c = 0
  52. for T in D:
  53. have = True
  54. for k in key:
  55. if k not in T:
  56. have = False
  57. if have:
  58. c += 1
  59. C.append(c)
  60. return C
  61.  
  62. def getCutKeys(keys, C, minSup, length):
  63. '''剪枝步'''
  64. for i, key in enumerate(keys):
  65. if float(C[i]) / length < minSup:
  66. keys.remove(key)
  67. return keys
  68.  
  69.  
  70.  
  71. def keyInT(key, T):
  72. '''判断项key是否在数据库中某一元组T中'''
  73. for k in key:
  74. if k not in T:
  75. return False
  76. return True
  77.  
  78.  
  79. def aproiri_gen(keys1):
  80. '''连接步'''
  81. keys2 = []
  82. for k1 in keys1:
  83. for k2 in keys1:
  84. if k1 != k2:
  85. key = []
  86. for k in k1:
  87. if k not in key:
  88. key.append(k)
  89. for k in k2:
  90. if k not in key:
  91. key.append(k)
  92. key.sort()
  93. if key not in keys2:
  94. keys2.append(key)
  95.  
  96. return keys2
  97.  
  98.  
  99.  
  100.  
  101. D = [['A','B','C','D'],['B','C','E'],['A','B','C','E'],['B','D','E'],['A','B','C','D']]
  102. F = apriori(D, 0.7)
  103. print '\nfrequent itemset:\n', F
  104.  
Success #stdin #stdout 0.01s 7176KB
stdin
Standard input is empty
stdout
{'A': 3, 'C': 4, 'B': 5, 'E': 3, 'D': 3}

frequent itemset:
[['B'], ['C'], ['B', 'C']]