#coding=utf-8 # 全文utf-8编码
import sys
def apriori(D, minSup):
'''频繁项集用keys表示,
key表示项集中的某一项,
cutKeys表示经过剪枝步的某k项集。
C表示某k项集的每一项在事务数据库D中的支持计数
'''
C1 = {}
for T in D:
for I in T:
if I in C1:
C1[I] += 1
else:
C1[I] = 1
print C1
_keys1 = C1.keys()
keys1 = []
for i in _keys1:
keys1.append([i])
n = len(D)
cutKeys1 = []
for k in keys1[:]:
if C1[k[0]]*1.0/n >= minSup:
cutKeys1.append(k)
cutKeys1.sort()
keys = cutKeys1
all_keys = []
while keys != []:
C = getC(D, keys)
cutKeys = getCutKeys(keys, C, minSup, len(D))
for key in cutKeys:
all_keys.append(key)
keys = aproiri_gen(cutKeys)
return all_keys
def getC(D, keys):
'''对keys中的每一个key进行计数'''
C = []
for key in keys:
c = 0
for T in D:
have = True
for k in key:
if k not in T:
have = False
if have:
c += 1
C.append(c)
return C
def getCutKeys(keys, C, minSup, length):
'''剪枝步'''
for i, key in enumerate(keys):
if float(C[i]) / length < minSup:
keys.remove(key)
return keys
def keyInT(key, T):
'''判断项key是否在数据库中某一元组T中'''
for k in key:
if k not in T:
return False
return True
def aproiri_gen(keys1):
'''连接步'''
keys2 = []
for k1 in keys1:
for k2 in keys1:
if k1 != k2:
key = []
for k in k1:
if k not in key:
key.append(k)
for k in k2:
if k not in key:
key.append(k)
key.sort()
if key not in keys2:
keys2.append(key)
return keys2
D = [['A','B','C','D'],['B','C','E'],['A','B','C','E'],['B','D','E'],['A','B','C','D']]
F = apriori(D, 0.7)
print '\nfrequent itemset:\n', F
I2NvZGluZz11dGYtOCAgICAgICAgICAgICAgICAgICAgICAgICMg5YWo5paHdXRmLTjnvJbnoIEKaW1wb3J0IHN5cwoKZGVmIGFwcmlvcmkoRCwgbWluU3VwKToKCQoJJycn6aKR57mB6aG56ZuG55Soa2V5c+ihqOekuu+8jAoJa2V56KGo56S66aG56ZuG5Lit55qE5p+Q5LiA6aG577yMCgljdXRLZXlz6KGo56S657uP6L+H5Ymq5p6d5q2l55qE5p+Qa+mhuembhuOAggoJQ+ihqOekuuafkGvpobnpm4bnmoTmr4/kuIDpobnlnKjkuovliqHmlbDmja7lupNE5Lit55qE5pSv5oyB6K6h5pWwCgknJycKCglDMSA9IHt9Cglmb3IgVCBpbiBEOgoJCWZvciBJIGluIFQ6CgkJCWlmIEkgaW4gQzE6CgkJCQlDMVtJXSArPSAxCgkJCWVsc2U6CgkJCQlDMVtJXSA9IDEKCglwcmludCBDMQoJX2tleXMxID0gQzEua2V5cygpCgoJa2V5czEgPSBbXQoJZm9yIGkgaW4gX2tleXMxOgoJCWtleXMxLmFwcGVuZChbaV0pCgoJbiA9IGxlbihEKQoJY3V0S2V5czEgPSBbXQoJZm9yIGsgaW4ga2V5czFbOl06CgkJaWYgQzFba1swXV0qMS4wL24gPj0gbWluU3VwOgoJCQljdXRLZXlzMS5hcHBlbmQoaykKCQoJY3V0S2V5czEuc29ydCgpCgoKCWtleXMgPSBjdXRLZXlzMQoJYWxsX2tleXMgPSBbXQoJd2hpbGUga2V5cyAhPSBbXToKCQlDID0gZ2V0QyhELCBrZXlzKQoJCWN1dEtleXMgPSBnZXRDdXRLZXlzKGtleXMsIEMsIG1pblN1cCwgbGVuKEQpKQoJCWZvciBrZXkgaW4gY3V0S2V5czoKCQkJYWxsX2tleXMuYXBwZW5kKGtleSkKCQlrZXlzID0gYXByb2lyaV9nZW4oY3V0S2V5cykKCglyZXR1cm4gYWxsX2tleXMKCmRlZiBnZXRDKEQsIGtleXMpOgoJJycn5a+5a2V5c+S4reeahOavj+S4gOS4qmtleei/m+ihjOiuoeaVsCcnJwoJQyA9IFtdCglmb3Iga2V5IGluIGtleXM6CgkJYyA9IDAKCQlmb3IgVCBpbiBEOgoJCQloYXZlID0gVHJ1ZQoJCQlmb3IgayBpbiBrZXk6CgkJCQlpZiBrIG5vdCBpbiBUOgoJCQkJCWhhdmUgPSBGYWxzZQoJCQlpZiBoYXZlOgoJCQkJYyArPSAxCgkJQy5hcHBlbmQoYykKCXJldHVybiBDCgpkZWYgZ2V0Q3V0S2V5cyhrZXlzLCBDLCBtaW5TdXAsIGxlbmd0aCk6CgknJyfliarmnp3mraUnJycKCWZvciBpLCBrZXkgaW4gZW51bWVyYXRlKGtleXMpOgoJCWlmIGZsb2F0KENbaV0pIC8gbGVuZ3RoIDwgbWluU3VwOgoJCQlrZXlzLnJlbW92ZShrZXkpCglyZXR1cm4ga2V5cwoKCgpkZWYga2V5SW5UKGtleSwgVCk6CgknJyfliKTmlq3poblrZXnmmK/lkKblnKjmlbDmja7lupPkuK3mn5DkuIDlhYPnu4RU5LitJycnCglmb3IgayBpbiBrZXk6CgkJaWYgayBub3QgaW4gVDoKCQkJcmV0dXJuIEZhbHNlCglyZXR1cm4gVHJ1ZQoKCmRlZiBhcHJvaXJpX2dlbihrZXlzMSk6CgknJyfov57mjqXmraUnJycKCWtleXMyID0gW10KCWZvciBrMSBpbiBrZXlzMToKCQlmb3IgazIgaW4ga2V5czE6CgkJCWlmIGsxICE9IGsyOgoJCQkJa2V5ID0gW10KCQkJCWZvciBrIGluIGsxOgoJCQkJCWlmIGsgbm90IGluIGtleToKCQkJCQkJa2V5LmFwcGVuZChrKQoJCQkJZm9yIGsgaW4gazI6CgkJCQkJaWYgayBub3QgaW4ga2V5OgoJCQkJCQlrZXkuYXBwZW5kKGspCgkJCQlrZXkuc29ydCgpCgkJCQlpZiBrZXkgbm90IGluIGtleXMyOgoJCQkJCWtleXMyLmFwcGVuZChrZXkpCgkJCQoJcmV0dXJuIGtleXMyCgoKCgpEID0gW1snQScsJ0InLCdDJywnRCddLFsnQicsJ0MnLCdFJ10sWydBJywnQicsJ0MnLCdFJ10sWydCJywnRCcsJ0UnXSxbJ0EnLCdCJywnQycsJ0QnXV0KRiA9IGFwcmlvcmkoRCwgMC43KQpwcmludCAnXG5mcmVxdWVudCBpdGVtc2V0OlxuJywgRgo=