tree = [{'left':-1,'right':-1,'poison':0,'sets':[]}]
mingain = 0.4
def entropy(sets):
if len(sets)==0:return 0.0
from math import log
log2 = lambda x:log(x)/log(2)
ent = 0.0
result = [0 for i in xrange(2)]
for s in sets:
result[s[19]] += 1
for i in xrange(2):
p = float(result[i])/len(sets)
if p != 0:ent -= p*log2(p)
return ent
# 分散
#def variance(sets):
# if len(sets)==0:return 0
# data = [float(s[len(s)-1])for s in sets]
# mean = sum(data)/len(data)
# var = sum([(d-mean)**2 for d in data])/len(data)
# return var
def divide(sets,index,criteria):
l = []
r = []
for s in sets:
if s[index] in criteria : l.append(s)
else : r.append(s)
return (l,r)
def build(n,sets):
tree[n]['sets']=sets
if len(sets)==0:return
best_gain = 0.0
best_criteria = {'left':-1,'right':-1,'poison':0,'sets':sets}
cur = entropy(sets)
for i in xrange(19):
criteria = {'left':-1,'right':-1,'index':i,'set':[]}
a = [[0 for j in xrange(2)]for k in xrange(9)]
for s in sets:
a[s[i]][s[19]] += 1
for j in xrange(9):
if a[j][0] > a[j][1]:
criteria['set'].append(j)
l,r = divide(sets,criteria['index'],criteria['set'])
p = float(len(l)/len(sets))
gain = cur - (entropy(l)*p+entropy(r)*(1.0-p))
if(gain > best_gain ):
best_gain = gain
best_criteria = criteria
if best_gain<=0.0:return
tree[n]=best_criteria
tree[n]['left']=len(tree)
tree.append({'left':-1,'right':-1,'poison':0,'sets':l})
tree[n]['right']=len(tree)
tree.append({'left':-1,'right':-1,'poison':1,'sets':r})
l,r = divide(sets,tree[n]['index'],tree[n]['set'])
build(tree[n]['left'],l)
build(tree[n]['right'],r)
return
def classify(n,sets):
if tree[n]['left']==-1:return tree[n]['poison']
if sets[tree[n]['index']] in tree[n]['set']:return classify(tree[n]['left'],sets)
else : return classify(tree[n]['right'],sets)
def printtree(n,indent=''):
if 'poison' in tree[n]:
print tree[n]['poison']
else:
print str(tree[n]['index'])+':'+str(tree[n]['set'])+'? '
print indent+'T->',
printtree(tree[n]['left'],indent+' ')
print indent+'F->',
printtree(tree[n]['right'],indent+' ')
if __name__ == '__main__':
f = open('learn.txt','r')
learn = []
for line in f:
learn.append(map(int,line.split(',')))
f.close()
build(0,learn)
t = open('test.txt','r')
test = []
for line in t:
test.append(map(int,line.split(',')))
t.close()
printtree(0)
for i in test:
print classify(0,i)
dHJlZSA9IFt7J2xlZnQnOi0xLCdyaWdodCc6LTEsJ3BvaXNvbic6MCwnc2V0cyc6W119XQoKbWluZ2FpbiA9IDAuNAoKZGVmIGVudHJvcHkoc2V0cyk6CglpZiBsZW4oc2V0cyk9PTA6cmV0dXJuIDAuMAoJZnJvbSBtYXRoIGltcG9ydCBsb2cKCWxvZzIgPSBsYW1iZGEgeDpsb2coeCkvbG9nKDIpCgllbnQgPSAwLjAKCXJlc3VsdCA9IFswIGZvciBpIGluIHhyYW5nZSgyKV0KCWZvciBzIGluIHNldHM6CgkJcmVzdWx0W3NbMTldXSArPSAxCglmb3IgaSBpbiB4cmFuZ2UoMik6CgkJcCA9IGZsb2F0KHJlc3VsdFtpXSkvbGVuKHNldHMpCgkJaWYgcCAhPSAwOmVudCAtPSBwKmxvZzIocCkKCXJldHVybiBlbnQKIyDliIbmlaMKI2RlZiB2YXJpYW5jZShzZXRzKToKIwlpZiBsZW4oc2V0cyk9PTA6cmV0dXJuIDAKIwlkYXRhID0gW2Zsb2F0KHNbbGVuKHMpLTFdKWZvciBzIGluIHNldHNdCiMJbWVhbiA9IHN1bShkYXRhKS9sZW4oZGF0YSkKIwl2YXIgPSBzdW0oWyhkLW1lYW4pKioyIGZvciBkIGluIGRhdGFdKS9sZW4oZGF0YSkKIwlyZXR1cm4gdmFyCgpkZWYgZGl2aWRlKHNldHMsaW5kZXgsY3JpdGVyaWEpOgoJbCA9IFtdCglyID0gW10KCWZvciBzIGluIHNldHM6CgkJaWYgc1tpbmRleF0gaW4gY3JpdGVyaWEgOiBsLmFwcGVuZChzKQoJCWVsc2UgOiByLmFwcGVuZChzKQoJcmV0dXJuIChsLHIpCgpkZWYgYnVpbGQobixzZXRzKToKCgl0cmVlW25dWydzZXRzJ109c2V0cwoJaWYgbGVuKHNldHMpPT0wOnJldHVybgoKCWJlc3RfZ2FpbiA9IDAuMAoJYmVzdF9jcml0ZXJpYSA9IHsnbGVmdCc6LTEsJ3JpZ2h0JzotMSwncG9pc29uJzowLCdzZXRzJzpzZXRzfQoJY3VyID0gZW50cm9weShzZXRzKQoKCWZvciBpIGluIHhyYW5nZSgxOSk6CgkJY3JpdGVyaWEgPSB7J2xlZnQnOi0xLCdyaWdodCc6LTEsJ2luZGV4JzppLCdzZXQnOltdfQoJCWEgPSBbWzAgZm9yIGogaW4geHJhbmdlKDIpXWZvciBrIGluIHhyYW5nZSg5KV0KCQlmb3IgcyBpbiBzZXRzOgoJCQlhW3NbaV1dW3NbMTldXSArPSAxCgkJZm9yIGogaW4geHJhbmdlKDkpOgoJCQlpZiBhW2pdWzBdID4gYVtqXVsxXToKCQkJCWNyaXRlcmlhWydzZXQnXS5hcHBlbmQoaikKCQlsLHIgPSBkaXZpZGUoc2V0cyxjcml0ZXJpYVsnaW5kZXgnXSxjcml0ZXJpYVsnc2V0J10pCgkJcCA9IGZsb2F0KGxlbihsKS9sZW4oc2V0cykpCgkJZ2FpbiA9IGN1ciAtIChlbnRyb3B5KGwpKnArZW50cm9weShyKSooMS4wLXApKQoJCWlmKGdhaW4gPiBiZXN0X2dhaW4gKToKCQkJYmVzdF9nYWluID0gZ2FpbgoJCQliZXN0X2NyaXRlcmlhID0gY3JpdGVyaWEKCglpZiBiZXN0X2dhaW48PTAuMDpyZXR1cm4KCgl0cmVlW25dPWJlc3RfY3JpdGVyaWEKCXRyZWVbbl1bJ2xlZnQnXT1sZW4odHJlZSkKCXRyZWUuYXBwZW5kKHsnbGVmdCc6LTEsJ3JpZ2h0JzotMSwncG9pc29uJzowLCdzZXRzJzpsfSkKCXRyZWVbbl1bJ3JpZ2h0J109bGVuKHRyZWUpCgl0cmVlLmFwcGVuZCh7J2xlZnQnOi0xLCdyaWdodCc6LTEsJ3BvaXNvbic6MSwnc2V0cyc6cn0pCglsLHIgPSBkaXZpZGUoc2V0cyx0cmVlW25dWydpbmRleCddLHRyZWVbbl1bJ3NldCddKQoJYnVpbGQodHJlZVtuXVsnbGVmdCddLGwpCglidWlsZCh0cmVlW25dWydyaWdodCddLHIpCglyZXR1cm4KCmRlZiBjbGFzc2lmeShuLHNldHMpOgoJaWYgdHJlZVtuXVsnbGVmdCddPT0tMTpyZXR1cm4gdHJlZVtuXVsncG9pc29uJ10KCWlmIHNldHNbdHJlZVtuXVsnaW5kZXgnXV0gaW4gdHJlZVtuXVsnc2V0J106cmV0dXJuIGNsYXNzaWZ5KHRyZWVbbl1bJ2xlZnQnXSxzZXRzKQoJZWxzZSA6IHJldHVybiBjbGFzc2lmeSh0cmVlW25dWydyaWdodCddLHNldHMpCgpkZWYgcHJpbnR0cmVlKG4saW5kZW50PScnKToKCWlmICdwb2lzb24nIGluIHRyZWVbbl06CgkJcHJpbnQgdHJlZVtuXVsncG9pc29uJ10KCWVsc2U6CgkJcHJpbnQgc3RyKHRyZWVbbl1bJ2luZGV4J10pKyc6JytzdHIodHJlZVtuXVsnc2V0J10pKyc/ICcKCQlwcmludCBpbmRlbnQrJ1QtPicsCgkJcHJpbnR0cmVlKHRyZWVbbl1bJ2xlZnQnXSxpbmRlbnQrJyAnKQoJCXByaW50IGluZGVudCsnRi0+JywKCQlwcmludHRyZWUodHJlZVtuXVsncmlnaHQnXSxpbmRlbnQrJyAnKQoKaWYgX19uYW1lX18gPT0gJ19fbWFpbl9fJzoKCglmID0gb3BlbignbGVhcm4udHh0JywncicpCglsZWFybiA9IFtdCglmb3IgbGluZSBpbiBmOgoJCWxlYXJuLmFwcGVuZChtYXAoaW50LGxpbmUuc3BsaXQoJywnKSkpCglmLmNsb3NlKCkKCglidWlsZCgwLGxlYXJuKQoKCXQgPSBvcGVuKCd0ZXN0LnR4dCcsJ3InKQoJdGVzdCA9IFtdCglmb3IgbGluZSBpbiB0OgoJCXRlc3QuYXBwZW5kKG1hcChpbnQsbGluZS5zcGxpdCgnLCcpKSkKCXQuY2xvc2UoKQoKCXByaW50dHJlZSgwKQoKCWZvciBpIGluIHRlc3Q6CgkJcHJpbnQgY2xhc3NpZnkoMCxpKQo=