/**
* 功能: ID3算法
* 语言: C++
* 作者: 刘永康
* 版本: 1.0
* 时间: 2012.12.3
*/
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <utility>
#include <cmath>
int attribute_quantity; //属性数量
int target_attribute_num; //目标属性
std::vector<std::string> attributes; //所有属性
int example_quantity; //训练样例的数量
std::vector<std::vector<std::string> > examples; //所有训练样例
const std::string yes("yes");
const std::string no("no");
struct node {
std::string attribute; //该结点的属性
std::string decision_attribute; //决定该结点的属性
std::vector<node> child; //该结点的孩子
};
/**
* 寻找最佳分类属性,也是ID3算法的核心部分
* cur_examples存放所要训练样例的索引值
* attributes_num存放待分类属性的索引值
*/
int FindBestAttribute(std::vector<int> &cur_examples, std::vector<int> &attributes_num)
{
int positive, negative, n;
double entropy;
//统计正样例和负样例的数量
positive = 0;
negative = 0;
n = cur_examples.size();
for (int i = 0; i != n; ++i) {
if (examples[cur_examples[i]][4].compare(yes)) {
++negative;
} else {
++positive;
}
}
//训练样例的熵值
entropy = -(positive * 1.0 / n) * log(positive * 1.0 / n) / log(2) - (negative * 1.0 / n) * log(negative * 1.0 / n) / log(2);
std::vector<double> gain(attributes_num.size(), entropy);
for (int i = 0; i != attributes_num.size(); ++i) {
int cur_positive, cur_negative, cur_n;
double cur_entropy;
std::map<std::string, int> values;
//找出该属性的所有取值
for (int j = 0; j != n; ++j) {
std::map<std::string, int>::iterator iter = values.find(examples[cur_examples[j]][attributes_num[i]]);
if (iter != values.end()) {
iter->second++;
} else {
values.insert(std::make_pair(examples[cur_examples[j]][attributes_num[i]], 1));
}
}
//计算该属性的信息增益
for (std::map<std::string, int>::iterator iter = values.begin(); iter != values.end(); ++iter) {
//统计该取值下正样例和负样例的数量
cur_positive = 0;
cur_negative = 0;
cur_n = iter->second;
for (int j = 0; j != n; ++j) {
if (examples[cur_examples[j]][attributes_num[i]] == iter->first) {
if (examples[cur_examples[j]][4].compare(yes)) {
++cur_negative;
} else {
++cur_positive;
}
}
}
//该取值的熵值
if (cur_positive && cur_negative) {
cur_entropy = -(cur_positive * 1.0 / cur_n) * log(cur_positive * 1.0 / cur_n) / log(2) - (cur_negative * 1.0 / cur_n) * log(cur_negative * 1.0 / cur_n) / log(2);
} else {
cur_entropy = 0;
}
//信息增益的计算方法
gain[i] -= cur_entropy * cur_n / n;
}
}
//寻找信息增益最大的属性,即为最佳分类属性
int max;
max = 0;
for (int i = 0; i != attributes_num.size(); ++i) {
if (gain[i] > gain[max]) {
max = i;
}
}
return max;
}
/**
* 构建决策树
* root为要建立的结点
* cur_examples存放所要训练样例的索引值
* attributes_num存放待分类属性的索引值
*/
void DecisionTreeBuild(node &root, std::vector<int> &cur_examples, std::vector<int> &attributes_num)
{
int positive, negative;
int best_attribute;
//统计正样例和负样例的数量
positive = 0;
negative = 0;
for (int i = 0; i != cur_examples.size(); ++i) {
if (examples[cur_examples[i]][4].compare(yes)) {
++negative;
} else {
++positive;
}
}
//如果examples都为正,该结点取值为yes
if (positive == cur_examples.size()) {
root.attribute = yes;
return;
}
//如果examples都为负,该结点取值为no
if (negative == cur_examples.size()) {
root.attribute = no;
return;
}
//如果attributes_num为空,那么选examples中最多的target_attribute
if (attributes_num.size() == 0) {
if (positive > negative) {
root.attribute = yes;
} else {
root.attribute = no;
}
return ;
}
//寻找最佳分类属性
best_attribute = FindBestAttribute(cur_examples, attributes_num);
root.attribute = attributes[attributes_num[best_attribute]];
//找出最佳分类属性的所有取值
std::map<std::string, int> values;
for (int i = 0; i != cur_examples.size(); ++i) {
std::map<std::string, int>::iterator iter = values.find(examples[cur_examples[i]][attributes_num[best_attribute]]);
if (iter != values.end()) {
iter->second++;
} else {
values.insert(std::make_pair(examples[cur_examples[i]][attributes_num[best_attribute]], 1));
}
}
//对最佳分类属性的每一个值进行递归分类
for (std::map<std::string, int>::iterator iter = values.begin(); iter != values.end(); ++iter) {
//对于该取值的训练样例集合
std::vector<int> new_examples;
for (int i = 0; i != cur_examples.size(); ++i) {
if (examples[cur_examples[i]][attributes_num[best_attribute]] == iter->first) {
new_examples.push_back(cur_examples[i]);
}
}
//剩余属性
std::vector<int> new_attributes_num;
for (int i = 0; i != attributes_num.size(); ++i) {
if (i != best_attribute) {
new_attributes_num.push_back(attributes_num[i]);
}
}
node new_node;
new_node.decision_attribute = iter->first;
if (new_examples.size()) { //若还有未分类属性,则递归分类
DecisionTreeBuild(new_node, new_examples, new_attributes_num);
} else {
if (positive > negative) { //若无剩余属性,则选取样例中目标属性取值频率较高的作为新结点的取值
new_node.attribute = yes;
} else {
new_node.attribute = no;
}
}
root.child.push_back(new_node);
}
}
//打印决策树
void DecisionTreePrint(node &root, int n)
{
for (int i = 0; i != n; ++i) {
std::cout << '\t';
}
if (root.decision_attribute != "") {
std::cout << root.decision_attribute << std::endl;
for (int i = 0; i != n + 1; ++i) {
std::cout << '\t';
}
}
std::cout << root.attribute << std::endl;
for (int i = 0; i != root.child.size(); ++i) {
DecisionTreePrint(root.child[i], n + 1);
}
}
int main()
{
std::string cur_s;
std::vector<int> attributes_num;
std::vector<int> cur_examples;
node root;
std::cin >> attribute_quantity;
std::cin >> target_attribute_num;
target_attribute_num--;
for (int i = 0; i != attribute_quantity; ++i) {
std::cin >> cur_s;
attributes.push_back(cur_s);
}
std::cin >> example_quantity;
for (int i = 0; i != example_quantity; ++i) {
std::vector<std::string> cur_v;
for (int j = 0; j != attribute_quantity; ++j) {
std::cin >> cur_s;
cur_v.push_back(cur_s);
}
examples.push_back(cur_v);
}
for (int i = 0; i != attribute_quantity; ++i) {
if (i != target_attribute_num) {
attributes_num.push_back(i);
}
}
for (int i = 0; i != example_quantity; ++i) {
cur_examples.push_back(i);
}
DecisionTreeBuild(root, cur_examples, attributes_num);
DecisionTreePrint(root, 0);
return 0;
}