zoukankan      html  css  js  c++  java
  • 数据挖掘算法实现

    学习了数据挖掘这门课,但是里面的算法仅仅是稍微了解了一下,并没有实现一下,试着把每个算法实现一下。。。。

    1、决策树之ID3

    下表记录了在不同气候条件下是否去打球的情况,要求根据该表用程序输出决策树。

    Day Outlook Temperature Humidity Wind PlayTennis
    1 Sunny Hot High Weak no
    2 Sunny Hot High Strong no
    3 Overcast Hot High Weak yes
    4 Rainy Mild High Weak yes
    5 Rainy Cool Normal Weak yes
    6 Rainy Cool Normal Strong no
    7 Overcast Cool Normal Strong yes
    8 Sunny Mild High Weak no
    9 Sunny Cool Normal Weak yes
    10 Rainy Mild Normal Weak yes
    11 Sunny Mild Normal Strong yes
    12 Overcast Mild High Strong yes
    13 Overcast Hot Normal Weak yes
    14 Rainy Mild High Strong no
    end

    下面是ID3的部分程序,还没有写完,慢慢再补。

      1 #include <iostream>
      2 #include <string>
      3 #include <cstring>
      4 #include <vector>
      5 #include <list>
      6 #include <map>
      7 #include <algorithm>
      8 #include <cstdlib>
      9 #include <cstdio>
     10 #include <cmath>
     11 
     12 using namespace std;
     13 
     14 class Node
     15 {
     16 public:
     17     vector<int> next;
     18     string attr;
     19     string ans;
     20     //Node() next(), attr(""), ans(""){}
     21 };
     22 
     23 const string yes = "yes";
     24 const string no = "no";
     25 const int attribute_name_size = 6;
     26 vector< vector<string> > data; //day weather temperature humidity wind play_or_not
     27 Node node[1000];
     28 int cnt_of_node = 0;
     29 
     30 void input()
     31 {
     32     string str;
     33     vector<string> tmp;
     34     while (cin >> str && str != "end")
     35     {
     36         tmp.push_back(str);
     37         for (int i = 0; i < attribute_name_size-1; ++i)
     38         {
     39             cin >> str;
     40             tmp.push_back(str);            
     41         }
     42         data.push_back(tmp);
     43         tmp.clear();
     44     }
     45 }
     46 
     47 double calcEntropy(vector<vector<string> >& vec, string element)
     48 {
     49     double ans = 0;    
     50     map<string, int> mp;
     51     if (vec.size() <= 0) return -1;
     52     for (int j = 0; j < vec[0].size(); ++j)
     53     {
     54         if (vec[0][j] == element)
     55             for (int i = 1; i < vec.size(); ++i)
     56                 mp[vec[i][j]]++;            
     57     }
     58     double cnt = vec.size()-1;
     59     for (map<string, int>::iterator it = mp.begin(); it != mp.end(); ++it)
     60     {
     61         double p = (it->second)/cnt;
     62         ans -= p*log2(p);
     63     }
     64     return ans;
     65 }
     66 
     67 double calcInfo(vector<vector<string> >& vec, int idx)
     68 {
     69     double ans = 0;
     70     if (vec.size() <= 1) return -1;
     71     map<string, map<string, int> > mp;
     72     int len = vec[0].size();
     73     int size = vec.size()-1;
     74 
     75     for (int j = 1; j < vec.size(); ++j)
     76         mp[vec[j][idx]][vec[j][len-1]]++;
     77     for (map<string, map<string, int> >::iterator it = mp.begin(); it != mp.end(); ++it)
     78     {
     79         int ys = 0, nt = 0;
     80         for (map<string, int>::iterator itr = (it->second).begin(); itr != (it->second).end(); ++itr)
     81         {
     82             if (itr->first == yes) ys += itr->second;
     83             if (itr->first == no) nt += itr->second;            
     84         }
     85         ans = -(ys+nt)/size*(-ys/(ys+nt)*log2(ys/(ys+nt)) - nt/(ys+nt)*log2(nt/(ys+nt)));
     86     }
     87     return ans;
     88 }
     89 
     90 int findBestAttribute(vector<vector<string> >& tmp)
     91 {
     92     if (tmp.size() <= 1) return -1;
     93     int len = tmp[0].size();
     94     string result = tmp[0][len-1];
     95     vector<double> v;
     96     double info_result = calcEntropy(tmp, result);
     97     for (int i = 0; i < len; ++i)
     98         v.push_back(calcInfo(tmp, i));
     99     double max_info_gain = 0;
    100     int idx = 0;
    101     for (int i = 0; i < v.size(); ++i)
    102     {
    103         if (info_result-v[i] > max_info_gain)
    104             max_info_gain = info_result-v[idx=i];
    105     }
    106     return idx;
    107 }
    108 
    109 void work(vector< vector<string> >& source, int now_node_num)
    110 {
    111     int idx = 0;
    112     idx = findBestAttribute(source);
    113     vector<int> vis(source.size(), 0);
    114     vector<string> attribute_tmp;
    115     for (int i = 0; i < source[0].size(); ++i)
    116         if (i != idx) attribute_tmp.push_back(source[0][i]);
    117     int len = source[0].size();
    118     for (int i = 1; i < source.size(); ++i)
    119     {
    120         if (vis[i]) continue;
    121         map<string, int> mp;
    122         for (int j = i; j < source.size(); ++j)
    123         {
    124             if (source[j][idx] == source[i][idx])
    125             {
    126                 mp[source[j][len-1]]++;
    127                 vis[j] = 1;
    128             }
    129         }
    130         node[now_node_num].next.push_back(++cnt_of_node);
    131         node[cnt_of_node].attr = source[i][idx];
    132         if (mp.size() == 1)
    133         {
    134             node[cnt_of_node].ans = source[i][len-1];    
    135             node[cnt_of_node].next.clear();
    136         }
    137         else
    138         {
    139             vector<vector<string> > vs;
    140             for (int j = 0; j < source.size(); ++j)
    141             {
    142                 vector<string> tmp;
    143                 for (int k = 0; k < source[0].size(); ++k)
    144                 {
    145                     if (k == idx) continue;
    146                     tmp.push_back(source[j][k]);    
    147                 }
    148                 vs.push_back(tmp);
    149             }
    150             work(vs, cnt_of_node);
    151         }
    152     }
    153 }
    154 
    155 void outputSourceData()
    156 {
    157     for (int i = 0; i < data.size(); ++i)
    158     {
    159         for (int j = 0; j < data[i].size(); ++j)
    160             cout << data[i][j] << '	';
    161         cout << endl;
    162     }
    163 }
    164 
    165 int main()
    166 {
    167         
    168     return 0;
    169 }
    ID3
  • 相关阅读:
    HDOJ2066 一个人的旅行 floyd
    手动添加数据源时DataGridViewComboBoxCell值出问题解决方法
    可伸缩的Form窗体!
    SharpMap项目Web控件学习!
    MVC和MVP的初步理解
    ArcEngine编辑功能(五)
    胡言乱语:实体具有继承关系的空间数据库设计方法?
    WinForm单例窗体的实现
    4. 模板模式和建造者模式
    Oracle笔记(0):在Win2008系统上安装Oracle11g实践
  • 原文地址:https://www.cnblogs.com/JustForCS/p/4885233.html
Copyright © 2011-2022 走看看