zoukankan      html  css  js  c++  java
  • 数据挖掘算法实现

    学习了数据挖掘这门课,但是里面的算法仅仅是稍微了解了一下,并没有实现一下,试着把每个算法实现一下。。。。

    1、决策树之ID3

    下表记录了在不同气候条件下是否去打球的情况,要求根据该表用程序输出决策树。

    Day Outlook Temperature Humidity Wind PlayTennis
    1 Sunny Hot High Weak no
    2 Sunny Hot High Strong no
    3 Overcast Hot High Weak yes
    4 Rainy Mild High Weak yes
    5 Rainy Cool Normal Weak yes
    6 Rainy Cool Normal Strong no
    7 Overcast Cool Normal Strong yes
    8 Sunny Mild High Weak no
    9 Sunny Cool Normal Weak yes
    10 Rainy Mild Normal Weak yes
    11 Sunny Mild Normal Strong yes
    12 Overcast Mild High Strong yes
    13 Overcast Hot Normal Weak yes
    14 Rainy Mild High Strong no
    end

    下面是ID3的部分程序,还没有写完,慢慢再补。

      1 #include <iostream>
      2 #include <string>
      3 #include <cstring>
      4 #include <vector>
      5 #include <list>
      6 #include <map>
      7 #include <algorithm>
      8 #include <cstdlib>
      9 #include <cstdio>
     10 #include <cmath>
     11 
     12 using namespace std;
     13 
     14 class Node
     15 {
     16 public:
     17     vector<int> next;
     18     string attr;
     19     string ans;
     20     //Node() next(), attr(""), ans(""){}
     21 };
     22 
     23 const string yes = "yes";
     24 const string no = "no";
     25 const int attribute_name_size = 6;
     26 vector< vector<string> > data; //day weather temperature humidity wind play_or_not
     27 Node node[1000];
     28 int cnt_of_node = 0;
     29 
     30 void input()
     31 {
     32     string str;
     33     vector<string> tmp;
     34     while (cin >> str && str != "end")
     35     {
     36         tmp.push_back(str);
     37         for (int i = 0; i < attribute_name_size-1; ++i)
     38         {
     39             cin >> str;
     40             tmp.push_back(str);            
     41         }
     42         data.push_back(tmp);
     43         tmp.clear();
     44     }
     45 }
     46 
     47 double calcEntropy(vector<vector<string> >& vec, string element)
     48 {
     49     double ans = 0;    
     50     map<string, int> mp;
     51     if (vec.size() <= 0) return -1;
     52     for (int j = 0; j < vec[0].size(); ++j)
     53     {
     54         if (vec[0][j] == element)
     55             for (int i = 1; i < vec.size(); ++i)
     56                 mp[vec[i][j]]++;            
     57     }
     58     double cnt = vec.size()-1;
     59     for (map<string, int>::iterator it = mp.begin(); it != mp.end(); ++it)
     60     {
     61         double p = (it->second)/cnt;
     62         ans -= p*log2(p);
     63     }
     64     return ans;
     65 }
     66 
     67 double calcInfo(vector<vector<string> >& vec, int idx)
     68 {
     69     double ans = 0;
     70     if (vec.size() <= 1) return -1;
     71     map<string, map<string, int> > mp;
     72     int len = vec[0].size();
     73     int size = vec.size()-1;
     74 
     75     for (int j = 1; j < vec.size(); ++j)
     76         mp[vec[j][idx]][vec[j][len-1]]++;
     77     for (map<string, map<string, int> >::iterator it = mp.begin(); it != mp.end(); ++it)
     78     {
     79         int ys = 0, nt = 0;
     80         for (map<string, int>::iterator itr = (it->second).begin(); itr != (it->second).end(); ++itr)
     81         {
     82             if (itr->first == yes) ys += itr->second;
     83             if (itr->first == no) nt += itr->second;            
     84         }
     85         ans = -(ys+nt)/size*(-ys/(ys+nt)*log2(ys/(ys+nt)) - nt/(ys+nt)*log2(nt/(ys+nt)));
     86     }
     87     return ans;
     88 }
     89 
     90 int findBestAttribute(vector<vector<string> >& tmp)
     91 {
     92     if (tmp.size() <= 1) return -1;
     93     int len = tmp[0].size();
     94     string result = tmp[0][len-1];
     95     vector<double> v;
     96     double info_result = calcEntropy(tmp, result);
     97     for (int i = 0; i < len; ++i)
     98         v.push_back(calcInfo(tmp, i));
     99     double max_info_gain = 0;
    100     int idx = 0;
    101     for (int i = 0; i < v.size(); ++i)
    102     {
    103         if (info_result-v[i] > max_info_gain)
    104             max_info_gain = info_result-v[idx=i];
    105     }
    106     return idx;
    107 }
    108 
    109 void work(vector< vector<string> >& source, int now_node_num)
    110 {
    111     int idx = 0;
    112     idx = findBestAttribute(source);
    113     vector<int> vis(source.size(), 0);
    114     vector<string> attribute_tmp;
    115     for (int i = 0; i < source[0].size(); ++i)
    116         if (i != idx) attribute_tmp.push_back(source[0][i]);
    117     int len = source[0].size();
    118     for (int i = 1; i < source.size(); ++i)
    119     {
    120         if (vis[i]) continue;
    121         map<string, int> mp;
    122         for (int j = i; j < source.size(); ++j)
    123         {
    124             if (source[j][idx] == source[i][idx])
    125             {
    126                 mp[source[j][len-1]]++;
    127                 vis[j] = 1;
    128             }
    129         }
    130         node[now_node_num].next.push_back(++cnt_of_node);
    131         node[cnt_of_node].attr = source[i][idx];
    132         if (mp.size() == 1)
    133         {
    134             node[cnt_of_node].ans = source[i][len-1];    
    135             node[cnt_of_node].next.clear();
    136         }
    137         else
    138         {
    139             vector<vector<string> > vs;
    140             for (int j = 0; j < source.size(); ++j)
    141             {
    142                 vector<string> tmp;
    143                 for (int k = 0; k < source[0].size(); ++k)
    144                 {
    145                     if (k == idx) continue;
    146                     tmp.push_back(source[j][k]);    
    147                 }
    148                 vs.push_back(tmp);
    149             }
    150             work(vs, cnt_of_node);
    151         }
    152     }
    153 }
    154 
    155 void outputSourceData()
    156 {
    157     for (int i = 0; i < data.size(); ++i)
    158     {
    159         for (int j = 0; j < data[i].size(); ++j)
    160             cout << data[i][j] << '	';
    161         cout << endl;
    162     }
    163 }
    164 
    165 int main()
    166 {
    167         
    168     return 0;
    169 }
    ID3
  • 相关阅读:
    Java静态类
    【Java TCP/IP Socket】深入剖析socket——TCP套接字的生命周期
    【Java TCP/IP Socket】深入剖析socket——TCP通信中由于底层队列填满而造成的死锁问题(含代码)
    【Java TCP/IP Socket】深入剖析socket——数据传输的底层实现
    【Java TCP/IP Socket】基于NIO的TCP通信(含代码)
    【Java TCP/IP Socket】Java NIO Socket VS 标准IO Socket
    【Java TCP/IP Socket】TCP Socket通信中由read返回值造成的的死锁问题(含代码)
    数据结构课后练习题(练习三)7-5 Tree Traversals Again (25 分)
    快速排序详解(lomuto划分快排,hoare划分快排,classic经典快排,dualpivot双轴快排源码)
    Java多线程(一)——线程基础和锁锁锁
  • 原文地址:https://www.cnblogs.com/JustForCS/p/4885233.html
Copyright © 2011-2022 走看看