zoukankan      html  css  js  c++  java
  • c++ 数据预处理(数据去噪,归一化)

    正态分布3σ原则,把3倍方差之外的点设想为噪声数据来排除。

    归一化,将数据经过处理之后限定到一定的范围内,一般都会将数据限定到[0,1]。

    #include <iostream>
    #include <string>
    #include <vector>
    #include <algorithm>
    #include <numeric>
    #include <cmath>
    #include <fstream>
    #include <sstream>

    template <class DataType>
    void ReadDataFromFile(std::string &filename, std::vector<std::vector<DataType> > &lines_feat) {
      std::ifstream vm_info(filename.c_str());
      std::string lines;
      DataType var;
      std::vector<DataType> row;

      lines_feat.clear();

      while(!vm_info.eof()) {
        getline(vm_info, lines);
        if(lines.empty())
          break;
        std::stringstream stringin(lines);
        row.clear();

        while(stringin >> var) {
          row.push_back(var);
        }
        lines_feat.push_back(row);
      }
    }

    template <class DataType>
    void Display2DVector(std::vector<std::vector<DataType> > &vv) {
      std::cout<<"the total rows of 2d vector_data: "<<vv.size()<<" ";

      for(size_t i=0;i<vv.size();++i) {
        for(typename::std::vector<DataType>::const_iterator it=vv[i].begin();it!=vv[i].end();++it) {
          std::cout<<*it<<" ";
        }
        std::cout<<" ";
      }
      std::cout<<"--------the end of the Display2DVector()-------- ";
    }

    template <class DataType>
    void ProcessVector(std::vector<std::vector<DataType> > &vv) {
      std::vector<double> temp;
      double u[3]={0.0}, sum[3]={0.0}, sigma[3]={0.0};
      for(size_t j=0; j<3; ++j) {
        temp.clear();
        for(size_t i=0; i<vv.size(); ++i) {
          temp.push_back(vv[i][j]);
        }
        sum[j]=std::accumulate(temp.begin(), temp.end(), 0);
        u[j]=sum[j]/vv.size();
      }

      for(size_t j=0;j<3;++j) {
        temp.clear();
        sum[j]=0.0;
        for(size_t i=0;i<vv.size();++i) {
          temp.push_back(std::pow(vv[i][j]-u[j], 2.0));
        }
        sum[j]=std::accumulate(temp.begin(), temp.end(), 0.0);
        sigma[j]=sum[j]/vv.size();
        sigma[j]=sqrt(sigma[j]);
      }

      double MaxValue[3]={0.0}, MinValue[3]={0.0};
      for(size_t j=0;j<3;++j) {
        temp.clear();
        for(size_t i=0;i<vv.size();++i) {
          if((vv[i][j]>(u[j]-3*sigma[j])) && (vv[i][j]<(u[j]+3*sigma[j]))) {
            std::cout<<vv[i][j]<<" ";
          temp.push_back(vv[i][j]);
          }
        }
        std::cout<<" ";
        MaxValue[j]=*std::max_element(temp.begin(), temp.end());
        MinValue[j]=*std::min_element(temp.begin(), temp.end());
      }

      for(size_t j=0;j<3;++j) {
        for(size_t i=0;i<vv.size();++i) {
          if((vv[i][j]>(u[j]-3*sigma[j])) && (vv[i][j]<(u[j]+3*sigma[j]))) {
            std::cout<<(vv[i][j]-MinValue[j])/(MaxValue[j]-MinValue[j])<<" ";
          }
        }
        std::cout<<" ";
      }
    }

    int main() {
      std::vector<std::vector<int> > lines_feat;
      std::string filename="vm.data";

      /*read data from file to 2d vector*/
      ReadDataFromFile(filename, lines_feat);

      /*display the raw data*/
      Display2DVector(lines_feat);

      /*process the data*/
      ProcessVector(lines_feat);

      std::cout<<"--------The end of main()-------- ";

      return 0;
    }

    源数据如下(cat vm.data):

    19 26 63
    13 62 65
    16 69 15
    14 56 17
    19 6 15
    11 42 15
    18 58 36
    12 77 33
    10 75 47
    15 54 70
    10017 1421077 4196

  • 相关阅读:
    解决hadoop中 bin/hadoop fs -ls ls: `.': No such file or directory问题
    ERROR namenode.NameNode: Failed to start namenode. java.lang.IllegalArgument
    org.springframework.beans.factory.BeanDefinitionStoreException: Failed to read candidate component class: file [/Users/lonecloud/tomcat/apache-tomcat-7.0.70 2/webapps/myproject/WEB-INF/classes/cn/lone
    创建Maven web工程不能解析EL表达式的解决办法
    mac中的myeclipse的控制台中文乱码问题解决办法
    Java采用内部构造器Builder模式进行对类进行构建
    java定时器的使用(Timer)
    传统的线程技术
    线程的理解
    Ibatis学习总结7--SqlMapClient 执行 SQL 语句
  • 原文地址:https://www.cnblogs.com/donggongdechen/p/9510077.html
Copyright © 2011-2022 走看看