zoukankan      html  css  js  c++  java
  • SAS | 数据EDA及代码

    先构造一个数据集,代码如下:

    x libname zdata "D:datapython_file	est";*先构造一个数据集;
    data Transactions;
    informat TransDate yymmdd10.;
    format TransDate yymmdd10.;
    input CustomerID  TransDate  Amount  AccountType  $;
    datalines;
    1    2008-01-01     114.56      Savings  
    1	 2008-02-01	 	56.78	 	Checking 
    1	 2008-03-01	 	359.31	 	Savings 
    1	 2008-01-04	 	89.56		Checking 
    1	 2008-05-01	 	1000.00	 	Savings  
    1	 2008-08-01	 	1200.00 	Checking 
    2	 2008-01-02	 	122.51	 	Savings  
    2	 2008-12-01	 	42.07		Checking 
    2	 2008-11-01	 	146.30	 	Savings  
    2	 2008-07-13	 	1254.48 	Checking 
    2	 2008-02-11	 	400.00	 	Savings	  
    2	 2008-01-01	 	500.00	 	Checking 
    ;
    run;
    

    然后对数据集进行EDA分析,过程和代码如下:

    /*1.全表统计*/
    proc contents data=zdata.Transactions;
    run;
    
    /*2.数据量统计*/
    /*常用于比较总记录数和去重主键数,看主键是否唯一*/
    proc sql;
    create table a as select count(*) as count1,count(distinct CustomerID) as count2 
    from zdata.Transactions;
    quit;
    
    /*3.名义变量统计*/
    /*该语句用于对数据集中名义变量进行频数统计*/
    proc freq data=zdata.Transactions;
    tables 
    	CustomerID 
    	AccountType 
    	TransDate 
    	CustomerID*AccountType			/*交叉分布*/
    ;
    format 
    	TransDate yyq6.				/*yyq6.指按季度统计、yymm6.指按月统计*/
    ;
    run;
    
    /*4.数值变量统计*/
    proc means data=zdata.Transactions N NMISS P1 P10 P25 P50 P75 P90 MIN MAX MEAN STD;
    var Amount;
    by CustomerID;
    run;
    
    proc means data=zdata.Transactions noprint ;
    putput out=d SUM(Amount)=Amount1 MIN(Amount)=Amount2;
    BY CustomerID;
    run;
    
    /*5.排序*/
    proc sort data=zdata.Transactions;
    by CustomerID;				    /*按照CustomerID排序,默认升序*/
    run;	 
    
    proc sort data=zdata.Transactions;
    by Amount descending CustomerID;	    /*按照Amount升序,CustomerID降序排序*/
    run;
    
    /*6.set:多个数据集纵向连接*/
    data zdata.Transactions_11;
    set zdata.Transactions;
    keep CustomerID Amount;
    run;
    
    /*7.merge:多个数据集横向连接*/
    data h1;
    set b1;
    if CustomerID=1;
    run;
    
    proc sort data=Transactions;
    by CustomerID;
    run;
    
    proc sort data=h1;
    by CustomerID;
    run;
    
    data h;
    merge  Transactions h1 ;		/*数据集横向连接*/
    by CustomerID;				/*merge只要再BY某个变量连接时,才需要先排序*/	
    run;
    
    *注意:merge只有在限定条件时,才等价于SQL中的left join/right join;
    data i;
    merge Transactions(in=a)  h1(in=b);
    by CustomerID;
    if b;		                        /*仅保留h1数据集中有的id*/
    run;
    
    *等价于上面限定条件的merge;
    proc sql;
    create table i as 
    select * from h1 as tb1 left join Transactions as tb2 on tb1.CustomerID=tb2.CustomerID;
    quit;
    
    /*8.数据集导出*/
    proc export data=i outfile='路径	est.csv' dbms=csv replace;run;
    
    
  • 相关阅读:
    Tomcat6.0 sqlServer2000 配置连接池操作
    SQL GROUP BY 实例
    Java 获取当前系统时间 格式:yyyyMMdd HH:mm:ss
    银行科技与业务融合之道
    银行IT部门科技管理流程管控工作发展之路
    银行科技管理工作优化提升之我见
    事务脚本的缺点以及领域模型的优点
    异常的分级分类与处理策略
    软件高性能的思考
    软件行业的一个发展推力就是不断提高用来构造软件的基础元素,也就是所谓的编程模型
  • 原文地址:https://www.cnblogs.com/1k-yang/p/12551030.html
Copyright © 2011-2022 走看看