zoukankan      html  css  js  c++  java
  • python数据分析-数据处理

    数据导入:

     导入csv
    from pandas import read_csv;
    
    df = read_csv('D://PA//4.1//1.csv')
    导入文本,要转成UTF-8无BOM格式:
    from pandas import read_table;
    
    df = read_table('D://PA//4.1//2.txt')
    导入excle
    from pandas import read_excel;
    
    df = read_excel('C:/PA/4.1/3.xlsx')
    View Code

    数据导出:

    数据的导出:

    from pandas import DataFrame;
    
    df = DataFrame({
        'age': [21, 22, 23], 
        'name': ['KEN', 'John', 'JIMI']
    });
    
    df.to_csv("c:/PA/4.1/df.csv");
    
    #不导入序号
    df.to_csv("c:/PA/4.1/df.csv", index=False);
    View Code

    重复值处理:

    from pandas import read_csv;
    
    df = read_csv('C:/PA/4.1/data.csv')
    
    newDF = df.drop_duplicates();
    View Code

    缺失值处理:

    from pandas import read_csv;
    
    df = read_csv('C:/PA/4.4/data.csv')
    
    newDF = df.dropna();
    View Code

    空格值处理:

    from pandas import read_csv;
    
    df = read_csv('C:/PA/4.5/data.csv')
    
    newDF = df["name"].str.strip();
    df["name"]=newDF;
    View Code

    字段抽取:

    astype(str) 转换成字符型数据,以便于处理。

    from pandas import read_csv;
    
    df = read_csv('C:/PA/4.6/data.csv')
    
    df["tel"]=df["tel"].astype(str);
    
    bands=df["tel"].str.slice(0,3);
    
    areas=df["tel"].str.slice(3,7);
    
    numbs=df["tel"].str.slice(7,11);
    View Code

    字段拆分:

    from pandas import read_csv;
    df=read_csv("C:/PA/4.7/data.csv");
    
    newDF=df["name"].str.split(" ",1,True);
    newDF.columns=["band","name"];
    View Code

    记录抽取:

    import pandas;
    from pandas import read_csv;
    df=read_csv("C:/PA/4.8/data.csv",sep="|");
    df[df.comments>1000];
    df[df.comments.between(1000,10000)];
    df[pandas.isnull(df.title)];
    df[df.title.str.contains("台电",na=False)];
    df[(df.comments>=1000)&(df.comments<=10000)]
    View Code

    随机抽样:

    import numpy;
    from pandas import read_csv;
    df=read_csv("C:/PA/4.9/data.csv");
    r=numpy.random.randint(0,10,3);
    df.loc[r,:];
    View Code

    记录合并 :

    import pandas;
    from pandas import read_csv;
    
    df1=read_csv("C:/PA/4.10/data1.csv",sep="|");
    df2=read_csv("C:/PA/4.10/data2.csv",sep="|");
    df3=read_csv("C:/PA/4.10/data3.csv",sep="|");
    
    df=pandas.concat([df1,df2,df3])
    View Code

    字段合并:

    from pandas import read_csv;
    
    df = read_csv(
         "C:/PA/4.11/data.csv",
        sep=" ", 
        names=['band', 'area', 'num']
    );
    
    df = df.astype(str);
    
    tel = df['band'] + df['area'] + df['num']    
    View Code

    字段匹配:

    import pandas;
    from pandas import read_csv;
    item=read_csv(
            "C:/PA/4.12/data1.csv",
            sep="|",
            names=["id","comments","title"]
    );
    prices=read_csv(
             "C:/PA/4.12/data1.csv",
             sep="|",
             names=["id","oldprice","newprice"]
            )
    itemprices=pandas.merge(
            item,
            prices,
            left_on="id",
            right_on="id"
     );
    View Code

    简单计算:

    import pandas;
    from pandas import read_csv;
    
    df=read_csv("C:/PA/4.13/data.csv",sep="|");
    result=df.price*df.num
    df["sum"]=result
    View Code

    数据标准化:

    import pandas;
    from pandas import read_csv;
    
    df=read_csv("C:/PA/4.14/data.csv");
    
    scale=(df.score-df.score.min())/(df.score.max()-df.score.min())
    View Code

    数据分组:

    import pandas;
    from pandas import read_csv;
    
    df = read_csv("C:\PA\4.15\data.csv", sep='|');
    
    bins = [min(df.cost)-1, 20, 40, 60, 80, 100, max(df.cost)+1];
    
    labels = ['20以下', '20到40', '40到60', '60到80', '80到100', '100以上'];
    
    pandas.cut(df.cost, bins)
    
    pandas.cut(df.cost, bins, right=False)
    
    pandas.cut(df.cost, bins, right=False, labels=labels)
    View Code

    日期转换:

    import pandas;
    from pandas import read_csv;
    from pandas import to_datetime;
    
    df = read_csv("C:\PA\4.16\data.csv",encoding="utf-8");
    df_dt=to_datetime(df.注册时间,format="%Y/%m/%d");
    View Code

    日期格式化:

    import pandas;
    from pandas import read_csv;
    from pandas import to_datetime;
    
    df = read_csv("C:\PA\4.16\data.csv",encoding="utf-8");
    df_dt=to_datetime(df.注册时间,format="%Y/%m/%d");
    df_dt_str=df_dt.apply(lambda x:datatime.strftime(x,"%d-%m-%Y"))
    View Code

    日期抽取:

    import pandas;
    from pandas import read_csv;
    from pandas import to_datetime;
    
    df = read_csv("C:\PA\4.18\data.csv",encoding="utf-8");
    df_dt=to_datetime(df.注册时间,format="%Y/%m/%d");
    df_dt.dt.year;
    df_dt.dt.second;
    df_dt.dt.minute;
    df_dt.dt.hour;
    df_dt.dt.day;
    df_dt.dt.month;
    df_dt.dt.weekday;
    View Code
  • 相关阅读:
    VS中的路径宏
    Eigen3
    Python3.6 import源文件与编译文件的关系
    使用C语言扩展Python3
    mysql.connector 事务总结
    C++ -- STL泛型编程(一)之vector
    JSP -- include指令与include动作的区别
    Rails -- 关于Migration
    ruby -- 进阶学习(八)自定义方法route配置
    ruby -- 进阶学习(七)strong parameters之permitted.has_key
  • 原文地址:https://www.cnblogs.com/qiuyuyu/p/9144034.html
Copyright © 2011-2022 走看看