zoukankan      html  css  js  c++  java
  • python数据预处理

    缺失值处理

    import pandas as pda
    import numpy as npy
    import matplotlib.pylab as pyl
    # data=pda.read_excel("D:/taobao2.xls")
    def index(data):
      data = pda.DataFrame(data[1:],columns=data[0])
      print(data)
      data["价格"][(data["价格"]==0)]=None
      print(data)
      x=0
      for i in data.columns:
       for j in range(len(data)):
         if(data[i].isnull())[j]:
            data[i][j]=data["价格"].mean()
            x+=1
            print(x)
      
    if __name__ == "__main__":
      data = nosupervision_read_data()
      index(data)
    

    数据离散化处理

    #离散化
    #连续型数据离散化
    #等宽离散化
    import pandas as pda
    import numpy as npy
    import matplotlib.pylab as pyl
    # data=pda.read_excel("D:/taobao2.xls")
    def index(data):
        data = pda.DataFrame(data[1:], columns=data[0])
        da=data.values
        price=da[:,2]
        price.sort()
        print(price)
        k=5
        c1=pda.cut(price,k,labels=["太便宜","便宜","适中","贵","太贵"])
        print(c1)
    #指点区间离散化
        k=[0,50,100,price.max()]
        print(k)
        c2=pda.cut(price,k,labels=["非常便宜","适中","贵"])
        print(c2)
    if __name__ == "__main__":
       data = nosupervision_read_data()
       index(data)
    

    数据集成处理

    # -*- coding:utf-8 -*-
    # 异常值处理
    import pandas as pda
    import numpy as npy
    def index(data):
    # 输出结果必须为字典output
       output = {}
    # data = pda.read_excel("D:/taobao2.xls")
       data = pda.DataFrame(data[1:], columns=data[0])
    # print(data)
       da = data.values
    # 数据集成
       da1 = da[0:10]
       da2 = da[10:20]
       da3 = npy.concatenate((da1, da2))
       pda.DataFrame(da3)
       output['data_数据集成'] = pda.DataFrame(da3).values.tolist()
       print(pda.DataFrame(da1))
       print(pda.DataFrame(da2))
       print(pda.DataFrame(da3))
       print(output)
       return output
    if __name__ == "__main__":
       data = nosupervision_read_data()
       index(data)
    

      

  • 相关阅读:
    python3爬虫 -----新浪微博(m)-------评论爬取
    hdu 5585
    Atcoder 092
    python3糗事爬取-------------------糗事百科
    python3爬虫 -----爬取职位招聘信息-------from腾讯社会招聘
    python3电影详细信息爬取-------------------电影天堂
    Python词云分析
    合并排序
    活动安排问题
    你好,2019!
  • 原文地址:https://www.cnblogs.com/wei23/p/10890609.html
Copyright © 2011-2022 走看看