zoukankan      html  css  js  c++  java
  • 吴裕雄 数据挖掘与分析案例实战(4)——python数据处理工具:Pandas

    # 导入模块
    import pandas as pd
    import numpy as np

    # 构造序列
    gdp1 = pd.Series([2.8,3.01,8.99,8.59,5.18])
    print(gdp1)
    # 取出gdp1中的第一、第四和第五个元素
    print('行号风格的序列: ',gdp1[[0,3,4]])
    # 数学函数--取对数
    print('通过numpy函数: ',np.log(gdp1))
    # 平均gdp
    print('通过numpy函数: ',np.mean(gdp1))
    print('通过序列的方法: ',gdp1.mean())

    gdp2 = pd.Series({'北京':2.8,'上海':3.01,'广东':8.99,'江苏':8.59,'浙江':5.18})
    print(gdp2)
    # 取出gdp2中的第一、第四和第五个元素
    print('行名称风格的序列: ',gdp2[[0,3,4]])
    # 取出gdp2中上海、江苏和浙江的GDP值
    print('行名称风格的序列: ',gdp2[['上海','江苏','浙江']])

    gdp3 = pd.Series(np.array((2.8,3.01,8.99,8.59,5.18)))
    print(gdp3)

    # 构造数据框
    df1 = pd.DataFrame([['张三',23,'男'],['李四',27,'女'],['王二',26,'女']])
    print('嵌套列表构造数据框: ',df1)

    df2 = pd.DataFrame({'姓名':['张三','李四','王二'],'年龄':[23,27,26],'性别':['男','女','女']})
    print('字典构造数据框: ',df2)

    df3 = pd.DataFrame(np.array([['张三',23,'男'],['李四',27,'女'],['王二',26,'女']]))
    print('二维数组构造数据框: ',df3)

    # 读取文本文件中的数据
    user_income = pd.read_table(r'F:\python_Data_analysis_and_mining\05\data_test01.txt', sep = ',',
    parse_dates={'birthday':[0,1,2]},skiprows=2, skipfooter=3,
    comment='#', encoding='utf8', thousands='&')
    print(user_income)

    child_cloth = pd.read_excel(io = r'F:\python_Data_analysis_and_mining\05\data_test02.xlsx', header = None,
    names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'], converters = {0:str})
    print(child_cloth)

    # 读取电子表格数据
    pd.read_excel(io = r'C:UsersAdministratorDesktopdata_test02.xlsx', header = None,
    names = ['Prod_Id','Prod_Name','Prod_Color','Prod_Price'])

    # 导入模块
    import pymysql
    # 连接MySQL数据库
    conn = pymysql.connect(host='localhost', user='root', password='1q2w3e4r',
    database='test', port=3306, charset='utf8')
    # 读取数据
    user = pd.read_sql('select * from topy', conn)
    # 关闭连接
    conn.close()
    # 数据输出
    user

    # 导入第三方模块
    import pymssql
    # 连接SQL Server数据库
    connect = pymssql.connect(server = 'localhost', user = '', password = '',
    database = 'train', charset = 'utf8')
    # 读取数据
    data = pd.read_sql("select * from sec_buildings where direction = '朝南'", con=connect)
    # 关闭连接
    connect.close()
    # 数据输出
    data.head()

    import numpy as np
    import pandas as pd

    # 数据类型转换及描述统计
    # 数据读取
    sec_cars = pd.read_table(r'F:\python_Data_analysis_and_mining\05\sec_cars.csv', sep = ',')
    # 预览数据的前五行
    print(sec_cars.head())
    # 查看数据的行列数
    print('数据集的行列数: ',sec_cars.shape)
    # 查看数据集每个变量的数据类型
    print('各变量的数据类型: ',sec_cars.dtypes)
    # 修改二手车上牌时间的数据类型
    sec_cars.Boarding_time = pd.to_datetime(sec_cars.Boarding_time, format = '%Y年%m月')
    # 预览数据的前五行
    print(sec_cars.head())
    # 修改二手车新车价格的数据类型
    # sec_cars.New_price = sec_cars.New_price.str[:-1].astype('float')
    # 重新查看各变量数据类型
    print(sec_cars.dtypes)
    # 数据的描述性统计
    print(sec_cars.describe())
    # 数据的形状特征
    # 挑出所有数值型变量
    num_variables = sec_cars.columns[sec_cars.dtypes !='object'][1:]
    print(num_variables)

    # 自定义函数,计算偏度和峰度
    def skew_kurt(x):
    skewness = x.skew()
    kurtsis = x.kurt()
    # 返回偏度值和峰度值
    return pd.Series([skewness,kurtsis], index = ['Skew','Kurt'])

    # 运用apply方法
    print(sec_cars[num_variables].apply(func = skew_kurt, axis = 0))
    # 离散型变量的统计描述
    print()
    print(sec_cars.describe(include = ['object']))

    # 离散变量频次统计
    Freq = sec_cars.Discharge.value_counts()
    print(Freq)
    print(sec_cars.shape)
    print(sec_cars.shape[0])
    Freq_ratio = Freq/sec_cars.shape[0]
    print(Freq_ratio)

    Freq_df = pd.DataFrame({'Freq':Freq,'Freq_ratio':Freq_ratio})
    print(Freq_df.head())
    # 将行索引重设为变量
    Freq_df.reset_index(inplace = True)
    print(Freq_df.head())

    # 数据读入
    df = pd.read_excel(r'F:\python_Data_analysis_and_mining\05\data_test03.xlsx')
    # 各变量数据类型
    print(df.dtypes)
    # 将birthday变量转换为日期型
    df.birthday = pd.to_datetime(df.birthday, format = '%Y/%m/%d')
    print(df.birthday)
    # 将手机号转换为字符串
    df.tel = df.tel.astype('str')
    print(df.dtypes)
    print(df.birthday.dt.year)
    print(df.start_work.dt.year)
    # 新增年龄和工龄两列
    df['age'] = pd.datetime.today().year - df.birthday.dt.year
    df['workage'] = pd.datetime.today().year - df.start_work.dt.year
    print(df.head())
    # 将手机号中间四位隐藏起来
    df.tel = df.tel.apply(func = lambda x : x.replace(x[3:7], '****'))
    print(df.head())
    # 取出邮箱的域名
    df['email_domain'] = df.email.apply(func = lambda x : x.split('@')[1])
    print(df.head())
    # 取出用户的专业信息
    df['profession'] = df.other.str.findall('专业:(.*?),')
    print(df.head())
    # 去除birthday、start_work和other变量
    df.drop(['birthday','start_work','other'], axis = 1, inplace = True)
    print(df.head())

    # 常用日期处理方法
    dates = pd.to_datetime(pd.Series(['1989-8-18 13:14:55','1995-2-16']), format = '%Y-%m-%d %H:%M:%S')
    print('返回日期值: ',dates.dt.date)
    print('返回季度: ',dates.dt.quarter)
    print('返回几点钟: ',dates.dt.hour)
    print('返回年中的天: ',dates.dt.dayofyear)
    print('返回年中的周: ',dates.dt.weekofyear)
    print('返回星期几的名称: ',dates.dt.weekday_name)
    print('返回月份的天数: ',dates.dt.days_in_month)

    # 数据清洗
    # 数据读入
    df = pd.read_excel(r'F:\python_Data_analysis_and_mining\05\data_test04.xlsx')
    print(df)
    # 重复观测的检测
    print('数据集中是否存在重复观测: ',any(df.duplicated()))
    # 删除重复项
    df.drop_duplicates(inplace = True)
    print(df)

    # 数据读入
    df = pd.read_excel(r'F:\python_Data_analysis_and_mining\05\data_test05.xlsx')
    print(df)
    # 缺失观测的检测
    print('数据集中是否存在缺失值: ',any(df.isnull()))
    # 删除法之记录删除
    df.dropna()
    # print(df)
    # 删除法之变量删除
    df.drop('age', axis = 1)
    # print(df)
    # 替换法之前向替换
    df.fillna(method = 'ffill',inplace = True)
    # print(df)
    # 替换法之后向替换
    df.fillna(method = 'bfill',inplace = True)
    # print(df)
    # 替换法之常数替换
    df.fillna(value = 0)
    # print(df)
    # 替换法之统计值替换
    df = df.fillna(value = {'gender':df.gender.mode()[0], 'age':df.age.mean(), 'income':df.income.median()})
    print(df)

    # 数据读入
    sunspots = pd.read_table(r'F:\python_Data_analysis_and_mining5\sunspots.csv', sep = ',')
    print(sunspots.shape)
    # 异常值检测之标准差法
    xbar = sunspots.counts.mean()
    print(xbar)
    xstd = sunspots.counts.std()
    print(xstd)
    print('标准差法异常值上限检测: ',any(sunspots.counts > xbar + 2 * xstd))
    print('标准差法异常值下限检测: ',any(sunspots.counts < xbar - 2 * xstd))
    # 异常值检测之箱线图法
    Q1 = sunspots.counts.quantile(q = 0.25)
    print(Q1)
    Q3 = sunspots.counts.quantile(q = 0.75)
    print(Q3)
    IQR = Q3 - Q1
    print(IQR)
    print('箱线图法异常值上限检测: ',any(sunspots.counts > Q3 + 1.5 * IQR))
    print('箱线图法异常值下限检测: ',any(sunspots.counts < Q1 - 1.5 * IQR))

    # 导入绘图模块
    import matplotlib.pyplot as plt
    # 设置绘图风格
    plt.style.use('ggplot')
    # 绘制直方图
    sunspots.counts.plot(kind = 'hist', bins = 30, normed = True)
    # 绘制核密度图
    sunspots.counts.plot(kind = 'kde')
    # 图形展现
    plt.show()
    # 替换法处理异常值
    print('异常值替换前的数据统计特征: ',sunspots.counts.describe())
    # 箱线图中的异常值判别上限
    UL = Q3 + 1.5 * IQR
    print('判别异常值的上限临界值: ',UL)
    # 从数据中找出低于判别上限的最大值
    replace_value = sunspots.counts[sunspots.counts < UL].max()
    print('用以替换异常值的数据: ',replace_value)
    # 替换超过判别上限异常值
    sunspots.counts[sunspots.counts > UL] = replace_value
    print('异常值替换后的数据统计特征: ',sunspots.counts.describe())

    df1 = pd.DataFrame({'name':['张三','李四','王二','丁一','李五'],
    'gender':['男','女','女','女','男'],
    'age':[23,26,22,25,27]}, columns = ['name','gender','age'])
    print(df1)
    # 取出数据集的中间三行(即所有女性),并且返回姓名和年龄两列
    a = df1.iloc[1:4,[0,2]]
    print(a)
    b = df1.loc[1:3, ['name','age']]
    print(b)
    c = df1.ix[1:3,[0,2]]
    print(c)
    # 将员工的姓名用作行标签
    df2 = df1.set_index('name')
    print(df2)
    # 取出数据集的中间三行
    a = df2.iloc[1:4,:]
    print(a)
    b = df2.loc[['李四','王二','丁一'],:]
    print(b)
    c = df2.ix[1:4,:]
    print(c)
    # 使用筛选条件,取出所有男性的姓名和年龄
    # df1.iloc[df1.gender == '男',]
    a = df1.loc[df1.gender == '男',['name','age']]
    print(a)
    b = df1.ix[df1.gender == '男',['name','age']]
    print(b)

    # 数据读取
    diamonds = pd.read_table(r'F:\python_Data_analysis_and_mining\05\diamonds.csv', sep = ',')
    print(diamonds.shape)
    print(diamonds.head())
    # 单个分组变量的均值统计
    a = pd.pivot_table(data = diamonds, index = 'color', values = 'price', margins = True, margins_name = '总计')
    print(a)
    # 两个分组变量的列联表
    # 导入numpy模块
    import numpy as np

    b = pd.pivot_table(data = diamonds, index = 'clarity', columns = 'cut', values = 'carat',
    aggfunc = np.size,margins = True, margins_name = '总计')
    print(b)

    # 构造数据集df1和df2
    df1 = pd.DataFrame({'name':['张三','李四','王二'], 'age':[21,25,22], 'gender':['男','女','男']})
    print(df1)
    df2 = pd.DataFrame({'name':['丁一','赵五'], 'age':[23,22], 'gender':['女','女']},)
    print(df2)
    # 数据集的纵向合并
    a = pd.concat([df1,df2], keys = ['df1','df2'])
    print(a)
    # 如果df2数据集中的“姓名变量为Name”
    df2 = pd.DataFrame({'Name':['丁一','赵五'], 'age':[23,22], 'gender':['女','女']})
    # 数据集的纵向合并
    b = pd.concat([df1,df2])
    print(b)

    # 构造数据集
    df3 = pd.DataFrame({'id':[1,2,3,4,5],'name':['张三','李四','王二','丁一','赵五'],'age':[27,24,25,23,25],'gender':['男','男','男','女','女']})
    print(df3)
    df4 = pd.DataFrame({'Id':[1,2,2,4,4,4,5],'kemu':['科目1','科目1','科目2','科目1','科目2','科目3','科目1'],'score':[83,81,87,75,86,74,88]})
    print(df4)
    df5 = pd.DataFrame({'id':[1,3,5],'name':['张三','王二','赵五'],'income':[13500,18000,15000]})
    print(df5)
    # 三表的数据连接
    # 首先df3和df4连接
    merge1 = pd.merge(left = df3, right = df4, how = 'left', left_on='id', right_on='Id')
    print(merge1)
    # 再将连接结果与df5连接
    merge2 = pd.merge(left = merge1, right = df5, how = 'left')
    print(merge2)

    # 数据读取
    diamonds = pd.read_table(r'F:\python_Data_analysis_and_mining\05\diamonds.csv', sep = ',')
    print(diamonds.shape)
    print(diamonds.head())
    # 通过groupby方法,指定分组变量
    grouped = diamonds.groupby(by = ['color','cut'])
    print(grouped)
    # 对分组变量进行统计汇总
    # 通过groupby方法,指定分组变量
    grouped = diamonds.groupby(by = ['color','cut'])
    print(grouped)
    # 对分组变量进行统计汇总
    result = grouped.aggregate({'color':np.size, 'carat':np.min, 'price':np.mean, 'face_width':np.max})
    # 调整变量名的顺序
    result = pd.DataFrame(result, columns=['color','carat','price','face_width'])
    # 数据集重命名
    result.rename(columns={'color':'counts','carat':'min_weight','price':'avg_price','face_width':'max_face_width'}, inplace=True)
    # 将行索引变量数据框的变量
    result.reset_index(inplace=True)
    result

  • 相关阅读:
    滚动播放文字或者图片信息效果
    电话号码
    星星
    圆角android
    JavaSE 基于AWT的记事本实现
    JavaSE 匿名类(Anonymouse Inner Class)实现
    老子《道德经》第四十四章
    linux 源码安装最新git
    sama5d3 xplained 编译文件系统(buildroot)添加nfs支持
    sama5d3 xplained 挂载主机NFS共享目录
  • 原文地址:https://www.cnblogs.com/tszr/p/10059199.html
Copyright © 2011-2022 走看看