import pandas as pd #导入pandas
import matplotlib.pyplot as plt #导入matplotlib
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
%matplotlib inline
数据读取与索引
bra = pd.read_csv('data/bra.csv')
bra.head()
data:image/s3,"s3://crabby-images/f1f4d/f1f4d03294bb51b42efc87fa5ae745a2a20f4431" alt=""
选取列
bra.content
data:image/s3,"s3://crabby-images/8d81b/8d81be721c2fc6d3455a3bff2668960a79ea34e6" alt=""
bra[['creationTime','productColor']].head()
data:image/s3,"s3://crabby-images/629ce/629ce1e630e3d76dd6a3e9aae49c3bdf3b47e9a5" alt=""
选择行
bra[1:6]
data:image/s3,"s3://crabby-images/bb4cb/bb4cba29fd933ad7075102989059288d9abb4a2a" alt=""
选择行和列
bra.ix[[2,3],[1,3]] #使用ix
data:image/s3,"s3://crabby-images/faea1/faea149902cf997018f6dd432dff68d4b19bcaae" alt=""
bra.ix[1:5,['productColor']]
data:image/s3,"s3://crabby-images/e175f/e175f2b9bfd6567420e93603f05c207d85bf6d97" alt=""
bra.iloc[[2,3],[1,3]] #使用iloc
data:image/s3,"s3://crabby-images/17bd0/17bd028426fe5ee87ee34fecdefd2fad6b410074" alt=""
bra.loc[1:5,['content','creationTime','productSize']] #使用loc
data:image/s3,"s3://crabby-images/34f21/34f219d2993dc3f2cb14e3144e11929bf78be4b1" alt=""
bra.loc[1:5,'content':'userClientShow']
data:image/s3,"s3://crabby-images/e6a1a/e6a1adcc99ad5723377303b5dc8367e6dc1ffe22" alt=""
数据预处理
缺失值
bra.describe() #查看数据的分布情况,可返回变量和观测的数量、缺失值和唯一值的数目、平均值、分位数等相关信息
data:image/s3,"s3://crabby-images/3b962/3b96260e2af4d7bca0018e85ea91218ab70e1081" alt=""
bra['userClientShow'].unique() #userClientShow列有几种选项
data:image/s3,"s3://crabby-images/06cce/06cce05ea759e931285a2690536f0e84c5232278" alt=""
bra['userClientShow'].isnull().sum() #初始缺失值数量
data:image/s3,"s3://crabby-images/0dd88/0dd8847ee2a431f6964bc4165c257e053b4ed752" alt=""
bra['userClientShow'].fillna('不详',inplace=True) #缺失值替换为“不详”
bra['userClientShow'].isnull().sum() #赋值后的缺失值数量
data:image/s3,"s3://crabby-images/566f4/566f4a79de95a003f4dae1e55a4bb4de8efbdd6f" alt=""
新增列
bra.dtypes #查看属性
data:image/s3,"s3://crabby-images/79613/7961392dd9ae2815be2320ba2d4d67f00f539064" alt=""
bra['creationTime'] = pd.to_datetime(bra['creationTime']) #更新类型
bra.dtypes
data:image/s3,"s3://crabby-images/cc241/cc241c7fe3740ba3db2b40a4787d8a338ae45f2b" alt=""
bra['hour'] = [i.hour for i in bra['creationTime']] #新建hour列
bra
data:image/s3,"s3://crabby-images/eec8b/eec8ba52d7f84f4bac680cef8649675590d12b5a" alt=""
字符串操作
bra.productSize.unique() #查看productSize的唯一值
data:image/s3,"s3://crabby-images/ef2f9/ef2f95195d45213e245ba8044d49e6d3005d2e38" alt=""
cup = bra.productSize.str.findall('[a-zA-Z]+').str[0] #新增列cup
cup2 = cup.str.replace('M','B')
cup3 = cup2.str.replace('L','C')
cup4 = cup3.str.replace('XC','D')
bra['cup'] = cup4
bra.head()
data:image/s3,"s3://crabby-images/03006/03006ab82de7b5951b2e66319bf31fdfc9b10ae2" alt=""
bra['cup'].unique() #查看cup唯一值
data:image/s3,"s3://crabby-images/bcd2b/bcd2b2591727c59c92d50a7fd4bb581d6617a61c" alt=""
数据转换
bra.productColor.unique() #查看productColor唯一值
data:image/s3,"s3://crabby-images/ddcb2/ddcb2d2ba48e2404b5ce47c18c62690cfbf7f86f" alt=""
def getColor(s):
if '黑' in s:
return '黑色'
elif '肤' in s:
return '肤色'
elif '蓝' in s:
return '蓝色'
elif '红' in s:
return '红色'
elif '紫' in s:
return '紫色'
elif '白' in s:
return '白色'
elif '粉' in s:
return '粉色'
elif '灰' in s:
return '灰色'
elif '绿' in s:
return '绿色'
elif '青' in s:
return '青色'
else:
return s
bra['color'] = bra['productColor'].map(getColor) #从productColor列查询,赋值到定义的函数getColor,最终新增列color
bra
data:image/s3,"s3://crabby-images/5e8bd/5e8bd1ca2d78efd045a31089810068b2968ffb09" alt=""
bra.color.unique() #查询color的唯一值
data:image/s3,"s3://crabby-images/f4b2a/f4b2a2bdad9a5cc1f3e975f2836e39e8a6db1e75" alt=""
数据可视化
x = [1991,1992,1993,1994,1995,1996,1997]
y = [23,56,38,29,34,56,92]
plt.plot(x,y) #调用函数plot
data:image/s3,"s3://crabby-images/2c64a/2c64a5316427b2531717ca659138cbb2edca152c" alt=""
plt.figure(figsize=(8,6),dpi=80) #调用函数firgure
plt.plot(x,y)
data:image/s3,"s3://crabby-images/9c0b3/9c0b3de50f8fd7c5848bf50e3a869324c52ed367" alt=""
hour = bra.groupby('hour')['hour'].count() #hour列排序
hour
data:image/s3,"s3://crabby-images/0bf32/0bf3298490afd2302244368610713855866cb4a3" alt=""
plt.xlim(0,25) #横轴0~25
plt.plot(hour,linestyle='solid',color='royalblue',marker='8') #颜色深蓝
data:image/s3,"s3://crabby-images/4f7e1/4f7e1e0a7c391f29a95699262a91ed49cd11efda" alt=""
cup_style = bra.groupby('cup')['cup'].count() #cup列唯一值得数量
cup_style
data:image/s3,"s3://crabby-images/1c58a/1c58a818c2df6bfa5fb44ae498cbbdd55d5b48bc" alt=""
plt.figure(figsize=(8,6),dpi=80)
labels = list(cup_style.index)
plt.xlabel('cup') #x轴为cup
plt.ylabel('count') #y轴为count数量
plt.bar(range(len(labels)),cup_style,color='royalblue',alpha=0.7) #alpha为透明度
plt.xticks(range(len(labels)),labels,fontsize=12)
plt.grid(color='#95a5a6',linestyle='--',linewidth=1,axis='y',alpha=0.6)
plt.legend(['user-count'])
for x,y in zip(range(len(labels)),cup_style):
plt.text(x,y,y,ha='center',va='bottom')
data:image/s3,"s3://crabby-images/bd114/bd114627c4b21b271533dcf42ce50b134d31d9ea" alt=""
color_style = bra.groupby('color')['color'].count() #color列唯一值得数量
color_style
data:image/s3,"s3://crabby-images/59635/59635ffe1daea6885b1352b0b17c4ff519cd9ed7" alt=""
plt.figure(figsize=(8,6),dpi=80)
plt.subplot(facecolor='gainsboro',alpha=0.2)
colors = ['brown','orange','gray','white','pink','purple','red','green','wheat','blue','gold','springgreen','black'] #颜色种类
labels = list(color_style.index)
plt.xlabel('count') #x轴为count数量
plt.ylabel('color') #y轴为color
plt.title('Color Distribution') #定义标题
plt.barh(range(len(labels)),color_style,color=colors,alpha=1)
plt.yticks(range(len(labels)),labels,fontsize=12)
plt.grid(color='#95a5a6',linestyle='--',linewidth=1,axis='x',alpha=0.4)
data:image/s3,"s3://crabby-images/3d99d/3d99def714575490ab4891da700fe42d93583494" alt=""
bra.head(30)
data:image/s3,"s3://crabby-images/16c0f/16c0fcdb2f66b578deb2ff02f077ccd6b12b91fb" alt=""