什么是数据分析:
- 数据分析是用适当的方法对收集来的大量数据进行分析,帮助人们作出判断,以便采取适当行动。
- 将数据进行可视化,更直观的呈现
- 使数据更加客观,更具说服力
matplotlib学习
1,什么是matplotlib
Matolotlib是最流行的python底层绘图库,主要是做数据可视化图表。它可以让数据更加直观的呈现,让数据更加客观,具有说服力。学习爬虫后,可能会遇到对大量的数据的处理,于是学习数据分析是必不可少的。
http://www.360doc.com/content/20/0808/01/71093473_929113091.shtml
2,matplotlib基本要点
3,matplotlib的散点图,直方图,柱状图
4,更多的画图工具
matplotlib 折线图案例
案例一:使用折线图绘制假设一天中每隔两个小时的气温分别是[15,13,14.5,17,20,25,26,26,27,22,18,15]度。
from matplotlib import pyplot as plt # 导入pyplot
fig = plt.figure(figsize=(20,8),dpi=80)
# figure 图形图标的意思,在这里指我们画的图
# 传入figsize给图片设置长高
# 传入dpi参数,给图片增加像素
x = range(2,26,2) # 时间
y = [15,13,14.5,17,20,25,26,26,27,22,18,15] # 温度
plt.plot(x,y) # 传入x和y,通过plot绘制出折线图
plt.xticks(x) # 设置x轴刻度
#plt.savefig("气温.png") # 保存图片,传入.svg这种矢量图格式,放大不会有锯齿
plt.show() # 展示图形
from matplotlib import pyplot as plt # 导入pyplot
fig = plt.figure(figsize=(20,8),dpi=80)
# figure 图形图标的意思,在这里指我们画的图
# 传入figsize给图片设置长高
# 传入dpi参数,给图片增加像素
x = range(2,26,2) # 时间
y = [15,13,14.5,17,20,25,26,26,27,22,18,15] # 温度
plt.plot(x,y) # 传入x和y,通过plot绘制出折线图
# 调整x轴刻度
scale = [i/2 for i in range(4,49)]
#plt.xticks()
plt.xticks(scale[::3])# 当刻度太密集的时候可使用列表的步长来进行调整,matplotlib会自动做对应
# 调整y轴刻度
plt.yticks(range(min(y),max(y)+1))
#plt.savefig("气温.png") # 保存图片,传入.svg这种矢量图格式,放大不会有锯齿
plt.show() # 展示图形
案例二:随机生成一个气温度数,然后绘制10点到12点每一分钟的气温。
from matplotlib import pyplot as plt # 导入pyplot
import random #导入 random随机数模块
fig = plt.figure(figsize=(20,8),dpi=80) #设置绘制图片的大小以及像素
x = range(0,120) # 时间
y = [random.randint(20,35)for i in range(120)] # 生成随机温度
plt.plot(x,y)
# 调整x轴刻度
x_time = ["10点{}分".format(i) for i in range(60)]
x_time += ["11点{}分".format(i) for i in range(60)]
# 取步长,数字和字符串一一对应,数据长度一样
plt.xticks(list(x)[::3],x_time[::3],rotation=90) #rotation函数旋转字符度数)
plt.show()
from matplotlib import pyplot as plt # 导入pyplot
import random #导入 random随机数模块
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置中文实例解决乱码
fig = plt.figure(figsize=(20,8),dpi=80) #设置绘制图片的大小以及像素
x = range(0,120) # 时间
y = [random.randint(20,35)for i in range(120)] # 生成随机温度
plt.plot(x,y)
# 调整x轴刻度
x_time = ["10点{}分".format(i) for i in range(60)]
x_time += ["11点{}分".format(i) for i in range(60)]
# 取步长,数字和字符串一一对应,数据长度一样
plt.xticks(list(x)[::3],x_time[::3],rotation=270,fontproperties=my_font) #rotation函数旋转字符度数
plt.show()
from matplotlib import pyplot as plt # 导入pyplot
import random #导入 random随机数模块
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置中文实例解决乱码
fig = plt.figure(figsize=(20,8),dpi=80) #设置绘制图片的大小以及像素
x = range(0,120) # 时间
y = [random.randint(20,35)for i in range(120)] # 生成随机温度
plt.plot(x,y)
# 调整x轴刻度
x_time = ["10点{}分".format(i) for i in range(60)]
x_time += ["11点{}分".format(i) for i in range(60)]
# 取步长,数字和字符串一一对应,数据长度一样
plt.xticks(list(x)[::3],x_time[::3],rotation=270,fontproperties=my_font) #rotation函数旋转字符度数
# 添加描述信息
plt.xlabel("时间",fontproperties=my_font)
plt.ylabel("温度 单位(℃)",fontproperties=my_font)
plt.title("10点到12点每分钟的气温变化情况",fontproperties=my_font)
plt.show()
案例三:假设小明是30岁,统计出小明从11岁到30岁每年交往的女朋友数量如列表a,请绘制出该数据的折线图。
a = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
要求:
y轴表示交往女朋友个数
x轴表示其岁数如11岁,12岁,13岁等
from matplotlib import pyplot as plt # 导入pyplot
from matplotlib import font_manager
x = range(11,31)
y = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')
fig = plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y)
plt.yticks(range(min(y),max(y)+1))
xiaoming = ["{}岁".format(i) for i in range(11,31)]
plt.xticks(list(x),xiaoming,fontproperties=my_font)
plt.xlabel("小明年龄",fontproperties=my_font)
plt.ylabel("女朋友个数",fontproperties=my_font)
plt.title("小明11岁到30岁女朋友个数详情",fontproperties=my_font)
plt.show()
from matplotlib import pyplot as plt # 导入pyplot
from matplotlib import font_manager
x = range(11,31)
y = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')
fig = plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,y)
plt.yticks(range(0,9))
xiaoming = ["{}岁".format(i) for i in range(11,31)]
plt.xticks(list(x),xiaoming,fontproperties=my_font)
plt.xlabel("小明年龄",fontproperties=my_font)
plt.ylabel("女朋友个数",fontproperties=my_font)
plt.title("小明11岁到30岁女朋友个数详情",fontproperties=my_font)
# 绘制网格
plt.grid(alpha=0.4)#alpha 设置网格深度
plt.show()
案例四:假设小a和小b 都是30岁,请在同一个图中绘制出a和b从11岁到30岁个交往的女朋友个数的折线图。
a = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
b = [1,0,3,1,2,2,3,3,2,1,2,1,1,1,1,1,1,1,1,1]
from matplotlib import pyplot as plt # 导入pyplot
from matplotlib import font_manager
x = range(11,31)
a = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
b = [1,0,3,1,2,2,3,3,2,1,2,1,1,1,1,1,1,1,1,1]
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')
fig = plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,a)
plt.plot(x,b)
plt.yticks(range(0,9))
xiaoming = ["{}岁".format(i) for i in range(11,31)]
plt.xticks(list(x),xiaoming,fontproperties=my_font)
plt.xlabel("年龄",fontproperties=my_font)
plt.ylabel("女朋友个数",fontproperties=my_font)
plt.title("a和b从11岁到30岁女朋友个数队比详情",fontproperties=my_font)
# 绘制网格
plt.grid(alpha=0.4)#alpha 设置网格深度
plt.show()
图例位置,线条颜色,线条风格
from matplotlib import pyplot as plt # 导入pyplot
from matplotlib import font_manager
x = range(11,31)
a = [1,0,1,1,2,4,3,2,3,4,4,5,6,5,4,3,3,1,1,1]
b = [1,0,3,1,2,2,3,3,2,1,2,1,1,1,1,1,1,1,1,1]
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')
fig = plt.figure(figsize=(20,8),dpi=80)
plt.plot(x,a,label="小a",color="orange",linestyle="--" ) # color 参数表示折线的颜色,linestyle参数表示线条的风格
plt.plot(x,b,label="小b",color="#FFB6C1")
plt.yticks(range(0,9))
xiaoming = ["{}岁".format(i) for i in range(11,31)]
plt.xticks(list(x),xiaoming,fontproperties=my_font)
plt.xlabel("年龄",fontproperties=my_font)
plt.ylabel("女朋友个数",fontproperties=my_font)
plt.title("a和b从11岁到30岁女朋友个数队比详情",fontproperties=my_font)
# 绘制网格
plt.grid(alpha=0.4)#alpha 设置网格深度
plt.legend(prop=my_font,loc="upper left")# 添加图例,loc显示图列位置
plt.show()
matplotlib-散点图学习
案例一:
假设通过爬虫获取到了北京2020年3,10月份每天白天的最高气温(分别位于列表a,b),那么此时如果寻找出气温和随时间(天)变化得某种规律呢?
a = [11,17,16,11,12,11,12,6,6,7,8,9,12,15,14,17,18,21,16,17,20,14,15,15,15,19,21,22,22,22,23]
b = [26,26,28,19,21,17,16,19,18,20,20,19,22,23,17,20,21,20,22,15,11,15,5,13,17,10,11,13,12,16,6]
from matplotlib import pyplot as plt
from matplotlib import font_manager
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
y_3 = [11,17,16,11,12,11,12,6,6,7,8,9,12,15,14,17,18,21,16,17,20,14,15,15,15,19,21,22,22,22,23]
y_10 = [26,26,28,19,21,17,16,19,18,20,20,19,22,23,17,20,21,20,22,15,11,15,5,13,17,10,11,13,12,16,6]
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
x_3 = range(1,32)
x_10 = range(51,82)
# 使用scatter方法绘制散点图,和之前绘制折线图的唯一区别
plt.scatter(x_3,y_3,label="3月份")
plt.scatter(x_10,y_10,label="10月份")
#调整x轴的刻度
_x = list(x_3)+list(x_10)
_xtick_labels = ["3月{}日".format(i) for i in x_3]
_xtick_labels += ["10月{}日".format(i-50) for i in x_10]
plt.xticks(_x[::3],_xtick_labels[::3],fontproperties=my_font,rotation = 45)
plt.legend(prop=my_font,loc="upper left")# 添加图例,loc显示图列位置
#添加描述信息
plt.xlabel("时间",fontproperties=my_font)
plt.ylabel("温度",fontproperties=my_font)
plt.title("北京2020年3,10月份每天白天的最高气温对比",fontproperties=my_font)
plt.show()
绘制条形图
案例一:假设获取到2017年内地电影(列表a)和电影票房数据(列表b),那么如何更加直观展示该数据?
a = [“战狼2”,“速度与激情8”,“功夫瑜伽”,“西游伏妖篇”,“变形金刚5:最后的骑士”,“摔跤吧!爸爸”,“加勒比海盗5:死无对证”,“金刚:骷髅岛”,“极限特工:终极回归”,“生化危机6:终章”,“乘风破浪”,“神偷奶爸3”,“智取威虎山”,“大闹天竺”,“金刚狼3:殊死一战”,“蜘蛛侠:英雄归来”,“悟空传”,“银河护卫队2”,“情圣”,“新木乃伊”]
b = [56.01,26.94,17.53,16.49,15.45,12.96,11.8,11.61,11.28,11.12,10.49,10.3,8.75,7.55,7.32,6.99,6.88,6.86,6.58,6.23] 单位:亿
from matplotlib import pyplot as plt
from matplotlib import font_manager
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
a = ["战狼2","速度与激情8","功夫瑜伽","西游伏妖篇","变形金刚5
:最后的骑士","摔跤吧!爸爸","加勒比海盗5
:死无对证","金刚:骷髅岛","极限特工
:终极回归","生化危机6
:终章","乘风破浪","神偷奶爸3","智取威虎山","大闹天竺","金刚狼3
:殊死一战","蜘蛛侠
:英雄归来","悟空传","银河护卫队2","情圣","新木乃伊"]
b = [56.01,26.94,17.53,16.49,15.45,12.96,11.8,11.61,11.28,11.12,10.49,10.3,8.75,7.55,7.32,6.99,6.88,6.86,6.58,6.23] #单位:亿
plt.bar(range(len(a)),b,width=0.3)#width设置线条宽度
plt.xticks(range(len(a)),a,fontproperties=my_font,rotation=45)
plt.xlabel("电影名称",fontproperties=my_font)
plt.ylabel("票房",fontproperties=my_font)
plt.title("2017年内地电影和电影票房数据",fontproperties=my_font)
plt.show()
plt.show()
from matplotlib import pyplot as plt
from matplotlib import font_manager
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
a = ["战狼2","速度与激情8","功夫瑜伽","西游伏妖篇","变形金刚5
:最后的骑士","摔跤吧!爸爸","加勒比海盗5
:死无对证","金刚:骷髅岛","极限特工
:终极回归","生化危机6
:终章","乘风破浪","神偷奶爸3","智取威虎山","大闹天竺","金刚狼3
:殊死一战","蜘蛛侠
:英雄归来","悟空传","银河护卫队2","情圣","新木乃伊"]
b = [56.01,26.94,17.53,16.49,15.45,12.96,11.8,11.61,11.28,11.12,10.49,10.3,8.75,7.55,7.32,6.99,6.88,6.86,6.58,6.23] #单位:亿
plt.barh(range(len(a)),b,height=0.3,color="#33FF00")#height设置线条宽度
plt.yticks(range(len(a)),a,fontproperties=my_font)
plt.xlabel("票房(亿)",fontproperties=my_font)
plt.ylabel("电影名称",fontproperties=my_font)
plt.title("2017年内地电影和电影票房数据",fontproperties=my_font)
plt.grid(alpha=0.3)#绘制网格
plt.show()
案例二:假设已知a中电影分别在 2017-09-14(b_14),2017-9-15(b_15),2017-9-16(b_16)三天的票房数据,为了展示列表中电影本身的票房以及同其他电影的数据对比情况,应该如何更加直观的呈现该票房数据?
a = [“猩球崛起3:终极之战”,"敦刻尔克,“蜘蛛侠:英雄归来”,“战狼2”]
b_14 = [2358,399,2358,362]
b_15 = [12357,156,2045,168]
b_16 = [15764,312,4497,319]
from matplotlib import pyplot as plt
from matplotlib import font_manager
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
a = ["猩球崛起3:终极之战","郭刻尔克","蜘蛛侠:英雄归来","战狼2"]
b_14 = [2358,399,2358,362]
b_15 = [12357,156,2045,168]
b_16 = [15764,312,4497,319]
bar_width = 0.2
x_14 = list(range(len(a)))
x_15 = [i + bar_width for i in x_14]
x_16 = [i + bar_width*2 for i in x_14]
plt.bar(range(len(a)),b_14,width=bar_width)
plt.bar(x_15,b_15,width=bar_width)
plt.bar(x_16,b_16,width=bar_width)
plt.show()
from matplotlib import pyplot as plt
from matplotlib import font_manager
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
a = ["猩球崛起3:终极之战","敦刻尔克","蜘蛛侠:英雄归来","战狼2"]
b_14 = [2358,399,2358,362]
b_15 = [12357,156,2045,168]
b_16 = [15764,312,4497,319]
bar_width = 0.2
x_14 = list(range(len(a)))
x_15 = [i + bar_width for i in x_14]
x_16 = [i + bar_width*2 for i in x_14]
plt.bar(range(len(a)),b_14,width=bar_width,label="9月14日")
plt.bar(x_15,b_15,width=bar_width,label="9月15日")
plt.bar(x_16,b_16,width=bar_width,label="9月16日")
#设置图例
plt.legend(prop=my_font)
# 设置x轴的刻度
plt.xticks(x_15,a,fontproperties=my_font)
plt.show()
绘制直方图
案例一:
假设你获取了50部电影的时长(列表a中),希望统计出这些电影时长的分布状态(比如时长为100分钟到120分钟的电影数量,出现的评率)等信息。
a= [131,98,125,131,124,139,131,117,128,108,135,138,131,102,107,114,119,128,121,142,127,130,124,101,110,116,117,110,128,128,115,99,136,126,134,95,138,117,111,78,132,124,113,150,110,117,86,95,144,105,126,130,126,130,126,116,123,106,112,138,123,86,101,99,136,123,117,119,105,137,123,128,125,104,109,134,77,88,99,100,121,95,92,123,128,111,135,132,124,150,151,98,75,72,77,82,100,98,99,100]
from matplotlib import pyplot as plt
from matplotlib import font_manager
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
a= [131,98,125,131,124,139,131,117,128,108,135,138,131,102,107,114,119,128,121,142,127,130,124,101,110,116,117,110,128,128,115,99,136,126,134,95,138,117,111,78,132,124,113,150,110,117,86,95,144,105,126,130,126,130,126,116,123,106,112,138,123,86,101,99,136,123,117,119,105,137,123,128,125,104,109,134,77,88,99,100,121,95,92,123,128,111,135,132,124,150,151,98,75,72,77,82,100,98,99,100]
#计算组数
d = 5 #组距
num_bins = (max(a)-min(a))//d
plt.hist(a,num_bins)
#设置x轴的刻度
plt.xticks(range(min(a),max(a)+d,d))
plt.grid()# 绘制网格
plt.show()
我们发现图片出现了偏移是因为组距无法被整除
from matplotlib import pyplot as plt
from matplotlib import font_manager
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
a= [131,98,125,131,124,139,131,117,128,108,135,138,131,102,107,114,119,128,121,142,127,130,124,101,110,116,117,110,128,128,115,99,136,126,134,95,138,117,111,78,132,124,113,150,110,117,86,95,144,105,126,130,126,130,126,116,123,106,112,138,123,86,101,99,136,123,117,119,105,137,123,128,125,104,109,134,77,88,99,100,121,95,92,123,128,111,135,132,124,150,151,98,75,72,77,82,100,98,99,100]
#计算组数
d = 1 #组距
num_bins = (max(a)-min(a))//d
plt.hist(a,num_bins,density=True)
#设置x轴的刻度
plt.xticks(range(min(a),max(a),d))
plt.grid()# 绘制网格
plt.show()
from matplotlib import pyplot as plt
from matplotlib import font_manager
fig = plt.figure(figsize=(20,8),dpi=80)# 设置绘图大小
my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/msyhl.ttc')# 设置字体
interval = [0,5,10,15,20,25,30,25,40,45,60,90]
width = [5,5,5,5,5,5,5,5,5,15,30,60]
quantity = [836,2737,3723,3926,3596,1438,3273,642,824,613,215,47]
plt.bar(range(len(quantity)),quantity,width=1)
plt.show()