zoukankan      html  css  js  c++  java
  • 机器学习之路--Matplotlib

    1.绘制折线图

    在pandas里面有一种数据类型为datatime ,可以将不规范的日期改为:xxxx-xx-xx

    import pandas as pd
    import numpy as np
    a = pd.read_csv('UNRATE.csv')
    a['DATE'] = pd.to_datetime(a['DATE'])
    print(a.head(12))

    折线图

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    a = pd.read_csv('UNRATE.csv')
    b = a[0:12]
    plt.plot(b['DATE'],b['VALUE'])
    plt.show()

    这样就能绘制出一个折线图了

    如果横坐标写不下怎么办?我们可以将文字竖着写或者指定一个角度

    plt.xticks(rotation = 45)   #其中的45表示45°(和数学里面一样)

    一般情况下要写横坐标与纵坐标要表达什么,还有标题

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt      
    a = pd.read_csv('UNRATE.csv')     #导入文件
    b = a[0:12]      #将数据的前12条提取出来
    plt.plot(b['DATE'],b['VALUE'])      #导入横纵坐标的数据
    plt.xticks(rotation = 90)     #横坐标90
    plt.xlabel('Month')              #横坐标名称
    plt.ylabel('Unemployment Rate')      #纵坐标名称
    plt.title('Monthly Unemployment Trends, 1948')      #标题
    plt.show()       #展示

    输出;

    unrate['MONTH'] = unrate['DATE'].dt.month
    unrate['MONTH'] = unrate['DATE'].dt.month
    fig = plt.figure(figsize=(6,3))          #图的大小
    
    plt.plot(unrate[0:12]['MONTH'], unrate[0:12]['VALUE'], c='red')          #c为颜色
    plt.plot(unrate[12:24]['MONTH'], unrate[12:24]['VALUE'], c='blue')
       #在同一张图上绘制两条折线并进行对比
    plt.show()
    fig = plt.figure(figsize=(10,6))
    colors = ['red', 'blue', 'green', 'orange', 'black']
    for i in range(5):
        start_index = i*12
        end_index = (i+1)*12
        subset = unrate[start_index:end_index]
        plt.plot(subset['MONTH'], subset['VALUE'], c=colors[i])
        #绘制5条折线在一张图中,用颜色加以区分
    plt.show()
    fig = plt.figure(figsize=(10,6))
    colors = ['red', 'blue', 'green', 'orange', 'black']
    for i in range(5):
        start_index = i*12
        end_index = (i+1)*12
        subset = unrate[start_index:end_index]
        label = str(1948 + i)
        plt.plot(subset['MONTH'], subset['VALUE'], c=colors[i], label=label)
    plt.legend(loc='best')      #legend表示添加图例,loc是图例在折线图中的位置,best表示在系统觉得合适的位置,当然也可以自定义位置,位置的选择请help(legend)
    #print help(plt.legend)
    plt.show()

    输出:

    最终版:

    fig = plt.figure(figsize=(10,6))
    colors = ['red', 'blue', 'green', 'orange', 'black']
    for i in range(5):
        start_index = i*12
        end_index = (i+1)*12
        subset = unrate[start_index:end_index]     #数据区间
        label = str(1948 + i)       #图例每次写的折线标题
        plt.plot(subset['MONTH'], subset['VALUE'], c=colors[i], label=label)
    plt.legend(loc='upper left')       #放到左上角
    plt.xlabel('Month, Integer')       #横坐标标题
    plt.ylabel('Unemployment Rate, Percent')   #纵坐标标题
    plt.title('Monthly Unemployment Trends, 1948-1952')      #折线图标题
    
    plt.show()

    输出:

    3、条形图与散点图

    import pandas as pd
    import numpy as np
    from numpy import arange
    import matplotlib.pyplot as plt
    reviews = pd.read_csv('fandango_scores.csv')
    cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
    norm_reviews = reviews[cols]
    num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
    
    bar_heights = norm_reviews.ix[0, num_cols].values     #当前柱的高度
    #print bar_heights
    bar_positions = arange(5) + 0.75     #0.75是第一个柱离原点的距离    然后每个柱距离为1 一共5个柱
    #print bar_positions
    fig, ax = plt.subplots()
    ax.bar(bar_positions, bar_heights, 0.5)      #0.5表示柱子的宽度
    plt.show()
    num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
    bar_heights = norm_reviews.ix[0, num_cols].values
    bar_positions = arange(5) + 0.75
    tick_positions = range(1,6)
    fig, ax = plt.subplots()
    
    ax.bar(bar_positions, bar_heights, 0.5)
    ax.set_xticks(tick_positions)
    ax.set_xticklabels(num_cols, rotation=45)
    
    ax.set_xlabel('Rating Source')     #横坐标
    ax.set_ylabel('Average Rating')     #纵坐标
    ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')   #标题
    plt.show()

    输出:

    当然,也可以将柱形图变为横着的

    import matplotlib.pyplot as plt
    from numpy import arange
    num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
    
    bar_widths = norm_reviews.ix[0, num_cols].values
    bar_positions = arange(5) + 0.75
    tick_positions = range(1,6)
    fig, ax = plt.subplots()
    ax.barh(bar_positions, bar_widths, 0.5)     #需要改变的地方,将bar改为barh
    
    ax.set_yticks(tick_positions)
    ax.set_yticklabels(num_cols)
    ax.set_ylabel('Rating Source')
    ax.set_xlabel('Average Rating')
    ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')
    plt.show()

    输出:

     散点图:

    fig, ax = plt.subplots()
    ax.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews    #scatter画散点图
    ['RT_user_norm'])
    ax.set_xlabel('Fandango')
    ax.set_ylabel('Rotten Tomatoes')
    plt.show()

    输出:

    画两个散点图:

    fig = plt.figure(figsize=(5,10))
    ax1 = fig.add_subplot(2,1,1)
    ax2 = fig.add_subplot(2,1,2)
    ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm'])
    ax1.set_xlabel('Fandango')
    ax1.set_ylabel('Rotten Tomatoes')
    ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue'])
    ax2.set_xlabel('Rotten Tomatoes')
    ax2.set_ylabel('Fandango')
    plt.show()

    输出:

    用fig设置参数,ax做实际画图的操作

    4、柱形图与盒图

    求数据的频数,并可视化

    import pandas as pd
    import numpy as np
    from numpy import arange
    import matplotlib.pyplot as plt
    reviews = pd.read_csv('fandango_scores.csv')
    cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
    norm_reviews = reviews[cols]
    print(norm_reviews[:5])      #输出数据
    fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()       #需要数据
    fandango_distribution = fandango_distribution.sort_index()     #从小到大排序
    
    imdb_distribution = norm_reviews['IMDB_norm'].value_counts()
    imdb_distribution = imdb_distribution.sort_index()
    
    print(fandango_distribution)    #一组数据的频数,比如4.3出现了6次 表示为:4.3     6
    print(imdb_distribution)        #另一组数据的频数
    fig, ax = plt.subplots()
    ax.hist(norm_reviews['Fandango_Ratingvalue'])       #画出柱形图
    #ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)     #bins = 20 表示一共有20个柱子
    #ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20)     #range代表了横坐标的区间
    plt.show()
    import pandas as pd
    import numpy as np
    from numpy import arange
    import matplotlib.pyplot as plt
    reviews = pd.read_csv('fandango_scores.csv')
    cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
    norm_reviews = reviews[cols]
    
    fig = plt.figure(figsize=(5,20))     
    ax1 = fig.add_subplot(4,1,1)
    ax2 = fig.add_subplot(4,1,2)
    ax3 = fig.add_subplot(4,1,3)
    ax4 = fig.add_subplot(4,1,4)
    ax1.hist(norm_reviews['Fandango_Ratingvalue'], bins=20, range=(0, 5))
    ax1.set_title('Distribution of Fandango Ratings')
    ax1.set_ylim(0, 50)    #指定了这组数据的y轴取值区间
    
    ax2.hist(norm_reviews['RT_user_norm'], 20, range=(0, 5))
    ax2.set_title('Distribution of Rotten Tomatoes Ratings')
    ax2.set_ylim(0, 50)
    
    ax3.hist(norm_reviews['Metacritic_user_nom'], 20, range=(0, 5))
    ax3.set_title('Distribution of Metacritic Ratings')
    ax3.set_ylim(0, 50)
    
    ax4.hist(norm_reviews['IMDB_norm'], 20, range=(0, 5))
    ax4.set_title('Distribution of IMDB Ratings')
    ax4.set_ylim(0, 50)
    
    plt.show()

    输出:(在ml里run一下,太长了)

    盒图(四分图,找中位数):

    import pandas as pd
    import numpy as np
    from numpy import arange
    import matplotlib.pyplot as plt
    reviews = pd.read_csv('fandango_scores.csv')
    cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
    norm_reviews = reviews[cols]
    fig, ax = plt.subplots()
    ax.boxplot(norm_reviews['RT_user_norm'])
    ax.set_xticklabels(['Rotten Tomatoes'])
    ax.set_ylim(0, 5)
    plt.show()

    输出:

    这样,就可以清晰的看到中位数的位置以及大致的数据区间

    也可以在一张图上放入多张盒图,这样就可以区分各个属性的特征了

    import pandas as pd
    import numpy as np
    from numpy import arange
    import matplotlib.pyplot as plt
    reviews = pd.read_csv('fandango_scores.csv')
    cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
    norm_reviews = reviews[cols]
    num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
    fig, ax = plt.subplots()
    ax.boxplot(norm_reviews[num_cols].values)
    ax.set_xticklabels(num_cols, rotation=90)
    ax.set_ylim(0,5)
    plt.show()

    输出:

     5、闲的蛋疼系列:

    可以将坐标轴去掉:

    for key,spine in ax.spines.items():
        spine.set_visible(False)     #去掉横纵坐标轴的线

    可以去掉坐标轴的锯齿:

    ax.tick_params(bottom="off", top="off", left="off", right="off")

    6、最后的一些方法

    *****一般在做图时为了让图中表达的清晰,让图尽量在一行或两行

    fig = plt.figure(figsize=(12, 12))   #figsize参数调试

    在作图时的颜色可以用自己定义的颜色

    #Color
    import pandas as pd
    import matplotlib.pyplot as plt
    
    women_degrees = pd.read_csv('percent-bachelors-degrees-women-usa.csv')
    major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
    
    
    cb_dark_blue = (0/255, 107/255, 164/255)    #自定义颜色,注意格式
    cb_orange = (255/255, 128/255, 14/255)
    
    fig = plt.figure(figsize=(12, 12))
    
    for sp in range(0,4):
        ax = fig.add_subplot(2,2,sp+1)
        # The color for each line is assigned here.
        ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women')
        ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men')
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(major_cats[sp])
        ax.tick_params(bottom="off", top="off", left="off", right="off")
    
    plt.legend(loc='upper right')
    plt.show()

    如果要让线的宽度改变,让

    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=10)   #linewidth是改变线宽度的参数
        ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=10)

    最终附上一波此例完整版:(其中有在图中某一坐标上标出此点名称):

    import pandas as pd
    import numpy as np
    from numpy import arange
    import matplotlib.pyplot as plt
    women_degrees = pd.read_csv('percent-bachelors-degrees-women-usa.csv')
    major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
    stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']
    cb_dark_blue = (0/255, 107/255, 164/255)
    cb_orange = (255/255, 128/255, 14/255)
    fig = plt.figure(figsize=(18, 3))
    
    for sp in range(0, 6):
        ax = fig.add_subplot(1, 6, sp + 1)
        ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100 - women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
        for key, spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0, 100)
        ax.set_title(stem_cats[sp])
        ax.tick_params(bottom="off", top="off", left="off", right="off")
    plt.legend(loc='upper right')
    plt.show()
    fig = plt.figure(figsize=(18, 3))
    
    for sp in range(0, 6):
        ax = fig.add_subplot(1, 6, sp + 1)
        ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100 - women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
        for key, spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0, 100)
        ax.set_title(stem_cats[sp])
        ax.tick_params(bottom="off", top="off", left="off", right="off")
    
        if sp == 0:            #设置if语句后会对需要的图上加点的名称
            ax.text(2005, 87, 'Men')    #在坐标(2005,87)处标men
            ax.text(2002, 8, 'Women')
        elif sp == 5:
            ax.text(2005, 62, 'Men')
            ax.text(2001, 35, 'Women')
    plt.show()

    输出:


     

  • 相关阅读:
    ssm批量删除
    java.sql.SQLException: Value '0000-00-00 00:00:00' can not be represented as java.sql.Timestamp
    springMVC中的日期格式的转化
    常用(二)
    ssm上传文件
    redis命令
    redis的下载与安装(linux版)
    解决Maven项目pom.xml文件报xxx argetclassesMETA-INFMANIFEST.MF (系统找不到指定的路径)问题
    flex布局元素操作详情
    彩色小球的重现以及下雪效果的实现
  • 原文地址:https://www.cnblogs.com/ggnbnb/p/9823276.html
Copyright © 2011-2022 走看看