zoukankan      html  css  js  c++  java
  • python数据分析与可视化

    #数据可视化分析
    #python中matplotlib绘制图像
    '''
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    import numpy as np
    import warnings
    warnings.filterwarnings("ignore")
    plt.style.use("classic")
    #正常的二维函数图像
    x=np.linspace(0,10,100)
    y=np.sin(x)
    y1=np.cos(x)
    plt.plot(x,y,"-")
    plt.plot(x,y1,"--")
    plt.show()
    #多子图绘制
    plt.figure()
    plt.subplot(2,1,1) #绘制两行一列的图像,第几个图
    plt.plot(x,y)
    plt.subplot(2,1,2)
    plt.plot(x,y1)
    plt.show()
    #子图绘制的第二种方式
    fig,ax=plt.subplots(2)
    ax[0].plot(x,y)
    ax[1].plot(x,y1)
    plt.show()
    #底图的风格展示
    plt.style.use("seaborn-whitegrid")
    fig=plt.figure()
    ax=plt.axes()
    x=np.linspace(0,10,100)
    ax.plot(x,np.sin(x))
    plt.show()
    #颜色调整
    plt.plot(x,np.sin(x-0),color="blue")
    plt.plot(x,np.sin(x-1),color="g")
    plt.plot(x,np.sin(x-2),color="0.75")
    plt.plot(x,np.sin(x-3),color="#FFDD44")
    plt.plot(x,np.sin(x-4),color=(1.0,0.2,0.3))
    plt.plot(x,np.sin(x-5),color="chartreuse")
    plt.show()
    #线条的样式
    plt.figure()
    plt.plot(x,x+0,linestyle="solid")
    plt.plot(x,x+1,linestyle="dashed")
    plt.plot(x,x+2,linestyle="dashdot")
    plt.plot(x,x+3,linestyle="dotted")
    plt.plot(x,x+4,linestyle="-")
    plt.plot(x,x+5,linestyle="--")
    plt.plot(x,x+6,linestyle="-.")
    plt.plot(x,x+7,linestyle=":")
    plt.show()

    #不同的标记展示
    rng=np.random.RandomState(0)
    for marker in ["o",".",",","x","+","v","^","<",">","s","d"]:
    plt.plot(rng.rand(5),rng.rand(5),marker,label="marker='{}'".format(marker))
    plt.legend(numpoints=1)
    plt.xlim(0,1.8)
    plt.show()
    #绘制散点图
    x=np.linspace(0,10,20)
    plt.scatter(x,np.sin(x))
    plt.show()
    #绘制直方图
    data=np.random.randn(1000)
    plt.hist(data,color="g")
    plt.show()
    data=np.random.randn(1000)
    plt.hist(data,bins=30,normed=True,alpha=0.5,histtype="stepfilled",color="steelblue",edgecolor="none")
    plt.show()
    x1=np.random.normal(0,0.8,1000)
    x2=np.random.normal(-2,1,1000)
    x3=np.random.normal(3,2,1000)
    kwargs=dict(bins=40,normed=True,alpha=0.3,histtype="stepfilled")
    plt.hist(x1,**kwargs)
    plt.hist(x2,**kwargs)
    plt.hist(x3,**kwargs)
    plt.show()
    #柱状图绘制
    #箱式图(离散变量对连续变量的关系)

    #python中的seaborn绘制图像
    import seaborn as sns
    import pandas as pd
    df_iris=pd.read_csv('D:Byrbt2018StudyPython机器学习全流程项目实战精讲配套课件第五讲 数据分析与可视化iris.csv')
    fig,axes=plt.subplots(2)
    sns.distplot(df_iris["petal length"],ax=axes[0],kde=True,rug=True)#加轴虚的直方图带拟合线
    sns.kdeplot(df_iris["petal length"],ax=axes[1],shade=True)#拟合图
    plt.show()
    #四种直方图形式
    sns.set(palette="muted",color_codes=True)
    rs=np.random.RandomState(10)
    d=rs.normal(size=100)
    f,axes=plt.subplots(2,2,figsize=(7,7),sharex=True)
    sns.distplot(d,kde=False,color="b",ax=axes[0,0])
    sns.distplot(d,hist=False,rug=True,color="r",ax=axes[0,1])
    sns.distplot(d,hist=False,color="g",kde_kws={"shade":True},ax=axes[1,0])
    sns.distplot(d,color="m",ax=axes[1,1])
    plt.show()
    #绘制箱式图-不同类别鸢尾花的叶片宽度分布
    sns.boxplot(x=df_iris["class"],y=df_iris["sepal width"])
    plt.show()
    #图矩阵
    sns.set()
    sns.pairplot(df_iris,hue="class")
    plt.show()
    '''
    #1-3 招聘数据的探索性数据分析
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    #拿到数据表格,导入数据
    data=pd.read_csv('D:Byrbt2018StudyPython机器学习全流程项目实战精讲配套课件第五讲 数据分析与可视化lagou_preprocessed.csv',encoding="gbk")
    print(data.head())
    #输出数据的基本信息
    print(data.info())
    #数值型变量的统计量描述
    print(data.describe())

    #目标变量分析salary
    #回归连续性变量统计分析
    print(data["salary"].describe())
    #绘制目标变量的直方图,查看值分布
    plt.hist(data["salary"])
    plt.show()
    #使用seaborn绘制直方图
    import warnings
    warnings.filterwarnings("ignore")
    sns.distplot(data["salary"])
    plt.show()
    #计算目标变量的偏度和峰度
    from scipy import stats
    print("Skewness:%f" % data["salary"].skew())
    print("Kurtosis: %f" % data["salary"].kurt())

    #分类变量处理-直接统计各个分类结果的数目
    #分类值统计
    cols=["city","education","position_name","size","stage","work_year"]
    for col in cols:
    print(data[col].value_counts())
    #将一些分类结果归结为其他
    city=["北京","上海","广州","深圳","杭州","成都","南京","武汉","南京"]
    for i,j in enumerate(data["city"]):
    if j not in city:
    data["city"][i]="其他"
    print(data["city"].value_counts())
    #解决绘图中的文字体显示问题
    from pylab import *
    mpl.rcParams["font.sans-serif"]=["SimHei"]
    #城市分类与工资水平的箱线图
    sns.boxplot(x=data["city"],y=data["salary"])
    plt.show()
    #学历与学历的关系
    sns.boxplot(x=data["education"],y=data["salary"])
    plt.show()
    #经验与工资水平的关系
    sns.boxplot(x=data["work_year"],y=data["salary"])
    plt.show()
    #企业发展阶段与工资水平的关系
    sns.boxplot(x=data["stage"],y=data["salary"])
    plt.show()
    #企业规模与工资水平的关系
    sns.boxplot(x=data["size"],y=data["salary"])
    plt.show()
    #岗位与工资水平的关系
    sns.boxplot(x=data["position_name"],y=data["salary"])
    plt.show()
    #处理industry变量
    for i,j in enumerate(data["industry"]):
    if "," not in j:
    data["industry"][i]=j
    else:
    data["industry"][i]=j.split(",")[0]
    print(data["industry"].value_counts())
    indus=["移动互联网","金融","数据服务","电子商务","企业服务","医疗健康","O2O","硬件","信息安全","教育"]
    for i,j in enumerate(data["industry"]):
    if j not in indus:
    data["industry"][i]="其他"
    else:
    data["industry"][i]=j
    print(data["industry"].value_counts())
    #行业与工资水平的关系
    sns.boxplot(x=data["industry"],y=data["salary"])
    plt.show()

    #大文本的特征信息数据分析-使用结巴库和词云图来进行展示
    ADV=[]
    for i in data["advantage"]:
    ADV.append(i)
    ADv_text="".join(ADV)
    print(ADv_text)
    '''import jieba
    result=jieba.cut(ADv_text)
    print("切分结果:"+",".join(result))
    #加入一些jieba库中没有的词汇
    jieba.suggest_freq(("五险一金"),True)
    jieba.suggest_freq(("六险一金"),True)
    jieba.suggest_freq(("带薪年假"),True)
    jieba.suggest_freq(("年度旅游"),True)
    jieba.suggest_freq(("氛围好"),True)
    jieba.suggest_freq(("技术大牛"),True)
    jieba.suggest_freq(("免费三餐"),True)
    jieba.suggest_freq(("租房补贴"),True)
    jieba.suggest_freq(("大数据"),True)
    jieba.suggest_freq(("精英团队"),True)
    jieba.suggest_freq(("晋升空间大"),True)
    result=jieba.cut(ADv_text)
    print("切分结果:"+",".join(result))

    from wordcloud import WordCloud
    wordcloud = WordCloud(font_path="MSYH.TTF",background_color="black").generate(ADv_text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    '''




  • 相关阅读:
    P2176 [USACO14FEB]路障Roadblock
    洛谷 P1187 3D模型
    洛谷 P2777 [AHOI2016初中组]自行车比赛
    洛谷P2896 [USACO08FEB]一起吃饭Eating Together
    洛谷P2983 [USACO10FEB]购买巧克力Chocolate Buying
    洛谷 P2858 [USACO06FEB]奶牛零食Treats for the Cows
    Restaurant
    OR in Matrix
    poj 3321Apple Tree
    Codeforces Round #204 (Div. 2) C. Jeff and Rounding
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/12569071.html
Copyright © 2011-2022 走看看