zoukankan      html  css  js  c++  java
  • 乐高天猫旗舰店数据分析

    乐高天猫旗舰店数据分析

    01 导入模块

    # 导入模块
    import pandas as pd
    import numpy as np
    import jieba 
    import time
    import stylecloud
    from IPython.display import Image
    from pyecharts.charts import Bar,Line,Map,Page,Pie
    from pyecharts import options as opts
    from pyecharts.globals import SymbolType
    

    02 读取数据

    df_tm=pd.read_csv('F:Python数据分析课程python数据处理Pandas练习数据分析项目练习legao3225天猫乐高旗舰店数据.csv')
    df_tm.head()
    

    image-20201013213321493

    #查看信息
    df_tm.info()
    

    image-20201013213359790

    1. 重复值处理
    2. age_range:暂不处理
    3. price:价格处理/类型转换
    4. sales_num:类型转换
    5. color_cat:暂不处理
    df_tm.drop_duplicates(inplace=True)
    # 价格处理
    def transform_price(x):
        if '-' in x:
            return (float(x.split('-')[1])-float(x.split('-')[0]))/2
        else:
            return x
    # 价格转换
    df_tm['price']=df_tm.price.apply(lambda x:transform_price(x)).astype('float')
    # 使用平均值填充缺失值
    df_tm['sales_num']=df_tm.sales_num.replace('无',200)
    # 转换类型
    df_tm['sales_num']=df_tm.sales_num.astype('int')
    df_tm.head()
    

    image-20201013213638447

    df_tm['title']=df_tm.title.str.replace('乐高旗舰店|官网|2020年','')
    # 销售额
    df_tm['sales_volumn']=df_tm['sales_num']*df_tm['price']
    df_tm.head()
    

    image-20201013213712577

    df_tm.info()
    

    image-20201013213754967

    df_tm['title']=df_tm.title.str.replace('乐高旗舰店|官网|2020年','')
    # 销售额
    df_tm['sales_volumn']=df_tm['sales_num']*df_tm['price']
    
    df_tm.head()
    

    image-20201013213847695

    rank_top10=df_tm.groupby('title')['sales_num'].sum().sort_values(ascending=False).head(10)
    rank_top10
    

    image-20201013213918575

    rank_top10=df_tm.sort_values('sales_num',ascending=False).head(10)[['title','sales_num']]
    rank_top10=rank_top10.sort_values('sales_num')
    rank_top10
    

    image-20201013213945319

    x_data=rank_top10.title.values.tolist()
    y_data=rank_top10.sales_num.values.tolist()
    
    bar1=Bar()
    bar1.add_xaxis(x_data)
    bar1.add_yaxis('',y_data)
    bar1.set_global_opts(title_opts=opts.TitleOpts(title='乐高旗舰店月销量排名Top10商品'),
                         # visualmap_opts=opts.VisualMapOpts(max_=5000)
                         )
    bar1.set_series_opts(label_opts=opts.LabelOpts(position='right'))
    bar1.reversal_axis()
    bar1.render_notebook()
    

    image-20201013214008235

    cut_bins=[0,200,400,600,800,1000,2000,9469]
    cut_labels=['0~50元','50~100元','100~200元','200~300元','300~500元','500~1000元','1000元以上']
    
    price_cut=pd.cut(df_tm['price'],bins=cut_bins,labels=cut_labels)
    price_num=price_cut.value_counts()
    price_num
    

    image-20201013214040206

    bar2=Bar()
    bar2.add_xaxis(['0~50元','50~100元','100~200元','200~300元','300~500元','500~1000元','1000元以上'])
    bar2.add_yaxis('',[52,71,86,39,35,61,25])
    bar2.set_global_opts(title_opts=opts.TitleOpts(title='乐高旗舰店不同价格区间商品数量'),
                        visualmap_opts=opts.VisualMapOpts(max_=90)
    )
    bar2.render_notebook()
    

    image-20201013214101827

    # 添加到
    df_tm['price_cut']=price_cut
    cut_purchase=df_tm.groupby('price_cut')['sales_volumn'].sum()
    cut_purchase
    

    image-20201013214129511

    data_pair=[list(z) for z in zip(cut_purchase.index.tolist(),cut_purchase.values.tolist())]
    # 绘制饼图
    piel=Pie()
    piel.add('',data_pair,radius=['35%','60%'])
    piel.set_global_opts(title_opts=opts.TitleOpts(title='不同价格区间的销售额整体表现'),
                        legend_opts=opts.LegendOpts(orient='vertical',pos_top='15%',pos_left='2%'))
    piel.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%"))
    piel.set_colors(['#EF9050','#3B7BA9','#6FB27C','#FFAF34','#D7BFD7','#00BFFE','#7FFFAA'])
    piel.render_notebook()
    

    image-20201013214203193

    def get_cut_words(content_series):
        # 读入停用图表析
        stop_words=[]
        with open("F:\Python数据分析课程\python数据处理\Pandas练习\数据分析项目练习\legao3225\cn_stopwords.txt",'r',encoding='utf-8')as f:
            lines=f.readlines()
            for line in lines:
                stop_words.append(line.strip())
        # 添加关键词
        my_words=['乐高','悟空小侠','大颗粒','小颗粒']
        for i in my_words:
            jieba.add_word(i)
        # 自定义停用词
        # my_stop_words=[]
        # stop_words.extend(my_stop_words)
    
        # 分词
        word_num=jieba.lcut(content_series.str.cat(sep='。'),cut_all=False)
        # 条件筛选
        word_num_selected=[i for i in word_num if i not in stop_words and len(i)>=2]
        return  word_num_selected
    
    text=get_cut_words(content_series=df_tm['title'])
    text[:6]
    

    image-20201013214244120

    text=get_cut_words(content_series=df_tm['title'])
    text[:6]
    
    stylecloud.gen_stylecloud(
        text=' '.join(text),
        collocations=False,
        font_path=r'F:Python数据分析课程python数据处理Pandas练习数据分析项目练习legao3225simhei.ttf',
        icon_name='fas fa-gamepad',
        size=768,
        output_name='乐高旗舰店商品标题词云图.png'
    )
    Image(filename='乐高旗舰店商品标题词云图.png')
    

    image-20201013214309818

  • 相关阅读:
    Failed to parse PID from file /run/nginx.pid: Invalid argument
    Ubuntu16.04环境下bashrc文件位置
    virtualenvwrapper.sh报错: There was a problem running the initialization hooks.解决
    pip安装virtualenvwrapper报错的解决办法
    争鸣|函数性质的综合应用辨析
    总结|静雅斋之2020高考备考回顾总结
    2020年全国卷Ⅱ卷文科数学图片版
    奇怪|说好的求最大值变成了求最小值
    探究|平面向量探究题
    平面向量错误收集
  • 原文地址:https://www.cnblogs.com/James-221/p/13811581.html
Copyright © 2011-2022 走看看