zoukankan      html  css  js  c++  java
  • 80万商城运营大数据分析

    80万商城运营大数据分析

    • 用户行为分析:日访问量、小时访问量、不同行为类型访问量
    • 获客分析
    • 用户留存分析
    • 复购分析
    • 转化漏斗分析
    • 商品竞争力分析:性别、城市、用户浏览、购买、收藏、商品类目分析

    一:导入数据

    # 导入常用的库:
    import pandas as pd 
    import numpy as np
    import matplotlib.pyplot as plt
    import datetime
    
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False
    
    # columns = ['user_id','goods_id','cat','behavior','time']
    df = pd.read_table("E:/yizhiamumu/behavior.txt")
    df.info()
    
    

    打印结果

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 757565 entries, 0 to 757564
    Data columns (total 10 columns):
    user_id     757565 non-null int64
    goods_id    757565 non-null int64
    cat         757565 non-null int64
    behavior    757565 non-null object
    time        757565 non-null int64
    sex         757565 non-null int64
    addr        757565 non-null object
    device      757565 non-null object
    price       757565 non-null float64
    amount      757565 non-null int64
    dtypes: float64(1), int64(6), object(3)
    memory usage: 57.8+ MB
    
    观察数据
    df.head()
    

    打印结果

    	    user_id	goods_id	cat	behavior	time	sex	addr	device	price	amount
    0	1	5002615	2520377	pv	1574911385	0	成都	Redmi Note8Pro	0.0	0
    1	1	2734026	4145813	pv	1574914184	0	成都	Redmi Note8Pro	0.0	0
    2	1	5002615	2520377	pv	1574916273	0	成都	Redmi Note8Pro	0.0	0
    3	1	3239041	2355072	pv	1574927664	0	成都	Redmi Note8Pro	0.0	0
    4	1	4615417	4145813	pv	1574942864	0	成都	Redmi Note8Pro	0.0	0
    

    二:数据清洗

    # 数据清廷
    df.drop_duplicates(inplace=True)
    
    # 时间戳处理
    df['time1'] = df['time'].apply(lambda x:datetime.datetime.fromtimestamp(x))
    df['month'] = df['time'].apply(lambda x:int(datetime.datetime.fromtimestamp(x).strftime("%Y%m")))
    df['date'] = df['time'].apply(lambda x:datetime.datetime.fromtimestamp(x).strftime("%Y%m%d"))
    df['hour'] = df['time1'].apply(lambda x:x.hour)
    
    # 销售金额计算
    df['money'] = df['price'] * df['amount']
    
    # 缺失值处理
    df.isnull().sum()
    
    df.describe()
    

    打印结果

    	    user_id	goods_id	cat	time	sex	price	amount	month	hour	money
    count	757565.000000	7.575650e+05	7.575650e+05	7.575650e+05	757565.000000	757565.000000	757565.000000	757565.000000	757565.000000	757565.000000
    mean	5583.487645	2.582329e+06	2.716578e+06	1.575162e+09	0.432664	0.955333	0.047983	201911.560921	14.953452	2.383729
    std	3184.742636	1.487073e+06	1.465113e+06	1.494278e+05	0.495445	7.915329	0.376215	0.496275	6.063847	21.677706
    min	1.000000	2.900000e+01	2.171000e+03	1.574870e+09	0.000000	0.000000	0.000000	201911.000000	0.000000	0.000000
    25%	2824.000000	1.304276e+06	1.354236e+06	1.575031e+09	0.000000	0.000000	0.000000	201911.000000	11.000000	0.000000
    50%	5671.000000	2.582878e+06	2.735466e+06	1.575181e+09	0.000000	0.000000	0.000000	201912.000000	16.000000	0.000000
    75%	8310.000000	3.861863e+06	4.145813e+06	1.575291e+09	1.000000	0.000000	0.000000	201912.000000	20.000000	0.000000
    max	11079.000000	5.163006e+06	5.161669e+06	1.575389e+09	1.000000	99.900000	4.000000	201912.000000	23.000000	399.600000
    

    三:数据可视化

    1 每天PV UV 走势分析

    # 数据可视化
    # 1 每天PV UV 走势分析
    all_puv = pd.pivot_table(df, index=['date'],values='user_id',aggfunc='count')
    uv = df[['user_id','date']].drop_duplicates()['date'].value_counts()
    all_puv = all_puv.join(uv)
    all_puv.columns = ['pv','uv']
    all_puv['avg_pv'] = all_puv['pv']/all_puv['uv']
    all_puv
    

    打印结果

    	    pv	uv	avg_pv
    date			
    20191128	106877	7610	14.044284
    20191129	111426	7811	14.265267
    20191130	114328	7874	14.519685
    20191201	121457	8097	15.000247
    20191202	153545	10552	14.551270
    20191203	149932	10530	14.238557
    
    x=all_puv.index
    fig,axes = plt.subplots(1,3,figsize=(18,3))
    axes[0].plot(x,all_puv['pv'], color='r', marker='o')
    axes[1].plot(x,all_puv['uv'], color='g', marker='s')
    axes[2].plot(x,all_puv['avg_pv'], color='b', marker='d')
    
    axes[0].set_title('pv')
    axes[1].set_title('uv')
    axes[2].set_title('avg_pv')
    
    plt.show()
    
    

    打印结果 1

    2 用户行为 pv 分析

    # 2 用户行为 pv 分析
    
    pv = pd.pivot_table(df, index=['date'], columns=['behavior'], values='user_id', aggfunc='count')
    
    pv['all'] = pv.sum(axis=1)
    
    plt.figure(figsize=(12,5))
    plt.plot(pv.index, pv['all'])
    plt.plot(pv.index, pv['pv'],color='r')
    plt.plot(pv.index, pv['cart'],color='y')
    plt.plot(pv.index, pv['buy'], color='c')
    plt.plot(pv.index, pv['fav'], color='b')
    
    plt.xlabel("日期", fontsize=20)
    plt.ylabel('pv', fontsize=20)
    plt.title("用户行为pv 分析", fontsize=24)
    plt.legend(["all","pv","cart","buy","fav"], loc = 'upper left', fontsize=20)
    plt.show()
    
    

    打印结果

    2

    3 访问高峰分析

    # 3 访问高峰分析
    hour_pv = df['hour'].value_counts().reset_index().rename(columns={'index':'hour', 'hour':'pv'})
    
    plt.figure(figsize=(12,4))
    plt.bar(hour_pv['hour'], hour_pv['pv'],color='r')
    plt.xlabel("小时", fontsize=20)
    plt.ylabel("访问量pv", fontsize=20)
    plt.title("每小时用户访问量数据分析", fontsize = 24)
    plt.show()
    
    

    3

    4 新用户分析

    # 4 新用户分析
    new_user= df[['user_id','date']].groupby('user_id').min()['date'].value_counts().reset_index()
    new_user.columns = ['date', 'new_user']
    
    plt.figure(figsize=(10,6))
    x,y = new_user['date'], new_user['new_user']
    plt.bar(x,y,width=0.6,color='r')
    for a, b in zip(x,y):
        plt.text(a,b + 0.05, '%.0f' % b, ha='center', va='bottom', fontsize=20)
    plt.xlabel("日期", fontsize=16)
    plt.ylabel("新用户数", fontsize=16)
    plt.title("每日新增用户数", fontsize=20)
    plt.show()
    

    4

    5 用户留存分析

    # 5 用户留存分析
    # 建立 n 日留存率计算模型,数据传入用户id 和登录日期
    # n 为n 日留存, 不传入start_date 和 n 时,则计算所有留存
    
    def cal_retention(df, start_date='20190101',n=0):
        if n>0:
            new_user = df[['user_id','date']].groupby('user_id').min().reset_index()
            date2 = datetime.datetime.strptime(start_date, '%Y%m%d')+datetime.timedelta(n)
            end_date = datetime.datetime.strftime(date2,'%Y%m%d')
    
            start_user = set(new_user[new_user.date==start_date].user_id)
            end_user = set(df[df.date==end_date].user_id)
            user = start_user&end_user
            
            return [start_date, end_date, len(start_user),len(user),round(len(user)/len(start_user),4)]
        
        else:
            new_user = df[['user_id',"date"]].groupby('user_id').min().reset_index()
            date_source = new_user.date.unique()
            date_source.sort()
            
            result1 = []
            flag =0
            
            for start_date in date_source:
                start_user = set(new_user[new_user.date == start_date].user_id)
                for end_date in date_source[flag:]:
                    end_user = set(df[df.date==end_date].user_id)
                    user = start_user&end_user
                    result1.append([start_date,end_date,len(start_user),len(user),round(len(user)/len(start_user),4)])
                flag = flag+1
           return pd.DataFrame(result1, columns=['开始日期','留存日期','新用户数','留存人数','留存率'])
        
    # 调用cal_retention 函数计算留存
    # cal_retention(df[['user_id','date']],'20191128',3)
    
    retention = cal_retention(df[['user_id','date']])
    
    # 留存人数展示
    pd.pivot_table(retention, index=['开始日期'],columns=['留存日期'],values='留存人数',aggfunc='sum',fill_value=0)
    
    # 留存率展示
    pd.pivot_table(retention, index=['开始日期'],columns=['留存日期'],values='留存率',aggfunc='sum',fill_value=0)
    

    5

    6

    6 复购率计算

    # 6 复购率计算
    
    data_buy = df[df.behavior=='buy'][['user_id','date']].drop_duplicates()['user_id'].value_counts().reset_index()
    data_buy.columns = ['user_id','num']
    
    re_buy_rate = round(len(data_buy[data_buy.num>=2])/len(data_buy),4)
    print("复购率为:",round(re_buy_rate*100,2),"%")
    
    # 购买总人数
    buy_user = len(data_buy)
    
    # 网联次数的人数分布
    buy_freq = data_buy.num.value_counts().reset_index()
    buy_freq.columns = [['购买次数','人数']]
    buy_freq['人数占比'] = round(buy_freq['人数']/buy_user,4)
    buy_freq
    
    

    打印结果

        复购率为: 42.06 %
        购买次数	人数	人数占比
    0	1	3540	0.5794
    1	2	1714	0.2805
    2	3	575	0.0941
    3	4	210	0.0344
    4	5	57	0.0093
    5	6	14	0.0023
    

    7 商品TOP 分析

    7.1 商品销售TOP

    # 7.1 商品销售TOP
    
    buy_top = df[df.behavior=='buy']['goods_id'].value_counts().head(10)
    
    from pyecharts.charts import Bar
    from pyecharts import options as opts
    
    bar = Bar()
    bar.add_xaxis(buy_top.index.tolist())
    bar.add_yaxis("商品销售TOP",buy_top.values.tolist(),color='red')
    bar.set_global_opts(title_opts=opts.TitleOpts(title="商品销售TOP"))
    
    bar.render_notebook()
    

    7

    7.2商品浏览TOP

    # 7.2商品浏览TOP
    pv_top = df[df.behavior=='pv']['goods_id'].value_counts().head(10)
    
    from pyecharts.charts import Bar
    from pyecharts import options as opts
    
    bar = Bar()
    bar.add_xaxis(pv_top.index.tolist())
    bar.add_yaxis("商品浏览TOP",pv_top.values.tolist(),color='red')
    bar.set_global_opts(title_opts=opts.TitleOpts(title="商品浏览TOP"))
    
    bar.render_notebook()
    
    

    72

    7.3商品收藏 TOP

    # 7.3商品收藏 TOP
    fav_top = df[df.behavior=='fav']['goods_id'].value_counts().head(10)
    
    from pyecharts.charts import Bar
    from pyecharts import options as opts
    
    bar = Bar()
    bar.add_xaxis(fav_top.index.tolist())
    bar.add_yaxis("商品收藏TOP",fav_top.values.tolist(),color='red')
    bar.set_global_opts(title_opts=opts.TitleOpts(title="商品收藏TOP"))
    
    bar.render_notebook()
    
    

    73

    8 城市购买竞争力TOP

    # 8 城市购买竞争力TOP
    city_top = df[df.behavior=='buy'][['addr','money']].groupby('addr').sum().sort_values('money',ascending=False)
    
    
    bar = Bar()
    bar.add_xaxis(city_top.index.tolist())
    bar.add_yaxis("城市购买竞争力TOP",[round(a,2) for a in city_top.money], color='red')
    bar.set_global_opts(title_opts=opts.TitleOpts(title="城市购买竞争力TOP"))
    
    bar.render_notebook()
    
    

    8

    9 不同性别购买力分析

    # 9 不同性别购买力情况  0-女,1-男
    
    sex_top = df[df.behavior=='buy'][['sex','money']].groupby('sex').sum()
    
    from pyecharts.charts import Pie
    
    pie = Pie()
    pie.add("",[list(z) for z in zip(['女','男'], [round(a,2) for a in city_top.money])])
    pie.set_global_opts(title_opts=opts.TitleOpts(title="不同性别购买力分析"))
    pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{c} ({d}%)"))
    pie.render_notebook()
    
    

    9

    10商品类目竞争力分析

    # 10商品类目竞争力分析 
    
    cat_top = df[df.behavior=='buy']['cat'].value_counts().head(10)
    bar = Bar()
    bar.add_xaxis(cat_top.index.tolist())
    bar.add_yaxis("商品类目竞争力分析",cat_top.values.tolist())
    bar.render_notebook()
    

    10

    11 总体转化漏斗分析

    # 11 总体转化漏斗分析
    from pyecharts.charts import Funnel
    from pyecharts import options as opts
    data_behavior = df[df.behavior!="fav"]['behavior'].value_counts().reset_index().rename(columns={"index":"环节","behavior":"人数"})
    
    # 单一环节的转化率
    t1 = np.array(data_behavior['人数'][1:])
    t2 = np.array(data_behavior['人数'][0:-1])
    single_convs = t1/t2
    single_convs = list(single_convs)
    single_convs.insert(0,1)
    
    # 总体转化率
    flag = data_behavior['人数'][0]
    data_behavior['总体转化率'] = data_behavior['人数']/flag
    
    attrs = [a+":  "+str(round(b,2))+"%" for a,b in zip(data_behavior['环节'],data_behavior['总体转化率']*100)]
    attr_value = [round(a,2) for a in data_behavior['总体转化率'] * 100]
    
    funnel = Funnel()
    funnel.add("商品",[list(z) for z in zip(attrs, attr_value)], label_opts = opts.LabelOpts(position="inside"))
    funnel.set_global_opts(title_opts=opts.TitleOpts(title="总体转化漏斗数据分析"))
    funnel.render_notebook()

    11

    12 RFM 用户分层模型

    # RFM 模型打分规则
    
    def recency(x):
        if x<=2:
            return 5
        elif x==3:
            return 4
        elif x==4:
            return 3
        elif x==5:
            return 2
        elif x>=6:
            return 1
    
    def frequency(x):
        if x>=8:
            return 5
        elif (x>=6)&(x<8):
            return 4
        elif (x>=4)&(x<6):
            return 3
        elif (x>=2)&(x<4):
            return 2
        elif (x>=0)&(x<2):
            return 1
    
    def monetary(x):
        if x>=300:
            return 5
        elif (x>=200)&(x<300):
            return 4
        elif (x>100)&(x<200):
            return 3
        elif (x>50)&(x<100):
            return 2
        elif (x>=0)&(x<50):
            return 1
    
    
    # RFM 用户分层
    RFM_date = df[df.behavior == 'buy'][['user_id','date']].groupby("user_id").max()
    RFM_F = df[df.behavior == 'buy'][['user_id','behavior']].groupby('user_id').count()
    RFM_M = df[df.behavior == 'buy'][['user_id','money']].groupby('user_id').mean()
    RFM = RFM_date.join(RFM_F).join(RFM_M)
    
    # 用户价值分层
    end_date = datetime.datetime.strptime('20191205',"%Y%m%d")
    # 时间间隔天数计算
    RFM['days'] = RFM['date'].apply(lambda x:(end_date-datetime.datetime.strptime(x,"%Y%m%d")).days)
    RFM = RFM[['days','behavior','money']]
    RFM.columns = ['间隔天数','消费频次','消费金额']
    
    RFM.head()
    

    打印结果:

        间隔天数	消费频次	消费金额
    user_id			
    2	3	7	186.742857
    4	5	4	142.600000
    16	4	2	84.650000
    17	4	1	62.700000
    20	4	1	37.900000
    

    RFM数据建模:

    # RFM 数据处理
    
    RFM['R_S'] = RFM['间隔天数'].apply(recency)
    RFM['F_S'] = RFM['消费频次'].apply(frequency)
    RFM['M_S'] = RFM['消费金额'].apply(monetary)
    RFM['RFM'] = RFM.apply(lambda x:int(x.R_S * 100 + x.F_S * 10 + x.M_S),axis=1)
    RFM.head()
    
    

    打印结果

    	间隔天数	消费频次	消费金额	R_S	F_S	M_S RFM 
    user_id						
    2	3	7	186.742857	4	4	3.0  443
    4	5	4	142.600000	2	3	3.0  233
    16	4	2	84.650000	3	2	2.0  322
    17	4	1	62.700000	3	1	2.0 312 
    20	4	1	37.900000	3	1	1.0  311
    
    

    欢迎关注:一只阿木木

  • 相关阅读:
    Oracle中Lpad函数和Rpad函数的用法
    SQL中on条件与where条件的区别
    安装sqlServer
    springboot注解加深
    springcloud父项工程pom
    雪花算法
    docker 创建mysql容器
    docker入门
    restful接口规范(安全与幂等)
    mysql
  • 原文地址:https://www.cnblogs.com/yizhiamumu/p/12688948.html
Copyright © 2011-2022 走看看