zoukankan      html  css  js  c++  java
  • 优惠券预测——数据探索2

    #分隔符
    separator=':'
    #计算折扣率,将满减和折扣统一
    #因为discount_rate为null的时候一般都是没有使用优惠券,这个时候折扣应该是1
    def get_discount_rate(s):
        s = str(s)
        if s=='null':
            return -1
            #return 1
        s = s.split(separator)
        if len(s) == 1:
            return float(s[0])
        else:
            return 1.0-float(s[1])/float(s[0])
    
    #获取是否满减(full reduction promotion)
    def get_if_fd(s):
        s = str(s)
        s = s.split(separator)
        if len(s)==1:
            return 0
        else:
            return 1
            
    #获取满减的条件
    def get_full_value(s):
        s = str(s)
        s = s.split(separator)
        if len(s)==1:
            return -1
        else:
            return int(s[0])
            
    #获取满减的优惠     
    def get_reduction_value(s):
        s = str(s)
        s = s.split(separator)
        if len(s) == 1:
            return -1
        else:
            return int(s[1])
    
    
    #获取月份
    def get_month(s):
        if s[0]=='null':
            return -1
        else:    
            return int(s[4:6])
    
    #获取日期
    def get_day(s):
        if s[0]=='null':
            return -1
        else:    
            return int(s[6:8])
        
    #获取日期间隔输入内容为Date:Date_received
    def get_day_gap(s):
        s = s.split(separator)
        if s[0]=='null':
            return -1
        if s[1]=='null':
            return -1
        else:    
            return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days
    
    #获取Label,输入内容为Date:Date_received
    def get_label(s):
        s = s.split(separator)
        if s[0]=='null':
            return 0
        if s[1]=='null':
            return -1
        elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
            return 1
        else:
            return -1
    def add_feature(df):
        df['if_fd']=df['discount_rate'].apply(get_if_fd)
        df['full_value']=df['discount_rate'].apply(get_full_value)
        df['reduction_value']=df['discount_rate'].apply(get_reduction_value)
        df['discount_rate']=df['discount_rate'].apply(get_discount_rate)
        df['distance']=df['distance'].replace('null',-1).astype(int)
        #df['month_received'] = df['date_received'].apply(get_month)
        #df['month'] = df['date'].apply(get_month)
        return df
        
    def add_label(df):
        df['day_gap']=df['date'].astype('str') + ':' +  df['date_received'].astype('str')
        df['label']=df['day_gap'].apply(get_label)
        df['day_gap']=df['day_gap'].apply(get_day_gap)
        return df
    #拷贝数据,免得调试的时候重读文件
    dftrain = off_train.copy()
    dftest = off_test.copy()
    dftrain=add_feature(dftrain)
    dftrain=add_label(dftrain)
    dftest=add_feature(dftest)
    # 数据分析
    dftrain.head()
    
    dftrain.describe()
    
    dftrain[dftrain.distance>=0]['distance'].value_counts()/dftrain[dftrain.distance>=0]['distance'].count()
    
    dftest[dftest.distance>=0]['distance'].value_counts()/dftest[dftest.distance>=0]['distance'].count()
    
    dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].value_counts()/dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].count()
    print ('Offline 训练集满减情况')
    dftrain.if_fd.value_counts()/dftrain.if_fd.count()
    print ('测试集满减情况')
    dftest.if_fd.value_counts()/dftest.if_fd.count()
    # 箱线图查看分布
    fig = plt.figure(figsize=(4, 6))  # 指定绘图对象宽度和高度
    sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],orient="v", width=0.5)
    fig = plt.figure(figsize=(4, 6))  # 指定绘图对象宽度和高度
    sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],orient="v", width=0.5)
    # 直方图和QQ图
    plt.figure(figsize=(10,5))
    
    ax=plt.subplot(1,2,1)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],fit=stats.norm)
    ax=plt.subplot(1,2,2)
    res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], plot=plt)
    plt.figure(figsize=(10,5))
    
    ax=plt.subplot(1,2,1)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],fit=stats.norm)
    ax=plt.subplot(1,2,2)
    res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], plot=plt)
    # 对比分布
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.discount_rate>=0)]['discount_rate'], color="Blue", shade=True)
    ax.set_xlabel('discount_rate')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.distance>=0)]['distance'], color="Blue", shade=True)
    ax.set_xlabel('distance')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.full_value>=0)]['full_value'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.full_value>=0)]['full_value'], color="Blue", shade=True)
    ax.set_xlabel('full_value')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.reduction_value>=0)]['reduction_value'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.reduction_value>=0)]['reduction_value'], color="Blue", shade=True)
    ax.set_xlabel('reduction_value')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    # 可视化线性关系
    fcols = 2
    frows = 1
    plt.figure(figsize=(8,4))
    ax=plt.subplot(1,2,1)
    sns.regplot(x='distance', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.distance>=0)][['distance','label']], ax=ax, 
                scatter_kws={'marker':'.','s':3,'alpha':0.3},
                line_kws={'color':'k'});
    plt.xlabel('distance')
    plt.ylabel('label')
    ax=plt.subplot(1,2,2)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].dropna())
    plt.xlabel('distance')
    plt.show()
    fcols = 2
    frows = 1
    plt.figure(figsize=(8,4))
    ax=plt.subplot(1,2,1)
    sns.regplot(x='discount_rate', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)][['discount_rate','label']], ax=ax, 
                scatter_kws={'marker':'.','s':3,'alpha':0.3},
                line_kws={'color':'k'});
    plt.xlabel('discount_rate')
    plt.ylabel('label')
    ax=plt.subplot(1,2,2)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'].dropna())
    plt.xlabel('discount_rate')
    plt.show()
  • 相关阅读:
    [Linux]Linux常用命令: zip/unzip 压缩和解压缩命令
    [ORACLE] REHL7.5 下oracle 19.3 安装
    [ORACLE] oracle table export exp/imp
    [linux] linux信号
    [SAP HANA]SAP HANA 内存管理详解
    [SAP HANA]SAP HANA的系统限制
    [SAP HANA]SAP HANA的组件
    [SAP HANA] SAP HANA的架构
    [SAP HANA] 如何设定HANA数据库存的类型 生产/测试/开发/定制
    [SAP HANA] SAP HANA连接不上, 连接超限 保留管理员连接
  • 原文地址:https://www.cnblogs.com/Cookie-Jing/p/14714904.html
Copyright © 2011-2022 走看看