zoukankan      html  css  js  c++  java
  • 优惠券预测——数据探索2

    #分隔符
    separator=':'
    #计算折扣率,将满减和折扣统一
    #因为discount_rate为null的时候一般都是没有使用优惠券,这个时候折扣应该是1
    def get_discount_rate(s):
        s = str(s)
        if s=='null':
            return -1
            #return 1
        s = s.split(separator)
        if len(s) == 1:
            return float(s[0])
        else:
            return 1.0-float(s[1])/float(s[0])
    
    #获取是否满减(full reduction promotion)
    def get_if_fd(s):
        s = str(s)
        s = s.split(separator)
        if len(s)==1:
            return 0
        else:
            return 1
            
    #获取满减的条件
    def get_full_value(s):
        s = str(s)
        s = s.split(separator)
        if len(s)==1:
            return -1
        else:
            return int(s[0])
            
    #获取满减的优惠     
    def get_reduction_value(s):
        s = str(s)
        s = s.split(separator)
        if len(s) == 1:
            return -1
        else:
            return int(s[1])
    
    
    #获取月份
    def get_month(s):
        if s[0]=='null':
            return -1
        else:    
            return int(s[4:6])
    
    #获取日期
    def get_day(s):
        if s[0]=='null':
            return -1
        else:    
            return int(s[6:8])
        
    #获取日期间隔输入内容为Date:Date_received
    def get_day_gap(s):
        s = s.split(separator)
        if s[0]=='null':
            return -1
        if s[1]=='null':
            return -1
        else:    
            return (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8])) - date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days
    
    #获取Label,输入内容为Date:Date_received
    def get_label(s):
        s = s.split(separator)
        if s[0]=='null':
            return 0
        if s[1]=='null':
            return -1
        elif (date(int(s[0][0:4]),int(s[0][4:6]),int(s[0][6:8]))-date(int(s[1][0:4]),int(s[1][4:6]),int(s[1][6:8]))).days<=15:
            return 1
        else:
            return -1
    def add_feature(df):
        df['if_fd']=df['discount_rate'].apply(get_if_fd)
        df['full_value']=df['discount_rate'].apply(get_full_value)
        df['reduction_value']=df['discount_rate'].apply(get_reduction_value)
        df['discount_rate']=df['discount_rate'].apply(get_discount_rate)
        df['distance']=df['distance'].replace('null',-1).astype(int)
        #df['month_received'] = df['date_received'].apply(get_month)
        #df['month'] = df['date'].apply(get_month)
        return df
        
    def add_label(df):
        df['day_gap']=df['date'].astype('str') + ':' +  df['date_received'].astype('str')
        df['label']=df['day_gap'].apply(get_label)
        df['day_gap']=df['day_gap'].apply(get_day_gap)
        return df
    #拷贝数据,免得调试的时候重读文件
    dftrain = off_train.copy()
    dftest = off_test.copy()
    dftrain=add_feature(dftrain)
    dftrain=add_label(dftrain)
    dftest=add_feature(dftest)
    # 数据分析
    dftrain.head()
    
    dftrain.describe()
    
    dftrain[dftrain.distance>=0]['distance'].value_counts()/dftrain[dftrain.distance>=0]['distance'].count()
    
    dftest[dftest.distance>=0]['distance'].value_counts()/dftest[dftest.distance>=0]['distance'].count()
    
    dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].value_counts()/dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].count()
    print ('Offline 训练集满减情况')
    dftrain.if_fd.value_counts()/dftrain.if_fd.count()
    print ('测试集满减情况')
    dftest.if_fd.value_counts()/dftest.if_fd.count()
    # 箱线图查看分布
    fig = plt.figure(figsize=(4, 6))  # 指定绘图对象宽度和高度
    sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],orient="v", width=0.5)
    fig = plt.figure(figsize=(4, 6))  # 指定绘图对象宽度和高度
    sns.boxplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],orient="v", width=0.5)
    # 直方图和QQ图
    plt.figure(figsize=(10,5))
    
    ax=plt.subplot(1,2,1)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'],fit=stats.norm)
    ax=plt.subplot(1,2,2)
    res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], plot=plt)
    plt.figure(figsize=(10,5))
    
    ax=plt.subplot(1,2,1)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'],fit=stats.norm)
    ax=plt.subplot(1,2,2)
    res = stats.probplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], plot=plt)
    # 对比分布
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.discount_rate>=0)]['discount_rate'], color="Blue", shade=True)
    ax.set_xlabel('discount_rate')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.distance>=0)]['distance'], color="Blue", shade=True)
    ax.set_xlabel('distance')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.full_value>=0)]['full_value'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.full_value>=0)]['full_value'], color="Blue", shade=True)
    ax.set_xlabel('full_value')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    ax = sns.kdeplot(dftrain[(dftrain.label>=0)&(dftrain.reduction_value>=0)]['reduction_value'], color="Red", shade=True)
    ax = sns.kdeplot(dftest[(dftest.reduction_value>=0)]['reduction_value'], color="Blue", shade=True)
    ax.set_xlabel('reduction_value')
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train","test"])
    # 可视化线性关系
    fcols = 2
    frows = 1
    plt.figure(figsize=(8,4))
    ax=plt.subplot(1,2,1)
    sns.regplot(x='distance', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.distance>=0)][['distance','label']], ax=ax, 
                scatter_kws={'marker':'.','s':3,'alpha':0.3},
                line_kws={'color':'k'});
    plt.xlabel('distance')
    plt.ylabel('label')
    ax=plt.subplot(1,2,2)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.distance>=0)]['distance'].dropna())
    plt.xlabel('distance')
    plt.show()
    fcols = 2
    frows = 1
    plt.figure(figsize=(8,4))
    ax=plt.subplot(1,2,1)
    sns.regplot(x='discount_rate', y='label', data=dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)][['discount_rate','label']], ax=ax, 
                scatter_kws={'marker':'.','s':3,'alpha':0.3},
                line_kws={'color':'k'});
    plt.xlabel('discount_rate')
    plt.ylabel('label')
    ax=plt.subplot(1,2,2)
    sns.distplot(dftrain[(dftrain.label>=0)&(dftrain.discount_rate>=0)]['discount_rate'].dropna())
    plt.xlabel('discount_rate')
    plt.show()
  • 相关阅读:
    xls与csv文件的区别
    青音,经典爱情语录
    win7用户账户自动登录方法汇总
    How to using Procedure found Lead Blocker
    FTS(3) BSD 库函数手册 遍历文件夹(二)
    FTS(3) BSD 库函数手册 遍历文件夹(一)
    DisplayMetrics类 获取手机显示屏的基本信息 包括尺寸、密度、字体缩放等信息
    About App Distribution 关于应用发布
    FTS(3) 遍历文件夹实例
    OpenCV 2.1.0 with Visual Studio 2008
  • 原文地址:https://www.cnblogs.com/Cookie-Jing/p/14714904.html
Copyright © 2011-2022 走看看