zoukankan      html  css  js  c++  java
  • pandas 实现rfm模型

    import pandas as pd
    import numpy as np
    
    
    df = pd.read_csv('./zue_164466.csv')
    
    df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d')
    df['dateDiff'] = pd.to_datetime('today')-df['ptdate']
    df['dateDiff'] = df['dateDiff'].dt.days
    
    R_Agg = df.groupby(by=['user_email','product_name'])['dateDiff'].agg({'RecencyAgg': np.min})
    
    F_Agg = df.groupby(by=['user_email','product_name'])['ptdate'].agg({'FrequencyAgg': np.size})
    
    M_Agg = df.groupby(by=['user_email','product_name'])['totalcost'].agg({'MonetaryAgg': np.sum})
    
    aggData = R_Agg.join(F_Agg).join(M_Agg)
    
    bins = aggData.RecencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
    bins[0] = 0
    labels = [5, 4, 3, 2, 1]
    R_S = pd.cut(aggData.RecencyAgg, bins, labels=labels)
    
    #
    bins = aggData.FrequencyAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
    bins[0] = 0
    labels = [1, 2, 3, 4, 5]
    F_S = pd.cut(aggData.FrequencyAgg, bins, labels=labels)
    
    
    bins = aggData.MonetaryAgg.quantile(q=[0, 0.2, 0.4, 0.6, 0.8, 1],interpolation='nearest')
    bins[0] = 0
    labels = [1, 2, 3, 4, 5]
    M_S = pd.cut(aggData.MonetaryAgg,bins, labels=labels)
    
    
    aggData['R_S']=R_S
    aggData['F_S']=F_S
    aggData['M_S']=M_S
    
    
    aggData['RFM'] = 100*R_S.astype(int) + 10*F_S.astype(int) + 1*M_S.astype(int)
    
    
    bins = aggData.RFM.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest')
    bins[0] = 0
    labels = [1, 2, 3, 4, 5, 6, 7, 8]
    aggData['level'] = pd.cut(aggData.RFM, bins, labels=labels)
    
    aggData = aggData.reset_index()
    
    fe = aggData.sort_values(['level', 'RFM'], ascending=[0, 0])
    
    dd=aggData.groupby(by=['level'])['user_email','product_name'].agg({'size':np.size})
    
    print(fe.head())
    fe.to_csv('./rfm_data.csv',index=False)
    print("---------------")
    print(dd)

    # -*- coding: utf-8 -*-
    
    '''
    描述:案例-基于RFM的用户价值度模型
    程序输入:sales.csv
    程序输出:RFM得分数据写本地文件sales_rfm_score.csv和数据表(sales_rfm_score)
    '''
    # 导入库
    import time  # 导入时间库
    import numpy as np  # 导入numpy库
    import pandas as pd  # 导入pandas库
    import mysql.connector  # 导入mysql连接库
    
    # 读取数据
    dtypes = {'ORDERDATE': object, 'ORDERID': object, 'AMOUNTINFO': np.float32}  # 设置每列数据类型
    raw_data = pd.read_csv('sales.csv', dtype=dtypes, index_col='USERID')  # 读取数据文件
    
    # 数据审查和校验
    # 数据概览
    print ('Data Overview:')
    print (raw_data.head(4))  # 打印原始数据前4条
    print ('-' * 30)
    print ('Data DESC:')
    print (raw_data.describe())  # 打印原始数据基本描述性信息
    print ('-' * 60)
    
    # 缺失值审查
    na_cols = raw_data.isnull().any(axis=0)  # 查看每一列是否具有缺失值
    print ('NA Cols:')
    print (na_cols)  # 查看具有缺失值的列
    print ('-' * 30)
    na_lines = raw_data.isnull().any(axis=1)  # 查看每一行是否具有缺失值
    print ('NA Recors:')
    print ('Total number of NA lines is: {0}'.format(na_lines.sum()))  # 查看具有缺失值的行总记录数
    print (raw_data[na_lines])  # 只查看具有缺失值的行信息
    print ('-' * 60)
    
    # 数据异常、格式转换和处理
    # 异常值处理
    sales_data = raw_data.dropna()  # 丢弃带有缺失值的行记录
    sales_data = sales_data[sales_data['AMOUNTINFO'] > 1]  # 丢弃订单金额<=1的记录
    
    # 日期格式转换
    sales_data['ORDERDATE'] = pd.to_datetime(sales_data['ORDERDATE'], format='%Y-%m-%d')  # 将字符串转换为日期格式
    print ('Raw Dtypes:')
    print (sales_data.dtypes)  # 打印输出数据框所有列的数据类型
    print ('-' * 60)
    
    # 数据转换
    recency_value = sales_data['ORDERDATE'].groupby(sales_data.index).max()  # 计算原始最近一次订单时间
    frequency_value = sales_data['ORDERDATE'].groupby(sales_data.index).count()  # 计算原始订单频率
    monetary_value = sales_data['AMOUNTINFO'].groupby(sales_data.index).sum()  # 计算原始订单总金额
    
    # 计算RFM得分
    # 分别计算R、F、M得分
    deadline_date = pd.datetime(2017, 01, 01)  # 指定一个时间节点,用于计算其他时间与该时间的距离
    r_interval = (deadline_date - recency_value).dt.days  # 计算R间隔
    r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1])  # 计算R得分
    f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5])  # 计算F得分
    m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5])  # 计算M得分
    
    # R、F、M数据合并
    rfm_list = [r_score, f_score, m_score]  # 将r、f、m三个维度组成列表
    rfm_cols = ['r_score', 'f_score', 'm_score']  # 设置r、f、m三个维度列名
    rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols,
                          index=frequency_value.index)  # 建立r、f、m数据框
    print ('RFM Score Overview:')
    print (rfm_pd.head(4))
    print ('-' * 60)
    
    # 计算RFM总得分
    # 方法一:加权得分
    rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1
    # 方法二:RFM组合
    rfm_pd_tmp = rfm_pd.copy()
    rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('string')
    rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('string')
    rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('string')
    rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score'])
    
    # 打印输出和保存结果
    # 打印结果
    print ('Final RFM Scores Overview:')
    print (rfm_pd.head(4))  # 打印数据前4项结果
    print ('-' * 30)
    print ('Final RFM Scores DESC:')
    print (rfm_pd.describe())
    
    # 保存RFM得分到本地文件
    rfm_pd.to_csv('sales_rfm_score.csv')  # 保存数据为csv
    
    # 保存RFM得分到MySQL数据库
    # 设置要写库的数据库连接信息
    table_name = 'sales_rfm_score'  # 要写库的表名
    # 数据库基本信息
    config = {'host': '127.0.0.1',  # 默认127.0.0.1
              'user': 'root',  # 用户名
              'password': '123456',  # 密码
              'port': 3306,  # 端口,默认为3306
              'database': 'python_data',  # 数据库名称
              'charset': 'gb2312'  # 字符编码
              }
    con = mysql.connector.connect(**config)  # 建立mysql连接
    cursor = con.cursor()  # 获得游标
    # 查找数据库是否存在目标表,如果没有则新建
    cursor.execute("show tables")  #
    table_object = cursor.fetchall()  # 通过fetchall方法获得所有数据
    table_list = []  # 创建库列表
    for t in table_object:  # 循环读出所有库
        table_list.append(t[0])  # 每个每个库追加到列表
    if not table_name in table_list:  # 如果目标表没有创建
        cursor.execute('''
        CREATE TABLE %s (
        userid               VARCHAR(20),
        r_score               int(2),
        f_score              int(2),
        m_score              int(2),
        rfm_wscore              DECIMAL(10,2),
        rfm_comb              VARCHAR(10),
        insert_date              VARCHAR(20)
        )ENGINE=InnoDB DEFAULT CHARSET=gb2312
        ''' % table_name)  # 创建新表
    # 将数据写入数据库
    user_id = rfm_pd.index  # 索引列
    rfm_wscore = rfm_pd['rfm_wscore']  # RFM加权得分列
    rfm_comb = rfm_pd['rfm_comb']  # RFM组合得分列
    timestamp = time.strftime('%Y-%m-%d', time.localtime(time.time()))  # 写库日期
    print ('Begin to insert data into table {0}...'.format(table_name))  # 输出开始写库的提示信息
    for i in range(rfm_pd.shape[0]):  # 设置循环次数并依次循环
        insert_sql = "INSERT INTO `%s` VALUES ('%s',%s,%s,%s,%s,'%s','%s')" % 
                     (table_name, user_id[i], r_score.iloc[i], f_score.iloc[i], m_score.iloc[i], rfm_wscore.iloc[i],
                      rfm_comb.iloc[i], timestamp)  # 写库SQL依据
        cursor.execute(insert_sql)  # 执行SQL语句,execute函数里面要用双引号
        con.commit()  # 提交命令
    cursor.close()  # 关闭游标
    con.close()  # 关闭数据库连接
    print ('Finish inserting, total records is: %d' % (i + 1))  # 打印写库结果

     第三版

    import pandas as pd
    import numpy as np
    
    
    df = pd.read_excel('./zue_edu.xlsx',sheet_name='Q2未消费的',index_col='user_id')
    df['ptdate'] = pd.to_datetime(df['ptdate'],format='%Y-%m-%d')
    
    recency_value = df['ptdate'].groupby(df['product_name']).max()  # 计算原始最近一次订单时间
    r_interval = (pd.to_datetime('2019-05-06',format='%Y-%m-%d') - recency_value).dt.days  # 计算R间隔
    frequency_value = df['ptdate'].groupby([df['user_email'],df['product_name']]).count()  # 计算原始订单频率
    monetary_value = df['cost'].groupby(df['product_name']).sum()  # 计算原始订单总金额
    frequency_value = frequency_value.groupby('product_name').max()
    
    data_list = [recency_value,r_interval, frequency_value, monetary_value]
    data_pd = pd.DataFrame(np.array(data_list).transpose(), columns=['上次消费时间','已停投天数', '有消费天数', '消费金额'] ,
                          index=frequency_value.index)  # 建立原始数据框
    
    
    # 计算RFM得分
    # 分别计算R、F、M得分
    r_score = pd.cut(r_interval, 5, labels=[5, 4, 3, 2, 1])  # 计算R得分
    f_score = pd.cut(frequency_value, 5, labels=[1, 2, 3, 4, 5])  # 计算F得分
    m_score = pd.cut(monetary_value, 5, labels=[1, 2, 3, 4, 5])  # 计算M得分
    
    # R、F、M数据合并
    rfm_list = [r_score, f_score, m_score]  # 将r、f、m三个维度组成列表
    rfm_cols = ['r_score', 'f_score', 'm_score']  # 设置r、f、m三个维度列名
    rfm_pd = pd.DataFrame(np.array(rfm_list).transpose(), dtype=np.int32, columns=rfm_cols,
                          index=frequency_value.index)  # 建立r、f、m数据框
    
    # 计算RFM总得分
    # 方法一:加权得分
    rfm_pd['rfm_wscore'] = rfm_pd['r_score'] * 0.6 + rfm_pd['f_score'] * 0.3 + rfm_pd['m_score'] * 0.1
    
    # 方法二:RFM组合
    rfm_pd_tmp = rfm_pd.copy()
    rfm_pd_tmp['r_score'] = rfm_pd_tmp['r_score'].astype('str')
    rfm_pd_tmp['f_score'] = rfm_pd_tmp['f_score'].astype('str')
    rfm_pd_tmp['m_score'] = rfm_pd_tmp['m_score'].astype('str')
    rfm_pd['rfm_comb'] = rfm_pd_tmp['r_score'].str.cat(rfm_pd_tmp['f_score']).str.cat(rfm_pd_tmp['m_score'])
    
    rfm_pd['rfm_comb'] = rfm_pd['rfm_comb'].astype('int')
    
    rfm_pd['最近级别'] = rfm_pd['r_score'].replace({5:'',4:'',3:'一般',2:'',1:'非常差'})
    rfm_pd['频次'] = rfm_pd['f_score'].replace({5:'',4:'',3:'一般',2:'',1:'非常差'})
    rfm_pd['金额'] = rfm_pd['m_score'].replace({5:'',4:'',3:'一般',2:'',1:'非常差'})
    
    bins = rfm_pd.rfm_wscore.quantile(q=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1],interpolation='nearest')
    bins[0] = 0  #起点为0
    labels = [1, 2, 3, 4, 5, 6, 7, 8]
    rfm_pd['level'] = pd.cut(rfm_pd.rfm_wscore, bins, labels=labels)
    
    rfm_pd = rfm_pd.reset_index()
    data_pd = data_pd.reset_index()
    fe = rfm_pd.sort_values('rfm_comb', ascending=0)
    fina = fe.merge(data_pd,on='product_name')
    
    # 打印结果
    print(fina.head())
    fina.to_excel('./rfm_edu_Q2non.xlsx',index=False)
  • 相关阅读:
    LVM扩容根分区
    vmware vsphere client 创建虚拟机
    虚拟机三种磁盘置备方式
    【转】VMWare vCenter 6.0安装配置
    【转】在VMware中为Linux系统安装VM-Tools的详解教程
    【转】虚拟化(五):vsphere高可用群集与容错
    【转】虚拟化(四):vsphere高可用功能前提-共享存储搭建
    虚拟化(三) -vsphere套件的安装注意及使用
    虚拟化(二)-VMware workstation
    虚拟化(一) -VMware产品介绍
  • 原文地址:https://www.cnblogs.com/Erick-L/p/10755956.html
Copyright © 2011-2022 走看看