zoukankan      html  css  js  c++  java
  • 数据挖掘实践(36):实战--高潜用户购买画像(五)模型设计

    5 模型设计

    #!/usr/bin/env python
    # -*- coding: UTF-8 -*-
    import sys
    import pandas as pd
    import numpy as np
    import xgboost as xgb
    from sklearn.model_selection import train_test_split
    import operator
    from matplotlib import pylab as plt
    from datetime import datetime
    import time
    from sklearn.model_selection import GridSearchCV
    data = pd.read_csv('train_set.csv')
    data.head()
    data.columns
    Index(['user_id', 'sku_id', 'cate', 'action_before_3_1.0_x',
           'action_before_3_2.0_x', 'action_before_3_3.0_x',
           'action_before_3_4.0_x', 'action_before_3_5.0_x',
           'action_before_3_6.0_x', 'action_before_3_1.0_y',
           ...
           'cate_action_5_mean', 'cate_action_6_mean', 'has_bad_comment',
           'bad_comment_rate', 'comment_num_0', 'comment_num_1', 'comment_num_2',
           'comment_num_3', 'comment_num_4', 'label'],
          dtype='object', length=251)
    data_x = data.loc[:,data.columns != 'label']
    data_y = data.loc[:,data.columns == 'label']
    data_x.head()
    x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,test_size = 0.2, random_state = 0)
    x_test.shape
    (2924, 250)
    x_val = x_test.iloc[:1500,:]
    y_val = y_test.iloc[:1500,:]
    
    x_test = x_test.iloc[1500:,:] 
    y_test = y_test.iloc[1500:,:]
    print (x_val.shape)
    print (x_test.shape)
    (1500, 250)
    (1424, 250)
    del x_train['user_id']
    del x_train['sku_id']
    
    del x_val['user_id']
    del x_val['sku_id']
    
    x_train.head()
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dvalid = xgb.DMatrix(x_val, label=y_val)
    param = {'n_estimators': 4000, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 
                 'colsample_bytree': 0.8, 'scale_pos_weight':10, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic',
                 'eval_metric':'auc'}
    num_round = param['n_estimators']
    
    plst = param.items()
    evallist = [(dtrain, 'train'), (dvalid, 'eval')]
    bst = xgb.train(plst, dtrain, num_round, evallist, early_stopping_rounds=10)
    bst.save_model('bst.model')
    print (bst.attributes())
    {'best_iteration': '198', 'best_msg': '[198]	train-auc:0.989114	eval-auc:0.97177', 'best_score': '0.97177'}
    def create_feature_map(features):
        outfile = open(r'xgb.fmap', 'w')
        i = 0
        for feat in features:
            outfile.write('{0}	{1}	q
    '.format(i, feat))
            i = i + 1
        outfile.close()
    
    
    features = list(x_train.columns[:])
    create_feature_map(features)
    def feature_importance(bst_xgb):
        importance = bst_xgb.get_fscore(fmap=r'xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
    
        df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        df['fscore'] = df['fscore'] / df['fscore'].sum()
        file_name = 'feature_importance_' + str(datetime.now().date())[5:] + '.csv'
        df.to_csv(file_name)
    
    feature_importance(bst)
    fi = pd.read_csv('feature_importance_10-24.csv')
    fi.sort_values("fscore", inplace=True, ascending=False)
    fi.head()

    x_test.head()
    users = x_test[['user_id', 'sku_id', 'cate']].copy()
    del x_test['user_id']
    del x_test['sku_id']
    x_test_DMatrix = xgb.DMatrix(x_test)
    y_pred = bst.predict(x_test_DMatrix, ntree_limit=bst.best_ntree_limit)
    x_test['pred_label'] = y_pred
    x_test.head()
    def label(column):
        if column['pred_label'] > 0.5:
            #rint ('yes')
            column['pred_label'] = 1
        else:
            column['pred_label'] = 0
        return column
    x_test = x_test.apply(label,axis = 1)
    x_test.head()        
    x_test['true_label'] = y_test
    x_test.head()
    x_test['user_id'] = users['user_id']
    x_test['sku_id'] = users['sku_id']
    x_test.head()
    # 所有购买用户
    all_user_set = x_test[x_test['true_label']==1]['user_id'].unique()
    print (len(all_user_set))
    # 所有预测购买的用户
    all_user_test_set = x_test[x_test['pred_label'] == 1]['user_id'].unique()
    print (len(all_user_test_set))
    all_user_test_item_pair = x_test[x_test['pred_label'] == 1]['user_id'].map(str) + '-' + x_test[x_test['pred_label'] == 1]['sku_id'].map(str)
    all_user_test_item_pair = np.array(all_user_test_item_pair)
    print (len(all_user_test_item_pair))
    126
    224
    243
    pos, neg = 0,0
    for user_id in all_user_test_set:
        if user_id in all_user_set:
            pos += 1
        else:
            neg += 1
    all_user_acc = 1.0 * pos / ( pos + neg)
    all_user_recall = 1.0 * pos / len(all_user_set)
    print ('所有用户中预测购买用户的准确率为 ' + str(all_user_acc))
    print ('所有用户中预测购买用户的召回率' + str(all_user_recall))
    所有用户中预测购买用户的准确率为 0.5357142857142857
    所有用户中预测购买用户的召回率0.9523809523809523
    #所有实际商品对
    all_user_item_pair = x_test[x_test['true_label']==1]['user_id'].map(str) + '-' + x_test[x_test['true_label']==1]['sku_id'].map(str)
    all_user_item_pair = np.array(all_user_item_pair)
    #print (len(all_user_item_pair))
    #print(all_user_item_pair)
    pos, neg = 0, 0
    for user_item_pair in all_user_test_item_pair:
        #print (user_item_pair)
        if user_item_pair in all_user_item_pair:
            pos += 1
        else:
            neg += 1
    all_item_acc = 1.0 * pos / ( pos + neg)
    all_item_recall = 1.0 * pos / len(all_user_item_pair)
    print ('所有用户中预测购买商品的准确率为 ' + str(all_item_acc))
    print ('所有用户中预测购买商品的召回率' + str(all_item_recall))
    F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc)
    F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc)
    score = 0.4 * F11 + 0.6 * F12
    print ('F11=' + str(F11))
    print ('F12=' + str(F12))
    print ('score=' + str(score))
    所有用户中预测购买商品的准确率为 0.5679012345679012
    所有用户中预测购买商品的召回率0.9583333333333334
    F11=0.5778491171749598
    F12=0.7516339869281046
    score=0.6821200390268466
  • 相关阅读:
    Python 安装Twisted 提示python version 2.7 required,which was not found in the registry
    Openfire Strophe开发中文乱码问题
    css div 垂直居中
    How to create custom methods for use in spring security expression language annotations
    How to check “hasRole” in Java Code with Spring Security?
    Android 显示/隐藏 应用图标
    Android 当媒体变更后,通知其他应用重新扫描
    文件上传那些事儿
    专题:点滴Javascript
    主流动画实现方式总结
  • 原文地址:https://www.cnblogs.com/qiu-hua/p/14400909.html
Copyright © 2011-2022 走看看