zoukankan      html  css  js  c++  java
  • 通过移动设备行为数据预测性别年龄

    1. 通过行为习惯对移动用户人口属性(年龄+性别)进行预测。

    2. 数据及包含~20万用户数据,分成12组,同时提供了用户行为属性,如:手机品牌、型号、APP的类型等。

    3. 通过logloss评价

    main.py

      1 # -*- coding: utf-8 -*-
      2 
      3 
      4 import pandas as pd
      5 import os
      6 from pd_tools import split_train_test, get_part_data
      7 import numpy as np
      8 from sklearn.preprocessing import LabelEncoder, OneHotEncoder
      9 from sklearn.preprocessing import StandardScaler
     10 from sklearn.linear_model import LogisticRegression
     11 from sklearn import svm
     12 from sklearn.decomposition import PCA
     13 from ml_tools import get_best_model
     14 from sklearn.metrics import log_loss
     15 from sklearn.feature_selection import VarianceThreshold
     16 
     17 # 数据集变量声明
     18 dataset_path = './dataset'
     19 gender_age_filename = 'gender_age.csv'
     20 phone_brand_device_model_filename = 'phone_brand_device_model.csv'
     21 events_filename = 'events.csv'
     22 app_events_filename = 'app_events.csv'
     23 app_labels_filename = 'app_labels.csv'
     24 label_categories_filename = 'label_categories.csv'
     25 
     26 train_gender_age_filename = 'gender_age_train.csv'
     27 test_gender_age_filename = 'gender_age_test.csv'
     28 
     29 is_first_run = False
     30 
     31 
     32 def run_main():
     33     """
     34         主函数
     35     """
     36     if is_first_run:
     37         # 1. 分割数据集
     38         print('分割数据集')
     39         all_gender_age = pd.read_csv(os.path.join(dataset_path, gender_age_filename))
     40         df_train, df_test = split_train_test(all_gender_age)
     41         # 查看训练集测试集基本信息
     42         print('训练集中各类的数据个数:', df_train.groupby('group').size())
     43         print('测试集中各类的数据个数:', df_test.groupby('group').size())
     44 
     45         # 保存分割的数据集
     46         df_train.to_csv(os.path.join(dataset_path, train_gender_age_filename),
     47                         index=False)
     48         df_test.to_csv(os.path.join(dataset_path, test_gender_age_filename),
     49                        index=False)
     50 
     51     # 2. 加载数据
     52     print('加载数据')
     53     # 加载数据
     54     gender_age_train = pd.read_csv(os.path.join(dataset_path, train_gender_age_filename),
     55                                    index_col='device_id')
     56     gender_age_test = pd.read_csv(os.path.join(dataset_path, test_gender_age_filename),
     57                                   index_col='device_id')
     58 
     59     # 选取部分数据用于实验
     60     percent = 0.1
     61     gender_age_train = get_part_data(gender_age_train, percent=percent)
     62     gender_age_test = get_part_data(gender_age_test, percent=percent)
     63 
     64     phone_brand_device_model = pd.read_csv(os.path.join(dataset_path, phone_brand_device_model_filename))
     65     # 去掉重复数据
     66     phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id').set_index('device_id')
     67 
     68     events = pd.read_csv(os.path.join(dataset_path, events_filename),
     69                          usecols=['device_id', 'event_id'], index_col='event_id')
     70     app_events = pd.read_csv(os.path.join(dataset_path, app_events_filename),
     71                              usecols=['event_id', 'app_id'])
     72     # app_labels = pd.read_csv(os.path.join(dataset_path, app_labels_filename))
     73 
     74     # 3. 特征工程
     75     # 3.1 手机品牌特征
     76     # 使用LabelEncoder将类别转换为数字
     77     brand_label_encoder = LabelEncoder()
     78     brand_label_encoder.fit(phone_brand_device_model['phone_brand'].values)
     79     phone_brand_device_model['brand_label_code'] = 
     80         brand_label_encoder.transform(phone_brand_device_model['phone_brand'].values)
     81     gender_age_train['brand_label_code'] = phone_brand_device_model['brand_label_code']
     82     gender_age_test['brand_label_code'] = phone_brand_device_model['brand_label_code']
     83 
     84     # 使用OneHotEncoder将数字转换为OneHot码
     85     brand_onehot_encoder = OneHotEncoder()
     86     brand_onehot_encoder.fit(phone_brand_device_model['brand_label_code'].values.reshape(-1, 1))
     87     tr_brand_feat = brand_onehot_encoder.transform(gender_age_train['brand_label_code'].values.reshape(-1, 1))
     88     te_brand_feat = brand_onehot_encoder.transform(gender_age_test['brand_label_code'].values.reshape(-1, 1))
     89 
     90     print('[手机品牌]特征维度:', tr_brand_feat.shape[1])
     91 
     92     # 3.2 手机型号特征
     93     # 合并手机品牌与型号字符串
     94     phone_brand_device_model['brand_model'] = 
     95         phone_brand_device_model['phone_brand'].str.cat(phone_brand_device_model['device_model'])
     96 
     97     # 使用LabelEncoder将类别转换为数字
     98     model_label_encoder = LabelEncoder()
     99     model_label_encoder.fit(phone_brand_device_model['brand_model'].values)
    100     phone_brand_device_model['brand_model_label_code'] = 
    101         model_label_encoder.transform(phone_brand_device_model['brand_model'].values)
    102     gender_age_train['brand_model_label_code'] = phone_brand_device_model['brand_model_label_code']
    103     gender_age_test['brand_model_label_code'] = phone_brand_device_model['brand_model_label_code']
    104 
    105     # 使用OneHotEncoder将数字转换为OneHot码
    106     model_onehot_encoder = OneHotEncoder()
    107     model_onehot_encoder.fit(phone_brand_device_model['brand_model_label_code'].values.reshape(-1, 1))
    108     tr_model_feat = model_onehot_encoder.transform(gender_age_train['brand_model_label_code'].values.reshape(-1, 1))
    109     te_model_feat = model_onehot_encoder.transform(gender_age_test['brand_model_label_code'].values.reshape(-1, 1))
    110 
    111     print('[手机型号]特征维度:', tr_model_feat.shape[1])
    112 
    113     # 3.3 安装app特征
    114     device_app = app_events.merge(events, how='left', left_on='event_id', right_index=True)
    115     # 运行app的总次数
    116     n_run_s = device_app['app_id'].groupby(device_app['device_id']).size()
    117 
    118     # 运行app的个数
    119     n_app_s = device_app['app_id'].groupby(device_app['device_id']).nunique()
    120 
    121     gender_age_train['n_run'] = n_run_s
    122     gender_age_train['n_app'] = n_app_s
    123 
    124     # 填充缺失数据
    125     gender_age_train['n_run'].fillna(0, inplace=True)
    126     gender_age_train['n_app'].fillna(0, inplace=True)
    127 
    128     gender_age_test['n_run'] = n_run_s
    129     gender_age_test['n_app'] = n_app_s
    130 
    131     # 填充缺失数据
    132     gender_age_test['n_run'].fillna(0, inplace=True)
    133     gender_age_test['n_app'].fillna(0, inplace=True)
    134 
    135     tr_run_feat = gender_age_train['n_run'].values.reshape(-1, 1)
    136     tr_app_feat = gender_age_train['n_app'].values.reshape(-1, 1)
    137 
    138     te_run_feat = gender_age_test['n_run'].values.reshape(-1, 1)
    139     te_app_feat = gender_age_test['n_app'].values.reshape(-1, 1)
    140 
    141     # 3.4 合并所有特征
    142     tr_feat = np.hstack((tr_brand_feat.toarray(), tr_model_feat.toarray(), tr_run_feat, tr_app_feat))
    143     te_feat = np.hstack((te_brand_feat.toarray(), te_model_feat.toarray(), te_run_feat, te_app_feat))
    144     print('特征提取结束')
    145     print('每个样本特征维度:', tr_feat.shape[1])
    146 
    147     # 3.5 特征范围归一化
    148     scaler = StandardScaler()
    149     tr_feat_scaled = scaler.fit_transform(tr_feat)
    150     te_feat_scaled = scaler.transform(te_feat)
    151 
    152     # 3.6 特征选择
    153     sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    154     tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled)
    155     te_feat_scaled_sel = sel.transform(te_feat_scaled)
    156 
    157     # 3.7 PCA降维操作
    158     pca = PCA(n_components=0.95)  # 保留95%共享率的特征向量
    159     tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel)
    160     te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel)
    161     print('特征处理结束')
    162     print('处理后每个样本特征维度:', tr_feat_scaled_sel_pca.shape[1])
    163 
    164     # 4 为数据添加标签
    165     group_label_encoder = LabelEncoder()
    166     group_label_encoder.fit(gender_age_train['group'].values)
    167     y_train = group_label_encoder.transform(gender_age_train['group'].values)
    168     y_test = group_label_encoder.transform(gender_age_test['group'].values)
    169 
    170     # 5. 训练模型
    171     # 5.1 逻辑回归模型
    172     print('训练逻辑回归模型...')
    173     lr_param_grid = [
    174         {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}
    175     ]
    176     lr_model = LogisticRegression()
    177     best_lr_model = get_best_model(lr_model,
    178                                    tr_feat_scaled_sel_pca, y_train,
    179                                    lr_param_grid, cv=3)
    180     y_pred_lr = best_lr_model.predict_proba(te_feat_scaled_sel_pca)
    181 
    182     # 5.2 SVM
    183     print('训练SVM模型...')
    184     svm_param_grid = [
    185         {'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
    186     ]
    187 
    188     # 设置probability=True用于输出预测概率
    189     svm_model = svm.SVC(probability=True)
    190     best_svm_model = get_best_model(svm_model,
    191                                     tr_feat_scaled_sel_pca, y_train,
    192                                     svm_param_grid, cv=3)
    193     y_pred_svm = best_svm_model.predict_proba(te_feat_scaled_sel_pca)
    194 
    195     # 6. 查看结果
    196     print('逻辑回归模型 logloss:', log_loss(y_test, y_pred_lr))
    197     print('SVM logloss:', log_loss(y_test, y_pred_svm))
    198 
    199 
    200 if __name__ == '__main__':
    201     run_main()

    ml_tools.py

     1 # -*- coding: utf-8 -*-
     2 
     3 from sklearn.model_selection import GridSearchCV
     4 
     5 
     6 def get_best_model(model, X_train, y_train, params, cv=5):
     7     """
     8         交叉验证获取最优模型
     9         默认5折交叉验证
    10     """
    11     clf = GridSearchCV(model, params, cv=cv)
    12     clf.fit(X_train, y_train)
    13     return clf.best_estimator_

    pd_tools.py

     1 # -*- coding: utf-8 -*-
     2 
     3 import pandas as pd
     4 import math
     5 
     6 
     7 def split_train_test(df_data, size=0.8):
     8     """
     9         分割训练集和测试集
    10     """
    11     # 为保证每个类中的数据能在训练集中和测试集中的比例相同,所以需要依次对每个类进行处理
    12     df_train = pd.DataFrame()
    13     df_test = pd.DataFrame()
    14 
    15     labels = df_data['group'].unique().tolist()
    16     for label in labels:
    17         # 找出group的记录
    18         df_w_label = df_data[df_data['group'] == label]
    19         # 重新设置索引,保证每个类的记录是从0开始索引,方便之后的拆分
    20         df_w_label = df_w_label.reset_index()
    21 
    22         # 默认按80%训练集,20%测试集分割
    23         # 这里为了简化操作,取前80%放到训练集中,后20%放到测试集中
    24         # 当然也可以随机拆分80%,20%(尝试实现下DataFrame中的随机拆分)
    25 
    26         # 该类数据的行数
    27         n_lines = df_w_label.shape[0]
    28         split_line_no = math.floor(n_lines * size)
    29         text_df_w_label_train = df_w_label.iloc[:split_line_no, :]
    30         text_df_w_label_test = df_w_label.iloc[split_line_no:, :]
    31 
    32         # 放入整体训练集,测试集中
    33         df_train = df_train.append(text_df_w_label_train)
    34         df_test = df_test.append(text_df_w_label_test)
    35 
    36     df_train = df_train.reset_index()
    37     df_test = df_test.reset_index()
    38     return df_train, df_test
    39 
    40 
    41 def get_part_data(df_data, percent=1):
    42     """
    43         从df_data中按percent选取部分数据
    44     """
    45     df_result = pd.DataFrame()
    46     grouped = df_data.groupby('group')
    47     for group_name, group in grouped:
    48         n_group_size = group.shape[0]
    49         n_part_size = math.floor(n_group_size * percent)
    50         part_df = group.iloc[:n_part_size, :]
    51         df_result = df_result.append(part_df)
    52 
    53     return df_result

    dataset下载地址
    链接:http://pan.baidu.com/s/1dE7D0bf
    密码:yapd
  • 相关阅读:
    奥运圣火在家乡传递
    Please stop reinventing the wheel (请不要重复发明轮子)
    使用IDispatch::Invoke函数在C++中调用C#实现的托管类库方法
    To invoke and to begin invoke, that is a question.
    XML和JSON(JavaScript Object Notation)
    Cloud Computing Is a Big Whiteboard
    TRIE Data Structure
    ASP.NET AJAX UpdatePanel 控件实现剖析
    分布式计算、网格计算和云计算
    系统架构设计师考试大纲(2009版)
  • 原文地址:https://www.cnblogs.com/chengchengaqin/p/9655717.html
Copyright © 2011-2022 走看看