zoukankan      html  css  js  c++  java
  • 天池nlp新人赛_task1

    赛题解读:
    https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.6.6406111aIKCSLV&postId=118252
    赛题报名:
    https://tianchi.aliyun.com/competition/entrance/531810/introduction
    注意下提交时间和提交次数。

    第一阶段(7月15日-9月7日)每天提供2次的评测机会,提交后将进行实时评测;排行榜每小时更新,按照评测指标得分从高到低排序;(排行榜将选择选手在本阶段的历史最优成绩进行排名展示,不做最终排名计算)
    第二阶段(9月7日~9月8日)系统将在7日11:00提供新测试数据,并清空排行榜进行重新排名,参赛团队需要再次下载数据文件,每天提供2次的评测机会,提交后将进行实时评测;排行榜每小时更新

    因为是学习的心态,所以想把常用的方案都尝试以下。
    今天实现思路一,并分析其中的问题。
    思路一:TF-IDF + 机器学习分类器
    TF-IDF。TF表示词条在文本中出现的概率。一般会归一化。

    IDF是总文件数目处以包含该词语的文件的数目,再取对数。如果包含词条t的文档越少,则IDF越大,说明词条有很少的区分能力。

    为了避免分母为0,分母+1。

    提取tf_idf代码:

    def tf_idf(contents):
        # 提取文本特征tf-idf
        vectorizer = CountVectorizer(min_df=1e-5)
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(contents))
        return tfidf
    

    lgbm

    # lgb
    import lightgbm as lgb
    from sklearn.model_selection import GridSearchCV
    X_train_, X_val_, y_train_, y_val_ = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=2020)
    
    
    # cv_params = { 'n_estimators':[100,150,200,250,300],
    # 'num_leaves':[15,20,25,30,35,40,45,50],
    # 'max_depth':[3,4,5,6,7,8,9],
    # 'min_data_in_leaf':[18,19,20,21,22],
    # 'min_sum_hessian_in_leaf':[0.001,0.002],
    # 'feature_fraction':[0.6,0.7,0.8,0.9,1.0],
    # 'bagging_fraction':[0.6,0.7,0.8,0.9,1.0],
    # 'bagging_freq':[2,4,6,8,10],
    # 'lambda_l1':[1e-3,1e-2,0.0,0.1,0.2,0.3,0.4,0.5],
    # 'lambda_l2':[1e-3,1e-2,0.0,0.1,0.2,0.3,0.4,0.5],
    # 'learning_rate':[0.01,0.02,0.05,0.07,0.09,0.1,0.15,0.2]
    # }
    
    model = lgb.LGBMClassifier( 
                                boosting = 'gbdt',
                                objective = 'multiclass', #分类用binary,多分类用multi-class,回归用regression
                                num_class = 14,
                                metrics = 'multi_logloss',
                                n_estimators = 100,
                                num_leaves = 30, #搭配max_septh使用,取值<=2^(max_depth),否则过拟合,单独调时可使得max_depth=-1,表示不限制树的深度
                                max_depth = 5,
                                min_data_in_leaf = 15,
                                min_sum_hession_in_leaf = 0.005,
                                feature_fraction = 0.8,
                                bagging_fraction = 0.8,
                                bagging_freq = 5,
                                lambda_l1 = 0.1,
                                lambda_l2 = 0.1,
                                learning_rate = 0.1
    )
    #optimized_lgb=GridSearchCV(estimator=model, param_grid=cv_params, scoring='f1', cv=3, verbose=20, n_jobs=-1)
    # optimized_lgb.fit(X_train, y_train)
    # y_test_preds = optimized_lgb.predicted(X_val)
    
    # best_model = optimized_lgb.best_estimator_
    # best_model.fit(X_train, y_train, val_set=None, eval_metric='f1', early_stopping_rounds=100)
    # print(best_model.feature_importances_)
    # best_params = optimized_lgb.best_params_
    # best_score = optimized_lgb.best_score_
    # y_test_preds = best_model.predict(X_test)
    model.fit(X_train, y_train)
    y_test_preds = model.predict(X_test)
    

    xgboost

    # xgb
    from xgboost import XGBClassifier
    class XGB():
        
        def __init__(self, X_df, y_df):
            self.X = X_df
            self.y = y_df
           
        def train(self, param):
            self.model = XGBClassifier(**param)
            self.model.fit(self.X, self.y, eval_set=[(self.X, self.y)],
                           eval_metric=['mlogloss'],
                           early_stopping_rounds=10, # 连续N次分值不再优化则提前停止
                           verbose=True
                          )
            
    # mode evaluation
            train_result, train_proba = self.model.predict(self.X), self.model.predict_proba(self.X)
            train_acc = accuracy_score(self.y, train_result)
            train_auc = f1_score(self.y, train_proba, average='macro')
            
            print("Train acc: %.2f%% Train auc: %.2f" % (train_acc*100.0, train_auc))
            
        def test(self, X_test, y_test):
            result, proba = self.model.predict(X_test), self.model.predict_proba(X_test)
            acc = accuracy_score(y_test, result)
            f1 = f1_score(y_test, proba, average='macro')
            
            print("acc: %.2f%% F1_score: %.2f%%" % (acc*100.0, f1))
        
        def grid(self, param_grid):
            self.param_grid = param_grid
            xgb_model = XGBClassifier(nthread=20)
            clf = GridSearchCV(xgb_model, self.param_grid, scoring='f1_macro', cv=2, verbose=1)
            clf.fit(self.X, self.y)
            print("Best score: %f using parms: %s" % (clf.best_score_, clf.best_params_))
            return clf.best_params_, clf.best_score_
        
        
    param = {'learning_rate': 0.05, # (xgb’s “eta”)
                  'objective': 'multi:softmax', 
                  'n_jobs': 16,
                  'n_estimators': 300, # 树的个数
                  'max_depth': 10,               
                  'gamma': 0.5, # 惩罚项中叶子结点个数前的参数,Increasing this value will make model more conservative.
                  'reg_alpha': 0, # L1 regularization term on weights.Increasing this value will make model more conservative.
                  'reg_lambda': 2, # L2 regularization term on weights.Increasing this value will make model more conservative.
                  'min_child_weight' : 1, # 叶子节点最小权重
                  'subsample':0.8, # 随机选择80%样本建立决策树
                  'random_state':1 # 随机数
                 }
    
    X_train_, X_val_, y_train_, y_val_ = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=2020)
    model = XGB(X_train_, y_train_)
    model.train(param)
    model.test(X_val_, y_val_)
    

    贝叶斯

    NB_model = MultinomialNB(alpha=0.01)
    NB_model.fit(X_train, y_train)
    Y_val_preds = NB_model.predict(X_val)
    print(f1_score(y_val, Y_val_preds, average='macro'))
    

    LR

    # 逻辑回归
    from sklearn.linear_model import LogisticRegressionCV
    lr_model = LogisticRegressionCV(solver='newton-cg', multi_class='multinomial', cv=5, n_jobs=-1, verbose = True)
    lr_model.fit(X_train, y_train)
    # Y_val_preds = lr_model.predict(X_val)
    # print(f1_score(y_val, Y_val_preds, average='macro'))
    

    SVM

    from sklearn.svm import SVC
    svm_model = SVC(kernel="linear", verbose=True)
    svm_model.fit(X_train, y_train)
    Y_val_preds = svm_model.predict(X_val)
    print(f1_score(y_val, Y_val_preds, average='macro'))
    

    KNN

    # KNN
    from sklearn.neighbors import KNeighborsClassifier
    for x in range(1, 15):
        knn_model = KNeighborsClassifier(n_neighbors=x)
        knn_model.fit(X_train, y_train)
    
        Y_val_preds = knn_model.predict(X_val)
        print("n_neighbors = {}".format(n_neighbors), f1_score(y_val, Y_val_preds, average='macro'))
    

    TF-IDF优点是简单快速。
    缺点:

    • 没有考虑特征词的位置因素对文本的区分度
    • 一些生僻字出现的很少,往往会被误认为文档关键词
    • 简单的TF-IDF只考虑特征词与它文本数之间的关系,忽略了特征项在一个类别中不同的类别间的分布情况。
  • 相关阅读:
    EF 配置(SqlServer,Mysql)
    mysql sql优化
    非root用户安装、配置mysql
    使用spring jdbc遇到的一个性能问题
    mac 修改 vim 配色
    logstash 监控日志文件时应对日志文件名改变的原理
    java Atomic compareAndSet部分原理分析
    实现进程单例的一些想法
    java String、String.concat和StringBuilder性能对比
    Elasticsearch 动态修改replica配置、增删replica
  • 原文地址:https://www.cnblogs.com/zuotongbin/p/13358216.html
Copyright © 2011-2022 走看看