zoukankan      html  css  js  c++  java
  • 美团店铺评价语言处理以及分类(tfidf,SVM,决策树,随机森林,Knn,ensemble)

    import pandas as pd
    import numpy as np
    import  matplotlib.pyplot as  plt
    import time
    
    df=pd.read_excel("all_data_meituan.xlsx")[["comment","star"]]
    df.head()
    
    comment star
    0 还行吧,建议不要排队那个烤鸭和羊肉串,因为烤肉时间本来就不够,排那个要半小时,然后再回来吃烤... 40
    1 去过好几次了 东西还是老样子 没增添什么新花样 环境倒是挺不错 离我们这也挺近 味道还可以 ... 40
    2 一个字:好!!! #羊肉串# #五花肉# #牛舌# #很好吃# #鸡软骨# #拌菜# #抄河... 50
    3 第一次来吃,之前看过好多推荐说这个好吃,真的抱了好大希望,排队的人挺多的,想吃得趁早来啊。还... 20
    4 羊肉串真的不太好吃,那种说膻不膻说臭不臭的味。烤鸭还行,大虾没少吃,也就到那吃大虾了,吃完了... 30
    df.shape
    
    (17400, 2)
    
    df['sentiment']=df['star'].apply(lambda x:1 if x>30 else 0)
    df=df.drop_duplicates() ## 去掉重复的评论
    df=df.dropna()
    
    X=pd.concat([df[['comment']],df[['comment']],df[['comment']]])
    y=pd.concat([df.sentiment,df.sentiment,df.sentiment])
    X.columns=['comment']
    X.reset_index
    X.shape
    
    (3138, 1)
    
    import jieba
    def chinese_word_cut(mytext):
        return " ".join(jieba.cut(mytext))
    X['cut_comment']=X["comment"].apply(chinese_word_cut)
    X['cut_comment'].head()
    
    Building prefix dict from the default dictionary ...
    Loading model from cache C:UsersFRED-H~1AppDataLocalTempjieba.cache
    Loading model cost 0.651 seconds.
    Prefix dict has been built succesfully.
    
    
    
    
    
    0    还行 吧 , 建议 不要 排队 那个 烤鸭 和 羊肉串 , 因为 烤肉 时间 本来 就 不够...
    1    去过 好 几次 了   东西 还是 老 样子   没 增添 什么 新花样   环境 倒 是 ...
    2    一个 字 : 好 ! ! !   # 羊肉串 #   # 五花肉 #   # 牛舌 #   ...
    3    第一次 来 吃 , 之前 看过 好多 推荐 说 这个 好吃 , 真的 抱 了 好 大 希望 ...
    4    羊肉串 真的 不太 好吃 , 那种 说 膻 不 膻 说 臭 不 臭 的 味 。 烤鸭 还 行...
    Name: cut_comment, dtype: object
    
    from sklearn.model_selection import  train_test_split
    X_train,X_test,y_train,y_test= train_test_split(X,y,random_state=42,test_size=0.25)
    
    def get_custom_stopwords(stop_words_file):
        with open(stop_words_file,encoding="utf-8") as f:
            custom_stopwords_list=[i.strip() for i in f.readlines()]
        return custom_stopwords_list
    
    stop_words_file = "stopwords.txt"
    stopwords = get_custom_stopwords(stop_words_file)
    stopwords[-10:]
    
    ['100', '01', '02', '03', '04', '05', '06', '07', '08', '09']
    
    from sklearn.feature_extraction.text import  CountVectorizer
    vect=CountVectorizer()
    vect
    
    CountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor=None, stop_words=None,
            strip_accents=None, token_pattern='(?u)\b\w\w+\b',
            tokenizer=None, vocabulary=None)
    
    vect.fit_transform(X_train["cut_comment"])
    
    <2353x1965 sparse matrix of type '<class 'numpy.int64'>'
    	with 20491 stored elements in Compressed Sparse Row format>
    
    vect.fit_transform(X_train["cut_comment"]).toarray().shape
    
    (2353, 1965)
    
    # pd.DataFrame(vect.fit_transform(X_train["cut_comment"]).toarray(),columns=vect.get_feature_names()).iloc[:10,:22]
    # print(vect.get_feature_names())
    # #  数据维数1956,不算很大(未使用停用词)
    
    vect = CountVectorizer(token_pattern=u'(?u)\b[^\d\W]\w+\b',stop_words=frozenset(stopwords)) # 去除停用词
    pd.DataFrame(vect.fit_transform(X_train['cut_comment']).toarray(), columns=vect.get_feature_names()).head()
    # 1691 columns,去掉以数字为特征值的列,减少了三列编程1691 
    # max_df = 0.8 # 在超过这一比例的文档中出现的关键词(过于平凡),去除掉。
    # min_df = 3 # 在低于这一数量的文档中出现的关键词(过于独特),去除掉。
    
    amazing happy ktv pm2 一万个 一个多 一个月 一串 一人 一件 ... 麻烦 麻酱 黄喉 黄桃 黄花鱼 黄金 黑乎乎 黑椒 黑胡椒 齐全
    0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
    1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
    2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
    3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
    4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

    5 rows × 1691 columns

    from sklearn.pipeline import make_pipeline
    from sklearn.svm import SVC
    from sklearn import  metrics
    svc_cl=SVC()
    pipe=make_pipeline(vect,svc_cl)
    pipe.fit(X_train.cut_comment, y_train)
    
    Pipeline(memory=None,
         steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor=None,
            stop_words=...,
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False))])
    
    y_pred = pipe.predict(X_test.cut_comment)
    metrics.accuracy_score(y_test,y_pred)
    
    0.6318471337579618
    
    metrics.confusion_matrix(y_test,y_pred)
    
    array([[  0, 289],
           [  0, 496]], dtype=int64)
    

    支持向量机分类

    from sklearn.svm import SVC
    svc_cl=SVC() # 实例化
    pipe=make_pipeline(vect,svc_cl)
    pipe.fit(X_train.cut_comment, y_train)
    y_pred = pipe.predict(X_test.cut_comment)
    metrics.accuracy_score(y_test,y_pred)
    
    0.6318471337579618
    

    支持向量机 网格搜索

    from sklearn.model_selection import GridSearchCV
    from sklearn.svm import SVC
    from sklearn.pipeline import  Pipeline
    # svc=SVC(random_state=1)
    from sklearn.linear_model import SGDClassifier
    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf=TfidfTransformer()
    # ('tfidf',
    #                       TfidfTransformer()),
    #                      ('clf',
    #                       SGDClassifier(max_iter=1000)),
    # svc=SGDClassifier(max_iter=1000)
    svc=SVC()
    # pipe=make_pipeline(vect,SVC)
    pipe_svc=Pipeline([("scl",vect),('tfidf',tfidf),("clf",svc)])
    para_range=[0.0001,0.001,0.01,0.1,1.0,10,100,1000]
    para_grid=[
        {'clf__C':para_range,
        'clf__kernel':['linear']},
        {'clf__gamma':para_range,
        'clf__kernel':['rbf']}
    ]
    
    gs=GridSearchCV(estimator=pipe_svc,param_grid=para_grid,cv=10,n_jobs=-1)
    
    gs.fit(X_train.cut_comment,y_train)
    
    GridSearchCV(cv=10, error_score='raise',
           estimator=Pipeline(memory=None,
         steps=[('scl', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor=None,
            stop_words=frozenset({'...,
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False))]),
           fit_params=None, iid=True, n_jobs=-1,
           param_grid=[{'clf__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000], 'clf__kernel': ['linear']}, {'clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100, 1000], 'clf__kernel': ['rbf']}],
           pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
           scoring=None, verbose=0)
    
    gs.best_estimator_.fit(X_train.cut_comment,y_train)
    
    Pipeline(memory=None,
         steps=[('scl', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor=None,
            stop_words=frozenset({'...,
      max_iter=-1, probability=False, random_state=None, shrinking=True,
      tol=0.001, verbose=False))])
    
    y_pred = gs.best_estimator_.predict(X_test.cut_comment)
    metrics.accuracy_score(y_test,y_pred)
    
    0.9503184713375796
    

    临近法

    from sklearn.neighbors import  KNeighborsClassifier
    knn=KNeighborsClassifier(n_neighbors=5,p=2,metric='minkowski')
    pipe=make_pipeline(vect,knn)
    pipe.fit(X_train.cut_comment, y_train)
    
    Pipeline(memory=None,
         steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
            dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
            lowercase=True, max_df=1.0, max_features=None, min_df=1,
            ngram_range=(1, 1), preprocessor=None,
            stop_words=...owski',
               metric_params=None, n_jobs=1, n_neighbors=5, p=2,
               weights='uniform'))])
    
    y_pred = pipe.predict(X_test.cut_comment)
    metrics.accuracy_score(y_test,y_pred)
    
    0.7070063694267515
    
    metrics.confusion_matrix(y_test,y_pred)
    
    array([[ 87, 202],
           [ 28, 468]], dtype=int64)
    

    决策树

    from sklearn.tree import DecisionTreeClassifier
    tree=DecisionTreeClassifier(criterion='entropy',random_state=1)
    
    pipe=make_pipeline(vect,tree)
    pipe.fit(X_train.cut_comment, y_train)
    y_pred = pipe.predict(X_test.cut_comment)
    metrics.accuracy_score(y_test,y_pred)
    
    0.9388535031847134
    
    metrics.confusion_matrix(y_test,y_pred)
    
    array([[256,  33],
           [ 15, 481]], dtype=int64)
    

    随机森林

    
    from sklearn.ensemble import RandomForestClassifier
    forest=RandomForestClassifier(criterion='entropy',random_state=1,n_jobs=2)
    pipe=make_pipeline(vect,forest)
    pipe.fit(X_train.cut_comment, y_train)
    y_pred = pipe.predict(X_test.cut_comment)
    metrics.accuracy_score(y_test,y_pred)
    # 加上tfidf反而准确率96.5降低至95.0,
    
    0.9656050955414013
    
    metrics.confusion_matrix(y_test,y_pred)
    
    array([[265,  24],
           [  3, 493]], dtype=int64)
    

    bagging方法

    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier
    tree=DecisionTreeClassifier(criterion='entropy',random_state=1)
    bag=BaggingClassifier(base_estimator=tree,
                         n_estimators=10,
                         max_samples=1.0,
                         max_features=1.0,
                         bootstrap=True,
                         bootstrap_features=False,
                         n_jobs=1,random_state=1)
    pipe=make_pipeline(vect,tfidf,bag)
    pipe.fit(X_train.cut_comment, y_train)
    y_pred = pipe.predict(X_test.cut_comment)
    metrics.accuracy_score(y_test,y_pred)  #  没用转化td-idf 93.2%, 加上转化步骤,准确率提升到95.5
    
    0.9554140127388535
    
    metrics.confusion_matrix(y_test,y_pred)
    
    array([[260,  29],
           [  6, 490]], dtype=int64)
  • 相关阅读:
    2018牛客网暑期ACM多校训练营(第九场)A -Circulant Matrix(FWT)
    ZOJ
    BZOJ 4318 OSU!(概率DP)
    POJ
    POJ
    Linux安装及管理程序
    Linux目录及文件管理
    linux账号管理操作
    linux系统命令总结
    linux目录及文件管理操作
  • 原文地址:https://www.cnblogs.com/onemorepoint/p/9678446.html
Copyright © 2011-2022 走看看