zoukankan      html  css  js  c++  java
  • 文本情感分析

     1 import pandas as pd
     2 import numpy as np
     3 
     4 # 数据读取
     5 df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
     6 
     7 # Sample the data to speed up computation
     8 # Comment out this line to match with lecture
     9 df = df.sample(frac=0.1, random_state=10)
    10 
    11 df.head()

     1 # Drop missing values
     2 df.dropna(inplace=True)
     3 
     4 # Remove any 'neutral' ratings equal to 3
     5 df = df[df['Rating'] != 3]
     6 
     7 # Encode 4s and 5s as 1 (rated positively)
     8 # Encode 1s and 2s as 0 (rated poorly)
     9 df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
    10 df.head(10)

    1 # Most ratings are positive
    2 df['Positively Rated'].mean()
    0.7471776686078667

    1 from sklearn.model_selection import train_test_split
    2 
    3 # Split data into training and test sets
    4 X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
    5                                                     df['Positively Rated'], 
    6                                                     random_state=0)
    7 print('X_train first entry:
    
    ', X_train.iloc[0])
    8 print('
    
    X_train shape: ', X_train.shape)
    X_train first entry:
    
     Everything about it is awesome!
    
    
    X_train shape:  (23052,)

    CountVectorizer

    1 from sklearn.feature_extraction.text import CountVectorizer
    2 
    3 # Fit the CountVectorizer to the training data
    4 vect = CountVectorizer().fit(X_train)
    5 vect.get_feature_names()[::2000]
    ['00',
     'arroja',
     'comapañias',
     'dvds',
     'golden',
     'lands',
     'oil',
     'razonable',
     'smallsliver',
     'tweak']
    1 len(vect.get_feature_names())
    19601

    1 # transform the documents in the training data to a document-term matrix
    2 X_train_vectorized = vect.transform(X_train)
    3 
    4 X_train_vectorized
    <23052x19601 sparse matrix of type '<class 'numpy.int64'>'
    	with 613289 stored elements in Compressed Sparse Row format>

    1 from sklearn.linear_model import LogisticRegression
    2 
    3 # Train the model
    4 model = LogisticRegression()
    5 model.fit(X_train_vectorized, y_train)
    LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
              intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
              penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
              verbose=0, warm_start=False)

    1 from sklearn.metrics import roc_auc_score
    2 
    3 # Predict the transformed test documents
    4 predictions = model.predict(vect.transform(X_test))
    5 
    6 print('AUC: ', roc_auc_score(y_test, predictions))
    AUC:  0.897433277667

     1 # get the feature names as numpy array
     2 feature_names = np.array(vect.get_feature_names())
     3 
     4 # Sort the coefficients from the model
     5 sorted_coef_index = model.coef_[0].argsort()
     6 
     7 # Find the 10 smallest and 10 largest coefficients
     8 # The 10 largest coefficients are being indexed using [:-11:-1] 
     9 # so the list returned is in order of largest to smallest
    10 print('Smallest Coefs:
    {}
    '.format(feature_names[sorted_coef_index[:10]]))
    11 print('Largest Coefs: 
    {}'.format(feature_names[sorted_coef_index[:-11:-1]]))
    Smallest Coefs:
    ['worst' 'terrible' 'slow' 'junk' 'poor' 'sucks' 'horrible' 'useless'
     'waste' 'disappointed']
    
    Largest Coefs: 
    ['excelent' 'excelente' 'excellent' 'perfectly' 'love' 'perfect' 'exactly'
     'great' 'best' 'awesome']


    IF-idf
    1 from sklearn.feature_extraction.text import TfidfVectorizer
    2 
    3 # Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
    4 vect = TfidfVectorizer(min_df=5).fit(X_train)
    5 len(vect.get_feature_names())
    5442

    1 X_train_vectorized = vect.transform(X_train)
    2 
    3 model = LogisticRegression()
    4 model.fit(X_train_vectorized, y_train)
    5 
    6 predictions = model.predict(vect.transform(X_test))
    7 
    8 print('AUC: ', roc_auc_score(y_test, predictions))
    AUC:  0.889951006492
    
     
    1 feature_names = np.array(vect.get_feature_names())
    2 
    3 sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
    4 
    5 print('Smallest tfidf:
    {}
    '.format(feature_names[sorted_tfidf_index[:10]]))
    6 print('Largest tfidf: 
    {}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))
    Smallest tfidf:
    ['61' 'printer' 'approach' 'adjustment' 'consequences' 'length' 'emailing'
     'degrees' 'handsfree' 'chipset']
    
    Largest tfidf: 
    ['unlocked' 'handy' 'useless' 'cheat' 'up' 'original' 'exelent' 'exelente'
     'exellent' 'satisfied']

    1 sorted_coef_index = model.coef_[0].argsort()
    2 
    3 print('Smallest Coefs:
    {}
    '.format(feature_names[sorted_coef_index[:10]]))
    4 print('Largest Coefs: 
    {}'.format(feature_names[sorted_coef_index[:-11:-1]]))
    Smallest Coefs:
    ['not' 'slow' 'disappointed' 'worst' 'terrible' 'never' 'return' 'doesn'
     'horrible' 'waste']
    
    Largest Coefs: 
    ['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome' 'far'
     'perfectly']
    
     
    1 # These reviews are treated the same by our current model
    2 print(model.predict(vect.transform(['not an issue, phone is working',
    3                                     'an issue, phone is not working'])))
    [0 0]


    n-grams(语音识别中常用模型)

    1 # Fit the CountVectorizer to the training data specifiying a minimum 
    2 # document frequency of 5 and extracting 1-grams and 2-grams
    3 vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
    4 
    5 X_train_vectorized = vect.transform(X_train)
    6 
    7 len(vect.get_feature_names())
    29072

    1 model = LogisticRegression()
    2 model.fit(X_train_vectorized, y_train)
    3 
    4 predictions = model.predict(vect.transform(X_test))
    5 
    6 print('AUC: ', roc_auc_score(y_test, predictions))
    AUC:  0.91106617946

    1 feature_names = np.array(vect.get_feature_names())
    2 
    3 sorted_coef_index = model.coef_[0].argsort()
    4 
    5 print('Smallest Coefs:
    {}
    '.format(feature_names[sorted_coef_index[:10]]))
    6 print('Largest Coefs: 
    {}'.format(feature_names[sorted_coef_index[:-11:-1]]))
    Smallest Coefs:
    ['no good' 'junk' 'poor' 'slow' 'worst' 'broken' 'not good' 'terrible'
     'defective' 'horrible']
    
    Largest Coefs: 
    ['excellent' 'excelente' 'excelent' 'perfect' 'great' 'love' 'awesome'
     'no problems' 'good' 'best']

    1 # These reviews are now correctly identified
    2 print(model.predict(vect.transform(['not an issue, phone is working',
    3                                     'an issue, phone is not working'])))
    [1 0]
     


  • 相关阅读:
    HTTP Continuation or nonHTTP traffic 数据包
    linuxTcp IP协议栈源码阅读笔记(转)
    使用Windows命令行启动服务
    数据库集群
    ShellExecute
    oracle 中数据库完全导入导出:cmd命令行模式
    理解ORACLE数据库字符集
    asp.net 编码设置
    ShellExecute与ShellExecuteEx的用法
    C++用位运算实现循环移位
  • 原文地址:https://www.cnblogs.com/zhengzhe/p/8573202.html
Copyright © 2011-2022 走看看