import pandas as pd import jieba from sklearn.feature_extraction.text import CountVectorizer import numpy as np from sklearn.naive_bayes import MultinomialNB # 加载数据 data = pd.read_csv("./data.csv",encoding='ansi') print("data: ",data) print("data 的列名: ",data.columns) # 将特征值 与目标值转化为数值类型 data.loc[data.loc[:,"评价"] == '好评','评价'] = 0 data.loc[data.loc[:,"评价"] == '差评','评价'] = 1 # 将object 转化为int类型 data.loc[:,"评价"] = data.loc[:,"评价"].astype('int') # print(data) # print(data.dtypes) # 转化特征值为数值型 content_list = [] for tmp in data.loc[:,'内容 ']: res = jieba.cut(tmp,cut_all=False) # 组装分词 res_str = ",".join(res) content_list.append(res_str) print(content_list) # 处理停用词 stop_words = [] with open("./stopwords.txt",encoding='utf-8') as f: lines = f.readlines() for line in lines: line_obj = line.strip() stop_words.append(line_obj) # print(lines) # 打印停用词结果 # print(len(stop_words)) # 去重重复的停用词 stop_words = list(set(stop_words)) # print(len(stop_words)) # 进行统计词数 con_vec = CountVectorizer(stop_words=stop_words) # 统计分词 X = con_vec.fit_transform(content_list) feature = X.toarray() # 获取分词结果 names = con_vec.get_feature_names() # print(names) # print(feature) # 将特征值与目标值组成完整的数据 new_data = np.concatenate((feature,data.loc[:,'评价'].values.reshape((-1,1))),axis=1) print(new_data) print(new_data.shape) # 数组的 除了最后一列 其余的都是特征值, 最后一列为目标值 # 拆分成训练集 与测试集 train_data = new_data[:10,:] test_data = new_data[10:,:] # 拆分特征值 与目标值 x_train = train_data[:,:-1] y_train = train_data[:,-1] x_test = test_data[:,:-1] y_test = test_data[:,-1] # 直接进行朴素贝叶斯分类 # alpha 拉普拉斯平滑系数 nb = MultinomialNB(alpha=1.0) #训练数据 nb.fit(x_train,y_train) #预测数据 y_predict = nb.predict(x_test) #准确率 score = nb.score(x_test,y_test) print("*"*80) print("y_predict : ",y_predict) print("准确率: ",score)