from numpy import * import re import operator import feedparser def loadDataSet(): posting_ist = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] class_vec = [0, 1, 0, 1, 0, 1] return posting_ist, class_vec def createVocabList(data_set): # 将矩阵内所有词放入set中去重 vocab_set = set([]) # 创建空集 for document in data_set: vocab_set = vocab_set | set(document) # 并集 return list(vocab_set) def setOfWords2Vec(vocab_list, input_set): # 统计每个词的出现次数 return_vec = [0] * len(vocab_list) # 创建一个所含元素都为0的向量 for word in input_set: if word in vocab_list: return_vec[vocab_list.index(word)] = 1 # vocab_list是没有重复值的 else: print(print('the word : %s is not in my vocabulary!' % word)) return return_vec def trainNB0(train_matrix, train_category): ''' 朴素贝叶斯训练函数 :param train_matrix: 文档矩阵 :param train_category: 文档对应标签构成的向量 :return: ''' num_train_docs = len(train_matrix) # 矩阵行数 num_words = len(train_matrix[0]) # 矩阵数列 p_abusive = sum(train_category) / float(num_train_docs) # 总侮辱语言概率 p0_num = ones(num_words) # 初始化概率 p1_num = ones(num_words) p0_denom = 2.0 p1_denom = 2.0 for i in range(num_train_docs): if train_category[i] == 1: p1_num += train_matrix[i] # 向量相加 p1_denom += sum(train_matrix[i]) # 统计1的个数 else: p0_num += train_matrix[i] p0_denom += sum(train_matrix[i]) p1_vect = log(p1_num / p1_denom) p0_vect = log(p0_num / p0_denom) return p0_vect, p1_vect, p_abusive def classifyNB(vec2_classify, p0_vec, p1_vec, p_class1): p1 = sum(vec2_classify * p1_vec) + log(p_class1) p0 = sum(vec2_classify * p0_vec) + log(1 - p_class1) if p1 > p0: return 1 else: return 0 def testingNB(): list_oposts, list_classes = loadDataSet() # 拿到矩阵和标签 my_vocab_list = createVocabList(list_oposts) # 将矩阵内所有词放入set中去重 train_mat = [] for postin_doc in list_oposts: train_mat.append(setOfWords2Vec(my_vocab_list, postin_doc)) # 返回值为统计值的向量 p0V, p1V, pAb = trainNB0(array(train_mat), array(list_classes)) test_entry = ['love', 'my', 'dalmation'] this_doc = array(setOfWords2Vec(my_vocab_list, test_entry)) print(test_entry, 'classified as:', classifyNB(this_doc, p0V, p1V, pAb)) test_entry = ['stupid', 'garbage'] this_doc = array(setOfWords2Vec(my_vocab_list, test_entry)) print(test_entry, 'classified as:', classifyNB(this_doc, p0V, p1V, pAb)) def bagOfWords2Vec(vocab_list, input_set): # 统计每个词的出现次数 return_vec = [0] * len(vocab_list) # 创建一个所含元素都为0的向量 for word in input_set: if word in vocab_list: return_vec[vocab_list.index(word)] += 1 # vocab_list是没有重复值的 return return_vec def textParse(big_string): list_of_tokens = re.split(r'w*', big_string) return [tok.lower for tok in list_of_tokens if len(tok) > 1] def spamTest(): doc_list = [] class_list = [] full_text = [] for i in range(1, 26): # 导入文本文件 应为有25个文件,所以取26 word_list = textParse(open(r'emailspam\%d.txt' % i).read()) # 导入文件解析成列表 doc_list.append(word_list) # 矩阵 full_text.extend(word_list) # 列表 class_list.append(1) # 垃圾邮件 print(i) word_list = textParse(open(r'emailham\%d.txt' % i).read()) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) vocab_list = createVocabList(doc_list) # 将矩阵内所有词放入set中去重 training_set = list(range(50)) # 共50邮件 test_set = [] for i in range(10): # 选十个测试 rand_index = int(random.uniform(0, len(training_set))) # 随机选10个 test_set.append(training_set[rand_index]) del (training_set[rand_index]) # 删除已选数字,防止重复选邮件 train_mat = [] train_classes = [] for doc_index in training_set: # 训练剩余40个 train_mat.append(bagOfWords2Vec(vocab_list, doc_list[doc_index])) # 统计训练邮件每个词的出现次数 train_classes.append(class_list[doc_index]) p0V, p1V, p_spam = trainNB0(array(train_mat), array(train_classes)) error_count = 0 for doc_index in test_set: word_vector = bagOfWords2Vec(vocab_list, doc_list[doc_index]) if classifyNB(array(word_vector), p0V, p1V, p_spam) != class_list[doc_index]: error_count += 1 print('the error rate is : ', float(error_count) / len(test_set)) def calcMostFreq(vocab_list,full_text): freq_dict = {} for token in vocab_list: freq_dict[token] = full_text.count(token) sorted_freq = sorted(freq_dict.items(),key=operator.itemgetter(1),reverse=True) return sorted_freq[:30] def localWords(feed1,feed0): doc_list= [] class_list = [] full_text = [] min_len = min(len(feed1.entries),len(feed0.entries)) for i in range(min_len): word_list =textParse(feed1.entries[i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(1) word_list = textParse(feed0.entries[i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) vocab_list = createVocabList(doc_list) # 将矩阵内所有词放入set中去重 top30_words = calcMostFreq(vocab_list,full_text) for pair_w in top30_words: if pair_w[0] in vocab_list: vocab_list.remove(pair_w[0]) training_set = list(range(2*min_len)) test_set = [] for i in range(20): rand_index = int(random.uniform(0,len(training_set))) test_set.append(training_set[rand_index]) del training_set[rand_index] train_mat = [] train_classes = [] for doc_index in training_set: train_mat.append(bagOfWords2Vec(vocab_list,doc_list[doc_index])) train_classes.append(class_list[doc_index]) p0v,p1v,p_spam = trainNB0(array(train_mat),array(train_classes)) error_count = 0 for doc_index in test_set: word_vector = bagOfWords2Vec(vocab_list,doc_list[doc_index]) if classifyNB(array(word_vector),p0v,p1v,p_spam) != class_list[doc_index]: error_count += 1 print('the error rate is : ',float(error_count)/len(test_set)) return vocab_list,p0v,p1v def getTopwords(nf,sf): vocab_list,p0v,p1v = localWords(nf,sf) top_ny = [] top_sf = [] for i in range(len(p0v)): if p0v[i] > -6.0: top_sf.append((vocab_list[i],p0v[i])) if p1v[i] > -6.0: top_ny.append((vocab_list[i],p1v[i])) sorted_sf = sorted(top_sf,key=lambda pair:pair[1],reverse=True) print('sf**sf**sf**sf**sf**') for item in sorted_sf: print(item[0]) sorted_ny = sorted(top_ny,key=lambda pair:pair[1],reverse=True) print('ny**ny**ny**ny**ny**') for item in sorted_ny: print(item[0])