zoukankan      html  css  js  c++  java
  • 朴素贝叶斯

    from numpy import *
    import re
    import operator
    import feedparser
    
    
    def loadDataSet():
        posting_ist = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                       ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                       ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                       ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                       ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                       ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
        class_vec = [0, 1, 0, 1, 0, 1]
        return posting_ist, class_vec
    
    
    def createVocabList(data_set):  # 将矩阵内所有词放入set中去重
        vocab_set = set([])  # 创建空集
        for document in data_set:
            vocab_set = vocab_set | set(document)  # 并集
        return list(vocab_set)
    
    
    def setOfWords2Vec(vocab_list, input_set):  # 统计每个词的出现次数
        return_vec = [0] * len(vocab_list)  # 创建一个所含元素都为0的向量
        for word in input_set:
            if word in vocab_list:
                return_vec[vocab_list.index(word)] = 1  # vocab_list是没有重复值的
            else:
                print(print('the word : %s is not in my vocabulary!' % word))
        return return_vec
    
    
    def trainNB0(train_matrix, train_category):
        '''
        朴素贝叶斯训练函数
        :param train_matrix: 文档矩阵
        :param train_category: 文档对应标签构成的向量
        :return:
        '''
        num_train_docs = len(train_matrix)  # 矩阵行数
        num_words = len(train_matrix[0])  # 矩阵数列
        p_abusive = sum(train_category) / float(num_train_docs)  # 总侮辱语言概率
        p0_num = ones(num_words)  # 初始化概率
        p1_num = ones(num_words)
        p0_denom = 2.0
        p1_denom = 2.0
        for i in range(num_train_docs):
            if train_category[i] == 1:
                p1_num += train_matrix[i]  # 向量相加
                p1_denom += sum(train_matrix[i])  # 统计1的个数
            else:
                p0_num += train_matrix[i]
                p0_denom += sum(train_matrix[i])
        p1_vect = log(p1_num / p1_denom)
        p0_vect = log(p0_num / p0_denom)
        return p0_vect, p1_vect, p_abusive
    
    
    def classifyNB(vec2_classify, p0_vec, p1_vec, p_class1):
        p1 = sum(vec2_classify * p1_vec) + log(p_class1)
        p0 = sum(vec2_classify * p0_vec) + log(1 - p_class1)
        if p1 > p0:
            return 1
        else:
            return 0
    
    
    def testingNB():
        list_oposts, list_classes = loadDataSet()  # 拿到矩阵和标签
        my_vocab_list = createVocabList(list_oposts)  # 将矩阵内所有词放入set中去重
        train_mat = []
        for postin_doc in list_oposts:
            train_mat.append(setOfWords2Vec(my_vocab_list, postin_doc))  # 返回值为统计值的向量
        p0V, p1V, pAb = trainNB0(array(train_mat), array(list_classes))
        test_entry = ['love', 'my', 'dalmation']
        this_doc = array(setOfWords2Vec(my_vocab_list, test_entry))
        print(test_entry, 'classified as:', classifyNB(this_doc, p0V, p1V, pAb))
        test_entry = ['stupid', 'garbage']
        this_doc = array(setOfWords2Vec(my_vocab_list, test_entry))
        print(test_entry, 'classified as:', classifyNB(this_doc, p0V, p1V, pAb))
    
    
    def bagOfWords2Vec(vocab_list, input_set):  # 统计每个词的出现次数
        return_vec = [0] * len(vocab_list)  # 创建一个所含元素都为0的向量
        for word in input_set:
            if word in vocab_list:
                return_vec[vocab_list.index(word)] += 1  # vocab_list是没有重复值的
        return return_vec
    
    
    def textParse(big_string):
        list_of_tokens = re.split(r'w*', big_string)
        return [tok.lower for tok in list_of_tokens if len(tok) > 1]
    
    
    def spamTest():
        doc_list = []
        class_list = []
        full_text = []
        for i in range(1, 26):  # 导入文本文件  应为有25个文件,所以取26
            word_list = textParse(open(r'emailspam\%d.txt' % i).read())  # 导入文件解析成列表
            doc_list.append(word_list)  # 矩阵
            full_text.extend(word_list)  # 列表
            class_list.append(1)  # 垃圾邮件
            print(i)
            word_list = textParse(open(r'emailham\%d.txt' % i).read())
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(0)
        vocab_list = createVocabList(doc_list)  # 将矩阵内所有词放入set中去重
        training_set = list(range(50))  # 共50邮件
        test_set = []
        for i in range(10):  # 选十个测试
            rand_index = int(random.uniform(0, len(training_set)))  # 随机选10个
            test_set.append(training_set[rand_index])
            del (training_set[rand_index])  # 删除已选数字,防止重复选邮件
        train_mat = []
        train_classes = []
        for doc_index in training_set:  # 训练剩余40个
            train_mat.append(bagOfWords2Vec(vocab_list, doc_list[doc_index]))  # 统计训练邮件每个词的出现次数
            train_classes.append(class_list[doc_index])
        p0V, p1V, p_spam = trainNB0(array(train_mat), array(train_classes))
        error_count = 0
        for doc_index in test_set:
            word_vector = bagOfWords2Vec(vocab_list, doc_list[doc_index])
            if classifyNB(array(word_vector), p0V, p1V, p_spam) != class_list[doc_index]:
                error_count += 1
        print('the error rate is : ', float(error_count) / len(test_set))
    
    
    def calcMostFreq(vocab_list,full_text):
        freq_dict = {}
        for token in vocab_list:
            freq_dict[token] = full_text.count(token)
        sorted_freq = sorted(freq_dict.items(),key=operator.itemgetter(1),reverse=True)
        return sorted_freq[:30]
    
    def localWords(feed1,feed0):
        doc_list= []
        class_list = []
        full_text = []
        min_len = min(len(feed1.entries),len(feed0.entries))
        for i in range(min_len):
            word_list =textParse(feed1.entries[i]['summary'])
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(1)
            word_list = textParse(feed0.entries[i]['summary'])
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(0)
        vocab_list = createVocabList(doc_list)  # 将矩阵内所有词放入set中去重
        top30_words = calcMostFreq(vocab_list,full_text)
        for pair_w in top30_words:
            if pair_w[0] in vocab_list:
                vocab_list.remove(pair_w[0])
        training_set = list(range(2*min_len))
        test_set = []
        for i in range(20):
            rand_index = int(random.uniform(0,len(training_set)))
            test_set.append(training_set[rand_index])
            del training_set[rand_index]
        train_mat = []
        train_classes = []
        for doc_index in training_set:
            train_mat.append(bagOfWords2Vec(vocab_list,doc_list[doc_index]))
            train_classes.append(class_list[doc_index])
        p0v,p1v,p_spam = trainNB0(array(train_mat),array(train_classes))
        error_count = 0
        for doc_index in test_set:
            word_vector = bagOfWords2Vec(vocab_list,doc_list[doc_index])
            if classifyNB(array(word_vector),p0v,p1v,p_spam) != class_list[doc_index]:
                error_count += 1
        print('the error rate is : ',float(error_count)/len(test_set))
        return vocab_list,p0v,p1v
    
    
    def getTopwords(nf,sf):
        vocab_list,p0v,p1v = localWords(nf,sf)
        top_ny = []
        top_sf = []
        for i in range(len(p0v)):
            if p0v[i] > -6.0:
                top_sf.append((vocab_list[i],p0v[i]))
            if p1v[i] > -6.0:
                top_ny.append((vocab_list[i],p1v[i]))
        sorted_sf = sorted(top_sf,key=lambda pair:pair[1],reverse=True)
        print('sf**sf**sf**sf**sf**')
        for item in sorted_sf:
            print(item[0])
        sorted_ny = sorted(top_ny,key=lambda pair:pair[1],reverse=True)
        print('ny**ny**ny**ny**ny**')
        for item in sorted_ny:
            print(item[0])
    

      

  • 相关阅读:
    Grafana+Prometheus监控mysql性能
    性能测试监控平台Grafana的使用
    搭建grafana+telegraf+influxdb服务器性能监控平台
    cocos2d-x jsb 防止触摸事件传递
    web app 相关记录
    如何在Teamcenter中使用PMI?
    浅谈人机工程应用在数字化工艺中的作用
    关于奇葩说
    一些感想
    关于起名
  • 原文地址:https://www.cnblogs.com/luck-L/p/9168548.html
Copyright © 2011-2022 走看看