zoukankan      html  css  js  c++  java
  • 朴素贝叶斯——实现贴吧精品贴的预测

    首先是爬取了我们学校贴吧的贴吧数据,每个帖子都有是否是精品贴的标签。

    根据帖子标题信息,实现了贴吧精品贴和普通贴的分类。错误率在10%左右。

    切词用的是jieba吧,没有过滤点停用词和标点符号,因为标点符号其实也是可以算是区分帖子是否是精品贴的而一个重要特征;其实还可以增加几个特征,比如第一页是否含有音频、视频、图片的数量等等,有些重要的特征维度甚至可以多覆盖几个维度。分类的效果可能会更好。但是目前光看标题正确率已经达到了90%,说明朴素贝叶斯还真不错。

    训练算法部分用的是朴素贝叶斯方法。测试部分,还是用的留出法。

    先是爬虫部分代码:

      1 __author__ = 'Oscar_Yang'
      2 # -*- coding= utf-8 -*-
      3 """
      4 本次目的
      5 1、抓取列表页,标题
      6 2、计算提拔关键词
      7 """
      8 # 导入相关模块。
      9 import re, requests, json, random, time, jieba,pymongo
     10 import urllib.request
     11 from bs4 import BeautifulSoup
     12 """连接mongodb"""
     13 client = pymongo.MongoClient("localhost",27017)
     14 db_tieba = client["db_tieba"]
     15 # sheet_tieba_ysu_good = db_tieba_ysu_good["sheet_tieba_ysu_good"]
     16 # sheet_tieba_dq = db_tieba_dq["sheet_tieba_dq_test"]
     17 # sheet_tieba_dbdx = db_tieba["sheet_tieba_dbdx"]
     18 sheet_tieba = db_tieba["sheet_tieba_ysu_914"]
     19 
     20 """设置代理"""
     21 # resp = requests.get("http://tor1024.com/static/proxy_pool.txt")
     22 # ips_txt = resp.text.strip().split("
    ")
     23 # # print(ips_txt)
     24 # ips = []
     25 # for i in ips_txt:
     26 #     try:
     27 #         k = json.loads(i)
     28 #         ips.append(k)
     29 #     except Exception as e:
     30 #         print(e)
     31 
     32 header = {
     33     "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
     34     "Cookie": 'XXX'
     35 }
     36 
     37 
     38 # 定义获取base_urls的函数
     39 
     40 def get_base_urls():
     41     urls = ["http://tieba.baidu.com/f?kw=%E7%87%95%E5%B1%B1%E5%A4%A7%E5%AD%A6&ie=utf-8&pn={}".format(str(i)) for i in range(0, 383400, 50)]
     42     return urls
     43 def get_base_good_urls():
     44     urls=["http://tieba.baidu.com/f?kw=%E9%87%8C%E4%BB%81%E5%AD%A6%E9%99%A2&ie=utf-8&pn={}".format(str(i)) for i in range(0,10000,50)]
     45     return urls
     46 def get_last_reply_time(detail_url):
     47     web_data = requests.get(detail_url)
     48     web_data.encoding = "utf-8"
     49     soup = BeautifulSoup(web_data.text, "lxml")
     50     detail_page_num = soup.find_all(class_="red")[1].text
     51     detail_page_last_url=detail_url+"?pn={}".format(detail_page_num)
     52     web_data1 = requests.get(detail_page_last_url)
     53     web_data1.encoding = "utf-8"
     54     soup1 = BeautifulSoup(web_data1.text, "lxml")
     55     last_post_time_data_0 = soup1.find_all(class_="l_post j_l_post l_post_bright  ")
     56     last_post_time_data = last_post_time_data_0[len(last_post_time_data_0)-1].get("data-field")
     57     last_reply_post_time = last_post_time_data[int(last_post_time_data.find("date")) + 7:int(last_post_time_data.find("vote_crypt") - 3)]
     58     return last_post_time_data
     59 
     60 def get_detail_data(detail_url):
     61     res = requests.get(detail_url)
     62     res.encoding = "utf-8"
     63     soup = BeautifulSoup(res.text,"lxml")
     64     post_time_data = soup.find_all(class_='l_post j_l_post l_post_bright noborder ')[0].get("data-field")
     65     author_dengji = soup.select("div.d_author > ul > li.l_badge > div > a > div.d_badge_lv")[0].text
     66     author_lable = soup.select("div.d_badge_title")[0].text
     67     reply_nus = soup.find_all(class_="red")[0].text
     68     title = soup.select("div.core_title.core_title_theme_bright > h1")[0].text
     69     author_name = soup.select("div.d_author > ul > li.d_name > a")[0].text
     70     authon_contents = soup.find_all(class_="d_post_content j_d_post_content  clearfix")
     71     jingyan_scores = post_time_data[int(post_time_data.find("cur_score"))+11:int(post_time_data.find("bawu"))-2]
     72     post_time = post_time_data[int(post_time_data.find("date")) + 7:int(post_time_data.find("vote_crypt")) - 3]
     73     phone_is = post_time_data[int(post_time_data.find("open_type")) + 12:int(post_time_data.find("date")) - 2]
     74     sex_is = post_time_data[int(post_time_data.find("user_sex")) + 10:int(post_time_data.find("user_sex")) + 11]
     75     pic_num_firstpage = soup.find_all(class_="BDE_Image")
     76     voice_num = soup.find_all(class_="voice_player_inner")
     77     video_num = soup.find_all(class_="BDE_Flash")
     78     detail_page_num = soup.find_all(class_="red")[1].text
     79     contents = []
     80     for i in range(len(authon_contents)):
     81         contents.append(authon_contents[i].text.strip())
     82     data={
     83         "nick_name":author_name,
     84         "post_time":post_time,
     85         "vehicle":phone_is,
     86         "level":author_dengji,
     87         "honored_name":author_lable,
     88         "reply_num":reply_nus,
     89         "title":title,
     90         "sex":sex_is,
     91         "jingyan_scores":jingyan_scores,
     92         "pic_num":len(pic_num_firstpage),
     93         "video_num":len(video_num),
     94         "voice_num":len(voice_num),
     95         "detail_page_num":detail_page_num,
     96         "contents":contents,
     97         "tiezi_url":detail_url
     98     }
     99     #print(data)
    100     # sheet_tieba_ysu_good.insert_one(data)
    101     # sheet_tieba_dbdx.insert_one(data)
    102     # sheet_tieba_dq.insert_one(data)
    103     sheet_tieba.insert_one(data)
    104     #data1file(data)
    105 def data1file(s):
    106     path = r"C:UsersOscarDesktop数据.txt"
    107     file = open(path, "a",encoding="utf-8")
    108     file.write("
    ")
    109     file.write(str(s))
    110     file.close()
    111 
    112 def get_detail_urls(url):
    113     detail_links = []
    114     res = requests.get(url)
    115     res.encoding = "utf-8"
    116     soup = BeautifulSoup(res.text, 'lxml')
    117     link_tags = soup.select("#thread_list div.threadlist_title.pull_left.j_th_tit > a")
    118     for link_tag in link_tags:
    119         detail_links.append("http://tieba.baidu.com/" + link_tag.get("href"))
    120     return detail_links
    121     # print(detail_links)
    122 
    123 # 获取列表页数据
    124 def get_data(url):
    125     web_data = requests.get(url)
    126     web_data.encoding = "utf-8"
    127     soup = BeautifulSoup(web_data.text, "lxml")
    128     titles = soup.select('#thread_list div.threadlist_title.pull_left.j_th_tit > a')
    129     reply_nums = soup.select("div > div.col2_left.j_threadlist_li_left > span")
    130     zhurens = soup.select("div.threadlist_author.pull_right > span.tb_icon_author > span.frs-author-name-wrap > a")
    131     link_tags = soup.select('#thread_list div.threadlist_title.pull_left.j_th_tit > a')
    132     #time.sleep(random.randint(1, 2))
    133     for title, reply_num, link_tag, zhuren in zip(titles, reply_nums, link_tags, zhurens):
    134         data = {
    135             "标题": title.get_text(),
    136             "回复数": reply_num.text,
    137             "主人": zhuren.get_text(),
    138             "原文链接": "http://tieba.baidu.com/" + link_tag.get("href")
    139 
    140         }
    141         print(data)
    142 
    143 def get_counts():
    144     f = open(r'C:UsersOscarDesktop1.txt', 'r', encoding='utf-8')
    145     sentence = f.read()
    146     #words = jieba.cut(sentence, cut_all=True)
    147     #words = jieba.cut(sentence, cut_all=False)
    148     words = jieba.cut_for_search(sentence)
    149     tf = {}
    150     for word in words:
    151         print(word)
    152         word = ''.join(word.split())
    153         if word in tf:
    154             tf[word] += 1
    155         else:
    156             tf[word] = 1
    157     return tf
    158 
    159 def top_counts_sorted(tf, n=50):
    160     value_key_pairs = sorted([(count, tz) for tz, count in tf.items()], reverse=True)
    161     print(value_key_pairs[:n])
    162 
    163 # top_counts_sorted(get_counts())
    164 if __name__ == '__main__':
    165     count=0
    166 
    167     for base_url in get_base_urls():
    168         count = count + 1
    169         detail_urls = get_detail_urls(base_url)
    170         for detail_url in detail_urls:
    171             try:
    172                 get_detail_data(detail_url)
    173             except Exception as e:
    174                 print(e)
    175                 # pass
    176         #time.sleep(random.randint(1,3))
    177         print("完成了第{}页的抓取".format(count))

    爬取后的数据存入到mongodb了,我最终导出到txt了。精品贴的标题和普通的贴的标题分别放在两个txt了,如下图所示(0.txt,1.txt)

    下面是 朴素贝叶斯的部分,没有包在类中,几个函数就行了,包括文件处理函数,创建单词表函数,训练函数,分类函数,主函数。

      1 def text_split(textlist):
      2     import re
      3     word_cut = jieba.cut(textlist, cut_all=False)  # 精确模式,返回的结构是一个可迭代的genertor
      4     word_list = list(word_cut)  # genertor转化为list,每个词unicode格式
      5     return word_list
      6 
      7 
      8 # 创建单词表
      9 def createVocabList(dataSet):
     10     vocabSet = set([])  # 创建一个空的集合
     11     for document in dataSet:
     12         vocabSet = vocabSet | set(document)  # union of the two sets
     13     return list(vocabSet)
     14 
     15 
     16 def trainNB0(trainMatrix, trainCategory):
     17     numTrainDocs = len(trainMatrix)  # 训练矩阵的行数
     18     numWords = len(trainMatrix[0])  # 字母表的维度,即训练矩阵的列数
     19     pAbusive = sum(trainCategory) / float(numTrainDocs)  # 先验信息
     20     p0Num = ones(numWords);
     21     p1Num = ones(numWords)  # 改为 ones()
     22     p0Denom = 2.0;
     23     p1Denom = 2.0  # 改成 2.0
     24     for i in range(numTrainDocs):
     25         if trainCategory[i] == 1:
     26             p1Num += trainMatrix[i]
     27             p1Denom += sum(trainMatrix[i])
     28         else:
     29             p0Num += trainMatrix[i]
     30             p0Denom += sum(trainMatrix[i])
     31     p1Vect = log(p1Num / p1Denom)  # 改为 log()
     32     p0Vect = log(p0Num / p0Denom)  # 改为 log()
     33     return p0Vect, p1Vect, pAbusive
     34     # 返回先验信息PAbusive,返回确定分类的条件下的每个单词出现的概率(此时概率为频率)
     35 
     36 
     37 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
     38     p1 = sum(vec2Classify * p1Vec) + log(pClass1)  # 此时p1vec为对原始数组分别取对数之后的矩阵了,利用log(a*b)=sum(log(a)+log(b))再sum求和
     39     # pClass1为先验概率,此时p1就是最终的概率值。同理p0,根据后验概率最大准则,判别
     40     p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
     41     if p1 > p0:
     42         return 1
     43     else:
     44         return 0
     45 
     46 
     47 # 定义词袋模型,词出现几次算几次
     48 def bagOfWords2VecMN(vocabList, inputSet):
     49     returnVec = [0] * len(vocabList)  # 初始化矩阵
     50     for word in inputSet:
     51         if word in vocabList:
     52             returnVec[vocabList.index(word)] += 1
     53     return returnVec
     54 
     55 
     56 def spamTest():
     57     """"
     58     文本矩阵化,构建文本矩阵和分类矩阵;
     59     注意:由于有个文本的编码解码有问题,我给直接过滤掉了,所以最后矩阵有49行而不是50行
     60     """
     61     docList = [];classList = [];fullText = []
     62     path=r"C:UsersOscarDesktopdata1.txt"
     63     with open(path,encoding="utf8") as f:
     64         i=0
     65         while i<1046:
     66             a=f.readline()
     67             word_list=text_split(a)
     68             docList.append(word_list)
     69             classList.append(1)
     70             fullText.extend(docList)
     71             i=i+1
     72     path=r"C:UsersOscarDesktopdata.txt"
     73     with open(path,encoding="utf8") as f:
     74         i=0
     75         while i<1546:
     76             a=f.readline()
     77             word_list=text_split(a)
     78             docList.append(word_list)
     79             classList.append(0)
     80             fullText.extend(docList)
     81             i=i+1
     82     vocabList = createVocabList(docList)  # 创建词汇表
     83 
     84     trainingSet = list(range(2500))
     85     testSet = []  # 随机的构建测试集和训练集,留存交叉验证的方法
     86     for i in range(10):  # 测试集大小为10,训练集大小为49-10=39
     87         randIndex = int(random.uniform(0, len(trainingSet)))
     88         testSet.append(trainingSet[randIndex])
     89         trainingSet.pop(randIndex)
     90 
     91     trainMat = []
     92     trainClasses = []
     93     for docIndex in trainingSet:
     94         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
     95         trainClasses.append(classList[docIndex])
     96     p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses))
     97 
     98     errorCount = 0
     99     for docIndex in testSet:  # classify the remaining items
    100         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
    101         if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
    102             errorCount += 1
    103             print("分类错误的帖子是", docList[docIndex],"正确的类应该是",classList[docIndex])
    104     print('错误率为: ', float(errorCount) / len(testSet))
    105     # return vocabList,fullText
    106 
    107 
    108 if __name__ == '__main__':
    109     spamTest()

  • 相关阅读:
    《ASP.NET Core跨平台开发从入门到实战》Web API自定义格式化protobuf
    .NET Core中文分词组件jieba.NET Core
    .NET Core 2.0及.NET Standard 2.0
    Visual Studio 2017 通过SSH 调试Linux 上.NET Core
    Visual Studio 2017 ASP.NET Core开发
    Visual Studio 2017正式版离线安装及介绍
    在.NET Core 上运行的 WordPress
    IT人员如何开好站立会议
    puppeteer(二)操作实例——新Web自动化工具更轻巧更简单
    puppeteer(一)环境搭建——新Web自动化工具(同selenium)
  • 原文地址:https://www.cnblogs.com/coskaka/p/6028764.html
Copyright © 2011-2022 走看看