1 __author__ = 'Oscar_Yang' 2 # -*- coding= utf-8 -*- 3 """ 4 本次目的 5 1、抓取列表页,标题 6 2、计算提拔关键词 7 """ 8 # 导入相关模块。 9 import re, requests, json, random, time, jieba,pymongo 10 import urllib.request 11 from bs4 import BeautifulSoup 12 """连接mongodb""" 13 client = pymongo.MongoClient("localhost",27017) 14 db_tieba = client["db_tieba"] 15 # sheet_tieba_ysu_good = db_tieba_ysu_good["sheet_tieba_ysu_good"] 16 # sheet_tieba_dq = db_tieba_dq["sheet_tieba_dq_test"] 17 # sheet_tieba_dbdx = db_tieba["sheet_tieba_dbdx"] 18 sheet_tieba = db_tieba["sheet_tieba_ysu_914"] 19 20 """设置代理""" 21 # resp = requests.get("http://tor1024.com/static/proxy_pool.txt") 22 # ips_txt = resp.text.strip().split(" ") 23 # # print(ips_txt) 24 # ips = [] 25 # for i in ips_txt: 26 # try: 27 # k = json.loads(i) 28 # ips.append(k) 29 # except Exception as e: 30 # print(e) 31 32 header = { 33 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36', 34 "Cookie": 'XXX' 35 } 36 37 38 # 定义获取base_urls的函数 39 40 def get_base_urls(): 41 urls = ["http://tieba.baidu.com/f?kw=%E7%87%95%E5%B1%B1%E5%A4%A7%E5%AD%A6&ie=utf-8&pn={}".format(str(i)) for i in range(0, 383400, 50)] 42 return urls 43 def get_base_good_urls(): 44 urls=["http://tieba.baidu.com/f?kw=%E9%87%8C%E4%BB%81%E5%AD%A6%E9%99%A2&ie=utf-8&pn={}".format(str(i)) for i in range(0,10000,50)] 45 return urls 46 def get_last_reply_time(detail_url): 47 web_data = requests.get(detail_url) 48 web_data.encoding = "utf-8" 49 soup = BeautifulSoup(web_data.text, "lxml") 50 detail_page_num = soup.find_all(class_="red")[1].text 51 detail_page_last_url=detail_url+"?pn={}".format(detail_page_num) 52 web_data1 = requests.get(detail_page_last_url) 53 web_data1.encoding = "utf-8" 54 soup1 = BeautifulSoup(web_data1.text, "lxml") 55 last_post_time_data_0 = soup1.find_all(class_="l_post j_l_post l_post_bright ") 56 last_post_time_data = last_post_time_data_0[len(last_post_time_data_0)-1].get("data-field") 57 last_reply_post_time = last_post_time_data[int(last_post_time_data.find("date")) + 7:int(last_post_time_data.find("vote_crypt") - 3)] 58 return last_post_time_data 59 60 def get_detail_data(detail_url): 61 res = requests.get(detail_url) 62 res.encoding = "utf-8" 63 soup = BeautifulSoup(res.text,"lxml") 64 post_time_data = soup.find_all(class_='l_post j_l_post l_post_bright noborder ')[0].get("data-field") 65 author_dengji = soup.select("div.d_author > ul > li.l_badge > div > a > div.d_badge_lv")[0].text 66 author_lable = soup.select("div.d_badge_title")[0].text 67 reply_nus = soup.find_all(class_="red")[0].text 68 title = soup.select("div.core_title.core_title_theme_bright > h1")[0].text 69 author_name = soup.select("div.d_author > ul > li.d_name > a")[0].text 70 authon_contents = soup.find_all(class_="d_post_content j_d_post_content clearfix") 71 jingyan_scores = post_time_data[int(post_time_data.find("cur_score"))+11:int(post_time_data.find("bawu"))-2] 72 post_time = post_time_data[int(post_time_data.find("date")) + 7:int(post_time_data.find("vote_crypt")) - 3] 73 phone_is = post_time_data[int(post_time_data.find("open_type")) + 12:int(post_time_data.find("date")) - 2] 74 sex_is = post_time_data[int(post_time_data.find("user_sex")) + 10:int(post_time_data.find("user_sex")) + 11] 75 pic_num_firstpage = soup.find_all(class_="BDE_Image") 76 voice_num = soup.find_all(class_="voice_player_inner") 77 video_num = soup.find_all(class_="BDE_Flash") 78 detail_page_num = soup.find_all(class_="red")[1].text 79 contents = [] 80 for i in range(len(authon_contents)): 81 contents.append(authon_contents[i].text.strip()) 82 data={ 83 "nick_name":author_name, 84 "post_time":post_time, 85 "vehicle":phone_is, 86 "level":author_dengji, 87 "honored_name":author_lable, 88 "reply_num":reply_nus, 89 "title":title, 90 "sex":sex_is, 91 "jingyan_scores":jingyan_scores, 92 "pic_num":len(pic_num_firstpage), 93 "video_num":len(video_num), 94 "voice_num":len(voice_num), 95 "detail_page_num":detail_page_num, 96 "contents":contents, 97 "tiezi_url":detail_url 98 } 99 #print(data) 100 # sheet_tieba_ysu_good.insert_one(data) 101 # sheet_tieba_dbdx.insert_one(data) 102 # sheet_tieba_dq.insert_one(data) 103 sheet_tieba.insert_one(data) 104 #data1file(data) 105 def data1file(s): 106 path = r"C:UsersOscarDesktop数据.txt" 107 file = open(path, "a",encoding="utf-8") 108 file.write(" ") 109 file.write(str(s)) 110 file.close() 111 112 def get_detail_urls(url): 113 detail_links = [] 114 res = requests.get(url) 115 res.encoding = "utf-8" 116 soup = BeautifulSoup(res.text, 'lxml') 117 link_tags = soup.select("#thread_list div.threadlist_title.pull_left.j_th_tit > a") 118 for link_tag in link_tags: 119 detail_links.append("http://tieba.baidu.com/" + link_tag.get("href")) 120 return detail_links 121 # print(detail_links) 122 123 # 获取列表页数据 124 def get_data(url): 125 web_data = requests.get(url) 126 web_data.encoding = "utf-8" 127 soup = BeautifulSoup(web_data.text, "lxml") 128 titles = soup.select('#thread_list div.threadlist_title.pull_left.j_th_tit > a') 129 reply_nums = soup.select("div > div.col2_left.j_threadlist_li_left > span") 130 zhurens = soup.select("div.threadlist_author.pull_right > span.tb_icon_author > span.frs-author-name-wrap > a") 131 link_tags = soup.select('#thread_list div.threadlist_title.pull_left.j_th_tit > a') 132 #time.sleep(random.randint(1, 2)) 133 for title, reply_num, link_tag, zhuren in zip(titles, reply_nums, link_tags, zhurens): 134 data = { 135 "标题": title.get_text(), 136 "回复数": reply_num.text, 137 "主人": zhuren.get_text(), 138 "原文链接": "http://tieba.baidu.com/" + link_tag.get("href") 139 140 } 141 print(data) 142 143 def get_counts(): 144 f = open(r'C:UsersOscarDesktop1.txt', 'r', encoding='utf-8') 145 sentence = f.read() 146 #words = jieba.cut(sentence, cut_all=True) 147 #words = jieba.cut(sentence, cut_all=False) 148 words = jieba.cut_for_search(sentence) 149 tf = {} 150 for word in words: 151 print(word) 152 word = ''.join(word.split()) 153 if word in tf: 154 tf[word] += 1 155 else: 156 tf[word] = 1 157 return tf 158 159 def top_counts_sorted(tf, n=50): 160 value_key_pairs = sorted([(count, tz) for tz, count in tf.items()], reverse=True) 161 print(value_key_pairs[:n]) 162 163 # top_counts_sorted(get_counts()) 164 if __name__ == '__main__': 165 count=0 166 167 for base_url in get_base_urls(): 168 count = count + 1 169 detail_urls = get_detail_urls(base_url) 170 for detail_url in detail_urls: 171 try: 172 get_detail_data(detail_url) 173 except Exception as e: 174 print(e) 175 # pass 176 #time.sleep(random.randint(1,3)) 177 print("完成了第{}页的抓取".format(count))
下面是 朴素贝叶斯的部分,没有包在类中,几个函数就行了,包括文件处理函数,创建单词表函数,训练函数,分类函数,主函数。
1 def text_split(textlist): 2 import re 3 word_cut = jieba.cut(textlist, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor 4 word_list = list(word_cut) # genertor转化为list,每个词unicode格式 5 return word_list 6 7 8 # 创建单词表 9 def createVocabList(dataSet): 10 vocabSet = set([]) # 创建一个空的集合 11 for document in dataSet: 12 vocabSet = vocabSet | set(document) # union of the two sets 13 return list(vocabSet) 14 15 16 def trainNB0(trainMatrix, trainCategory): 17 numTrainDocs = len(trainMatrix) # 训练矩阵的行数 18 numWords = len(trainMatrix[0]) # 字母表的维度,即训练矩阵的列数 19 pAbusive = sum(trainCategory) / float(numTrainDocs) # 先验信息 20 p0Num = ones(numWords); 21 p1Num = ones(numWords) # 改为 ones() 22 p0Denom = 2.0; 23 p1Denom = 2.0 # 改成 2.0 24 for i in range(numTrainDocs): 25 if trainCategory[i] == 1: 26 p1Num += trainMatrix[i] 27 p1Denom += sum(trainMatrix[i]) 28 else: 29 p0Num += trainMatrix[i] 30 p0Denom += sum(trainMatrix[i]) 31 p1Vect = log(p1Num / p1Denom) # 改为 log() 32 p0Vect = log(p0Num / p0Denom) # 改为 log() 33 return p0Vect, p1Vect, pAbusive 34 # 返回先验信息PAbusive,返回确定分类的条件下的每个单词出现的概率(此时概率为频率) 35 36 37 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1): 38 p1 = sum(vec2Classify * p1Vec) + log(pClass1) # 此时p1vec为对原始数组分别取对数之后的矩阵了,利用log(a*b)=sum(log(a)+log(b))再sum求和 39 # pClass1为先验概率,此时p1就是最终的概率值。同理p0,根据后验概率最大准则,判别 40 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) 41 if p1 > p0: 42 return 1 43 else: 44 return 0 45 46 47 # 定义词袋模型,词出现几次算几次 48 def bagOfWords2VecMN(vocabList, inputSet): 49 returnVec = [0] * len(vocabList) # 初始化矩阵 50 for word in inputSet: 51 if word in vocabList: 52 returnVec[vocabList.index(word)] += 1 53 return returnVec 54 55 56 def spamTest(): 57 """" 58 文本矩阵化,构建文本矩阵和分类矩阵; 59 注意:由于有个文本的编码解码有问题,我给直接过滤掉了,所以最后矩阵有49行而不是50行 60 """ 61 docList = [];classList = [];fullText = [] 62 path=r"C:UsersOscarDesktopdata1.txt" 63 with open(path,encoding="utf8") as f: 64 i=0 65 while i<1046: 66 a=f.readline() 67 word_list=text_split(a) 68 docList.append(word_list) 69 classList.append(1) 70 fullText.extend(docList) 71 i=i+1 72 path=r"C:UsersOscarDesktopdata .txt" 73 with open(path,encoding="utf8") as f: 74 i=0 75 while i<1546: 76 a=f.readline() 77 word_list=text_split(a) 78 docList.append(word_list) 79 classList.append(0) 80 fullText.extend(docList) 81 i=i+1 82 vocabList = createVocabList(docList) # 创建词汇表 83 84 trainingSet = list(range(2500)) 85 testSet = [] # 随机的构建测试集和训练集,留存交叉验证的方法 86 for i in range(10): # 测试集大小为10,训练集大小为49-10=39 87 randIndex = int(random.uniform(0, len(trainingSet))) 88 testSet.append(trainingSet[randIndex]) 89 trainingSet.pop(randIndex) 90 91 trainMat = [] 92 trainClasses = [] 93 for docIndex in trainingSet: 94 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex])) 95 trainClasses.append(classList[docIndex]) 96 p0V, p1V, pSpam = trainNB0(array(trainMat), array(trainClasses)) 97 98 errorCount = 0 99 for docIndex in testSet: # classify the remaining items 100 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex]) 101 if classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: 102 errorCount += 1 103 print("分类错误的帖子是", docList[docIndex],"正确的类应该是",classList[docIndex]) 104 print('错误率为: ', float(errorCount) / len(testSet)) 105 # return vocabList,fullText 106 107 108 if __name__ == '__main__': 109 spamTest()