这次主要用requests库和Beautifusoup库来实现对糗百的热门帖子的用户信息的收集,由于糗百的反爬虫不是很严格,也不需要先登录才能获取数据,所以较简单。
思路,先请求首页的热门帖子获得用户详情链接,然后请求用户详情页,用Beautifusoup解析得到基本的用户信息
代码:
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 import requests 4 import json 5 import urllib3 6 import pymongo 7 from bs4 import BeautifulSoup 8 9 urllib3.disable_warnings() 10 11 class Qsbk(): 12 def __init__(self,url): 13 self.url = url # 糗事百科首页热门帖子链接 14 self.base_url = 'https://www.qiushibaike.com' # 用于用户链接拼接 15 self.user_info = {} # 保存用户信息 16 self.headers = { 17 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' 18 } # 请求头信息 19 self.proxies = { 20 'http':'192.168.105.71:80', 21 'https':'192.168.105.71:80' 22 } # 使用代理 23 self.file = open('qsbk.json','a',encoding='utf-8') # 把用户信息保存到json文本中,以追加的形式 24 25 26 27 def get_data(self): 28 try: 29 response = requests.get(url=self.url,headers=self.headers,proxies=self.proxies,verify=False).text # 请求糗百热门页面 30 soup = BeautifulSoup(response,'lxml') 31 node_list = soup.find_all('div',class_='author clearfix') 32 for node in node_list: 33 item = node.find('a') 34 if item != None: # 某些匿名帖子没有用户链接,需要排除掉 35 link = item.get('href') # 得到用户详情链接 36 user_link = self.base_url + link 37 # print(user_link) 38 self.parse_data(user_link) # 请求详情页并解析得到用户信息 39 except Exception as e: 40 print(e) 41 42 43 def parse_data(self,user_link): 44 try: 45 result = requests.get(user_link,headers=self.headers,proxies=self.proxies,verify=False).content.decode() 46 soup = BeautifulSoup(result,'lxml') 47 status = soup.find('h3').get_text() 48 if '当前用户已关闭糗百个人动态' in status: # 排除关闭糗百的用户 49 pass 50 else: 51 username = soup.find('h2').get_text() 52 self.user_info['用户名'] = username 53 node_list = soup.find_all('div',class_="user-statis user-block") 54 fans = node_list[0].select('ul > li')[0].get_text().split(':',1)[-1] #得到是“粉丝数:111”类似的数据,需要先用":"分割然后得到具体的数字,后续信息类似 55 concerns = node_list[0].select('ul > li')[1].get_text().split(':',1)[-1] 56 comments = node_list[0].select('ul > li')[3].get_text().split(':',1)[-1] 57 coins = node_list[0].select('ul > li')[4].get_text().split(':',1)[-1] 58 marriage = node_list[1].select('ul > li')[0].get_text().split(':',1)[-1] 59 job = node_list[1].select('ul > li')[2].get_text().split(':',1)[-1] 60 web_age = node_list[1].select('ul > li')[4].get_text().split(':',1)[-1] 61 self.user_info['粉丝数'] = fans 62 self.user_info['关注数'] = concerns 63 self.user_info['评论'] = comments 64 self.user_info['笑脸'] = coins 65 self.user_info['婚姻'] = marriage 66 self.user_info['职业'] = job 67 self.user_info['糗龄'] = web_age 68 # print(self.user_info) 69 self.save_json() # 保存数据到json文件中 70 self.save_mongo() # 保存到MongoDB中 71 except Exception as e: 72 print(e) 73 74 def save_json(self): 75 content = json.dumps(self.user_info,ensure_ascii=False) + ' ' # 每行结束后换行 76 self.file.write(content) 77 self.file.close() 78 79 def save_mongo(self): 80 try: 81 self.client = pymongo.MongoClient(host='localhost', port=27107) 82 self.col = self.client['qsbk']['qsbk'] 83 self.col.insert(self.user_info) 84 except Exception as e: 85 print(e) 86 87 if __name__ == '__main__': 88 for i in range(1,3): 89 url = 'https://www.qiushibaike.com/8hr/page/{}/'.format(i) 90 # print(url) 91 qsbk = Qsbk(url) 92 qsbk.get_data()