import requests import re import time import pymysql class bdspider: def __init__(self,tiebaName,pages_Num): self.tiebaName = tiebaName self.pages_Num = pages_Num self.base_url = 'https://tieba.baidu.com/f?kw='+tiebaName+'&ie=utf-8&pn={}' self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' } #构建每一页的连接 def getlink(self): url_list = [] for i in range(self.pages_Num): url_list.append(self.base_url.format(i*50)) #print(url_list) return url_list #获取页面信息 def get_pagesinfo(self,url): response = requests.get(url=url,headers=self.headers) #print(response.content.decode('utf-8')) return self.parse_pageInfo(response.content.decode('utf-8')) #解析页面 def parse_pageInfo(self,html): pattern = re.compile('<li class=" j_thread_list clearfix".*?<a rel="noreferrer".*?title="(.*?)".*?</a>',re.S) return re.findall(pattern,html) #运行逻辑 def run(self): url_list = self.getlink() for url in url_list: time.sleep(1) page_Info = self.get_pagesinfo(url) print(page_Info) self.save_to_mysql(page_Info) #保存数据 def save_to_mysql(self,page_Info): #链接数据库 conn = pymysql.connect(host='localhost',user='root',passwd='root123',db='baidu',port=3306) #游标对象 cursor = conn.cursor() #插入数据 for index in range(0, len(page_Info)): tt = page_Info[index] cursor.execute("insert into title(title) values('{}')".format(tt)) conn.commit() #关闭游标,关闭连接 cursor.close() conn.close() if __name__ == "__main__": spider = bdspider("lol",5) spider.run()