爬虫思路如下:
1.向起始url发起get请求得到响应
2.从(1)的响应中使用正则表达式提取每个贴吧标题和对应的URL,发送请求,获取响应
3.在(2)的响应中使用正则表达式提取每个img的URL,发送请求,获取响应。
4.将(3)的响应内容保存为图片
5.从(1)的响应中使用正则表达式提取下一页的URL,如果能提取到,则重复上述步骤:如果提取不到,则爬虫结束
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/8/24 22:28 # @Author : Lhtester # @Site : # @File : tieba.py # @Software: PyCharm import requests import re import time import random class TiebaSpider: '''贴吧爬虫''' def __init__(self): self.kw = input('关键字》') self.base_url = 'https://tieba.baidu.com/f' self.headers ={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"} self.page_num =1 self.title = '' def parse_text(self ,url, params=None): '''发送请求,获取响应内容''' #休眠,避免被对方反爬检测到 time.sleep(random.randint(1 , 5) ) req = requests.get(url, headers = self.headers, params=params) # print('req.text:',req.text) return req.text def parse_byte(self, url, params =None): '''发送请求,获取响应内容''' time.sleep(random.random()* 2 ) req = requests.get(url, headers = self.headers, params=params) print('req.text:',req.content) return req.content def page(self, content): '''解析每一页''' print('第{}页爬取中。。。'.format(self.page_num)) self.page_num +=1 url_title = re.findall( r'<a rel="noreferrer" href="(/p/d+?)" title="(.+?)" target="_blank" class="j_th_tit ">(.+?)</a>', content )#因为存在变化数据是三个,返回一个列表 for url, title ,title2 in url_title:#对应循环也是三个值 self.title =title self.detail('https://tieba.baidu.com/' + url) #保存标题 self.save_title() #判断下一页 next_url = re.findall(r'<a href="(.*?)" .*?>下一页></a>', content) if self.page_num==10:#爬取10页数据 print('爬虫到第10页,结束') elif next_url: print('next_url:',next_url) next_url = 'https:' + next_url[0] content = self.parse_text(url=next_url) self.page(content) else: print('爬虫结束') def detail(self, url): '''每一个帖子的详情''' content = self.parse_text(url=url) urls = re.findall(r'<img class="BDE_Image" pic_type="(d)" .*? src="(.*?)" .*?>',content)#加pic_type原因:因页面有很多image标签,只想保存用户上传的图片,而非百度系统图片,根据pic_type进行赛选 # print('content:',type(content),content) # print('urls:',urls) for number,url in urls: self.sava_img(url=url) def save_title(self): '''保存帖子的标题''' print('开始帖子的标题') with open('../image/tieba_{}.txt'.format(self.kw),'a',encoding='utf-8') as file: file.write(self.title) file.write(' ') def sava_img(self, url ): '''保存图片''' content = self.parse_byte(url=url) image_path = '{}_{}'.format(self.title,url[url.rfind('/') +1:]) #windows不能保存文件名存在/:*?"<>|字符。对文件进行替换 image_path = image_path.replace('?','') image_path = image_path.replace('/','') image_path = image_path.replace(':','') image_path = image_path.replace('*','') image_path = image_path.replace('"','') image_path = image_path.replace('<','') image_path = image_path.replace('>','') image_path = image_path.replace('\','')#学过编程的人都应该知道,在C里面,输出字符串时,如果想输出一个换行,那就要加上' '这个标志,类似的,输出一个TAB,就加上' ',也就是说,反斜杠("")这个符号会把跟在它后面的字符结合起来转义成其它字符。根据这个原理,如果想输出双引号('"'),就需要输入' "',这样才会将包含了双引号的字符串正确的写入内存中。那么如果想输入一个反斜杠呢?很简单,只要敲'\'就可以了。 image_path= '../image/{}'.format(image_path)#加上windows路径 print('开始保存图片') with open (image_path,'wb') as file: file.write(content) def start(self): '''开始爬虫''' print('爬虫开始') content =self.parse_text(url=self.base_url, params={'kw':self.kw,'ie':'utf-8','fr':'search'}) self.page(content) if __name__ == '__main__': spider =TiebaSpider() spider.start()