本文介绍两种爬取方式:
1.正则表达式
2.bs4解析Html
以下为正则表达式爬虫,面向对象封装后的代码如下:
import urllib.request # 用于下载图片 import os import requests # 发送http请求 import re # 正则表达式匹配 class GetJpg(object): def __init__(self, start_urls): self.start_urls = start_urls def get_response(self,url): '''获取网页响应内容''' response = requests.get(url).text return response def get_content(self,html): '''获取网页响应内容中所有图片的整体div部分''' reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)', re.S) return re.findall(reg, html) def get_jpg_url(self,content): '''获取图片url''' reg = r'data-original="(.*?)"' return re.findall(reg, content) def get_jpg_name(self,content): ''' 获取图片名称''' reg = re.compile(r'<a href="/detail-.{8}.html">(.*?)</a>') return re.findall(reg, content) def download_jpg(self,src_url, path,index): '''下载图片保存到本地目录''' path = ''.join(path.split()) path = 'E:Python爬图片{name}.{index}'.format(name=path,index=index) if not os.path.exists(path): urllib.request.urlretrieve(src_url, path) # 下载图片 print('OK!!!') else: print('文件已存在') def get_url_name(self,start_url): ''' 逐页下载,本部分本来可以放在main函数里,考虑到会多嵌套一个循环所以单独拿出来作为一个函数''' content = self.get_content(self.get_response(start_url)) for i in content: jpg_url = self.get_jpg_url(i) if jpg_url: jpg_name = self.get_jpg_name(i) index = jpg_url[0].split('.')[-1] try: self.download_jpg(jpg_url[0], jpg_name[0],index) except: continue def main(self): ''' 执行''' [self.get_url_name(start_url) for start_url in self.start_urls] # 此处列表生成器来执行 # 这部分的代码相当于: # for start_url in self.start_urls: # self.get_url_name(start_url) if __name__ == '__main__': start_urls = ['http://www.budejie.com/{id}'.format(id=i) for i in range(1,10)] jpg = GetJpg(start_urls) # 实例化一个对象 jpg.main()
以下为使用bs4爬取的代码:
from bs4 import BeautifulSoup import urllib.request import re def get_urls(img_girl): ''' :param img_girl: <img>标签内容 :return: 所有图片的url ''' all_urls = [girl.get('src') for girl in img_girl] return all_urls def get_img_name(img_girl): ''' :param img_girl: <img>标签内容 :return: 所有图片title ''' all_name = [girl.get('title') for girl in img_girl] return all_name def get_img_resource(url): ''' :param url:网站url :return:网页源码中的所有<img>标签内容 ''' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', 'Accept - Language': 'zh - CN, zh;q = 0.8' } # 添加请求头部,模拟浏览器 req = urllib.request.Request(url, headers=headers) # 创建对象 res = urllib.request.urlopen(req, timeout=20) # 发送请求 content = res.read() # 获取响应网页源码 soup = BeautifulSoup(content,'html.parser') # HMTL源码解析 img_girl = soup.find_all('img') # 获取 源码中的<img>标签模块内容 return img_girl def main(url): ''' 下载保存图片 :param url: 网站url ''' urls = get_urls(get_img_resource(url)) names = get_img_name(get_img_resource(url)) x = 1 for src_url in urls: path_l = re.split(r'W', names[urls.index(src_url)]) # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错 path = ''.join(path_l) path = 'E:Python爬图片BS4{name}_{index}.jpg'.format(name=path,index=x) urllib.request.urlretrieve(src_url, path) print('OK') x += 1 if __name__ == "__main__": urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ] [main(url)for url in urls]
bs4面向对象封装后代码:
from bs4 import BeautifulSoup import urllib.request import re class GetWebImg(object): def __init__(self, url, index): self.url = url self.index = index def get_urls(self,img_girl): ''' :param img_girl: <img>标签内容 :return: 所有图片的url ''' all_urls = [girl.get('src') for girl in img_girl] return all_urls def get_img_name(self,img_girl): ''' :param img_girl: <img>标签内容 :return: 所有图片title ''' all_name = [girl.get('title') for girl in img_girl] return all_name def get_img_resource(self, url): ''' :param url:网站url :return:网页源码中的所有<img>标签内容 ''' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', 'Accept - Language': 'zh - CN, zh;q = 0.8' } # 添加请求头部,模拟浏览器 req = urllib.request.Request(url, headers=headers) # 创建对象 res = urllib.request.urlopen(req, timeout=20) # 发送请求 content = res.read() # 获取响应网页源码 soup = BeautifulSoup(content, 'html.parser') # HMTL源码解析 img_girl = soup.find_all('img') # 获取 源码中的<img>标签模块内容 return img_girl def main(self): ''' 下载保存图片 :param url: 网站url ''' url_list = self.get_urls(self.get_img_resource(self.url)) name_list = self.get_img_name(self.get_img_resource(self.url)) x = 1 for src_url in url_list: path_l = re.split(r'W', name_list[url_list.index(src_url)]) # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错 path = ''.join(path_l) path = 'E:Python爬图片BS4{name}_{index}_{id}.jpg'.format(name=path, index=self.index,id =x) urllib.request.urlretrieve(src_url, path) print('第{index}页第{id}张图片下载OK'.format(index=self.index,id =x)) x += 1 if __name__ == "__main__": urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ] index = 1 for url in urls: get_img = GetWebImg(url,index) get_img.main() index += 1
运行结果: