zoukankan      html  css  js  c++  java
  • python网络爬虫抓取网站图片

    本文介绍两种爬取方式:

    1.正则表达式

    2.bs4解析Html

    以下为正则表达式爬虫,面向对象封装后的代码如下:

    import urllib.request  # 用于下载图片
    import os
    import requests  # 发送http请求
    import re   # 正则表达式匹配
    
    
    class GetJpg(object):
    
        def __init__(self, start_urls):
            self.start_urls = start_urls
    
        def get_response(self,url):
    
            '''获取网页响应内容'''
    
            response = requests.get(url).text
            return response
    
        def get_content(self,html):
    
            '''获取网页响应内容中所有图片的整体div部分'''
    
            reg = re.compile(r'(<div class="j-r-list-c">.*?</div>.*?</div>)', re.S)
            return re.findall(reg, html)
    
        def get_jpg_url(self,content):
    
            '''获取图片url'''
    
            reg = r'data-original="(.*?)"'
            return re.findall(reg, content)
    
        def get_jpg_name(self,content):
    
            ''' 获取图片名称'''
    
            reg = re.compile(r'<a href="/detail-.{8}.html">(.*?)</a>')
            return re.findall(reg, content)
    
        def download_jpg(self,src_url, path,index):
    
            '''下载图片保存到本地目录'''
    
            path = ''.join(path.split())
            path = 'E:Python爬图片{name}.{index}'.format(name=path,index=index)
            if not os.path.exists(path):
                urllib.request.urlretrieve(src_url, path)  # 下载图片
                print('OK!!!')
            else:
                print('文件已存在')
    
        def get_url_name(self,start_url):
    
            ''' 逐页下载,本部分本来可以放在main函数里,考虑到会多嵌套一个循环所以单独拿出来作为一个函数'''
    
            content = self.get_content(self.get_response(start_url))
            for i in content:
                jpg_url = self.get_jpg_url(i)
                if jpg_url:
                    jpg_name = self.get_jpg_name(i)
                    index = jpg_url[0].split('.')[-1]
                    try:
                        self.download_jpg(jpg_url[0], jpg_name[0],index)
                    except:
                        continue
    
        def main(self):
            
            ''' 执行'''
            
            [self.get_url_name(start_url) for start_url in self.start_urls]   # 此处列表生成器来执行
            # 这部分的代码相当于:
            # for start_url in self.start_urls:
            #     self.get_url_name(start_url)
    
    
    if __name__ == '__main__':
        start_urls = ['http://www.budejie.com/{id}'.format(id=i) for i in range(1,10)]
        jpg = GetJpg(start_urls)  # 实例化一个对象
        jpg.main()

    以下为使用bs4爬取的代码:

    from bs4 import BeautifulSoup
    import urllib.request
    import re
    
    
    def get_urls(img_girl):
        '''
        :param img_girl: <img>标签内容
        :return: 所有图片的url
        '''
        all_urls = [girl.get('src') for girl in img_girl]
        return all_urls
    
    
    def get_img_name(img_girl):
        '''
        :param img_girl:  <img>标签内容
        :return: 所有图片title
        '''
    
        all_name = [girl.get('title') for girl in img_girl]
        return all_name
    
    
    def get_img_resource(url):
        '''
        :param url:网站url
        :return:网页源码中的所有<img>标签内容
        '''
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
                   'Accept - Language': 'zh - CN, zh;q = 0.8'
        }  # 添加请求头部,模拟浏览器
        req = urllib.request.Request(url, headers=headers)  # 创建对象
        res = urllib.request.urlopen(req, timeout=20)  # 发送请求
        content = res.read()  # 获取响应网页源码
        soup = BeautifulSoup(content,'html.parser')  # HMTL源码解析
        img_girl = soup.find_all('img')  # 获取 源码中的<img>标签模块内容
        return img_girl
    
    
    def main(url):
        '''
        下载保存图片
        :param url: 网站url
        '''
        urls = get_urls(get_img_resource(url))
        names = get_img_name(get_img_resource(url))
        x = 1
        for src_url in urls:
            path_l = re.split(r'W', names[urls.index(src_url)])  # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错
            path = ''.join(path_l)
            path = 'E:Python爬图片BS4{name}_{index}.jpg'.format(name=path,index=x)
            urllib.request.urlretrieve(src_url, path)
            print('OK')
            x += 1
    
    if __name__ == "__main__":
        urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ]
        [main(url)for url in urls]

     bs4面向对象封装后代码:

    from bs4 import BeautifulSoup
    import urllib.request
    import re
    
    
    class GetWebImg(object):
    
        def __init__(self, url, index):
            self.url = url
            self.index = index
    
        def get_urls(self,img_girl):
            '''
            :param img_girl: <img>标签内容
            :return: 所有图片的url
            '''
            all_urls = [girl.get('src') for girl in img_girl]
            return all_urls
    
        def get_img_name(self,img_girl):
            '''
            :param img_girl:  <img>标签内容
            :return: 所有图片title
            '''
    
            all_name = [girl.get('title') for girl in img_girl]
            return all_name
    
        def get_img_resource(self, url):
            '''
            :param url:网站url
            :return:网页源码中的所有<img>标签内容
            '''
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
                'Accept - Language': 'zh - CN, zh;q = 0.8'
                }  # 添加请求头部,模拟浏览器
            req = urllib.request.Request(url, headers=headers)  # 创建对象
            res = urllib.request.urlopen(req, timeout=20)  # 发送请求
            content = res.read()  # 获取响应网页源码
            soup = BeautifulSoup(content, 'html.parser')  # HMTL源码解析
            img_girl = soup.find_all('img')  # 获取 源码中的<img>标签模块内容
            return img_girl
    
        def main(self):
            '''
            下载保存图片
            :param url: 网站url
            '''
            url_list = self.get_urls(self.get_img_resource(self.url))
            name_list = self.get_img_name(self.get_img_resource(self.url))
            x = 1
            for src_url in url_list:
                path_l = re.split(r'W', name_list[url_list.index(src_url)])  # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错
                path = ''.join(path_l)
                path = 'E:Python爬图片BS4{name}_{index}_{id}.jpg'.format(name=path, index=self.index,id =x)
                urllib.request.urlretrieve(src_url, path)
                print('第{index}页第{id}张图片下载OK'.format(index=self.index,id =x))
                x += 1
    
    
    if __name__ == "__main__":
        urls = ['https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}'.format(i=id)for id in range(1,10) ]
        index = 1
        for url in urls:
            get_img = GetWebImg(url,index)
            get_img.main()
            index += 1

    运行结果:

  • 相关阅读:
    从SmartObject中读取数据
    将数据存储到SmartObject中
    浅识K2 blackpearl中SmartObject
    .net基础
    字符编码
    PowerDesigner教程系列
    Asp.net 2.0(C#)图片存储到数据库和从数据库读取显示
    按回车提交问题:
    .NET设计模式(2):单件模式(Singleton Pattern)
    常用正则表达式收集
  • 原文地址:https://www.cnblogs.com/wolfshining/p/9013906.html
Copyright © 2011-2022 走看看