zoukankan      html  css  js  c++  java
  • python: 爬取[博海拾贝]图片脚本

    练手代码,聊作备忘:

    # encoding: utf-8
    # from __future__ import unicode_literals
    
    import urllib
    import urllib2
    import re
    import os
    import time
    from threading import Thread
    
    class BhsbSpider(object):
        _url = r'https://bh.sb/post/category/main/';
        _page_count = 0
        _page_index = 0
    
        def __init__(self, url, page_count = 0):
            self._url = url
            self._page_count = page_count
            folder = '博海拾贝'.decode('utf-8')
            if not os.path.exists(folder):
                os.mkdir(folder)
    
        def spider(self):
            while self._page_index < self._page_count:
                self._page_index += 1
                self._url = r'https://bh.sb/post/category/main/page/%d' % self._page_index
                self.do_spider(self._url)
    
        def do_spider(self, url):
            html = self.get_html(url)
            pattern = r'(?s)<h2><as+href="(?P<url>[^"]+).*?>[博海拾贝d+](?P<title>[^<]+).*?'
            for i, m in enumerate(re.findall(pattern, html)):
                info = '%d. url: %s, title: %s' % ((self._page_index - 1) * 20 + i + 1, m[0], m[1])
                print info
                # 多线程爬取页面
                Thread(target=self.download, args=(m[0], m[1])).start()
                time.sleep(2)
    
        def download(self, url, title):
            title = '博海拾贝\' + title
            title = title.decode('utf-8')
            if not os.path.exists(title):
                os.mkdir(title)
            html = self.get_html(url)
            pattern = r'(?s)<p>(?P<title>[^<]+).*?<p><imgs+src="(?P<image>[^"]+)"'
            for i, m in enumerate(re.findall(pattern, html)):
                img_title = m[0]
                img_url = m[1]
                img_filename = '%s/%s%s' % (title.encode('utf-8'), img_title, os.path.splitext(img_url)[1])
                img_filename = img_filename.decode('utf-8')
                print 'download %s ...' % img_filename
                if not os.path.exists(img_filename):
                    Thread(target=urllib.urlretrieve, args=(img_url, img_filename)).start()
                    time.sleep(1)
    
        def get_html(self, url):
            try:
                url = url.encode('utf-8')
                req = urllib2.Request(url)
                req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.5.1000 Chrome/39.0.2146.0 Safari/537.36')
                page = urllib2.urlopen(req)
                return page.read()
            except Exception as ex:
                print 'get url_%s html error, ex=%s' % (url, ex)
    
    
    if __name__ == '__main__':
        url = r'https://bh.sb/post/category/main/'
        bs = BhsbSpider(url, 10)
        bs.spider()

    未及细测试,其间有图片丢失情况。结果如下图示:

  • 相关阅读:
    MySQL-Linux升级MySQL
    查看linux 版本
    mysql 密码找回方法
    CentOS7.6利用systemctl添加自定义系统服务
    centos7.6下定时监测MySQL进程终止后自动重启的方法
    Linux实操篇-Linux磁盘分区、挂载
    阿里云centos7.6下MongoDB安装和配置
    Linux中文件权限 chmod、u+x、u、r、w、x分别代表什么
    ABP 发布以后nlog4.NET写入不到日志文件里
    Android studio gradle 下载很缓慢的解决方法,gradle版本不对
  • 原文地址:https://www.cnblogs.com/crwy/p/10623378.html
Copyright © 2011-2022 走看看