zoukankan      html  css  js  c++  java
  • python: 爬取[博海拾贝]图片脚本

    练手代码,聊作备忘:

    # encoding: utf-8
    # from __future__ import unicode_literals
    
    import urllib
    import urllib2
    import re
    import os
    import time
    from threading import Thread
    
    class BhsbSpider(object):
        _url = r'https://bh.sb/post/category/main/';
        _page_count = 0
        _page_index = 0
    
        def __init__(self, url, page_count = 0):
            self._url = url
            self._page_count = page_count
            folder = '博海拾贝'.decode('utf-8')
            if not os.path.exists(folder):
                os.mkdir(folder)
    
        def spider(self):
            while self._page_index < self._page_count:
                self._page_index += 1
                self._url = r'https://bh.sb/post/category/main/page/%d' % self._page_index
                self.do_spider(self._url)
    
        def do_spider(self, url):
            html = self.get_html(url)
            pattern = r'(?s)<h2><as+href="(?P<url>[^"]+).*?>[博海拾贝d+](?P<title>[^<]+).*?'
            for i, m in enumerate(re.findall(pattern, html)):
                info = '%d. url: %s, title: %s' % ((self._page_index - 1) * 20 + i + 1, m[0], m[1])
                print info
                # 多线程爬取页面
                Thread(target=self.download, args=(m[0], m[1])).start()
                time.sleep(2)
    
        def download(self, url, title):
            title = '博海拾贝\' + title
            title = title.decode('utf-8')
            if not os.path.exists(title):
                os.mkdir(title)
            html = self.get_html(url)
            pattern = r'(?s)<p>(?P<title>[^<]+).*?<p><imgs+src="(?P<image>[^"]+)"'
            for i, m in enumerate(re.findall(pattern, html)):
                img_title = m[0]
                img_url = m[1]
                img_filename = '%s/%s%s' % (title.encode('utf-8'), img_title, os.path.splitext(img_url)[1])
                img_filename = img_filename.decode('utf-8')
                print 'download %s ...' % img_filename
                if not os.path.exists(img_filename):
                    Thread(target=urllib.urlretrieve, args=(img_url, img_filename)).start()
                    time.sleep(1)
    
        def get_html(self, url):
            try:
                url = url.encode('utf-8')
                req = urllib2.Request(url)
                req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.5.1000 Chrome/39.0.2146.0 Safari/537.36')
                page = urllib2.urlopen(req)
                return page.read()
            except Exception as ex:
                print 'get url_%s html error, ex=%s' % (url, ex)
    
    
    if __name__ == '__main__':
        url = r'https://bh.sb/post/category/main/'
        bs = BhsbSpider(url, 10)
        bs.spider()

    未及细测试,其间有图片丢失情况。结果如下图示:

  • 相关阅读:
    Constructor构造方法
    overload重载
    static关键字
    this关键字
    继承
    ORACLE数据库 常用命令和Sql常用语句
    常见单词
    L贪心基础
    J贪心
    K贪心
  • 原文地址:https://www.cnblogs.com/crwy/p/10623378.html
Copyright © 2011-2022 走看看