zoukankan      html  css  js  c++  java
  • python: 爬取[博海拾贝]图片脚本

    练手代码,聊作备忘:

    # encoding: utf-8
    # from __future__ import unicode_literals
    
    import urllib
    import urllib2
    import re
    import os
    import time
    from threading import Thread
    
    class BhsbSpider(object):
        _url = r'https://bh.sb/post/category/main/';
        _page_count = 0
        _page_index = 0
    
        def __init__(self, url, page_count = 0):
            self._url = url
            self._page_count = page_count
            folder = '博海拾贝'.decode('utf-8')
            if not os.path.exists(folder):
                os.mkdir(folder)
    
        def spider(self):
            while self._page_index < self._page_count:
                self._page_index += 1
                self._url = r'https://bh.sb/post/category/main/page/%d' % self._page_index
                self.do_spider(self._url)
    
        def do_spider(self, url):
            html = self.get_html(url)
            pattern = r'(?s)<h2><as+href="(?P<url>[^"]+).*?>[博海拾贝d+](?P<title>[^<]+).*?'
            for i, m in enumerate(re.findall(pattern, html)):
                info = '%d. url: %s, title: %s' % ((self._page_index - 1) * 20 + i + 1, m[0], m[1])
                print info
                # 多线程爬取页面
                Thread(target=self.download, args=(m[0], m[1])).start()
                time.sleep(2)
    
        def download(self, url, title):
            title = '博海拾贝\' + title
            title = title.decode('utf-8')
            if not os.path.exists(title):
                os.mkdir(title)
            html = self.get_html(url)
            pattern = r'(?s)<p>(?P<title>[^<]+).*?<p><imgs+src="(?P<image>[^"]+)"'
            for i, m in enumerate(re.findall(pattern, html)):
                img_title = m[0]
                img_url = m[1]
                img_filename = '%s/%s%s' % (title.encode('utf-8'), img_title, os.path.splitext(img_url)[1])
                img_filename = img_filename.decode('utf-8')
                print 'download %s ...' % img_filename
                if not os.path.exists(img_filename):
                    Thread(target=urllib.urlretrieve, args=(img_url, img_filename)).start()
                    time.sleep(1)
    
        def get_html(self, url):
            try:
                url = url.encode('utf-8')
                req = urllib2.Request(url)
                req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.9.5.1000 Chrome/39.0.2146.0 Safari/537.36')
                page = urllib2.urlopen(req)
                return page.read()
            except Exception as ex:
                print 'get url_%s html error, ex=%s' % (url, ex)
    
    
    if __name__ == '__main__':
        url = r'https://bh.sb/post/category/main/'
        bs = BhsbSpider(url, 10)
        bs.spider()

    未及细测试,其间有图片丢失情况。结果如下图示:

  • 相关阅读:
    SoapUI 使用笔记
    git 使用笔记(二)
    git 使用笔记(一)
    jquery 拓展
    hdu 1024 Max Sum Plus Plus (DP)
    hdu 2602 Bone Collector (01背包)
    hdu 1688 Sightseeing (最短路径)
    hdu 3191 How Many Paths Are There (次短路径数)
    hdu 2722 Here We Go(relians) Again (最短路径)
    hdu 1596 find the safest road (最短路径)
  • 原文地址:https://www.cnblogs.com/crwy/p/10623378.html
Copyright © 2011-2022 走看看