zoukankan      html  css  js  c++  java
  • 简单的爬虫小例子

    实例目标:爬取知网空间300个期刊论文

    参考链接有详细的原理和教程

    一。调度器:用来控制整个流程

       spider_main.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'spider_main-调度器'
    import url_manager
    import html_downloader
    import html_parser
    import html_outputer
    from gevent import monkey; monkey.patch_all()
    import gevent
    
    class SpiderMain(object):
        """docstring for SpiderMain"""
        def __init__(self):
            super(SpiderMain, self).__init__()
            self.urls = url_manager.UrlManager()
            self.downloader = html_downloader.HtmlDownloader()
            self.parser = html_parser.HtmlParser()
            self.outputer = html_outputer.HtmlOutputer()
            self.count = 1 # 统计爬的网页数
    
        def gevent_01(self):
            sum_ge = 1
            temp = []
            while sum_ge <= 50: # 加入50个协程并发处理(非常好用)
                temp.append(gevent.spawn(self.craw, sum_ge))
                sum_ge = sum_ge + 1
            gevent.joinall(temp)
    
        def craw(self, n):
            while True:
                if self.count > 300: return # 爬300个网页
                if self.urls.has_new_url():
                    try:
                        new_url = self.urls.get_new_url()
                        html_cont = self.downloader.downloader(new_url)
                        new_urls, new_data = self.parser.parser(new_url,html_cont)
                        self.urls.add_new_urls(new_urls)
                        if new_data is None: continue # 网页中没有要爬的内容就到下一个网页
                        self.outputer.collect_data(new_data)
                        print '协程%d ' % n
                        print 'craw %d : %s' % (self.count,new_url)
                        self.count = self.count + 1 # 爬一个网页自增
                    except Exception as e:
                        print 'craw failed'
                else:
                    gevent.sleep(0) # 没有新的链接要怕,交出控制权给另一个协程
    
    if __name__ == '__main__':
        root_url = 'http://www.cnki.com.cn/index.htm'
        obj_spider = SpiderMain()
        obj_spider.urls.add_new_url(root_url)
        obj_spider.gevent_01()
        obj_spider.outputer.output_html()

    二。URL管理器(数据库,redis缓存, 内存) : 管理待抓取URL集合和已抓取URL集合 (防止重复和循环抓取)

        url_manager.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'url_manager-URL管理器'
    
    class UrlManager(object):
        """docstring for UrlManager"""
        def __init__(self):
            super(UrlManager, self).__init__()
            self.new_urls = set()
            self.old_urls = set()
    
        def add_new_url(self,url):
            if url is None: return
            if url not in self.new_urls and url not in self.old_urls:
                self.new_urls.add(url)
    
        def add_new_urls(self,urls):
            if urls is None or len(urls) == 0: return
            for url in urls:
                self.add_new_url(url)
    
        def get_new_url(self):
            new_url = self.new_urls.pop()
            self.old_urls.add(new_url)
            return new_url
    
        def has_new_url(self):
            return len(self.new_urls) != 0

    三。网页下载器(官方的urllib2,第三方requeests) : 将互联网上的URL对应的网页下载到本地的工具

        html_downloader.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'html_downloader-网页下载器'
    import urllib2
    
    class HtmlDownloader(object):
        """docstring for html_downloader"""
        def __init__(self):
            super(HtmlDownloader, self).__init__()
    
        def downloader(self,url):
            if url is None: return None
            response = urllib2.urlopen(url) # 第一种最直接的下载方式(可看参考链接)
            if response.getcode() != 200: return None
            return response.read()

    四。网页解释器(正则表达式,lxml,官方自带html.parser,第三方的BeautifulSoup) : 从网页中提取有价值数据的工具

       html_parser.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'html_parser-网页解释器'
    from bs4 import BeautifulSoup
    import re
    import urlparse
    
    class HtmlParser(object):
        """docstring for html_parser"""
        def __init__(self):
            super(HtmlParser, self).__init__()
    
        def __get_new_urls__(self, page_url, soup):
            new_urls = set()
    
            links_01 = soup.find_all('a',href=re.compile(r'/Journal/')) # 获取是期刊的链接
            #links = soup.find_all('a',href=re.compile(r'/item/'))
            for link in links_01:
                new_url = link['href']
                new_full_url = urlparse.urljoin(page_url, new_url) # https://baike.baidu.com
                new_urls.add(new_full_url)
    
            links = soup.find_all('a',class_='zt_name',href=re.compile(r'/Article/')) # 获取文章的链接
            #links = soup.find_all('a',href=re.compile(r'/item/'))
            for link in links:
                new_url = link['href']
                new_full_url = urlparse.urljoin(page_url, new_url) # https://baike.baidu.com
                new_urls.add(new_full_url)
            return new_urls
    
        def __get_new_data__(self,page_url,soup):
            res_data = {}
    
            title_node = soup.find('h1',class_='xx_title')
            if title_node is None: return
            res_data['title'] = title_node.get_text()
    
            summary_node = soup.find('div',class_='xx_font')
            res_data['summary'] = summary_node.get_text()
    
            res_data['url'] = page_url
    
            return res_data
    
        def parser(self, page_url, html_cont):
            if page_url is None or html_cont is None: return
    
            soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
            new_urls = self.__get_new_urls__(page_url,soup)
            new_data = self.__get_new_data__(page_url,soup)
            return new_urls, new_data

    五。输出器 : 输出数据

    html_outputer.py
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'html_outputer-输出器'
    
    class HtmlOutputer(object):
        """docstring for html_outputer"""
        def __init__(self):
            super(HtmlOutputer, self).__init__()
            self.datas = []
    
        def collect_data(self, data):
            if data is None: return
            self.datas.append(data)
    
        def output_html(self):
            fout = open('output.html','w')
            fout.write('<html>')
            fout.write('<head><meta charset="utf-8"></head>')
            fout.write('<body>')
            fout.write('<table>')
    
            for data in self.datas:
                fout.write('<tr>')
                fout.write('<td>%s</td>' % data['url'].encode('utf-8'))
                fout.write('<td>%s</td>' % data['title'].encode('utf-8'))
                fout.write('<td>%s</td>' % data['summary'].encode('utf-8'))
                fout.write('</tr>')
    
            fout.write('</table>')
            fout.write('</body>')
            fout.write('</html>')
            fout.close()

    六。问题

        用了多个协程来并发出来后,解决要等待网络io的问题(下载器下载网页), 但也存在不知道哪个协程先处理完成,哪个后处理完成,导致爬的网页顺序是没有规律的,不过这应该问题不大0.0

    七。演示

    八。参考

    https://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000/001407503089986d175822da68d4d6685fbe849a0e0ca35000

    https://www.imooc.com/learn/563

  • 相关阅读:
    作业要求 20201022-1 每周例行报告
    作业要求 20201015-3 每周例行报告
    20201008-1 每周例行报告
    20200924-1 每周例行报告
    总结
    20201126-1 每周例行报告
    20201120-1 每周例行报告
    20201112-1 每周例行报告
    20201105-1 每周例行报告
    20201029-1 每周例行报告
  • 原文地址:https://www.cnblogs.com/GH-123/p/7919827.html
Copyright © 2011-2022 走看看