zoukankan      html  css  js  c++  java
  • 简单的爬虫小例子

    实例目标:爬取知网空间300个期刊论文

    参考链接有详细的原理和教程

    一。调度器:用来控制整个流程

       spider_main.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'spider_main-调度器'
    import url_manager
    import html_downloader
    import html_parser
    import html_outputer
    from gevent import monkey; monkey.patch_all()
    import gevent
    
    class SpiderMain(object):
        """docstring for SpiderMain"""
        def __init__(self):
            super(SpiderMain, self).__init__()
            self.urls = url_manager.UrlManager()
            self.downloader = html_downloader.HtmlDownloader()
            self.parser = html_parser.HtmlParser()
            self.outputer = html_outputer.HtmlOutputer()
            self.count = 1 # 统计爬的网页数
    
        def gevent_01(self):
            sum_ge = 1
            temp = []
            while sum_ge <= 50: # 加入50个协程并发处理(非常好用)
                temp.append(gevent.spawn(self.craw, sum_ge))
                sum_ge = sum_ge + 1
            gevent.joinall(temp)
    
        def craw(self, n):
            while True:
                if self.count > 300: return # 爬300个网页
                if self.urls.has_new_url():
                    try:
                        new_url = self.urls.get_new_url()
                        html_cont = self.downloader.downloader(new_url)
                        new_urls, new_data = self.parser.parser(new_url,html_cont)
                        self.urls.add_new_urls(new_urls)
                        if new_data is None: continue # 网页中没有要爬的内容就到下一个网页
                        self.outputer.collect_data(new_data)
                        print '协程%d ' % n
                        print 'craw %d : %s' % (self.count,new_url)
                        self.count = self.count + 1 # 爬一个网页自增
                    except Exception as e:
                        print 'craw failed'
                else:
                    gevent.sleep(0) # 没有新的链接要怕,交出控制权给另一个协程
    
    if __name__ == '__main__':
        root_url = 'http://www.cnki.com.cn/index.htm'
        obj_spider = SpiderMain()
        obj_spider.urls.add_new_url(root_url)
        obj_spider.gevent_01()
        obj_spider.outputer.output_html()

    二。URL管理器(数据库,redis缓存, 内存) : 管理待抓取URL集合和已抓取URL集合 (防止重复和循环抓取)

        url_manager.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'url_manager-URL管理器'
    
    class UrlManager(object):
        """docstring for UrlManager"""
        def __init__(self):
            super(UrlManager, self).__init__()
            self.new_urls = set()
            self.old_urls = set()
    
        def add_new_url(self,url):
            if url is None: return
            if url not in self.new_urls and url not in self.old_urls:
                self.new_urls.add(url)
    
        def add_new_urls(self,urls):
            if urls is None or len(urls) == 0: return
            for url in urls:
                self.add_new_url(url)
    
        def get_new_url(self):
            new_url = self.new_urls.pop()
            self.old_urls.add(new_url)
            return new_url
    
        def has_new_url(self):
            return len(self.new_urls) != 0

    三。网页下载器(官方的urllib2,第三方requeests) : 将互联网上的URL对应的网页下载到本地的工具

        html_downloader.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'html_downloader-网页下载器'
    import urllib2
    
    class HtmlDownloader(object):
        """docstring for html_downloader"""
        def __init__(self):
            super(HtmlDownloader, self).__init__()
    
        def downloader(self,url):
            if url is None: return None
            response = urllib2.urlopen(url) # 第一种最直接的下载方式(可看参考链接)
            if response.getcode() != 200: return None
            return response.read()

    四。网页解释器(正则表达式,lxml,官方自带html.parser,第三方的BeautifulSoup) : 从网页中提取有价值数据的工具

       html_parser.py

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'html_parser-网页解释器'
    from bs4 import BeautifulSoup
    import re
    import urlparse
    
    class HtmlParser(object):
        """docstring for html_parser"""
        def __init__(self):
            super(HtmlParser, self).__init__()
    
        def __get_new_urls__(self, page_url, soup):
            new_urls = set()
    
            links_01 = soup.find_all('a',href=re.compile(r'/Journal/')) # 获取是期刊的链接
            #links = soup.find_all('a',href=re.compile(r'/item/'))
            for link in links_01:
                new_url = link['href']
                new_full_url = urlparse.urljoin(page_url, new_url) # https://baike.baidu.com
                new_urls.add(new_full_url)
    
            links = soup.find_all('a',class_='zt_name',href=re.compile(r'/Article/')) # 获取文章的链接
            #links = soup.find_all('a',href=re.compile(r'/item/'))
            for link in links:
                new_url = link['href']
                new_full_url = urlparse.urljoin(page_url, new_url) # https://baike.baidu.com
                new_urls.add(new_full_url)
            return new_urls
    
        def __get_new_data__(self,page_url,soup):
            res_data = {}
    
            title_node = soup.find('h1',class_='xx_title')
            if title_node is None: return
            res_data['title'] = title_node.get_text()
    
            summary_node = soup.find('div',class_='xx_font')
            res_data['summary'] = summary_node.get_text()
    
            res_data['url'] = page_url
    
            return res_data
    
        def parser(self, page_url, html_cont):
            if page_url is None or html_cont is None: return
    
            soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
            new_urls = self.__get_new_urls__(page_url,soup)
            new_data = self.__get_new_data__(page_url,soup)
            return new_urls, new_data

    五。输出器 : 输出数据

    html_outputer.py
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    'html_outputer-输出器'
    
    class HtmlOutputer(object):
        """docstring for html_outputer"""
        def __init__(self):
            super(HtmlOutputer, self).__init__()
            self.datas = []
    
        def collect_data(self, data):
            if data is None: return
            self.datas.append(data)
    
        def output_html(self):
            fout = open('output.html','w')
            fout.write('<html>')
            fout.write('<head><meta charset="utf-8"></head>')
            fout.write('<body>')
            fout.write('<table>')
    
            for data in self.datas:
                fout.write('<tr>')
                fout.write('<td>%s</td>' % data['url'].encode('utf-8'))
                fout.write('<td>%s</td>' % data['title'].encode('utf-8'))
                fout.write('<td>%s</td>' % data['summary'].encode('utf-8'))
                fout.write('</tr>')
    
            fout.write('</table>')
            fout.write('</body>')
            fout.write('</html>')
            fout.close()

    六。问题

        用了多个协程来并发出来后,解决要等待网络io的问题(下载器下载网页), 但也存在不知道哪个协程先处理完成,哪个后处理完成,导致爬的网页顺序是没有规律的,不过这应该问题不大0.0

    七。演示

    八。参考

    https://www.liaoxuefeng.com/wiki/001374738125095c955c1e6d8bb493182103fac9270762a000/001407503089986d175822da68d4d6685fbe849a0e0ca35000

    https://www.imooc.com/learn/563

  • 相关阅读:
    每日一篇文献:Robotic pick-and-place of novel objects in clutter with multi-affordance grasping and cross-domain image matching
    每日一篇文献:Intuitive Bare-Hand Teleoperation of a Robotic Manipulator Using Virtual Reality and Leap Motion
    每日一篇文献:Virtual Kinesthetic Teaching for Bimanual Telemanipulation
    HEBI Robotic Arm VR Teleoperation
    「iQuotient Case」AR device teleoperated robotic arm
    VR and Digital Twin Based Teleoperation of Robotic Arm
    HEBI Robotic Arm VR Teleoperation
    Human Robot Interaction
    Immersive Teleoperation Project
    机器人演示学习
  • 原文地址:https://www.cnblogs.com/GH-123/p/7919827.html
Copyright © 2011-2022 走看看