zoukankan      html  css  js  c++  java
  • python简单爬虫爬取百度百科python词条网页

    目标分析:
    目标:百度百科python词条相关词条网页 - 标题和简介

    入口页:https://baike.baidu.com/item/Python/407313

    URL格式:
    - 词条页面URL:/item/xxxx

    数据格式:
    - 标题:
    <dd class="lemmaWgt-lemmaTitle-title"><h1>***</h1></dd>

    - 简介:
    <div class="lemma-summary">***</div>

    页面编码:utf-8

    爬虫主入口文件

    spider_main.py

    # coding:utf-8
    import url_manager
    import html_downloader
    import html_parser
    import html_outputer
    
    
    class SpiderMain(object):
        def __init__(self):
            # url管理器
            self.urls = url_manager.UrlManager()
            # 下载器
            self.downloader = html_downloader.HtmlDownloader()
            # 解析器
            self.parser = html_parser.HtmlParser()
            # 输出控制器
            self.outputer = html_outputer.HtmlOutputer()
    
        def craw(self, root_url):
            # 记录当前爬取的是第几个url
            count = 1
            self.urls.add_new_url(root_url)
            # 如果有待爬取的url就继续while循环
            '''
            while self.urls.has_new_url():
                try:
                    new_url = self.urls.get_new_url()
                    print 'craw %d : %s' % (count, new_url)
                    # 下载url页面
                    html_cont = self.downloader.download(new_url)
                    # 进行url解析并获取url的数据
                    new_urls, new_data = self.parser.parse(new_url, html_cont)
                    # url解析及数据搜集
                    self.urls.add_new_urls(new_urls)
                    self.outputer.collect_data(new_data)
    
                    if count >= 1000:
                        break
                    count = count + 1
                except Exception as e:
                    print 'craw failed'
                    print e
            '''
            while self.urls.has_new_url():
                new_url = self.urls.get_new_url()
                print 'craw %d : %s' % (count, new_url)
                # 下载url页面
                html_cont = self.downloader.download(new_url)
                # print html_cont
                # 进行url解析并获取url的数据
                new_urls, new_data = self.parser.parse(new_url, html_cont)
                # url解析及数据搜集
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)
    
                if count >= 10:
                    break
                count = count + 1
    
            # 输出到指定页面
            self.outputer.output_html()
    
    
    if __name__ == "__main__":
        root_url = "https://baike.baidu.com/item/Python/407313"
        obj_spider = SpiderMain()
        obj_spider.craw(root_url)

    网页管理器

    url_manager.py

    # coding:utf-8
    
    
    class UrlManager(object):
        def __init__(self):
            # 要爬取的url
            self.new_urls = set()
            # 爬取过的url
            self.old_urls = set()
    
        def add_new_url(self, url):
            if url is None:
                return
            # 如果url不在要爬取的url里面也不在爬取过的url里面就添加进来
            if url not in self.new_urls and url not in self.old_urls:
                self.new_urls.add(url)
    
        def add_new_urls(self, urls):
            if urls is None or len(urls) == 0:
                return
            for url in urls:
                self.add_new_url(url)
    
        def has_new_url(self):
            return len(self.new_urls) != 0
    
        def get_new_url(self):
            new_url = self.new_urls.pop()
            self.old_urls.add(new_url)
            return new_url

    网页下载器

    html_downloader.py

    # coding:utf-8
    
    import urllib2
    
    
    class HtmlDownloader(object):
        def download(self, url):
            if url is None:
                return None
    
            response = urllib2.urlopen(url)
    
            if response.getcode() != 200:
                return None
    
            return response.read()

    网页分析器

    html_parser.py

    # coding:utf-8
    
    from bs4 import BeautifulSoup
    import re
    import urlparse
    
    
    class HtmlParser(object):
    
        def _get_new_urls(self, page_url, soup):
            # 得到所有的词条url
            links = soup.find_all('a', href=re.compile(r"/item/.*"))
            # print links
            new_urls = set()
            for link in links:
                new_url = link['href']
                new_full_url = urlparse.urljoin(page_url, new_url)
                new_urls.add(new_full_url)
    
            return new_urls
    
        def _get_new_data(self, page_url, soup):
            res_data = {}
            # url
            res_data['url'] = page_url
    
            # <dd class="lemmaWgt-lemmaTitle-title">
            # <h1>Python</h1>
            title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find("h1")
            res_data['title'] = title_node.get_text()
            # <div class="lemma-summary" label-module="lemmaSummary">
            summary_node = soup.find('div', class_="lemma-summary")
            res_data['summary'] = summary_node.get_text()
    
            return res_data
    
        def parse(self, page_url, html_cont):
            if page_url is None or html_cont is None:
                return
    
            soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
    
            new_urls = self._get_new_urls(page_url, soup)
            new_data = self._get_new_data(page_url, soup)
    
            return new_urls,new_data

    网页输出器

    html_outputer.py

    # coding:utf-8
    
    class HtmlOutputer(object):
        def __init__(self):
            self.datas = []
    
        def collect_data(self, data):
            if data is None:
                return
            self.datas.append(data)
    
        # ascii
        def output_html(self):
            fout = open('output.html', 'w')
    
            fout.write("<html>")
            fout.write("<body>")
            fout.write("<table>")
    
            for data in self.datas:
                fout.write("<tr>")
                fout.write("<td>%s</td>" % data['url'])
                fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
                fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))
                fout.write("</tr>")
    
            fout.write("/table")
            fout.write("/body")
            fout.write("/html")

    运行代码:

     结果页面

  • 相关阅读:
    yii 引入文件
    CodeForces 621C Wet Shark and Flowers
    面试题题解
    POJ 2251 Dungeon Master
    HDU 5935 Car(模拟)
    HDU 5938 Four Operations(暴力枚举)
    CodeForces 722C Destroying Array(并查集)
    HDU 5547 Sudoku(dfs)
    HDU 5583 Kingdom of Black and White(模拟)
    HDU 5512 Pagodas(等差数列)
  • 原文地址:https://www.cnblogs.com/reblue520/p/11083814.html
Copyright © 2011-2022 走看看