zoukankan      html  css  js  c++  java
  • python爬虫—爬取百度百科数据

    爬虫框架:开发平台 centos6.7 根据慕课网爬虫教程编写代码 片区百度百科url,标题,内容

    分为4个模块:html_downloader.py 下载器
    html_outputer.py 爬取数据生成html模块
    html_parser 获取有用数据
    url_manager url管理器
    spider_main 爬虫启动代码
     
    spider_main.py
     1 #!/usr/bin/python
     2 #-*- coding: utf8 -*-
     3 
     4 import html_downloader
     5 import html_outputer
     6 import html_parser
     7 import url_manager
     8 
     9 class SpiderMain(object):
    10     def __init__(self):
    11         #初始化url管理器
    12         self.urls = url_manager.UrlManager()
    13         #初始化url下载器
    14         self.downloader = html_downloader.HtmlDownloader()
    15         #初始化url解析器
    16         self.parser = html_parser.HtmlParser()
    17         #初始化url输出
    18         self.outputer = html_outputer.HtmlOutputer()
    19 
    20     def craw(self, root_url):
    21         count = 1
    22         #url管理器中添加一个new url
    23         self.urls.add_new_url(root_url)
    24         #判断是否有新的URL 开始爬取
    25         while self.urls.has_new_url():
    26             try:
    27                 #得到新的url
    28                 new_url = self.urls.get_new_url()
    29                 print 'craw %d : %s' % (count, new_url)
    30                 #下载新的url的数据
    31                 html_cont = self.downloader.download(new_url)
    32                 #解析出来url的内容和地址
    33                 new_urls, new_data = self.parser.parse(new_url, html_cont)
    34                 #新的url补充进url管理器
    35                 self.urls.add_new_urls(new_urls)
    36                 #输出数据
    37                 self.outputer.collect_data(new_data)
    38 
    39                 if count == 1001:
    40                     break
    41                 count = count + 1
    42                 print count
    43             except:

    html_parser.py

     1 #!/bin/usr/python 
     2 #-*- coding:utf8 -*-
     3 
     4 from bs4 import BeautifulSoup
     5 import re
     6 import urlparse
     7 
     8 class HtmlParser(object):
     9     '''
    10     解析器
    11     '''
    12     
    13     def _get_new_urls(self, page_url, soup):
    14         new_urls = set()
    15         links = soup.find_all('a', href=re.compile(r"/view/d+.htm"))
    16         for link in links:
    17             new_url = link['href']
    18             new_full_url = urlparse.urljoin(page_url, new_url)
    19             new_urls.add(new_full_url)
    20         
    21 
    22         return new_urls
    23     
    24     def _get_new_data(self, page_url, soup):
    25         res_data = {}
    26 
    27         res_data['url'] = page_url
    28 
    29         title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1')
    30         res_data['title'] = title_node.get_text()
    31         
    32         summary_node = soup.find('div', class_="lemma-summary")
    33         res_data['summary'] = summary_node.get_text()
    34         print res_data['summary']
    35 
    36         return res_data
    37 
    38     def parse(self, page_url, html_cont):
    39         if page_url is None or html_cont is None:
    40             return 
    41 
    42         soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
    43         new_urls = self._get_new_urls(page_url, soup)
    44         new_data = self._get_new_data(page_url, soup)
    45         print new_data
    46         return new_urls, new_data
    47         

    html_outputer.py

     1 #!/usr/bin/python 
     2 #-*- coding:utf8 -*-
     3 
     4 class HtmlOutputer(object):
     5     def __init__(self):
     6         self.datas = []
     7 
     8     def collect_data(self, data):
     9         if data is None:
    10             return
    11         self.datas.append(data)
    12         
    13 
    14     def output_html(self):
    15         fout = open('output.html', 'w')
    16 
    17         fout.write("<html>")
    18         fout.write("<body>")
    19         fout.write("<head>")
    20         fout.write('<meta charset="utf-8">')
    21         fout.write("</head>")
    22         fout.write("<table>")
    23 
    24         for data in self.datas:
    25             fout.write("<tr>")
    26             fout.write("<td>%s</td>" % data['url'])
    27             fout.write("<td>%s</td>" % data['title'].encode('utf-8'))
    28             fout.write("<td>%s</td>" % data['summary'].encode('utf-8'))
    29             fout.write("</tr>")
    30 
    31         fout.write("</table>")
    32         fout.write("</body>")
    33         fout.write("</html>")
    34 
    35         fout.close()

    html_downloader.py

     1 #!/usr/bin/python
     2 #-*- coding:utf8 -*-
     3 
     4 import urllib2
     5 
     6 class HtmlDownloader(object):
     7     '''
     8     下载器
     9     '''
    10 
    11     def download(self, url):
    12         if url is None:
    13             return None
    14 
    15         response = urllib2.urlopen(url)
    16 
    17         if response.getcode() != 200:
    18             print '请求失败'
    19             return None
    20 
    21         return response.read()

     url_manager.py

     1 #!/usr/bin/python
     2 #-*- coding:utf8 -*-
     3 
     4 class UrlManager(object):
     5     '''
     6     url管理器
     7     '''
     8     def __init__(self):
     9         self.new_urls = set()
    10         self.old_urls = set()
    11 
    12     def add_new_url(self, url):
    13         if url is None:
    14             return
    15         if url not in self.new_urls and url not in self.old_urls:
    16             self.new_urls.add(url)
    17 
    18     def has_new_url(self):
    19         if len(self.new_urls) != 0:
    20             return len(self.new_urls)
    21         else:
    22             print '没有新的url'
    23 
    24     def get_new_url(self):
    25         new_url = self.new_urls.pop()
    26         self.old_urls.add(new_url)
    27         return new_url
    28 
    29     def add_new_urls(self, urls):
    30         if urls is None or len(urls) == 0:
    31             return 
    32         for url in urls:
    33             self.add_new_url(url)
    34         
  • 相关阅读:
    1、搭建CICD平台
    Jackson 的 基本用法
    多种方式实现动态替换Android默认桌面Launcher
    Springboot2.x整合Redis以及连接哨兵模式/集群模式
    Redis哨兵(Sentinel)模式
    redis的哨兵模式(redis有密码)
    【数据结构】——LCT(link cut tree)
    征战蓝桥 —— 2017年第八届 —— C/C++A组第3题——魔方状态
    征战蓝桥 —— 2017年第八届 —— C/C++A组第4题——方格分割
    征战蓝桥 —— 2017年第八届 —— C/C++A组第4题——方格分割
  • 原文地址:https://www.cnblogs.com/flex-/p/5540921.html
Copyright © 2011-2022 走看看