爬虫框架:开发平台 centos6.7 根据慕课网爬虫教程编写代码 片区百度百科url,标题,内容
分为4个模块:html_downloader.py 下载器
html_outputer.py 爬取数据生成html模块
html_parser 获取有用数据
url_manager url管理器
spider_main 爬虫启动代码
spider_main.py
1 #!/usr/bin/python 2 #-*- coding: utf8 -*- 3 4 import html_downloader 5 import html_outputer 6 import html_parser 7 import url_manager 8 9 class SpiderMain(object): 10 def __init__(self): 11 #初始化url管理器 12 self.urls = url_manager.UrlManager() 13 #初始化url下载器 14 self.downloader = html_downloader.HtmlDownloader() 15 #初始化url解析器 16 self.parser = html_parser.HtmlParser() 17 #初始化url输出 18 self.outputer = html_outputer.HtmlOutputer() 19 20 def craw(self, root_url): 21 count = 1 22 #url管理器中添加一个new url 23 self.urls.add_new_url(root_url) 24 #判断是否有新的URL 开始爬取 25 while self.urls.has_new_url(): 26 try: 27 #得到新的url 28 new_url = self.urls.get_new_url() 29 print 'craw %d : %s' % (count, new_url) 30 #下载新的url的数据 31 html_cont = self.downloader.download(new_url) 32 #解析出来url的内容和地址 33 new_urls, new_data = self.parser.parse(new_url, html_cont) 34 #新的url补充进url管理器 35 self.urls.add_new_urls(new_urls) 36 #输出数据 37 self.outputer.collect_data(new_data) 38 39 if count == 1001: 40 break 41 count = count + 1 42 print count 43 except:
html_parser.py
1 #!/bin/usr/python 2 #-*- coding:utf8 -*- 3 4 from bs4 import BeautifulSoup 5 import re 6 import urlparse 7 8 class HtmlParser(object): 9 ''' 10 解析器 11 ''' 12 13 def _get_new_urls(self, page_url, soup): 14 new_urls = set() 15 links = soup.find_all('a', href=re.compile(r"/view/d+.htm")) 16 for link in links: 17 new_url = link['href'] 18 new_full_url = urlparse.urljoin(page_url, new_url) 19 new_urls.add(new_full_url) 20 21 22 return new_urls 23 24 def _get_new_data(self, page_url, soup): 25 res_data = {} 26 27 res_data['url'] = page_url 28 29 title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1') 30 res_data['title'] = title_node.get_text() 31 32 summary_node = soup.find('div', class_="lemma-summary") 33 res_data['summary'] = summary_node.get_text() 34 print res_data['summary'] 35 36 return res_data 37 38 def parse(self, page_url, html_cont): 39 if page_url is None or html_cont is None: 40 return 41 42 soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') 43 new_urls = self._get_new_urls(page_url, soup) 44 new_data = self._get_new_data(page_url, soup) 45 print new_data 46 return new_urls, new_data 47
html_outputer.py
1 #!/usr/bin/python 2 #-*- coding:utf8 -*- 3 4 class HtmlOutputer(object): 5 def __init__(self): 6 self.datas = [] 7 8 def collect_data(self, data): 9 if data is None: 10 return 11 self.datas.append(data) 12 13 14 def output_html(self): 15 fout = open('output.html', 'w') 16 17 fout.write("<html>") 18 fout.write("<body>") 19 fout.write("<head>") 20 fout.write('<meta charset="utf-8">') 21 fout.write("</head>") 22 fout.write("<table>") 23 24 for data in self.datas: 25 fout.write("<tr>") 26 fout.write("<td>%s</td>" % data['url']) 27 fout.write("<td>%s</td>" % data['title'].encode('utf-8')) 28 fout.write("<td>%s</td>" % data['summary'].encode('utf-8')) 29 fout.write("</tr>") 30 31 fout.write("</table>") 32 fout.write("</body>") 33 fout.write("</html>") 34 35 fout.close()
html_downloader.py
1 #!/usr/bin/python 2 #-*- coding:utf8 -*- 3 4 import urllib2 5 6 class HtmlDownloader(object): 7 ''' 8 下载器 9 ''' 10 11 def download(self, url): 12 if url is None: 13 return None 14 15 response = urllib2.urlopen(url) 16 17 if response.getcode() != 200: 18 print '请求失败' 19 return None 20 21 return response.read()
url_manager.py
1 #!/usr/bin/python 2 #-*- coding:utf8 -*- 3 4 class UrlManager(object): 5 ''' 6 url管理器 7 ''' 8 def __init__(self): 9 self.new_urls = set() 10 self.old_urls = set() 11 12 def add_new_url(self, url): 13 if url is None: 14 return 15 if url not in self.new_urls and url not in self.old_urls: 16 self.new_urls.add(url) 17 18 def has_new_url(self): 19 if len(self.new_urls) != 0: 20 return len(self.new_urls) 21 else: 22 print '没有新的url' 23 24 def get_new_url(self): 25 new_url = self.new_urls.pop() 26 self.old_urls.add(new_url) 27 return new_url 28 29 def add_new_urls(self, urls): 30 if urls is None or len(urls) == 0: 31 return 32 for url in urls: 33 self.add_new_url(url) 34