zoukankan      html  css  js  c++  java
  • 小爬简单模式

    学习两周python之后,就学着写了个爬虫,实现对百度百科关键字python搜索100个相关链接页面的标题和摘要。

    项目结构如下:


    源代码如下:

    html_downloader.py

    # coding:utf-8
    import urllib2
    
    class HtmlDownloader():
    	def download(self, url):
    		if url is None:
    			return None
    		response = urllib2.urlopen(url)
    		if response.getcode() != 200:
    			return None
    		return response.read()

    html_outper.py

    # coding:utf-8
    
    class HtmlOutputer(object):
    	def __init__(self):
    		self.datas = []
    
    	def collect_data(self, new_data):
    		if new_data is None:
    			return
    		self.datas.append(new_data)
    
    	def output_html(self):
    		try:
    			file = open("spider_output.html", "w")
    			file.write("<html><head><meta charset='UTF-8'></head><body><table>")
    			for data in self.datas:
    				file.write("<tr>")
    				file.write("<td>%s</td>" % data["url"])
    				file.write("<td>%s</td>" % data["title"].encode("utf-8"))
    				file.write("<td>%s</td>" % data["summary"].encode("utf-8"))
    				file.write("</tr>")
    			file.write("</table></body></html>")
    		except:
    			print "output_html error!"
    		finally:
    			file.close()
    

    html_parser.py

    # coding:utf-8
    import re
    import urlparse
    
    from bs4 import BeautifulSoup
    
    
    class HtmlParser():
    	def parse(self, page_url, html_cont):
    		if page_url is None or html_cont is None:
    			return
    		soup = BeautifulSoup(html_cont, "html.parser", from_encoding="utf-8")
    		new_urls = self.get_new_urls(page_url, soup)
    		new_data = self.get_new_data(page_url, soup)
    		return new_urls, new_data
    
    	def get_new_urls(self, page_url, soup):
    		new_urls = set()
    		links = soup.find_all("a", href=re.compile(r"/item/"))
    		for l in links:
    			new_url = l["href"]
    			new_full_link = urlparse.urljoin(page_url, new_url)
    			new_urls.add(new_full_link)
    		return new_urls
    
    	def get_new_data(self, page_url, soup):
    		res_data = {}
    		res_data["url"] = page_url
    		'''
    		<dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1></dd>
    		'''
    		title_node = soup.find("dd", class_="lemmaWgt-lemmaTitle-title").find("h1")
    		res_data["title"] = title_node.get_text()
    		'''
    		<div class="lemma-summary" label-module="lemmaSummary"></div>
    		'''
    		summary_node = soup.find("div", class_="lemma-summary")
    		res_data["summary"] = summary_node.get_text()
    		return res_data
    

    url_manager.py

    # coding:utf-8
    
    class UrlManager():
    	'''
    	维护两个url集合,访问过的urls、待访问的urls
    	'''
    
    	def __init__(self):
    		self.new_urls = set()
    		self.old_urls = set()
    
    	def add_new_url(self, url):
    		if url is None:
    			return
    		if url not in self.old_urls and url not in self.new_urls:
    			self.new_urls.add(url)
    
    	def add_new_urls(self, new_urls):
    		if new_urls is None or len(new_urls) == 0:
    			return
    		for url in new_urls:
    			self.new_urls.add(url)
    
    	def has_new_url(self):
    		return len(self.new_urls) != 0
    
    	def get_new_url(self):
    		new_url = self.new_urls.pop()
    		self.old_urls.add(new_url)
    		return new_url
    

    spider_main.py

    # coding:utf-8
    import html_downloader
    import html_outputer
    import html_parser
    import url_manager
    
    
    class SpiderMan(object):
    	def __init__(self):
    		self.urls = url_manager.UrlManager()
    		self.downloader = html_downloader.HtmlDownloader()
    		self.parser = html_parser.HtmlParser()
    		self.outputer = html_outputer.HtmlOutputer()
    
    	def craw(self, root_url):
    		count = 1
    		self.urls.add_new_url(root_url)
    		while self.urls.has_new_url():
    			new_url = self.urls.get_new_url()
    			print "current url is %d , url =  %s" % (count, new_url)
    			html_cont = self.downloader.download(new_url)
    			new_urls, new_data = self.parser.parse(new_url, html_cont)
    			self.urls.add_new_urls(new_urls)
    			self.outputer.collect_data(new_data)
    			count += 1
    			if (count == 100):
    				break
    		self.outputer.output_html()
    
    
    if __name__ == "__main__":
    	root_url = "https://baike.baidu.com/item/Python/407313?fr=aladdin"
    	obj_spider = SpiderMan()
    	obj_spider.craw(root_url)
    

    运行结果:



  • 相关阅读:
    《NVM-Express-1_4-2019.06.10-Ratified》学习笔记(5.23)-- Format NVM command
    《NVMe-over-Fabrics-1_0a-2018.07.23-Ratified》阅读笔记(4)-- Controller Architecture
    《NVMe-over-Fabrics-1_0a-2018.07.23-Ratified》阅读笔记(3)-- 命令
    SPDK-nvmf与不同传输类型的公共接口
    NVMe over Fabrics 概况
    NVMe概况
    NVMe over Fabrics 协议Discovery服务交互过程跟踪
    《NVM-Express-1_4-2019.06.10-Ratified》学习笔记(8.21)-- Host Operation with Asymmetric Namespace Access Reporting
    《NVM-Express-1_4-2019.06.10-Ratified》学习笔记(8.8)-- Reservations
    Spring 中IOC(控制反转)&& 通过SET方式为属性注入值 && Spring表达式
  • 原文地址:https://www.cnblogs.com/jasonhaven/p/7355001.html
Copyright © 2011-2022 走看看