zoukankan      html  css  js  c++  java
  • 小爬麦子学院教师

    任务描述:将麦子学院指定网页下教师信息(姓名,职称,介绍信息)爬取下来并保存到数据库。

    1.页面分析:

     

    2.代码:

    mydb.py:

    #!/usr/bin/env/python
    #coding:utf-8
    
    '''
    操作数据库
    '''
    import MySQLdb as db
    
    class DBHelper():
    	def __init__(self,tableName):
    		self.tableName=tableName
    		try:
    			self.conn=db.connect(
    				        host='localhost',
    				        port = 3306,
    				        user='root',
    				        passwd='root',
    				        db ='pythondb',
    				        charset='utf8'
    			        	)
    			self.cursor=self.conn.cursor()
    		except Exception as e:
    			print(e)
    	def createTable(self,pros,types):
    		sql='create table '+self.tableName+'('
    		for i in range(len(pros)):
    			if i==0:
    				sql+=pros[i]+' '+types[i]
    			else:
    				sql+=','+pros[i]+' '+types[i]
    		sql+=')'
    		self.cursor.execute(sql)
    	def insert(self,sql):
    		try:
    			print(sql)
    			self.cursor.execute(sql)
    			print('insert successfully!')
    		except Exception as e:
    			print('insert failed!')
    			self.conn.rollback()
    	def delete(self,sql):
    		try:
    			print(sql)
    			self.cursor.execute(sql)
    			print('delete successfully!')
    		except Exception as e:
    			print('delete failed!')
    			self.conn.rollback()
    	def queryBySql(self,sql):
    		return self.cursor.execute(sql)
    	def queryAll(self):
    		self.cursor.execute('select * from '+self.tableName)
    		# 获取所有记录列表
    		results = self.cursor.fetchall()
    		return results
    	def close(self):
    		self.cursor.close()
    		self.conn.commit()
    		self.conn.close()
    
    
    if __name__=='__main__':
    	print('test mydb DBHelper')
    	helper=DBHelper('teacher')
    	# pros=['name','title','production']
    	# types=['varchar(20)','varchar(50)','varchar(200)']
    	# dbhelper.createTable(pros,types)
    	sql='insert into teacher values("李希","成都莫比乌斯科技创始人","精通Windows及Linux系统平台的运维、大型分布式架构网站的部署和管理,具有15年资深IT从业经验。")'
    	helper.insert(sql)
    	for x in helper.queryAll():
    		print(x)
    	helper.close()
    

    mymodel.py:

    #!/usr/bin/env/python
    #coding:utf-8
    
    class Teacher():
    	def __init__(self,name,title,production):
    		self._name=name
    		self._title=title
    		self._production=production
    	def get_name(self):
    		return self._name
    	def set_name(self,value):
    		self._name=value
    	def get_title(self):
    		return self._title
    	def set_title(self,value):
    		self._title=value
    	def get_production(self):
    		return self._production
    	def set_production(self,value):
    		self._production=value
    	def __str__(self):
    		return 'name ='+self.name+',title ='+self.title+',production ='+self.production
    	name=property(get_name,set_name)
    	title=property(get_title,set_title)
    	production=property(get_production,set_production)
    
    
    if __name__=='__main__':
    	print('test mymodel Teacher')
    	p=Teacher('a','t','p')
    	print(p)
    	p.name='aa'
    	p.title='tt'
    	p.production='pp'
    	print(p)
    

    main.py:

    #!/usr/bin/env/python
    #coding:utf-8
    import mydb,mymodel
    import urllib
    from urllib import request
    import re
    class SpiderMan:
    	def __init__(self,url):
    		self.url=url
    		self.dbhelper=mydb.DBHelper('teacher')
    	def crawl(self):
    		#pattern
    		pattern_div=r"<div class='sliderPlay' id='sliderPlay'>[sS]*div id='btnBox' class='btnBox'>"
    		pattern_name=r'<p class="first">s*(.+)s*</p>'
    		pattern_title=r'<p class="second">s*(.+)s*</p>'
    		pattern_production=r'<p class="third">s*(.+)s*</p>'
    
    		#request
    		headers={
    		"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36",
    		'Host':'www.maiziedu.com',
    		'Referer':'www.maiziedu.com'
    		}
    		req=request.Request(self.url,headers=headers)
    		
    		#response
    		resp=request.urlopen(req)
    		html=resp.read().decode('utf-8')
    
    		#analysis
    		html_div=re.search(pattern_div,html).group()
    
    		name_list=re.findall(pattern_name,html_div)
    		title_list=re.findall(pattern_title,html_div)
    		production_list=re.findall(pattern_production,html_div)
    		
    		# print("name_list:")
    		# print(name_list)
    		# print("title_list:")
    		# print(title_list)
    		# print("production_list:")
    		# print(production_list)
    		
    		#save
    		for i in range(len(name_list)):
    			name=name_list[i]
    			title=title_list[i]
    			production=production_list[i]
    			sql='insert into '+self.dbhelper.tableName+' values('
    			sql+='"'+name+'"'+','+'"'+title+'"'+','+'"'+production+'"'
    			sql+=')'
    			self.dbhelper.insert(sql)
    
    		#close
    		self.dbhelper.close()
    
    
    if __name__=='__main__':
    	url='http://www.maiziedu.com/line/python/'
    	spider=SpiderMan(url)
    	spider.crawl()
    

    3.运行结果:

     

  • 相关阅读:
    JavaScript函数柯里化的一些思考
    Javascript Promise 学习笔记
    ArcGIS加载高德、OSM和谷歌等地图
    抓取“矢量”的实时交通流量数据
    uniapp使用swiper组件做tab切换动态获取高度
    elementui中弹出框不能自动换行的解决方案
    前端面试题总结
    关于nodejs中的增删改查
    关于cookie与本地 存储的区别的问题。
    微信小程序实现如丝顺滑可移动悬浮按钮(超简单)
  • 原文地址:https://www.cnblogs.com/jasonhaven/p/7420023.html
Copyright © 2011-2022 走看看