zoukankan      html  css  js  c++  java
  • 爬虫再探实战(一)——爬取智联招聘职位信息

      本人呢,算是学统计的,就想着爬一下智联的统计岗位信息,嗯,岗位很强势。。。

      这里用了requests,bs4进行抓取与解析,数据存入mysql数据库。代码比较乱,先凑和着看,有时间再整理吧。。。

    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    import datetime
    import MySQLdb
    
    now = datetime.datetime.now()
    
    # 获取网页数据,并解析网页
    def get_save_url(url, headers):
    	html = requests.get(url, headers=headers)
    	if html.status_code == 200:
    		print("网页" + url + "已经打开")
    	soup = BeautifulSoup(html.text, 'lxml')
    	global soup
    	html = html.text
    	global html
    	get_post_fact_city_num(soup,html)
    	print("正在抓取第"+pagenum+"页的职位信息")
    	
    
    
    # 定义解析网页的函数
    def get_post_fact_city_num(soupx,htmlx):
    
    	l_post_name = []
    	l_fact_name = []
    	l_city_name = []
    	l_num = []
    	l_lab = []
    	l_post_time = []
    	l_now_time = []
    	regex = "__ga__fullResult(.*)postname_clicksfullresult(.*)postnames_001"
    	posts = soup.findAll("a", {"class":re.compile(regex)}) 
    	for post in posts[::2]:		
    		post = post.get_text()
    		#print(post)
    		l_post_name.append(post)
    	facts = soup.findAll("p", {"class":"searchResultCompanyname"})
    
    	for fact in facts[::2]:
    		fact = fact.get_text()
    		l_fact_name.append(fact)
    
    	cities = soup.findAll("em", {"class":"searchResultJobCityval"})
    	for city in cities[::2]:
    
    		city = city.get_text()
    		l_city_name.append(city)
    	
    	nums = soup.findAll("em", {"class":"searchResultJobPeopnum"})
    	for num in nums:
    		num = num.get_text()
    		l_num.append(num)
    	#print("nums: "+str(inums))
    	labs = soup.findAll("p", {"class":"searchResultCompanyIndustry"})
    
    	for lab in labs:
    		lab = lab.get_text()
    		l_lab.append(lab)
    
    	time_regex = "<span>发布时间:<em></em>(.*)</span>"
    	time_pa = re.compile(time_regex)
    	times = re.findall(time_pa,html) 
    	itimes = 1
    	for time in times:
    		l_post_time.append(time)
    
    	save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time)
    
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"}
    
    # 这里页数可以在搜索结果包含的范围内任意设置
    urls = ["http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_{}_0".format(str(x)) for x in range(2, 20)]
    #url = "http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_2_0"
    db = MySQLdb.connect(host="localhost", user="root", passwd="这里是你的密码,我的不给看",db="test",use_unicode=True,charset="utf8")
    cursor = db.cursor()
    
    # 这里注意字符的大小尽量大,有些公司介绍比较长。。。
    sqlxx = """CREATE TABLE zhilian_tongji(
    		post_name VARCHAR(100),
    		fact_name VARCHAR(100),
    		city_name VARCHAR(20),
    		num VARCHAR(50),
    		lab VARCHAR(200),
    		post_time VARCHAR(50),
    		now_time VARCHAR(50)
    		) """
    
    cursor.execute(sqlxx)
    
    def save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time):
    	now_time = datetime.datetime.now()
    	sql = """INSERT INTO zhilian_tongji
    	   SET post_name=%s, fact_name=%s, city_name=%s, num=%s, lab=%s, post_time=%s, now_time=%s"""
    	for x in range(0,len(l_post_name)):
    		#print(len(l_post_name))
    		#print(x)
    		#print(l_fact_name)
    		cursor.execute(sql,(l_post_name[x],l_fact_name[x],l_city_name[x],l_num[x],l_lab[x],l_post_time[x],now_time))
    		db.commit()
    	print("抓取成功,已存入数据库!")
    
    for url in urls:
    	try:
    		time.sleep(1)
    		pagenum = url.split("_")[-2]
    		get_save_url(url=url,headers=headers)
    	except:
    		print("第 "+str(pagenum)+" 失败...")
    		pass
    db.close()
    print("大功告成!!!")
    

      代码输出结果如下。

      数据库查询结果如下。

  • 相关阅读:
    事件处理之二:点击事件监听器的五种写法 分类: H1_ANDROID 2013-09-11 10:32 4262人阅读 评论(1) 收藏
    如何解决安卓SDK无法下载Package的问题 分类: H1_ANDROID 2013-09-09 10:26 1199人阅读 评论(0) 收藏
    adb常用命令 分类: H1_ANDROID 2013-09-08 15:22 510人阅读 评论(0) 收藏
    用IBM WebSphere DataStage进行数据整合: 第 1 部分 分类: H2_ORACLE 2013-08-23 11:20 688人阅读 评论(0) 收藏
    三大主流ETL工具选型 分类: H2_ORACLE 2013-08-23 11:17 426人阅读 评论(0) 收藏
    ETL概述 分类: H2_ORACLE 2013-08-23 10:36 344人阅读 评论(0) 收藏
    POI操作Excel常用方法总结 分类: B1_JAVA 2013-08-23 10:01 349人阅读 评论(0) 收藏
    段的创建表user_segments 分类: H2_ORACLE 2013-08-10 11:13 714人阅读 评论(0) 收藏
    让android项目支持boost 支持c++11
    unity中全屏背景图缩放
  • 原文地址:https://www.cnblogs.com/buzhizhitong/p/5657301.html
Copyright © 2011-2022 走看看