zoukankan      html  css  js  c++  java
  • 爬虫再探实战(一)——爬取智联招聘职位信息

      本人呢,算是学统计的,就想着爬一下智联的统计岗位信息,嗯,岗位很强势。。。

      这里用了requests,bs4进行抓取与解析,数据存入mysql数据库。代码比较乱,先凑和着看,有时间再整理吧。。。

    import requests
    from bs4 import BeautifulSoup
    import re
    import time
    import datetime
    import MySQLdb
    
    now = datetime.datetime.now()
    
    # 获取网页数据,并解析网页
    def get_save_url(url, headers):
    	html = requests.get(url, headers=headers)
    	if html.status_code == 200:
    		print("网页" + url + "已经打开")
    	soup = BeautifulSoup(html.text, 'lxml')
    	global soup
    	html = html.text
    	global html
    	get_post_fact_city_num(soup,html)
    	print("正在抓取第"+pagenum+"页的职位信息")
    	
    
    
    # 定义解析网页的函数
    def get_post_fact_city_num(soupx,htmlx):
    
    	l_post_name = []
    	l_fact_name = []
    	l_city_name = []
    	l_num = []
    	l_lab = []
    	l_post_time = []
    	l_now_time = []
    	regex = "__ga__fullResult(.*)postname_clicksfullresult(.*)postnames_001"
    	posts = soup.findAll("a", {"class":re.compile(regex)}) 
    	for post in posts[::2]:		
    		post = post.get_text()
    		#print(post)
    		l_post_name.append(post)
    	facts = soup.findAll("p", {"class":"searchResultCompanyname"})
    
    	for fact in facts[::2]:
    		fact = fact.get_text()
    		l_fact_name.append(fact)
    
    	cities = soup.findAll("em", {"class":"searchResultJobCityval"})
    	for city in cities[::2]:
    
    		city = city.get_text()
    		l_city_name.append(city)
    	
    	nums = soup.findAll("em", {"class":"searchResultJobPeopnum"})
    	for num in nums:
    		num = num.get_text()
    		l_num.append(num)
    	#print("nums: "+str(inums))
    	labs = soup.findAll("p", {"class":"searchResultCompanyIndustry"})
    
    	for lab in labs:
    		lab = lab.get_text()
    		l_lab.append(lab)
    
    	time_regex = "<span>发布时间:<em></em>(.*)</span>"
    	time_pa = re.compile(time_regex)
    	times = re.findall(time_pa,html) 
    	itimes = 1
    	for time in times:
    		l_post_time.append(time)
    
    	save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time)
    
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"}
    
    # 这里页数可以在搜索结果包含的范围内任意设置
    urls = ["http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_{}_0".format(str(x)) for x in range(2, 20)]
    #url = "http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_2_0"
    db = MySQLdb.connect(host="localhost", user="root", passwd="这里是你的密码,我的不给看",db="test",use_unicode=True,charset="utf8")
    cursor = db.cursor()
    
    # 这里注意字符的大小尽量大,有些公司介绍比较长。。。
    sqlxx = """CREATE TABLE zhilian_tongji(
    		post_name VARCHAR(100),
    		fact_name VARCHAR(100),
    		city_name VARCHAR(20),
    		num VARCHAR(50),
    		lab VARCHAR(200),
    		post_time VARCHAR(50),
    		now_time VARCHAR(50)
    		) """
    
    cursor.execute(sqlxx)
    
    def save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time):
    	now_time = datetime.datetime.now()
    	sql = """INSERT INTO zhilian_tongji
    	   SET post_name=%s, fact_name=%s, city_name=%s, num=%s, lab=%s, post_time=%s, now_time=%s"""
    	for x in range(0,len(l_post_name)):
    		#print(len(l_post_name))
    		#print(x)
    		#print(l_fact_name)
    		cursor.execute(sql,(l_post_name[x],l_fact_name[x],l_city_name[x],l_num[x],l_lab[x],l_post_time[x],now_time))
    		db.commit()
    	print("抓取成功,已存入数据库!")
    
    for url in urls:
    	try:
    		time.sleep(1)
    		pagenum = url.split("_")[-2]
    		get_save_url(url=url,headers=headers)
    	except:
    		print("第 "+str(pagenum)+" 失败...")
    		pass
    db.close()
    print("大功告成!!!")
    

      代码输出结果如下。

      数据库查询结果如下。

  • 相关阅读:
    七夜在线音乐台开发 第二弹 (原创)
    七夜在线音乐台开发 第一弹 (原创)
    逆向某停车app(原创)
    (原创)python发送邮件
    申论(写作篇)之“人民时评”帮你写好作文
    行测(爆发篇)之片段阅读,不能只靠语感
    行测(爆发篇)之逻辑填空
    申论(写作篇)之文字表达能力提升
    申论(写作篇)之花木体申论写作法
    公考笔记整理(全)
  • 原文地址:https://www.cnblogs.com/buzhizhitong/p/5657301.html
Copyright © 2011-2022 走看看