使用BeautifuSoup4解析器,将招聘网页上的招聘单位名称存储出来。其他信息可类似爬取即可
# -*- coding:utf-8 -*- from bs4 import BeautifulSoup import urllib2 import json # 使用了json格式存储 def csdn(): url = 'https://job.csdn.net/search/index' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"} request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) resHtml = response.read() output = open('csdnJob.json', 'w') soup = BeautifulSoup(resHtml, 'html.parser', from_encoding='utf-8') # 创建CSS选择器 result = soup.select('div[class="position_list clearfix"]') # result += result2 print("----") print (result) items = [] for site in result: item = {} print (site) name = site.select('a[class="enterprise_name"]')[0].get_text() item['name'] = name items.append(item) # 禁用ascii编码,按utf-8编码 line = json.dumps(items, ensure_ascii=False) output.write(line.encode('utf-8')) output.close() if __name__ == "__main__": csdn()
效果: