1 # coding:utf-8 2 import json 3 import re 4 from lxml import etree 5 import requests 6 import time 7 8 9 class Lagou(object): 10 def __init__(self): 11 # 构建初始url 12 self.url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0' 13 # 构建请求头,lg需要设置登录cookie,登录后浏览器检查工具获得即可,referer为跳转网页,也可以独立请求跳转 14 self.headers = { 15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', 16 'Cookie': '此处代码为登录后获取的cookie,全部粘贴进来即可', 17 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' 18 } 19 self.pattern = re.compile(r'"positionId":(d+)') # 构建正则匹配用于获取urlId 20 self.base_url = 'https://www.lagou.com/jobs/{}.html' 21 self.file = open('lagou.json', 'w') 22 23 def get_post_data(self, page=1): 24 """获取列表页json数据""" 25 print('正在登陆----') 26 post_data = { 27 'first': 'true', 28 'pn': page, # 页数 29 'kd': 'python', # 此处可以动态修改 30 } 31 response = requests.post(self.url, headers=self.headers, data=post_data) 32 print('获取得列表页响应') 33 return response.content.decode() 34 35 def get_page(self, url): 36 """获取详情页响应数据""" 37 response = requests.post(url, headers=self.headers) 38 return response.content 39 40 def parse_url(self, data): 41 """解析列表页数据获取urlid""" 42 print('开始解析列表页数据,获取id') 43 id_list = self.pattern.findall(data) 44 url_list = [] 45 for id in id_list: 46 url_list.append(self.base_url.format(id)) 47 print('id获取完毕') 48 return url_list 49 50 def parse_detail_data(self, str_data): 51 """解析详情页数据""" 52 print('正在获取详情页数据') 53 html = etree.HTML(str_data) 54 data = {} 55 data['name'] = html.xpath('//div/span[@class="name"]/text()')[0] if len(html.xpath('//div/span[@class="name"]/text()'))>0 else None 56 data['salary'] = html.xpath('//span[@class="salary"]/text()')[0] if len(html.xpath('//span[@class="salary"]/text()'))>0 else None 57 temp = html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0] if len(html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()'))>0 else None 58 data['city'] = temp.replace('/', '').strip() 59 data['company'] = html.xpath('//div[@class="company"]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 60 temp = html.xpath('//dd/p[1]/span[4]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 61 data['education'] = temp.replace('/', '').strip() 62 data['job_type'] = html.xpath('//dd/p[1]/span[5]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 63 data['anvantage'] = html.xpath('//dd[@class="job-advantage"]/p/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None 64 desc_list = html.xpath('//dd[@class="job_bt"]/div/p/text()') 65 temp = '' 66 for desc in desc_list: 67 temp += desc 68 data['responsibilities'] = temp.replace('xa0', '') 69 return data 70 71 def parse_detail(self, url_list): 72 """获取单页详情页数据列表""" 73 print('开始拼装详情页url') 74 data_list = [] 75 for url in url_list: 76 str_data = self.get_page(url) 77 # print(str_data.decode()) 78 data_list.append(self.parse_detail_data(str_data)) 79 # print(str) 80 print('获取完毕') 81 return data_list 82 83 def save_data(self, data_list): 84 """保存数据模块""" 85 print('开始保存数据') 86 for data in data_list: 87 str_data = json.dumps(data, ensure_ascii=False) + ', ' # 将python字典转换为json字符串 88 self.file.write(str_data) 89 90 def run(self): 91 """爬虫运行逻辑模块""" 92 for page in range(1, 10): # 翻页 93 data = self.get_post_data(page) # lg数据需要登陆爬取,使用post稍微安全些,大概吧 94 url_list = self.parse_url(data) # 获取详情页所需id 95 data_list = self.parse_detail(url_list) # 获取单页详情页数据列表 96 # print(data_list) # debug 97 self.save_data(data_list) # 保存数据 98 99 def __del__(self): 100 print('数据保存完毕') 101 self.file.close() # 关闭文件 102 103 104 if __name__ == '__main__': 105 lagou = Lagou() 106 lagou.run()
一个简单的爬取拉勾网详情页信息的爬虫,非常粗糙,主要使用了requests进行请求,登录后才能爬取所有数据,需要用到登录后的cookie,最好使用post请求,虽然只安全一点点,仅供参考。