zoukankan      html  css  js  c++  java
  • 关于拉勾网的爬虫

      1 # coding:utf-8
      2 import json
      3 import re
      4 from lxml import etree
      5 import requests
      6 import time
      9 class Lagou(object):
     10     def __init__(self):
     11         # 构建初始url
     12         self.url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
     13         # 构建请求头,lg需要设置登录cookie,登录后浏览器检查工具获得即可,referer为跳转网页,也可以独立请求跳转
     14         self.headers = {
     15             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
     16             'Cookie': '此处代码为登录后获取的cookie,全部粘贴进来即可',
     17             'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
     18         }
     19         self.pattern = re.compile(r'"positionId":(d+)')  # 构建正则匹配用于获取urlId
     20         self.base_url = 'https://www.lagou.com/jobs/{}.html'
     21         self.file = open('lagou.json', 'w')
     23     def get_post_data(self, page=1):
     24         """获取列表页json数据"""
     25         print('正在登陆----')
     26         post_data = {
     27             'first': 'true',
     28             'pn': page,  # 页数
     29             'kd': 'python', # 此处可以动态修改
     30         }
     31         response = requests.post(self.url, headers=self.headers, data=post_data)
     32         print('获取得列表页响应')
     33         return response.content.decode()
     35     def get_page(self, url):
     36         """获取详情页响应数据"""
     37         response = requests.post(url, headers=self.headers)
     38         return response.content
     40     def parse_url(self, data):
     41         """解析列表页数据获取urlid"""
     42         print('开始解析列表页数据,获取id')
     43         id_list = self.pattern.findall(data)
     44         url_list = []
     45         for id in id_list:
     46             url_list.append(self.base_url.format(id))
     47         print('id获取完毕')
     48         return url_list
     50     def parse_detail_data(self, str_data):
     51         """解析详情页数据"""
     52         print('正在获取详情页数据')
     53         html = etree.HTML(str_data)
     54         data = {}
     55         data['name'] = html.xpath('//div/span[@class="name"]/text()')[0] if len(html.xpath('//div/span[@class="name"]/text()'))>0 else None
     56         data['salary'] = html.xpath('//span[@class="salary"]/text()')[0] if len(html.xpath('//span[@class="salary"]/text()'))>0 else None
     57         temp = html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0] if len(html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()'))>0 else None
     58         data['city'] = temp.replace('/', '').strip()
     59         data['company'] = html.xpath('//div[@class="company"]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
     60         temp = html.xpath('//dd/p[1]/span[4]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
     61         data['education'] = temp.replace('/', '').strip()
     62         data['job_type'] = html.xpath('//dd/p[1]/span[5]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
     63         data['anvantage'] = html.xpath('//dd[@class="job-advantage"]/p/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
     64         desc_list = html.xpath('//dd[@class="job_bt"]/div/p/text()')
     65         temp = ''
     66         for desc in desc_list:
     67             temp += desc
     68         data['responsibilities'] = temp.replace('xa0', '')
     69         return data
     71     def parse_detail(self, url_list):
     72         """获取单页详情页数据列表"""
     73         print('开始拼装详情页url')
     74         data_list = []
     75         for url in url_list:
     76             str_data = self.get_page(url)
     77             # print(str_data.decode())
     78             data_list.append(self.parse_detail_data(str_data))
     79             # print(str)
     80         print('获取完毕')
     81         return data_list
     83     def save_data(self, data_list):
     84         """保存数据模块"""
     85         print('开始保存数据')
     86         for data in data_list:
     87             str_data = json.dumps(data, ensure_ascii=False) + ',
    '  # 将python字典转换为json字符串
     88             self.file.write(str_data)
     90     def run(self):
     91         """爬虫运行逻辑模块"""
     92         for page in range(1, 10):  # 翻页
     93             data = self.get_post_data(page)  # lg数据需要登陆爬取,使用post稍微安全些,大概吧
     94             url_list = self.parse_url(data)  # 获取详情页所需id
     95             data_list = self.parse_detail(url_list)  # 获取单页详情页数据列表
     96             # print(data_list)  # debug
     97             self.save_data(data_list)  # 保存数据
     99     def __del__(self):
    100         print('数据保存完毕')
    101         self.file.close()  # 关闭文件
    104 if __name__ == '__main__':
    105     lagou = Lagou()
    106     lagou.run()


  • 相关阅读:
    Cannot resolve com.born:xxx_dao:unknown
    The valid characters are defined in RFC 7230 and RFC 3986错误
    web项目资源路径找不到 | Failed to load resource: the server responded with a status of 404 () | Uncaught ReferenceError: $ is not defined
    PUThttp://localhost.......jpg returned a response status of 409
    IDEA 1099端口被占用
    An invalid character [32] was present in the Cookie value
    HTTP Status 405 HTTP method GET is not supported by this URL
    IDEA Tomcat运行报错Failed to start component [StandardEngine[Catalina].StandardHost[localhost].
  • 原文地址:https://www.cnblogs.com/qiukujun/p/one_simple_spider.html
Copyright © 2011-2022 走看看