zoukankan      html  css  js  c++  java
  • 数据关键词招聘职位爬虫

      1 # -*- coding:utf-8 -*-
      2 # Author:Sure Feng
      3 
      4 from selenium import webdriver
      5 from lxml import etree
      6 import time
      7 import json
      8 import openpyxl
      9 
     10 class LaGou(object):
     11     # 定义浏览器地址
     12     # time = time.time()
     13     driver_path = r'E:surewarechromedriver.exe'
     14     def __init__(self):
     15         # 创建一个浏览器
     16         self.drive = webdriver.Chrome(executable_path=LaGou.driver_path)
     17         # 定义主页url
     18         self.url = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE?city=%E5%B9%BF%E5%B7%9E'
     19         self.positions = []
     20 
     21     def request_detail_page(self,url):
     22         '''请求详情页面'''
     23 
     24         # 创建新窗口打开详情页面
     25         self.drive.execute_script("window.open('%s')" % url)
     26         # 切换到详情页面
     27         self.drive.switch_to.window(self.drive.window_handles[1])
     28         # 获取详情页数据
     29         source = self.drive.page_source
     30         # 解析页面,获取具体数据
     31         self.parse_detial_page(source)
     32         # 关闭详情页面
     33         self.drive.close()
     34         # 切换到主页面
     35         self.drive.switch_to.window(self.drive.window_handles[0])
     36 
     37     def parse_detial_page(self, source):
     38         '''解析页面,获取具体数据'''
     39         html = etree.HTML(source)
     40         info_list = []
     41         # xpath解析html,获取具体数据
     42         position_id = html.xpath("//a[@class='send-CV-btn s-send-btn fr']/@data-position-id")
     43         position_web = "https://www.lagou.com/jobs/{}.html".format(position_id[0] if len(position_id) > 0 else None)
     44         info_list.append(position_web)
     45         position_name = html.xpath("//div[@class='job-name']/@title")
     46         position_name = position_name[0].strip() if len(position_name) > 0 else None
     47         info_list.append(position_name)
     48         salary = html.xpath("//dd[@class='job_request']/p/span[@class='salary']/text()")
     49         salary = salary[0].strip() if len(salary) > 0 else None
     50         info_list.append(salary)
     51         job_year = html.xpath("//dd[@class='job_request']/p/span[3]/text()")
     52         job_year = job_year[0].replace("/","").strip() if len(job_year) > 0 else None
     53         info_list.append(job_year)
     54         grade = html.xpath("//dd[@class='job_request']/p/span[4]/text()")
     55         grade = grade[0].replace("/","").strip() if len(grade) > 0 else None
     56         info_list.append(grade)
     57         publish_time = html.xpath("//p[@class='publish_time']/text()")
     58         publish_time = publish_time[0].replace("xa0 发布于拉勾网","").strip() if len(publish_time) > 0 else None
     59         info_list.append(publish_time)
     60         company_name = html.xpath("//img[@class='b2']/@alt")
     61         company_name = company_name[0] if len(company_name) > 0 else None
     62         info_list.append(company_name)
     63         company = html.xpath("//h2[@class='fl']/text()")
     64         company = company[0].strip() if len(company) > 0 else None
     65         info_list.append(company)
     66         job_advantage = html.xpath("//dd[@class='job-advantage']/p/text()")
     67         job_advantage = job_advantage[0].strip() if len(job_advantage) > 0 else None
     68         info_list.append(job_advantage)
     69         job_detail = html.xpath("//div[@class='job-detail']//text()")
     70         job_detail = str(job_detail).replace(" ","").strip() if len(job_detail) > 0 else None
     71         info_list.append(job_detail)
     72         work_addr = html.xpath("//div[@class='work_addr']/a[2]/text()")
     73         work_addr = work_addr[0].strip() if len(work_addr) > 0 else None
     74         info_list.append(work_addr)
     75         work_addr_detail = html.xpath("//div[@class='work_addr']//text()")
     76         work_addr_detail = work_addr_detail[-3].strip() if len(work_addr_detail) > 0 else None
     77         info_list.append(work_addr_detail)
     78         position_label_clearfix = html.xpath("//ul[@class='position-label clearfix']/li[@class='labels']//text()")
     79         position_label_clearfix = str(position_label_clearfix).strip() if len(position_label_clearfix) > 0 else None
     80         info_list.append(position_label_clearfix)
     81         c_feature = html.xpath("//ul[@class='c_feature']/li/text()")
     82         zone = c_feature[1].strip() if len(c_feature) > 0 else None
     83         info_list.append(zone)
     84         development = html.xpath("//i[@class='icon-glyph-trend']/../text()")
     85         development = development[1].strip() if len(development) > 0 else None
     86         info_list.append(development)
     87         people_num = html.xpath("//i[@class='icon-glyph-figure']/../text()")
     88         people_num = people_num[1].strip() if len(people_num) > 0 else None
     89         info_list.append(people_num)
     90         Investment_institution = html.xpath("//p[@class='financeOrg']/text()")
     91         Investment_institution = Investment_institution[0].strip() if len(Investment_institution) > 0 else None
     92         info_list.append(Investment_institution)
     93 
     94         # 将具体数据保存至dict中
     95         # info_dict = {
     96         #     'company':company,
     97         #     'position_name':position_name,
     98         #     'salary':salary,
     99         #     'job_year':job_year,
    100         #     'grade':grade,
    101         #     'publish_time':publish_time,
    102         #     'zone':zone,
    103         #     'job_advantage':job_advantage,
    104         #     'job_detail':job_detail,
    105         # }
    106         # 将dict保存至职位列表中
    107         # self.positions.append(info_dict)
    108         self.positions.append(info_list)
    109         # with open("lagou.json",encoding="utf-8",mode="a") as f:
    110         #     f.write(json.dumps(info_dict,ensure_ascii=False,indent=2))
    111         #     f.write("
    ")
    112         print(str(self.positions).encode('GBK','ignore').decode('GBk') )
    113 
    114     def parse_list_page(self, source):
    115         '''解析列表页'''
    116         html = etree.HTML(source)
    117         # 获取详情url列表
    118         links = html.xpath("//a[@class='position_link']/@href")
    119         for link in links:
    120             print(link)
    121             # 请求详情页面
    122             self.request_detail_page(link)
    123             time.sleep(1)
    124             # break
    125 
    126     def save_excel(self, list):
    127         wb = openpyxl.Workbook()
    128         now_time = time.time()
    129         ws = wb.create_sheet("lagou" + str(now_time))
    130         title = ["position_web","position_name","salary","job_year","grade","publish_time","company","company_name","job_advantage","job_detail","work_addr","work_addr_detail","position_label_clearfix","zone","development","people_num","Investment_institution"]
    131         ws.append(title)
    132         for li in list:
    133             ws.append(li)
    134 
    135         wb.save("lagou.xls")
    136 
    137 
    138     def run(self):
    139          # 打开主页
    140          self.drive.get(self.url)
    141          # num = 0
    142          while True:
    143              # num += 1
    144              # if num == 2:
    145              #     break
    146              # 解析主页,获取详情页url
    147              source = self.drive.page_source
    148              # 解析列表页
    149              self.parse_list_page(source)
    150              # 跳转下一页
    151              next_btn = self.drive.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
    152              if "pager_next pager_next_disabled" in next_btn.get_attribute("class"):
    153                  break
    154              else:
    155                  next_btn.click()
    156              time.sleep(3)
    157          # 保存数据到Excel中
    158          self.save_excel(self.positions)
    159 
    160 
    161 
    162 if __name__ == '__main__':
    163     # 创建爬虫对象
    164     spider = LaGou()
    165     # 调用run()执行爬虫
    166     spider.run()
  • 相关阅读:
    🍖流程控制之if判断
    🍖Python开发入门之变量
    🍖Python入门之基本数据类型
    sql事务和存储过程 【转】
    《设计模式基于C#的工程化实现及扩展》 Security Design Pattern 系列 4 角色模式(Role Pattern) 【转】
    C#中ToString格式大全 【转】
    事务日志 (SQL Server)【转】
    批处理启动、停止SQL服务 【转】
    关于HttpContext.Current.Request获取值 【转】
    C#中HashTable的用法 【转】
  • 原文地址:https://www.cnblogs.com/sure-feng/p/10204229.html
Copyright © 2011-2022 走看看