zoukankan      html  css  js  c++  java
  • 拉钩爬虫

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    import json
    import re
    import time

    import lxml.html
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC

    from redis_cache import RedisCache


    class LagouSpider(object):

    def __init__(self):
    # 调用webdriver.Chrome()启动浏览器
    self.driver = webdriver.Chrome()
    self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
    self.detail_url = None

    def run(self):
    # 获得url打开浏览器
    self.driver.get(self.url)
    while True:
    # 获取当前页面源代码
    source = self.driver.page_source
    # 进行等待页面加载,如果需要的内容已出现,就进行下一步
    WebDriverWait(driver=self.driver, timeout=10).until(
    EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]/span[last()]'))
    )
    # 将source传入parse_list_page函数进行解析
    self.parse_list_page(source)
    try:
    next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
    if "pager_next_disabled" in next_btn.get_attribute("class"):
    break
    else:
    next_btn.click()
    except:
    print(source)
    time.sleep(1)

    def parse_list_page(self, source):
    """
    进行原始页面解析
    :param source:
    :return:
    """
    html = lxml.html.fromstring(source)
    # 获取详情页链接集
    links = html.xpath('//a[@class="position_link"]/@href')
    for link in links:
    self.detail_url = link
    self.requests_detail_page(link)
    time.sleep(1)

    def requests_detail_page(self,url):
    """
    请求详情页信息
    :param url:
    :return:
    """
    self.driver.execute_script("window.open('%s')" % url)
    self.driver.switch_to.window(self.driver.window_handles[1])
    WebDriverWait(self.driver, timeout=10).until(
    EC.presence_of_element_located((By.XPATH, '//div[@class="job-name"]//span[@class="name"]'))
    )
    source = self.driver.page_source
    self.parse_datail_page(source)
    self.driver.close()
    self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_datail_page(self, source):
    """详情页解析"""
    html = lxml.html.fromstring(source)

    job_name = html.xpath('//div[@class="job-name"]//span[@class="name"]/text()')[0]
    job_salary = html.xpath('//dd[@class="job_request"]/p//span[1]/text()')[0]
    job_city = html.xpath('//dd[@class="job_request"]/p//span[2]/text()')[0]
    job_city = re.sub(r"[s/]", "", job_city)
    experience = html.xpath('//dd[@class="job_request"]/p//span[3]/text()')[0].strip()
    experience = re.sub(r"[s/]", "", experience)
    education = html.xpath('//dd[@class="job_request"]/p//span[4]/text()')[0]
    education = re.sub(r"[s/]", "", education)
    job_time = html.xpath('//dd[@class="job_request"]/p//span[5]/text()')[0]
    job_advantage = html.xpath('//dd[@class="job-advantage"]/p/text()')[0]
    desc = "".join(html.xpath('//dd[@class="job_bt"]//text()')).strip()
    job_address = "".join(html.xpath('//div[@class="work_addr"]//text()'))
    job_address = re.sub(r"[s/]", "", job_address)[0:-4]

    position = {
    'job_name': job_name,
    'job_salary': job_salary,
    'job_city': job_city,
    'experience': experience,
    'education': education,
    'job_advantage': job_advantage,
    'desc': desc,
    'job_address': job_address,
    'job_time': job_time,
    }

    rc = RedisCache()
    rc[self.detail_url] = position
    position_print = json.loads(rc[self.detail_url])
    print(self.detail_url)
    print(position_print)
    print('='*40)


    if __name__ == '__main__':
    spider = LagouSpider()
    spider.run()
  • 相关阅读:
    LeetCode周赛#206
    CET-6备考丨词组、佳句积累
    界面设计9.24第一次课
    图像超分辨率重建
    OpenGL和计算机图形学初步认识
    OpenGL装gult库
    安装java
    vs2019配置Opengl
    最长上升子序列(最长递增子序列)LIS
    c++科学计数法 、long long的范围
  • 原文地址:https://www.cnblogs.com/wenjiangtao/p/10963633.html
Copyright © 2011-2022 走看看