zoukankan      html  css  js  c++  java
  • selenium 爬boss

    # 有问题
    
    from selenium import webdriver
    import time
    from lxml import etree
    
    
    
    class LagouSpider(object):
        driver_path = r"G:Crawler and Datachromedriver.exe"
    
        def __init__(self):
            self.driver = webdriver.Chrome(executable_path=self.driver_path)
            self.url = "https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position="
            self.positions = []
            self.position_dict = {}
            self.detail_url_list = []
        def run(self):
            # 访问首页
            self.driver.get(self.url)
            # 获取页面信息
            # page_source可以获取页面的所有数据,包括每个职位的链接
            source= self.driver.page_source
            self.parse_list_page(source)
    
    
        def parse_list_page(self,source):
            # 每个职位的链接
            tree = etree.HTML(source)
    
            # 获取职位的链接 ******
            li_list = tree.xpath("//div[@class='job-box']/div[@class='job-list']/ul/li")
            for li in li_list:
                detail_url = li.xpath('.//div[@class="info-primary"]/h3/a/@href')[0]
                detail_url = "https://www.zhipin.com"+detail_url
                print(detail_url)
                self.detail_url_list.append(detail_url)
                title = li.xpath('.//div[@class="info-primary"]/h3/a/div[@class="job-title"]/text()')[0]
                salary = li.xpath('.//div[@class="info-primary"]/h3/a/span[@class="red"]/text()')[0]
                company = li.xpath('.//div[@class="info-company"]//h3/a/text()')[0]
                self.position_dict["title"]=title
                self.position_dict["salary"]=salary
                self.position_dict["company"]=company
    
                self.detail_page(detail_url)
                # break
    
        def detail_page(self,url):
            for url in self.detail_url_list:
                # self.driver.get(url) # 直接访问这个url
                self.driver.execute_script('window.open("%s")'%url) # 新打开一个窗口
                self.driver.switch_to.window(self.driver.window_handles[1])  # 切换到新窗口
                source = self.driver.page_source
                tree = etree.HTML(source)
                desc = tree.xpath("//div[@id='main']/div[3]/div/div[2]/div[2]/div[1]/div")
                # 获取一个标签(含有其他标签)下所有的文本
                desc_text = desc[0].xpath('string()').strip()
                self.position_dict['desc_text'] = desc_text
                print(self.position_dict)
                time.sleep(2)
                self.driver.close()  # 关闭页面
                self.driver.switch_to.window(self.driver.window_handles[0])  # 切换到新窗口
    
    
    if __name__ == '__main__':
        spider = LagouSpider()
        spider.run()
  • 相关阅读:
    MySQL 优化
    Log4j2 中format增加自定义的参数
    MySQL 索引
    Linux中top和free命令详解(转)
    JAVA面试题
    Servlet3.0的可插拔功能
    开放通用Api,总有你喜欢的
    Git常用命令
    支付宝无法回调或者回调后验签失败
    Promise
  • 原文地址:https://www.cnblogs.com/kenD/p/11192654.html
Copyright © 2011-2022 走看看