zoukankan      html  css  js  c++  java
  • selenium 爬boss

    # 有问题
    
    from selenium import webdriver
    import time
    from lxml import etree
    
    
    
    class LagouSpider(object):
        driver_path = r"G:Crawler and Datachromedriver.exe"
    
        def __init__(self):
            self.driver = webdriver.Chrome(executable_path=self.driver_path)
            self.url = "https://www.zhipin.com/job_detail/?query=python&city=101010100&industry=&position="
            self.positions = []
            self.position_dict = {}
            self.detail_url_list = []
        def run(self):
            # 访问首页
            self.driver.get(self.url)
            # 获取页面信息
            # page_source可以获取页面的所有数据,包括每个职位的链接
            source= self.driver.page_source
            self.parse_list_page(source)
    
    
        def parse_list_page(self,source):
            # 每个职位的链接
            tree = etree.HTML(source)
    
            # 获取职位的链接 ******
            li_list = tree.xpath("//div[@class='job-box']/div[@class='job-list']/ul/li")
            for li in li_list:
                detail_url = li.xpath('.//div[@class="info-primary"]/h3/a/@href')[0]
                detail_url = "https://www.zhipin.com"+detail_url
                print(detail_url)
                self.detail_url_list.append(detail_url)
                title = li.xpath('.//div[@class="info-primary"]/h3/a/div[@class="job-title"]/text()')[0]
                salary = li.xpath('.//div[@class="info-primary"]/h3/a/span[@class="red"]/text()')[0]
                company = li.xpath('.//div[@class="info-company"]//h3/a/text()')[0]
                self.position_dict["title"]=title
                self.position_dict["salary"]=salary
                self.position_dict["company"]=company
    
                self.detail_page(detail_url)
                # break
    
        def detail_page(self,url):
            for url in self.detail_url_list:
                # self.driver.get(url) # 直接访问这个url
                self.driver.execute_script('window.open("%s")'%url) # 新打开一个窗口
                self.driver.switch_to.window(self.driver.window_handles[1])  # 切换到新窗口
                source = self.driver.page_source
                tree = etree.HTML(source)
                desc = tree.xpath("//div[@id='main']/div[3]/div/div[2]/div[2]/div[1]/div")
                # 获取一个标签(含有其他标签)下所有的文本
                desc_text = desc[0].xpath('string()').strip()
                self.position_dict['desc_text'] = desc_text
                print(self.position_dict)
                time.sleep(2)
                self.driver.close()  # 关闭页面
                self.driver.switch_to.window(self.driver.window_handles[0])  # 切换到新窗口
    
    
    if __name__ == '__main__':
        spider = LagouSpider()
        spider.run()
  • 相关阅读:
    「UVA12293」 Box Game
    「CF803C」 Maximal GCD
    「CF525D」Arthur and Walls
    「CF442C」 Artem and Array
    LeetCode lcci 16.03 交点
    LeetCode 1305 两棵二叉搜索树中的所有元素
    LeetCode 1040 移动石子直到连续 II
    LeetCode 664 奇怪的打印机
    iOS UIPageViewController系统方法崩溃修复
    LeetCode 334 递增的三元子序列
  • 原文地址:https://www.cnblogs.com/kenD/p/11192654.html
Copyright © 2011-2022 走看看