zoukankan      html  css  js  c++  java
  • 拉勾网 selenium 模拟 爬取

    #encoding: utf-8
    
    from selenium import webdriver
    from selenium.webdriver.support.ui import Select,WebDriverWait
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support import expected_conditions as EC
    from lxml import etree
    import time
    import re
    import csv
    
    class LagouSpider(object):
        # chromedriver的绝对路径
        driver_path ='/Users/mac126/chromedriver'
        def __init__(self):
            # 初始化一个driver,并且指定chromedriver的路径
            self.driver = webdriver.Chrome(executable_path=self.driver_path)
            self.positions = []
            self.fp = open('lagou.csv','a',encoding='utf-8',newline='')
            self.writer = csv.DictWriter(self.fp,['title','salary','city','work_years','education',"company_website",'desc','acquire','origin_url'])
            self.writer.writeheader()
    
        def run(self):
            #运行
            url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='#路径
            self.driver.get(url)#获取路径
            while True:#死循环
                WebDriverWait(driver=self.driver,timeout=10).until(
                    EC.presence_of_element_located((By.XPATH,"//span[contains(@class,'pager_next')]"))
                )
                resource = self.driver.page_source
                self.parse_list_page(resource)
                next_btn = self.driver.find_element_by_xpath("//span[contains(@class,'pager_next')]")
                if "pager_next_disabled" in next_btn.get_attribute('class'):
                    break
                next_btn.click()
                time.sleep(1)
    
        def parse_list_page(self,resource):
            html = etree.HTML(resource)
            links = html.xpath("//a[@class='position_link']/@href")
            for link in links:
                self.parse_detail_page(link)
                time.sleep(1)
    
        def parse_detail_page(self,url):
         
            self.driver.execute_script("window.open('"+url+"')")
            self.driver.switch_to.window(self.driver.window_handles[1])
            WebDriverWait(self.driver,timeout=10).until(
                EC.presence_of_element_located((By.XPATH,"//dd[@class='job_bt']"))
            )
            resource = self.driver.page_source
            html = etree.HTML(resource)
            title = html.xpath("//span[@class='name']/text()")[0]
            company = html.xpath("//h2[@class='fl']/text()")[0].strip()
            job_request_span = html.xpath("//dd[@class='job_request']//span")
            salary = job_request_span[0].xpath(".//text()")[0]
            salary = salary.strip()
            city = job_request_span[1].xpath(".//text()")[0]
            city = re.sub(r"[/s]","",city)
            work_years = job_request_span[2].xpath(".//text()")[0]
            work_years = re.sub(r"[/s]","",work_years)
            education = job_request_span[3].xpath(".//text()")[0]
            education = re.sub(r"[/s]","",education)
            company_website = html.xpath("//ul[@class='c_feature']/li[last()]/a/@href")[0]
            position_desc = "".join(html.xpath("//dd[@class='job_bt']/div//text()"))
            position = {
                'title': title,
                'city': city,
                'salary': salary,
                'company': company,
                'company_website': company_website,
                'education': education,
                'work_years': work_years,
                'desc': position_desc,
                'origin_url': url
            }
            self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[0])
            self.write_position(position)
    
        def write_position(self,position):
            if len(self.positions) >= 100:
                self.writer.writerows(self.positions)
                self.positions.clear()
            self.positions.append(position)
            print(position)
    
    def main():
        spider = LagouSpider()
        spider.run()
    
    if __name__ == '__main__':
        main()

  • 相关阅读:
    How to Set up Cplex Dev Environment under Linux
    矿大linux下拨号上网的一个方法——利用NetworkManager
    4月4日在写协议分析器中遇到的问题:对类指针的直接赋值中遇到的问题
    新篇章的开始
    纠结N久,还是开通了博客园,希望跟大家多多交流吧
    SQL去重distinct方法解析
    接口测试基础知识介绍
    Appium是什么
    appium环境配置
    appium操作
  • 原文地址:https://www.cnblogs.com/liangliangzz/p/10252824.html
Copyright © 2011-2022 走看看