zoukankan      html  css  js  c++  java
  • 抓取腾讯招聘python岗位

    # -*- coding: utf-8 -*-
    """
    @author: Dell Created on Mon Dec 23 17:55:06 2019
    """
    import re
    import time
    import requests
    from lxml import etree
    
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    
    
    HEADERS = {
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Referer': 'https://careers.tencent.com/',
        'Accept': 'application/json, text/javascript, */*; q=0.01'
    }
    
    def parse(url):
        driver= webdriver.Chrome()
        driver.get(url)
        text = driver.page_source
        
        # resp = requests.get(url, headers=HEADERS)
        # text = resp.content.decode("utf-8", errors="ignore")
        
        #解析网页
        html = etree.HTML(text)
        divs = html.xpath("//div[@class='recruit-list']")
        
        pos_infos = []
        for div in divs:
            title = div.xpath("./a/h4/text()")[0]#提取职位名称
            address = div.xpath("./a/p/span[2]/text()")[0]#提取职位工作地点
            require = div.xpath("./a/p[@class='recruit-text']/text()")[0]#提取职位要求
           
            pos_info = {'title':title, 'address':address, 'require':require}
            pos_infos.append(pos_info)
        
        driver.close()
        return pos_infos
    
    def save(list):
        with open("tencent.txt", "a+", encoding="utf-8") as f:
            for line in list:
                f.write(str(line) + "
    ")
                
        
    
    if __name__ == "__main__":
        baseurl = "https://careers.tencent.com/search.html?index={}&keyword=python"
        for i in range(1,70):
            url = baseurl.format(i)
            pos_list = parse(url)
            
            save(pos_list)
            for pos in pos_list:
                print(pos)
            print("第%s页解析完成" % str(i), "-" * 50)
        pass
    
    
  • 相关阅读:
    GitHub常用 库
    App性能优化
    iOS App性能优化
    UIButton图片与文字位置调整
    Mac常用目录
    js数字转金额,ajax调用接口,后台返回html(完整页面),打开新窗口并写入html
    js坑 把数字型的字符串默认为数字 把前面的0给去掉了("001")
    url跳转路径参数获取
    常用正则表达式,手机号,邮箱,网址
    Js获取操作系统版本 && 获得浏览器版本
  • 原文地址:https://www.cnblogs.com/zxfei/p/12088112.html
Copyright © 2011-2022 走看看