zoukankan      html  css  js  c++  java
  • 爬虫(六)-在前程无忧投简历

     在前程无忧上投简历发现有个竞争力分析,似乎是按简历来的,想要根据评分投简历

    抓取关键字搜索结果的综合竞争力得分和评语,筛选投简历

     

    获取cookie

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    # chrome_options.add_argument('--headless')
    from time import sleep
    import re
    from lxml import etree
    import requests
    import os
    import json
    
    driver = webdriver.Chrome(chrome_options=chrome_options,executable_path = 'D:pythonchromedriver.exe')
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}

    def get_cookie():
    driver.get("https://login.51job.com/login.php?loginway=1&lang=c&url=")
    sleep(2)
    phone=input("输入手机号:")
    driver.find_element_by_id("loginname").send_keys(phone)
    driver.find_element_by_id("btn7").click()
    sleep(1)
    code=input("输入短信:")
    driver.find_element_by_id("phonecode").send_keys(code)
    driver.find_element_by_id("login_btn").click()
    sleep(2)
    cookies = driver.get_cookies()
    with open("cookie.json", "w")as f:
    f.write(json.dumps(cookies))
    driver.close()

    用webdriver登陆获取cookie,把cookie写入文件,取消注释可以用无头模式

    前程无忧手机短信一天只能发送三条,保存cookie登陆用

    #搜索职位获得页码
    def get_job():
    job = input("输入职位:")
    url=f"https://search.51job.com/list/020000,000000,0000,00,1,99,{job},2,1.html?lang=c&stype=1&postchannel=0000&workyear=02&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=5&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    r=session.get(url,headers=headers)
    r.encoding=r.apparent_encoding
    tree = etree.HTML(r.text)
    x = tree.xpath('//span[@class="td"]/text()')[0]
    total_page = int(re.findall("(d+)", x)[0])
    href = []
    for i in range(1,total_page+1):
    href.append(re.sub("d.html", f'{i}.html', url))
    return href

    职位结果的页码

     

     

    #职位id
    def get_job_code(url):
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
    r=session.get(url,headers=headers)
    tree=etree.HTML(r.text)
    divs=tree.xpath('//div[@class="el"]/p/span/a/@href')
    job=str(divs)
    job_id=re.findall("/(d+).html",job)
    return job_id

    获取职位id

    修改竞争力分析页面的id

    #匹配的信息
    def get_info(job_id):
    href=f"https://i.51job.com/userset/bounce_window_redirect.php?jobid={job_id}&redirect_type=2"
    r=session.get(href,headers=headers)
    r.encoding=r.apparent_encoding
    tree=etree.HTML(r.text)
    pingjia=tree.xpath('//div[@class="warn w1"]//text()')[0].strip()
    gongsi=[]
    for i in tree.xpath('//div[@class="lf"]//text()'):
    if i.strip():
    gongsi.append(i.strip())
    fenshu=[]
    for i in tree.xpath('//ul[@class="rt"]//text()'):
    if i.strip():
    fenshu.append(i.strip())
    url=f"https://jobs.51job.com/shanghai/{job_id}.html?s=03&t=0"
    return {"公司":gongsi[1],"职位":gongsi[0],"匹配度":pingjia,fenshu[3]:fenshu[2],"链接":url,"_id":job_id}

    主程序

    #用cookie登陆
    if not os.path.exists("cookie.json"):
    get_cookie()
    f=open("cookie.json","r")
    cookies=json.loads(f.read())
    f.close()
    session = requests.Session()
    for cookie in cookies:
    session.cookies.set(cookie['name'], cookie['value'])

    检查cookie,不存就保存

    #获取所有职位id
    code=[]
    for i in get_job():
    code=code+get_job_code(i)

    职位id添加到列表

     #存入Mongodb
    import pymongo
    client=pymongo.MongoClient("47.102.109.190",27017)
    db=client["job_zhu"]
    job_info=db["job_info"]
    for i in code:
    try:
    if not job_info.find_one({"_id":i}):
    info=get_info(i)
    if not job_info.find_one(info):
    job_info.insert_one(info)
    print(info)
    print("插入成功")
    except:
    print(code)

    MongDB保存结果,用职位id去重

    吃完饭已经有8000个职位了,找到127个匹配度好的投递

    投递:

    一个登陆状态的点击动作,可以用selenium

    for i in job_info.find({"匹配度":{$regex:"排名很好"},"综合竞争力得分":{$gte:"80"}}):
        print(i)
        try:
            driver.get(i)
            driver.find_element_by_id("app_ck").click()
            sleep(2)
        except:
            pass

    用cookie登陆后循环投递,查表筛选条件

     

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    from time import sleep
    import re
    from lxml import etree
    import requests
    import os
    import json


    driver = webdriver.Chrome(chrome_options=chrome_options,executable_path = 'D:pythonchromedriver.exe')
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
    driver.get("https://search.51job.com/list/020000,000000,0000,00,1,99,%2B,2,1.html?lang=c&stype=1&postchannel=0000&workyear=01%2C02&cotype=99&degreefrom=03%2C04&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=5&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=")


    #获取cookie
    def get_cookie():
    driver.get("https://login.51job.com/login.php?loginway=1&lang=c&url=")
    sleep(2)
    phone=input("输入手机号:")
    driver.find_element_by_id("loginname").send_keys(phone)
    driver.find_element_by_id("btn7").click()
    sleep(1)
    code=input("输入短信:")
    driver.find_element_by_id("phonecode").send_keys(code)
    driver.find_element_by_id("login_btn").click()
    sleep(2)
    cookies = driver.get_cookies()
    with open("cookie.json", "w")as f:
    f.write(json.dumps(cookies))
    driver.close()


    #搜索和页码
    def get_job():
    job = input("输入职位:")
    url=f"https://search.51job.com/list/020000,000000,0000,00,1,99,{job},2,1.html?lang=c&stype=1&postchannel=0000&workyear=02&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=5&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    r=session.get(url,headers=headers)
    r.encoding=r.apparent_encoding
    tree = etree.HTML(r.text)
    x = tree.xpath('//span[@class="td"]/text()')[0]
    total_page = int(re.findall("(d+)", x)[0])
    href = []
    for i in range(1,total_page+1):
    href.append(re.sub("d.html", f'{i}.html', url))
    return href


    #职位id
    def get_job_code(url):
    headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
    r=session.get(url,headers=headers)
    tree=etree.HTML(r.text)
    divs=tree.xpath('//div[@class="el"]/p/span/a/@href')
    job=str(divs)
    job_id=re.findall("/(d+).html",job)
    return job_id


    #匹配信息
    def get_info(job_id):
    href=f"https://i.51job.com/userset/bounce_window_redirect.php?jobid={job_id}&redirect_type=2"
    r=session.get(href,headers=headers)
    r.encoding=r.apparent_encoding
    tree=etree.HTML(r.text)
    pingjia=tree.xpath('//div[@class="warn w1"]//text()')[0].strip()
    gongsi=[]
    for i in tree.xpath('//div[@class="lf"]//text()'):
    if i.strip():
    gongsi.append(i.strip())
    fenshu=[]
    for i in tree.xpath('//ul[@class="rt"]//text()'):
    if i.strip():
    fenshu.append(i.strip())
    url=f"https://jobs.51job.com/shanghai/{job_id}.html?s=03&t=0"
    return {"公司":gongsi[1],"职位":gongsi[0],"匹配度":pingjia,fenshu[3]:fenshu[2],"链接":url,"_id":job_id}


    #用cookie登陆
    if not os.path.exists("cookie.json"):
    get_cookie()
    f=open("cookie.json","r")
    cookies=json.loads(f.read())
    f.close()
    session = requests.Session()
    for cookie in cookies:
    session.cookies.set(cookie['name'], cookie['value'])


    #所有职位id
    code=[]
    for i in get_job():
    code=code+get_job_code(i)


    #存入Mongo
    import pymongo
    client=pymongo.MongoClient("47.102.109.190",27017)
    db=client["job_zhu"]
    job_info=db["job_info"]
    for i in code:
    try:
    if not job_info.find_one({"_id":i}):
    info=get_info(i)
    if not job_info.find_one(info):
    job_info.insert_one(info)
    print(info)
    print("插入成功")
    except:
    print(code)
  • 相关阅读:
    jdk1.8安装教程
    实现负载均衡的两种方式
    Java中ArrayList和LinkedList区别
    做一个英译中的命令行工具
    在 IDEA 中运行 Spark 程序报错:Multiple sources found for text.......please specify the fully qualified class name
    Android消息处理:EventBus、BroadCast和Handler-优缺点比较(转)
    C#流总结(文件流、内存流、网络流、BufferedStream、StreamReader/StreamWriter、TextReader/TextWriter、转载)
    Android相关知识
    物联网相关知识
    Mqtt相关知识
  • 原文地址:https://www.cnblogs.com/shenyiyangle/p/10822688.html
Copyright © 2011-2022 走看看