zoukankan      html  css  js  c++  java
  • 爬虫练习二(拉勾网投递简历)

    一、流程分析

     第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
     1、请求url:https://passport.lagou.com/login/login.html
     2、请求方法:GET
     3、请求头: User-agent
     
     第二步:登陆
     1、请求url:https://passport.lagou.com/login/login.json
     2、请求方法:POST
     3、请求头:
        cookie
        User-agent
        Referer:https://passport.lagou.com/login/login.html
        X-Anit-Forge-Code:53165984
        X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
        X-Requested-With:XMLHttpRequest
     4、请求体:
     isValidate:true
     username:18611453110
     password:70621c64832c4d4d66a47be6150b4a8e
     request_form_verifyCode:''
    submit:''
    
    第三步:授权
    1.请求url::https://passport.lagou.com/grantServiceTicket/grant.html
    2.请求方法:GET
    3.请求头:User-agent
               Referer:https://passport.lagou.com/login/login.html
    
    第四步:验证
    
    第五步:筛选职位信息
    1.请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
    2.请求方法:GET
    3.请求头:User-Agent
    4.请求参数:
       # gj:3年及以下
       # px:default
       # yx:25k-50k
       # city:北京
    
    第六步:
    #请求url:https://www.lagou.com/jobs/positionAjax.json
    #请求方法:POST
    #请求头
    #    Referer
    #    User-Agent
    #请求体:
        # first:true
        # pn:1
        # kd:java高级开发
    #请求参数
    # params={
    #      'gj': '3年及以下',
    #      'px': 'default',
    #      'yx': '25k-50k',
    #      'city': '北京',
    #     'needAddtionalResult':False,
    #     'isSchoolJob':0
    # }
    
     第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
        # 请求url:详情页地址
        # 请求方式:GET
        # 请求头:User-Agent

     二、代码实现

    import requests
    import re
    from urllib.parse import urlencode
    session = requests.session()
    r1 = session.get(
        "https://passport.lagou.com/login/login.html",
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        }
    )
    X_Anit_Forge_Code  = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S)
    X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S)
    r2 = session.post(
        "https://passport.lagou.com/login/login.json",
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
            "Referer":"https://passport.lagou.com/login/login.html",
            "X-Anit-Forge-Code":X_Anit_Forge_Code,
            "X-Anit-Forge-Token":X_Anit_Forge_Token,
            "X-Requested-With":"XMLHttpRequest"
        },
        data={
            "isValidate": True,
            'username': '18611453110',
            'password': '70621c64832c4d4d66a47be6150b4a8e',
            'request_form_verifyCode': '',
            'submit': ''
        }
    )
    r3 = session.get(
        "https://passport.lagou.com/grantServiceTicket/grant.html",
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
            'Referer': 'https://passport.lagou.com/login/login.html',
        }
    )
    r4 = session.get(
        'https://www.lagou.com/resume/myresume.html',
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        }
    )
    
    print('18611453110' in r4.text)
    
    # ============================
    # res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
    # url = "https://www.lagou.com/jobs/list_"+res
    # r5 =session.get(url,
    #             headers={
    #                     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
    #                 },
    #              params={
    #                      'gj': '3年及以下',
    #                      'px': 'default',
    #                      'yx': '25k-50k',
    #                      'city': '北京'
    #                 }
    #          ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式
    res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
    url = "https://www.lagou.com/jobs/list_"+res
    r6 = session.post(
        'https://www.lagou.com/jobs/postionAjax.json',
        headers = {
            'Referer': url,
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        },
        data = {
            "first":True,
            "pn":1,
            "kd":"java高级开发"
        },
        params = {
            "gj":"3年及以下",
            "gx":"default",
            "yx":"15k-25k",
            "city":"北京",
            "needAddtionResult":False,
            "isSchoolJob":0
        }
    )
    from pprint import pprint
    # print(r6.json())
    comapines_list=r6.json()['content']['positionResult']['result']
    for comapiny in comapines_list:
        positionId=comapiny['positionId']
        company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)
        companyShortName = comapiny['companyShortName']
        positionName = comapiny['positionName']
        salary = comapiny['salary']
        print('''
        详情连接:%s
        公司名:%s
        职位名:%s
        薪资:%s
        ''' %(company_link,companyShortName,positionName,salary))
        r7=session.get(company_link,
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                    }
                    )
        X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
        X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
        # print(X_Anti_Forge_Token,X_Anti_Forge_Code)
    
    
        session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                         'Referer': company_link,
                         'X-Anit-Forge-Code': X_Anti_Forge_Code,
                         'X-Anit-Forge-Token': X_Anti_Forge_Token,
                         'X-Requested-With': 'XMLHttpRequest'
                     },
                     data={
        'positionId':positionId,
        'type':1,
        'force':True
                     }
                     )
        print('%s 投递成功' %(companyShortName))
    View Code
  • 相关阅读:
    对于GetBuffer() 与 ReleaseBuffer() 的一些分析
    _tmain与main,winMain,wmain收藏
    【引用】常用字符串长度计算函数
    Invalid URI
    Cannot obtain the schema rowset "DBSCHEMA_TABLES_INFO" for OLE DB provider "SQLNCLI10" for linked server "DB1".
    Penang Industrial Zone
    Create Raid 1 and Raid 10 in one server
    Time zone BOGUS not found in registry
    'xxx_Forms' is not a valid Application Database or User 'sa' does not have sufficient permissions to check
    Syteline Goods Receiving Note Report
  • 原文地址:https://www.cnblogs.com/moning/p/8306493.html
Copyright © 2011-2022 走看看