zoukankan      html  css  js  c++  java
  • 爬取拉勾网示例

    爬取需求分析

    # 第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
    # 1、请求url:https://passport.lagou.com/login/login.html
    # 2、请求方法:GET
    # 3、请求头:
    #    User-agent
    r1 = session.get('https://passport.lagou.com/login/login.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                     },
                     )
    
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    
    # 第二步:登陆
    # 1、请求url:https://passport.lagou.com/login/login.json
    # 2、请求方法:POST
    # 3、请求头:
    #    cookie
    #    User-agent
    #    Referer:https://passport.lagou.com/login/login.html
    #    X-Anit-Forge-Code:53165984
    #    X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
    #    X-Requested-With:XMLHttpRequest
    # 4、请求体:
    # isValidate:true
    # username:18611453110
    # password:70621c64832c4d4d66a47be6150b4a8e
    # request_form_verifyCode:''
    # submit:''
    r2 = session.post('https://passport.lagou.com/login/login.json',
                      headers={
                          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                          'Referer': 'https://passport.lagou.com/login/login.html',
                          'X-Anit-Forge-Code': X_Anti_Forge_Code,
                          'X-Anit-Forge-Token': X_Anti_Forge_Token,
                          'X-Requested-With': 'XMLHttpRequest'
                      },
                      data={
                          "isValidate": True,
                          'username': '18611453110',
                          'password': '70621c64832c4d4d66a47be6150b4a8e',
                          'request_form_verifyCode': '',
                          'submit': ''
                      }
                      )
    
    # 第三步:授权
    # 1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html
    # 2、请求方法:GET
    # 3、请求头:
    #    User-agent
    #    Referer:https://passport.lagou.com/login/login.html
    
    r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                         'Referer': 'https://passport.lagou.com/login/login.html',
                     }
                     )
    
    
    # 第四步:验证
    r4 = session.get('https://www.lagou.com/resume/myresume.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                     }
                     )
    
    # 第五步:筛选职位信息
    # 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
    # 请求方法:GET
    # 请求头:
    # User-Agent
    # 请求参数:
    # gj:3年及以下
    # px:default
    # yx:25k-50k
    # city:北京
    
    
    #第六步,详细的筛选出职位信息条件
    #请求参数
    # params={
    #      'gj': '3年及以下',
    #      'px': 'default',
    #      'yx': '25k-50k',
    #      'city': '北京',
    #     'needAddtionalResult':False,
    #     'isSchoolJob':0
    # }
    
    #第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
        # 请求url:详情页地址
        # 请求方式:GET
        # 请求头:User-Agent
        r7=session.get(company_link,
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                    }
                    )
        X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
        X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
    
    
    
        #第八步:投递简历
        #请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
        #请求方式:POST
        #请求头:
            #Referer:详情页地址
            #User-agent
            #X-Anit-Forge-Code:53165984
            #X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
            #X-Requested-With:XMLHttpRequest
        #请求体:
        # positionId:职位ID
        # type:1
        # force:true
    
        session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                         'Referer': company_link,
                         'X-Anit-Forge-Code': X_Anti_Forge_Code,
                         'X-Anit-Forge-Token': X_Anti_Forge_Token,
                         'X-Requested-With': 'XMLHttpRequest'
                     },
                     data={
        'positionId':positionId,
        'type':1,
        'force':True
                     }
                     )
        print('%s 投递成功' %(companyShortName))
    

     

    import requests
    import re
    from urllib.parse import urlencode
    session = requests.session()
    r1 = session.get(
        "https://passport.lagou.com/login/login.html",
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        }
    )
    X_Anit_Forge_Code  = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S)
    X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S)
    r2 = session.post(
        "https://passport.lagou.com/login/login.json",
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
            "Referer":"https://passport.lagou.com/login/login.html",
            "X-Anit-Forge-Code":X_Anit_Forge_Code,
            "X-Anit-Forge-Token":X_Anit_Forge_Token,
            "X-Requested-With":"XMLHttpRequest"
        },
        data={
            "isValidate": True,
            'username': '18611453110',
            'password': '70621c64832c4d4d66a47be6150b4a8e',
            'request_form_verifyCode': '',
            'submit': ''
        }
    )
    r3 = session.get(
        "https://passport.lagou.com/grantServiceTicket/grant.html",
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
            'Referer': 'https://passport.lagou.com/login/login.html',
        }
    )
    r4 = session.get(
        'https://www.lagou.com/resume/myresume.html',
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        }
    )
    
    print('18611453110' in r4.text)
    
    # ============================
    # res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
    # url = "https://www.lagou.com/jobs/list_"+res
    # r5 =session.get(url,
    #             headers={
    #                     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
    #                 },
    #              params={
    #                      'gj': '3年及以下',
    #                      'px': 'default',
    #                      'yx': '25k-50k',
    #                      'city': '北京'
    #                 }
    #          ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式
    res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
    url = "https://www.lagou.com/jobs/list_"+res
    r6 = session.post(
        'https://www.lagou.com/jobs/postionAjax.json',
        headers = {
            'Referer': url,
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
        },
        data = {
            "first":True,
            "pn":1,
            "kd":"java高级开发"
        },
        params = {
            "gj":"3年及以下",
            "gx":"default",
            "yx":"15k-25k",
            "city":"北京",
            "needAddtionResult":False,
            "isSchoolJob":0
        }
    )
    from pprint import pprint
    # print(r6.json())
    comapines_list=r6.json()['content']['positionResult']['result']
    for comapiny in comapines_list:
        positionId=comapiny['positionId']
        company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)
        companyShortName = comapiny['companyShortName']
        positionName = comapiny['positionName']
        salary = comapiny['salary']
        print('''
        详情连接:%s
        公司名:%s
        职位名:%s
        薪资:%s
        ''' %(company_link,companyShortName,positionName,salary))
        r7=session.get(company_link,
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                    }
                    )
        X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
        X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
        # print(X_Anti_Forge_Token,X_Anti_Forge_Code)
    
    
        session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                         'Referer': company_link,
                         'X-Anit-Forge-Code': X_Anti_Forge_Code,
                         'X-Anit-Forge-Token': X_Anti_Forge_Token,
                         'X-Requested-With': 'XMLHttpRequest'
                     },
                     data={
        'positionId':positionId,
        'type':1,
        'force':True
                     }
                     )
        print('%s 投递成功' %(companyShortName))
    代码示例
  • 相关阅读:
    计算机漏洞安全相关的概念POC 、EXP 、VUL 、CVE 、0DAY
    开始使用kali的一些小问题:菜鸟瞎折腾
    nmap参数详解(罗列一下)
    安装kali之后必做的几件小事
    Debian下virtualBox增强功能出错
    ArcGIS Engine 基础功能(一)
    sublime 配置简单的python环境
    解决 ‘Could not fetch URL https://pypi.python.org’的问题
    golang基础语法学习
    大象盒子技术栈
  • 原文地址:https://www.cnblogs.com/shaojiafeng/p/8310306.html
Copyright © 2011-2022 走看看