zoukankan      html  css  js  c++  java
  • 第七篇:爬虫实战—2、投递拉钩网简历

    爬虫投递简历小示例

    一、流程分析

    复制代码
    第一步:获取登录页,获取X_Anti_Forge_Token,X_Anti_Forge_Code
        1、请求url:https://passport.lagou.com/login/login.html
        2、请求方式:get
        3、请求头:
               - cookie:用session处理了
               - User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name
    第二步:登录
        1、请求url:https://passport.lagou.com/login/login.json
        2、请求方式:post
        3、请求头:
            cookie
            User-agent
            Referer:https://passport.lagou.com/login/login.html
            X-Anit-Forge-Code:53165984
            X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
            X-Requested-With:XMLHttpRequest
        4、请求体
            isValidate:true
            username:18611453110
            password:70621c64832c4d4d66a47be6150b4a8e
            request_form_verifyCode:''
            submit:''
    第三步:授权
            1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html
            2、请求方法:GET
            3、请求头:
               User-agent
               Referer:https://passport.lagou.com/login/login.html
    第四步:验证
    第五步:筛选职位信息
        请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
        请求方法:GET
        请求头:
            User-Agent
        请求参数:
            gj:3年及以下
            px:default
            yx:25k-50k
            city:北京
    第六步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
        请求url:详情页地址
        请求方式:GET
        请求头:User-Agent
    第七步:投递简历
        请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
        请求方式:POST
        请求头:
            Referer:详情页地址
            User-agent
            X-Anit-Forge-Code:53165984
            X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
            X-Requested-With:XMLHttpRequest
        请求体:
        positionId:职位ID
        type:1
        force:true
    复制代码

    二、代码实现

      1 import requests
      2 import re
      3 from urllib.parse import urlencode
      4 session = requests.session()
      5 r1 = session.get(
      6     "https://passport.lagou.com/login/login.html",
      7     headers = {
      8         "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
      9     }
     10 )
     11 X_Anit_Forge_Code  = re.findall("X_Anit_Forge_Code ='(.*?)'",r1.text,re.S)
     12 X_Anit_Forge_Token = re.findall("X_Anit_Forge_Token ='(.*?)'",r1.text,re.S)
     13 r2 = session.post(
     14     "https://passport.lagou.com/login/login.json",
     15     headers = {
     16         "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
     17         "Referer":"https://passport.lagou.com/login/login.html",
     18         "X-Anit-Forge-Code":X_Anit_Forge_Code,
     19         "X-Anit-Forge-Token":X_Anit_Forge_Token,
     20         "X-Requested-With":"XMLHttpRequest"
     21     },
     22     data={
     23         "isValidate": True,
     24         'username': '18611453110',
     25         'password': '70621c64832c4d4d66a47be6150b4a8e',
     26         'request_form_verifyCode': '',
     27         'submit': ''
     28     }
     29 )
     30 r3 = session.get(
     31     "https://passport.lagou.com/grantServiceTicket/grant.html",
     32     headers = {
     33         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
     34         'Referer': 'https://passport.lagou.com/login/login.html',
     35     }
     36 )
     37 r4 = session.get(
     38     'https://www.lagou.com/resume/myresume.html',
     39     headers = {
     40         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
     41     }
     42 )
     43 
     44 print('18611453110' in r4.text)
     45 
     46 # ============================
     47 # res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
     48 # url = "https://www.lagou.com/jobs/list_"+res
     49 # r5 =session.get(url,
     50 #             headers={
     51 #                     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
     52 #                 },
     53 #              params={
     54 #                      'gj': '3年及以下',
     55 #                      'px': 'default',
     56 #                      'yx': '25k-50k',
     57 #                      'city': '北京'
     58 #                 }
     59 #          ) #按照套路进行,结果取不到值,因为人家发的是ajax请求获取的数据,所以选择了r6的方式
     60 res = urlencode({"k":"java高级开发"},encoding="utf-8").split("=")[-1]
     61 url = "https://www.lagou.com/jobs/list_"+res
     62 r6 = session.post(
     63     'https://www.lagou.com/jobs/postionAjax.json',
     64     headers = {
     65         'Referer': url,
     66         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36Name",
     67     },
     68     data = {
     69         "first":True,
     70         "pn":1,
     71         "kd":"java高级开发"
     72     },
     73     params = {
     74         "gj":"3年及以下",
     75         "gx":"default",
     76         "yx":"15k-25k",
     77         "city":"北京",
     78         "needAddtionResult":False,
     79         "isSchoolJob":0
     80     }
     81 )
     82 from pprint import pprint
     83 # print(r6.json())
     84 comapines_list=r6.json()['content']['positionResult']['result']
     85 for comapiny in comapines_list:
     86     positionId=comapiny['positionId']
     87     company_link='https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)
     88     companyShortName = comapiny['companyShortName']
     89     positionName = comapiny['positionName']
     90     salary = comapiny['salary']
     91     print('''
     92     详情连接:%s
     93     公司名:%s
     94     职位名:%s
     95     薪资:%s
     96     ''' %(company_link,companyShortName,positionName,salary))
     97     r7=session.get(company_link,
     98                 headers={
     99                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    100                 }
    101                 )
    102     X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
    103     X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
    104     # print(X_Anti_Forge_Token,X_Anti_Forge_Code)
    105 
    106 
    107     session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
    108                  headers={
    109                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    110                      'Referer': company_link,
    111                      'X-Anit-Forge-Code': X_Anti_Forge_Code,
    112                      'X-Anit-Forge-Token': X_Anti_Forge_Token,
    113                      'X-Requested-With': 'XMLHttpRequest'
    114                  },
    115                  data={
    116     'positionId':positionId,
    117     'type':1,
    118     'force':True
    119                  }
    120                  )
    121     print('%s 投递成功' %(companyShortName))
    View Code
  • 相关阅读:
    HDU
    HDU
    POJ
    HDU
    HDU
    POJ
    HDU
    FZU
    LightOJ 1030 Discovering Gold 数学期望计算
    POJ 3061 Subsequence 二分查找
  • 原文地址:https://www.cnblogs.com/mqhpy/p/11370788.html
Copyright © 2011-2022 走看看