zoukankan      html  css  js  c++  java
  • 爬虫(猫眼电影+校花网+github+今日头条+拉钩)

    Requests+正则表达式爬取猫眼TOP100榜电影信息

    MARK:将信息写入文件解决乱码方法,开启进程池秒爬。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    import requests
    from requests.exceptions import RequestException
    import re
    import json
    from multiprocessing import Pool
     
     
    def get_one_page(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
     
     
    def parse_one_page(html):
        pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
                             + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                             + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
        items = re.findall(pattern, html)
        for item in items:
            yield {
                '排行': item[0],
                '图片': item[1],
                '电影': item[2],
                '演员': item[3].strip()[3:],
                '上映信息': item[4].strip()[5:],
                '评分': item[5+ item[6]
            }
     
     
    def write_to_file(content):
        with open('result.txt''a', encoding='utf-8') as f:
            f.write(json.dumps(content, ensure_ascii=False+ ' ')
     
     
    def main(offset):
        url = 'http://maoyan.com/board/4?offset=' + str(offset)
        html = get_one_page(url)
        for item in parse_one_page(html):
            print(item)
            write_to_file(item)
     
     
    if __name__ == '__main__':
        # for i in range(10):
        #   main(i*10)
        pool = Pool()  # 进程池 多进程
        pool.map(main, [i * 10 for in range(10)])

    Requests+正则表达式爬取校花网视频

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    import requests
    import re
    import os
     
     
    def get_page(url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            response.encoding = response.apparent_encoding
            return response.text
        except:
            print("爬取失败")
     
     
    def get_url(html):
        pattern = re.compile('class="items".*?href="(.*?)"', re.S)
        urls = re.findall(pattern, html)
        for url in urls:
            if not url.startswith('http'):
                url = 'http://www.xiaohuar.com' + url
            yield url
     
     
    def get_detail_url(detail_content):
        pattern = re.compile('id="media".*?src="(.*?)"', re.S)
        urls = re.findall(pattern, detail_content)
        for url in urls:
            if url:
                if url.endswith('.mp4'):
                    yield url
     
     
    def download(url):
        root = "D://movie2//"
        path = root + url.split('/')[-1]
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                response = requests.get(url)
                # with open(path, 'wb') as f:
                #     f.write(response.content)
     
                with open(path, 'wb') as f:
                    for line in response.iter_content():
                        f.write(line)
                    print("文件保存成功")
            else:
                print("文件已存在")
        except:
            print("下载失败")
     
     
    def main(page_num):
        url = 'http://www.xiaohuar.com/list-3-{0}.html'.format(page_num)
        html = get_page(url)
        urls = get_url(html)
        for url in urls:
            detail_content = get_page(url)
            detail_urls = get_detail_url(detail_content)
            for detail_url in detail_urls:
                download(detail_url)
     
     
    if __name__ == '__main__':
        for num in range(30):
            main(num)

    Requests+PyQuery模拟登陆github

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    import requests
    from pyquery import PyQuery
     
    LOGIN_URL = 'https://github.com/login'
    SESSION_URL = 'https://github.com/session'
    session = requests.session()
    response = session.get(LOGIN_URL)
    text = PyQuery(response.text)
    authenticity_token = text('#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)').attr('value')
    data = {
        'commit''Sign in',
        'utf8''✓',
        'authenticity_token': authenticity_token,
        'login''lcgsmile@qq.com',
        'password''lcg@pwd.'
    }
    response = session.post(SESSION_URL, data=data)
    print(response.status_code)  # 200

    分析Ajax请求并抓取今日头条街拍美图 

    配置文件config.py

    1
    2
    3
    4
    5
    6
    7
    MONGO_URL = 'localhost'
    MONGO_DB = 'toutiao'
    MONGO_TABLE = 'toutiao'
     
    GROUP_START = 1
    GROUP_END = 20
    KEYWORD = '街拍'

    主爬虫文件

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    import json
    import os
    from urllib.parse import urlencode
    import pymongo
    import requests
    from bs4 import BeautifulSoup
    from requests.exceptions import ConnectionError
    import re
    from multiprocessing import Pool
    from hashlib import md5
    from json.decoder import JSONDecodeError
    from config import *
     
    client = pymongo.MongoClient(MONGO_URL, connect=False)  # 多进程抓取connect=False
    db = client[MONGO_DB]
     
     
    def get_page_index(offset, keyword):
        """
        爬取索引页
        """
        data = {
            'autoload''true',
            'count'20,
            'cur_tab'3,
            'format''json',
            'keyword': keyword,
            'offset': offset,
        }
        params = urlencode(data)  # 将字典类型构造成url的请求参数
        base = 'http://www.toutiao.com/search_content/'
        url = base + '?' + params
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
     
     
    def download_image(url):
        """
        下载图片
        """
        print('Downloading', url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                save_image(response.content)
            return None
        except ConnectionError:
            return None
     
     
    def save_image(content):
        """
        保存图片
        """
        file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
        # 用一个md5哈希生成的文件名防止重复
        print(file_path)
        if not os.path.exists(file_path):
            with open(file_path, 'wb') as f:
                f.write(content)
     
     
    def parse_page_index(text):
        """
        解析数据
        """
        try:
            data = json.loads(text)  # json字符串转换成字典
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
        except JSONDecodeError:
            pass
     
     
    def get_page_detail(url):
        """
        请求详情页
        """
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except ConnectionError:
            print('Error occurred')
            return None
     
     
    def parse_page_detail(html, url):
        """
        解析详情页
        """
        soup = BeautifulSoup(html, 'lxml')
        result = soup.select('title')
        title = result[0].get_text() if result else ''
        images_pattern = re.compile('gallery: JSON.parse("(.*)")', re.S)
        result = re.search(images_pattern, html)
        if result:
            data = json.loads(result.group(1).replace('\', ''))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url'for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title': title,
                    'url': url,
                    'images': images
                }
     
     
    def save_to_mongo(result):
        """
        将数据插入到MongoDB
        """
        if db[MONGO_TABLE].insert(result):
            print('Successfully Saved to Mongo', result)
            return True
        return False
     
     
    def main(offset):
        text = get_page_index(offset, KEYWORD)
        urls = parse_page_index(text)
        for url in urls:
            html = get_page_detail(url)
            result = parse_page_detail(html, url)
            if result: save_to_mongo(result)
     
     
    if __name__ == '__main__':
        pool = Pool()
        groups = ([x * 20 for in range(GROUP_START, GROUP_END + 1)])
        pool.map(main, groups)
        pool.close()
        pool.join()

    拉勾网自动投递简历

    import requests
    import re
    
    # 1、============================================认证流程
    session = requests.session()
    # 第一步:
    # 请求的URL:https://passport.lagou.com/login/login.html,
    # 请求的方法GET,
    # 请求头只包含User-agent
    
    r1 = session.get('https://passport.lagou.com/login/login.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     },
                     )
    
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    # print(X_Anti_Forge_Code)
    # print(X_Anti_Forge_Token)
    
    
    # 第二步:
    # 1、请求的URL:https://passport.lagou.com/login/login.json,
    # 2、请求方法POST,
    # 3、请求头:
    #   Referer:https://passport.lagou.com/login/login.html
    #   User-Agent:
    #   X-Anit-Forge-Code
    #   X-Anit-Forge-Token
    #   X-Requested-With
    # 4、请求体:
    # isValidate:true
    # username:1111111111
    # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
    session.post('https://passport.lagou.com/login/login.json',
                 headers={
                     'Referer': 'https://passport.lagou.com/login/login.html',
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'isValidate': True,
                     'username': '18611453110',
                     'password': '70621c64832c4d4d66a47be6150b4a8e'
                 }
                 )
    
    # 第三:
    # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
    # 2、请求方法GET,
    # 3、请求头:
    #   Referer:https://passport.lagou.com/login/login.html
    #   User-Agent:
    
    session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
                headers={
                    'Referer': 'https://passport.lagou.com/login/login.html',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                }
                )
    
    # 验证
    response = session.get('https://www.lagou.com/resume/myresume.html',
                           headers={
                               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                           }
                           )
    
    # print('18611453110' in response.text)
    
    
    
    # 2、============================================爬取职位信息
    # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
    # 2、请求的方式:POST
    #   请求参数:
    #     gj:3年及以下
    #     xl:不要求
    #     jd:不需要融资
    #     hy:移动互联网
    #     px:default
    #     yx:15k-25k
    #     city:全国
    # 3、请求头:
    # User-Agent
    # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
    # X-Anit-Forge-Code:0
    # X-Anit-Forge-Token:None
    # X-Requested-With:XMLHttpRequest
    
    # 4、请求体:
    # first:true
    # pn:1
    # kd:python数据分析
    
    from urllib.parse import urlencode
    
    params = {'kw': 'python数据分析'}
    res = urlencode(params).split('=')[-1]
    url = 'https://www.lagou.com/jobs/list_' + res
    # print(url)
    
    
    response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                            params={
                                # 'gj': '3年及以下',
                                # 'xl': '不要求',
                                # 'jd': '不需要融资',
                                # 'hy': '移动互联网',
                                'px': 'default',
                                'yx': '15k-25k',
                                'city': '北京',
                                'district': '海淀区',
    
                            },
                            headers={
                                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                                'Referer': url,
    
                            })
    
    # print(response.status_code)
    result = response.json()['content']['positionResult']['result']
    for comanpy_info in result:
        fullname = comanpy_info['companyFullName']
        emp_num = comanpy_info['companySize']
        salary = comanpy_info['salary']
        workyear = comanpy_info['workYear']
        positionName = comanpy_info['positionName']
        positionId = comanpy_info['positionId']
        detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)
    
        print(detail_url)
        print(fullname)
        print(emp_num)
        print(salary)
        print(workyear)
        print(positionName)
        print(positionId)
        print()
    
        # 3、============================================爬取职位信息
        # 第一步:请求详情页:
        # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
        # 2、请求的方式:GET
        # 3、请求头:
        #    User-Agent
        r1 = session.get(detail_url,
                         headers={
                             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                         }
                         )
    
        X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
        X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    
        # 第二步:投递简历
        # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
        # 2、请求的方式:POST
        # 3、请求头:
        # User-Agent
        # Referer:detail_url
        # X-Anit-Forge-Code:31832262
        # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
        # X-Requested-With:XMLHttpRequest
    
        # 4、请求体:
        # 'positionId':3984845
        # 'type':1
        # 'force':True
    
        session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                         'Referer': detail_url,
                         'X-Anit-Forge-Code': X_Anti_Forge_Code,
                         'X-Anit-Forge-Token': X_Anti_Forge_Token,
                         'X-Requested-With': 'XMLHttpRequest'
                     },
                     data={
                         'positionId': positionId,
                         'type': 1,
                         'force': True
                     }
    
                     )
    
        print('投递成功',detail_url)
    
    lagou
    复制代码
    import requests
    import re
    
    # 1、============================================认证流程
    session = requests.session()
    # 第一步:
    # 请求的URL:https://passport.lagou.com/login/login.html,
    # 请求的方法GET,
    # 请求头只包含User-agent
    
    r1 = session.get('https://passport.lagou.com/login/login.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     },
                     )
    
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    # print(X_Anti_Forge_Code)
    # print(X_Anti_Forge_Token)
    
    
    # 第二步:
    # 1、请求的URL:https://passport.lagou.com/login/login.json,
    # 2、请求方法POST,
    # 3、请求头:
    #   Referer:https://passport.lagou.com/login/login.html
    #   User-Agent:
    #   X-Anit-Forge-Code
    #   X-Anit-Forge-Token
    #   X-Requested-With
    # 4、请求体:
    # isValidate:true
    # username:1111111111
    # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
    session.post('https://passport.lagou.com/login/login.json',
                 headers={
                     'Referer': 'https://passport.lagou.com/login/login.html',
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'isValidate': True,
                     'username': '18611453110',
                     'password': '70621c64832c4d4d66a47be6150b4a8e'
                 }
                 )
    
    # 第三:
    # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
    # 2、请求方法GET,
    # 3、请求头:
    #   Referer:https://passport.lagou.com/login/login.html
    #   User-Agent:
    
    session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
                headers={
                    'Referer': 'https://passport.lagou.com/login/login.html',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                }
                )
    
    # 验证
    response = session.get('https://www.lagou.com/resume/myresume.html',
                           headers={
                               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                           }
                           )
    
    # print('18611453110' in response.text)
    
    
    
    # 2、============================================爬取职位信息
    # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
    # 2、请求的方式:POST
    #   请求参数:
    #     gj:3年及以下
    #     xl:不要求
    #     jd:不需要融资
    #     hy:移动互联网
    #     px:default
    #     yx:15k-25k
    #     city:全国
    # 3、请求头:
    # User-Agent
    # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
    # X-Anit-Forge-Code:0
    # X-Anit-Forge-Token:None
    # X-Requested-With:XMLHttpRequest
    
    # 4、请求体:
    # first:true
    # pn:1
    # kd:python数据分析
    
    from urllib.parse import urlencode
    
    params = {'kw': 'python数据分析'}
    res = urlencode(params).split('=')[-1]
    url = 'https://www.lagou.com/jobs/list_' + res
    # print(url)
    
    
    response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                            params={
                                # 'gj': '3年及以下',
                                # 'xl': '不要求',
                                # 'jd': '不需要融资',
                                # 'hy': '移动互联网',
                                'px': 'default',
                                'yx': '15k-25k',
                                'city': '北京',
                                'district': '海淀区',
    
                            },
                            headers={
                                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                                'Referer': url,
    
                            })
    
    # print(response.status_code)
    result = response.json()['content']['positionResult']['result']
    for comanpy_info in result:
        fullname = comanpy_info['companyFullName']
        emp_num = comanpy_info['companySize']
        salary = comanpy_info['salary']
        workyear = comanpy_info['workYear']
        positionName = comanpy_info['positionName']
        positionId = comanpy_info['positionId']
        detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)
    
        print(detail_url)
        print(fullname)
        print(emp_num)
        print(salary)
        print(workyear)
        print(positionName)
        print(positionId)
        print()
    
        # 3、============================================爬取职位信息
        # 第一步:请求详情页:
        # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
        # 2、请求的方式:GET
        # 3、请求头:
        #    User-Agent
        r1 = session.get(detail_url,
                         headers={
                             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                         }
                         )
    
        X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
        X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    
        # 第二步:投递简历
        # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
        # 2、请求的方式:POST
        # 3、请求头:
        # User-Agent
        # Referer:detail_url
        # X-Anit-Forge-Code:31832262
        # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
        # X-Requested-With:XMLHttpRequest
    
        # 4、请求体:
        # 'positionId':3984845
        # 'type':1
        # 'force':True
    
        session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                         'Referer': detail_url,
                         'X-Anit-Forge-Code': X_Anti_Forge_Code,
                         'X-Anit-Forge-Token': X_Anti_Forge_Token,
                         'X-Requested-With': 'XMLHttpRequest'
                     },
                     data={
                         'positionId': positionId,
                         'type': 1,
                         'force': True
                     }
    
                     )
    
        print('投递成功',detail_url)
    import requests
    import re
    
    # 1、============================================认证流程
    session = requests.session()
    # 第一步:
    # 请求的URL:https://passport.lagou.com/login/login.html,
    # 请求的方法GET,
    # 请求头只包含User-agent
    
    r1 = session.get('https://passport.lagou.com/login/login.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     },
                     )
    
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    # print(X_Anti_Forge_Code)
    # print(X_Anti_Forge_Token)
    
    
    # 第二步:
    # 1、请求的URL:https://passport.lagou.com/login/login.json,
    # 2、请求方法POST,
    # 3、请求头:
    #   Referer:https://passport.lagou.com/login/login.html
    #   User-Agent:
    #   X-Anit-Forge-Code
    #   X-Anit-Forge-Token
    #   X-Requested-With
    # 4、请求体:
    # isValidate:true
    # username:1111111111
    # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
    session.post('https://passport.lagou.com/login/login.json',
                 headers={
                     'Referer': 'https://passport.lagou.com/login/login.html',
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'isValidate': True,
                     'username': '18611453110',
                     'password': '70621c64832c4d4d66a47be6150b4a8e'
                 }
                 )
    
    # 第三:
    # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
    # 2、请求方法GET,
    # 3、请求头:
    #   Referer:https://passport.lagou.com/login/login.html
    #   User-Agent:
    
    session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
                headers={
                    'Referer': 'https://passport.lagou.com/login/login.html',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                }
                )
    
    # 验证
    response = session.get('https://www.lagou.com/resume/myresume.html',
                           headers={
                               'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                           }
                           )
    
    # print('18611453110' in response.text)
    
    
    
    # 2、============================================爬取职位信息
    # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
    # 2、请求的方式:POST
    #   请求参数:
    #     gj:3年及以下
    #     xl:不要求
    #     jd:不需要融资
    #     hy:移动互联网
    #     px:default
    #     yx:15k-25k
    #     city:全国
    # 3、请求头:
    # User-Agent
    # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
    # X-Anit-Forge-Code:0
    # X-Anit-Forge-Token:None
    # X-Requested-With:XMLHttpRequest
    
    # 4、请求体:
    # first:true
    # pn:1
    # kd:python数据分析
    
    from urllib.parse import urlencode
    
    params = {'kw': 'python数据分析'}
    res = urlencode(params).split('=')[-1]
    url = 'https://www.lagou.com/jobs/list_' + res
    # print(url)
    
    
    response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                            params={
                                # 'gj': '3年及以下',
                                # 'xl': '不要求',
                                # 'jd': '不需要融资',
                                # 'hy': '移动互联网',
                                'px': 'default',
                                'yx': '15k-25k',
                                'city': '北京',
                                'district': '海淀区',
    
                            },
                            headers={
                                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                                'Referer': url,
    
                            })
    
    # print(response.status_code)
    result = response.json()['content']['positionResult']['result']
    for comanpy_info in result:
        fullname = comanpy_info['companyFullName']
        emp_num = comanpy_info['companySize']
        salary = comanpy_info['salary']
        workyear = comanpy_info['workYear']
        positionName = comanpy_info['positionName']
        positionId = comanpy_info['positionId']
        detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)
    
        print(detail_url)
        print(fullname)
        print(emp_num)
        print(salary)
        print(workyear)
        print(positionName)
        print(positionId)
        print()
    
        # 3、============================================爬取职位信息
        # 第一步:请求详情页:
        # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
        # 2、请求的方式:GET
        # 3、请求头:
        #    User-Agent
        r1 = session.get(detail_url,
                         headers={
                             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                         }
                         )
    
        X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
        X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    
        # 第二步:投递简历
        # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
        # 2、请求的方式:POST
        # 3、请求头:
        # User-Agent
        # Referer:detail_url
        # X-Anit-Forge-Code:31832262
        # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
        # X-Requested-With:XMLHttpRequest
    
        # 4、请求体:
        # 'positionId':3984845
        # 'type':1
        # 'force':True
    
        session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                         'Referer': detail_url,
                         'X-Anit-Forge-Code': X_Anti_Forge_Code,
                         'X-Anit-Forge-Token': X_Anti_Forge_Token,
                         'X-Requested-With': 'XMLHttpRequest'
                     },
                     data={
                         'positionId': positionId,
                         'type': 1,
                         'force': True
                     }
    
                     )
    
        print('投递成功',detail_url)
    
    lagou
  • 相关阅读:
    Keras -- 词向量 CNN
    KUDU实时分析
    Keras
    Kalfka
    Linux云
    管理KUDU
    列存储与行存储
    算法笔记 第5章 入门篇(3) --数学问题 学习笔记
    算法笔记 上机训练实战指南 第4章 入门篇(2) --算法初步 4.6two pointers 学习笔记
    算法笔记 上机训练实战指南 第4章 入门篇(2) --算法初步 4.4贪心 学习笔记
  • 原文地址:https://www.cnblogs.com/yunlongaimeng/p/9802151.html
Copyright © 2011-2022 走看看