zoukankan      html  css  js  c++  java
  • 利用xpath爬取招聘网的招聘信息

    爬取招聘网的招聘信息:

    import json
    import random
    import time
    
    
    import pymongo
    import re
    import pandas as pd
    import requests
    from lxml import etree
    import datetime
    
    # 设置cookie中可变的值
    now = datetime.datetime.now()
    timeStamp = int(now.timestamp()*1000)
    geshi = "%Y%m%d%H%M%S"
    time1 = datetime.datetime.strftime(now,geshi)
    
    
    # 设置mongodb
    client = pymongo.MongoClient('localhost')
    # 设置数据库名
    db = client['lagou']
    # 指定集合名
    data_name = 'lagouData'
    detail = 'detailData'
    
    # 常量
    CITY = '广州'
    # 查询的岗位名称
    POSITON_NAME = '数据挖掘'
    # 想要爬取的总页面数
    PAGE_SUN = 10
    # 每页返回的职位数量
    PAGE_SIZE = 15
    
    # 匹配span[position()>3]:表示p标签下从第三个span开始匹配所以
    # //dd[@class='job_request']/p/span[position()>3]
    
    # index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize=15'
    # index页地址url
    index_url = 'https://m.lagou.com/search.json?city={}&positionName={}&pageNo={}&pageSize={}'
    # 详情页地址url
    detail_url = 'https://m.lagou.com/jobs/{}.html'
    # 浏览代理用户
    user_agents = [
        "Mozilla/5.0 (iPhone 84; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.8.0 Mobile/14G60 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
        "Mozilla/5.0 (Linux; Android 7.0; STF-AL10 Build/HUAWEISTF-AL10; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043508 Safari/537.36 V1_AND_SQ_7.2.0_730_YYB_D QQ/7.2.0.3270 NetType/4G WebP/0.3.0 Pixel/1080",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) Mobile/14G60 MicroMessenger/6.5.18 NetType/WIFI Language/en",
        "Mozilla/5.0 (Linux; Android 5.1.1; vivo Xplay5A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/48.0.2564.116 Mobile Safari/537.36 T7/9.3 baiduboxapp/9.3.0.10 (Baidu; P1 5.1.1)",
        "Mozilla/5.0 (Linux; U; Android 7.0; zh-cn; STF-AL00 Build/HUAWEISTF-AL00) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.9 Mobile Safari/537.36",
        "Mozilla/5.0 (Linux; Android 6.0; LEX626 Build/HEXCNFN5902606111S) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/35.0.1916.138 Mobile Safari/537.36 T7/7.4 baiduboxapp/8.3.1 (Baidu; P1 6.0)",
        "Mozilla/5.0 (iPhone 92; CPU iPhone OS 10_3_2 like Mac OS X) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.7.2 Mobile/14F89 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1",
        "Mozilla/5.0 (Linux; U; Android 7.0; zh-CN; ZUK Z2121 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.6.8.952 Mobile Safari/537.36"]
    
    
    # index页面
    def index_fn():
        user_agent=random.choice(user_agents)
        headers = {
            "Accept": "application/json",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Host": "m.lagou.com",
            #
            "Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
                timeStamp=timeStamp, time=time1),
            "Referer": "https://m.lagou.com/search.html",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest",
        }
        for i in range(PAGE_SIZE):
            proxies = {'HTTP': '171.37.164.78:8123'}
            response = requests.get(index_url.format(CITY, POSITON_NAME, i, PAGE_SIZE), headers=headers,proxies=proxies).content.decode()
            content = json.loads(response)
            # print('content', content)
            if content:
                try:
                    result = content['content']['data']['page']['result']
                    for item in result:
                        # print(type(item),item)
                        # print(item['positionId'])
                        data = {
                            'positionId': item['positionId'],
                            'positionName': item['positionName'],
                            'city': item['city'],
                            'createTime':item['createTime'],
                            'companyId': item['companyId'],
                            'companyLogo': item['companyLogo'],
                            'companyName': item['companyName'],
                            'companyFullName': item['companyFullName'],
                        }
                        time.sleep(0.5)
                        # db['lagouData'].insert(data)
                        yield data
                except Exception as e:
                    print('爬取index页出错', e)
            else:
                time.sleep(10)
                print('重新加载')
                # except Exception as e:
                #     print('爬取index页出错', e)
    
    
    # 详情页:
    def detail_d(positionId):
        # 随机获取代理对象
        user_agent = random.choice(user_agents)
        headers = {
            "Accept": "application/json",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Connection": "keep-alive",
            "Host": "m.lagou.com",
            #
            "Cookie": "_ga=GA1.2.841469794.1541152606; user_trace_token=20181102175657-a2701865-de85-11e8-8368-525400f775ce; LGUID=20181102175657-a2701fbd-de85-11e8-8368-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; _gid=GA1.2.311675459.1542615716; _ga=GA1.3.841469794.1541152606; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542634073,1542634080,1542634122,1542634128; JSESSIONID=ABAAABAAAGCABCC1B87E5C12282CECED77A736D4CD7FA8A; X_HTTP_TOKEN=aae2d9e96d6a68f72d98ab409a933460; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%2C%22%24device_id%22%3A%221672c5c65c01c7-0e8e56366a6cce-3a3a5c0e-2073600-1672c5c65c3bf%22%7D; sajssdk_2015_cross_new_user=1; _gat=1; LGSID=20181119231628-167f7db1-ec0e-11e8-a76a-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fm.lagou.com%2Fsearch.html; PRE_LAND=https%3A%2F%2Fm.lagou.com%2Fjobs%2F5219979.html; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6={timeStamp}; LGRID={time}-1c458fde-ec0e-11e8-895f-5254005c3644".format(
                timeStamp=timeStamp, time=time1),
            "Referer": "https://m.lagou.com/search.html",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest",
        }
        response = requests.get(detail_url.format(positionId), headers=headers).content.decode()
    
        xml = etree.HTML(response)
        title = xml.xpath('''//div[@class='postitle']/h2/text()''')
        job_details = xml.xpath('''//div[@class='detail']/div[1]//span/span/text()''')
        job_detail = str(job_details).replace(r'
    ', '').replace(' ', '')
        work_detial = xml.xpath('''//div[@class='content']//p/text()''')
        company_img = xml.xpath('''//div[@class='content']//p/text()''')
        company_infors = xml.xpath(
            '''//div[@class='company activeable']/div/div/h2/text()|//div[@class='dleft']/p/text()''')
        company_infor = str(company_infors).strip().replace(r'
    ', '').replace(' ', '')
        detail_datas = {
            'title': title,
            'job_detail': job_detail,
            'work_detial': work_detial,
            'company_img': company_img,
            'company_infor': company_infor
        }
        return detail_datas
    
    
    # 保存到mongodb
    def save_to_mongodb(data, detail_datas, positionId):
        # if db[data_name].update({'positionId': positionId}, {'$set': data}, True):
        #     print('update to Mongo', data['positionId'])
    
        db['lagouData'].insert(data)
        db['detailDta'].insert(detail_datas)
        print('成功存入mongodb')
    
    
    # 保存为csv文件
    def save_to_csv():
        item_list = []
        for item in index_fn():
            item_list.append(item)
            print('123456', item)
        # print('详情列', item_list)
        # item_list是一个列表,里面装很多字典类似{'positionId': 4102483, 'positionName': '数据挖掘工程师', 'city': '广州',...
        datas = pd.DataFrame(item_list, columns=["positionId", "positionName", "city", "createTime", "salary", "companyId",
                                                 "companyLogo", "companyName", "companyFullName"])
    
        datas.to_csv('./static/lagou.csv')
        print('保存为csv文件成功')
    
    
    def run():
        # 保存为csv文件
        # proxies=get_ip()
        # for i in proxies:
        data = index_fn()
        save_to_csv()
        for item in data:
            print('data', item)
            positionId = item['positionId']
            print(positionId)
            # 调用详情页函数
            detail_datas = detail_d(positionId)
            # 保存详情页和主页的数据到mongodb
            save_to_mongodb(data, detail_datas, positionId)
    
    
    if __name__ == '__main__':
        run()

    预防反爬虫措施:

    1.用户代理变换设置

    2.不同ip代理的设置

    3.设置用户cookie变化的信息

    列举一下获取网络免费ip代理,并验证其是否可用的代码:

    import requests
    import re
    import telnetlib
    from lxml import etree
    import time
    def get_ip():
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
        }
        url = "http://www.xicidaili.com/nn/"
        res = requests.get(url,headers=headers)
        content = res.content.decode()
    
        # 调用lxml中的etree库便于用xpath获取html中的文本
        xml = etree.HTML(content)
        # 以下为xpath语法
        # //tr[@class='odd']//td[2]//text()  ip
        # //tr[@class='odd']//td[3]//text() port 端口
        # //tr[@class='odd']//td[6]//text()  type 类型
        ip_list = xml.xpath("//tr[@class='odd']//td[2]//text()")
        port_list = xml.xpath("//tr[@class='odd']//td[3]//text()")
        type_list = xml.xpath("//tr[@class='odd']//td[6]//text()")
        if len(ip_list) != 0:
            for ip,port,type in zip(ip_list,port_list,type_list):
                proxies = {
                    type:"{}:{}".format(ip,port)
                }
                try:
                    telnetlib.Telnet(ip, port=port, timeout=10)
                except Exception:
                    print("不能使用该{}".format(proxies))
                else:
                    print('可以使用该{}'.format(proxies))
                    yield proxies
            get_ip()
        else:
            time.sleep(1)
            get_ip()
    
    
        # content= res.content.decode()
        # print(content)
    
    
    
    # if __name__ == '__main__':
    #     ip = get_ip()
        # print(ip)
        # for i in ip:
        #     pass
            # yield i
            # print('getip,getip',i)
  • 相关阅读:
    14组作品的优点与建议
    人月神话读后感1
    [置顶] acm入门
    POJ 3041 Asteroids 【匈牙利算法最小点覆盖】
    acm入门
    【转】acm入门
    POJ 1469 COURSES【匈牙利算法入门 二分图的最大匹配 模板题】
    二分图最大匹配总结【转自kb神】
    POJ 3041 Asteroids (匈牙利算法最小点覆盖)
    POJ 1258 AgriNet (最小生成树入门题目)
  • 原文地址:https://www.cnblogs.com/Dark-fire-liehuo/p/9992371.html
Copyright © 2011-2022 走看看