zoukankan      html  css  js  c++  java
  • 拉钩网数据抓取

    import json
    import re
    import time
    
    import requests
    import multiprocessing
    
    
    class HandleLaGou():
        def __init__(self):
            # 使用session保存cookies信息
            self.lagou_session = requests.Session()
            self.header = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
            }
            self.city_list = ""
    
        # 获取全国所有城市列表的方法
        def handle_city(self):
            city_search = re.compile(r'zhaopin/">(.*?)</a>')
            city_url = 'https://www.lagou.com/jobs/allCity.html'
            city_result = self.handle_request(method="GET", url=city_url)
            # 使用正则表达式获取城市列表
            self.city_list = city_search.findall(city_result)
            self.lagou_session.cookies.clear()  # 清除cookies
            # print(city_result)
    
        def handle_city_job(self, city):
            first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
            first_response = self.handle_request(method="GET", url=first_request_url)
            total_page_search = re.compile(r'class="spanstotalNum">(d+)</span>')
            try:
                total_page = total_page_search.search(first_response).group(1)
            # 由于没有岗位信息造成exception
            except:
                return
            else:
                for i in range(1, int(total_page) + 1):
                    data = {
                        "pn":i,
                        "kd":"python"
                    }
                    page_url = "https://www.lagou.com/jobs/positionAjax.json?px=default&city=%s&needAddtionalResult=false"% city
                    referer_url = 'https://www.lagou.com/jobs/list_python?&px=default&city=%s'%city
                    #referer_url需要进行encode
                    self.header['Referer'] = referer_url.encode()
                    response = self.handle_request("POST",page_url,data=data,info=city)
                    lagou_data = json.loads(response)
                    job_list = lagou_data['content']['positionResult']['result']
                    for job in job_list:
                        print(job)
            print(total_page)
    
        def handle_request(self, method, url, data=None, info=None):
            while True:
                #加入阿布云代理
                proxyinfo = "http://%s:%s@%s:%s"%('阿布云账号','阿布云密码','阿布云host','阿布云port')
                proxy = {
                    "http":proxyinfo,
                    "https":proxyinfo
                }
                try:
                    if method == "GET":
                        response = self.lagou_session.get(url=url, headers=self.header,proxies=proxy,timeout=6)
                    elif method =="POST":
                        response = self.lagou_session.post(url=url, headers=self.header,data=data,proxies=proxy,timeout=6)
                except:
                    # 需先清除cookies信息,然后重新获取
                    self.lagou_session.cookies.clear()
                    first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                    self.handle_request(method="GET", url=first_request_url)
                    time.sleep(10)
                    continue
                response.encoding = 'utf-8'
                if '频繁' in response.text:
                    print("频繁")
                    #需先清除cookies信息,然后重新获取
                    self.lagou_session.cookies.clear()
                    first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                    self.handle_request(method="GET", url=first_request_url)
                    time.sleep(10)
                    continue
                return response.text
    
    
    if __name__ == '__main__':
        lagou = HandleLaGou()
        # 所有城市方法
        lagou.handle_city()
        #引入多进程,加速抓取
        pool = multiprocessing.Pool(2)
        
        for city in lagou.city_list:
            pool.apply_async(lagou.handle_city_job,args=(city,))
            
        pool.close()
        pool.join()
  • 相关阅读:
    如何阅读一个Web项目 【转载】
    线程的状态与基本操作
    java多线程通信方式之一:wait/notify
    synchronized的简单理解
    每月IT摘录201807
    springmvc 请求无法到达controller,出现404
    android开发 java与c# 兼容AES加密
    android 开发不能创建目录
    mysql存储过程出现OUT or INOUT argument 10 for routine
    android退出登陆后,清空之前所有的activity,进入登陆主界面
  • 原文地址:https://www.cnblogs.com/Erick-L/p/11348119.html
Copyright © 2011-2022 走看看