zoukankan      html  css  js  c++  java
  • 爬取猎聘大数据岗位相关信息--Python

    猎聘网站搜索大数据关键字,只能显示100页,爬取这一百页的相关信息,以便做分析。

    __author__ = 'Fred Zhao'
    
    import requests
    from bs4 import BeautifulSoup
    import os
    import csv
    
    class JobSearch():
    
        def __init__(self):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
            self.base_url = 'https://www.liepin.com/zhaopin/?ckid=c1a868fa8b83aa5b&fromSearchBtn=2&init=-1&sfrom=click-pc_homepage-centre_searchbox-search_new&degradeFlag=0&key=大数据&headckid=c1a868fa8b83aa5b&d_pageSize=40&siTag=LGV-fc5u_67LtFjetF6ACg~fA9rXquZc5IkJpXC-Ycixw&d_headId=8e7325814e7ed9919787ee3fe85e1c94&d_ckId=8e7325814e7ed9919787ee3fe85e1c94&d_sfrom=search_fp&d_curPage=99&curPage='
            self.base_path = os.path.dirname(__file__)
    
        def makedir(self, name):
            path = os.path.join(self.base_path, name)
            isExist = os.path.exists(path)
            if not isExist:
                os.makedirs(path)
                print("File has been created.")
            else:
                print('OK!The file is existed. You do not need create a new one.')
            os.chdir(path)
    
        def request(self, url):
            r = requests.get(url, headers=self.headers)
            return r
    
        def get_detail(self, page):
            r = self.request(self.base_url + page)
            ul = BeautifulSoup(r.text, 'lxml').find('ul', class_='sojob-list')
            plist = ul.find_all('li')
            self.makedir('job_data')
            rows = []
            for item in plist:
                job_info = item.find('div', class_='sojob-item-main clearfix').find('div', class_='job-info')
                position = job_info.find('h3').get('title')
                print(position)
                job_info_list = job_info.find_all('p')
                job_condition = job_info_list[0].get('title')
                print(job_condition)
                job_time = job_info_list[1].find('time').get('title')
                print(job_time)
                company_info = item.find('div', class_='sojob-item-main clearfix').find('div', class_='company-info')
                company = company_info.find('p', class_='company-name').find('a').get('title')
                print(company)
    
                rows.append([position, job_condition, job_time, company])
            self.save_to_csv(rows)
    
        def save_to_csv(self, rows):
            with open('job.csv', 'a') as f:
                writer = csv.writer(f)
                writer.writerows(rows)
    
    if __name__ == '__main__':
        job = JobSearch()
        for page in range(0, 100):
            job.get_detail(str(page))
    
  • 相关阅读:
    NetworkInterface网速监测
    动态编译
    JSON C# Class Generator
    Cookie中的HttpOnly
    webapi session
    没有为扩展名“.html”注册的生成提供程序
    转 C# 使用openssl
    openssl jia adress
    扩展JS
    bootstrap 模态
  • 原文地址:https://www.cnblogs.com/fredkeke/p/9409560.html
Copyright © 2011-2022 走看看