zoukankan html css js c++ java

利用 requests + bs4 爬取猎聘工作信息

目的：

爬虫练手

爬虫建议：

1、刚开始学习bs4库的时候，官方教程上面侧重于介绍find 和 find_all方法，

2、如果了解css语法和xpath语法，其实这两个的灵活性更好

代码：

import requests
import time
from bs4 import BeautifulSoup
from pprint import pprint
import time
from openpyxl import Workbook


class LiePin(object):

    def __init__(self,job):
        self.url = "https://www.liepin.com/zhaopin/?"
        self.job = job
        self.dict_list = []


    def get_html(self,url,data=None):
        """
        解析开始查询页： 第一页需要输入关键词
        """
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
        }

        try:
            response = requests.get(url, params=data, headers=headers)
            print(response.url)
            if response.status_code == 200:
                return response.content
        except Exception:
            print("解析出现错误")
            self.get_html(url)
            

    def get_soup(self,html):
        try:
            soup = BeautifulSoup(html, 'lxml')
        except:
            soup = BeautifulSoup(html, 'html.parser')
        
        return soup


    def handle_one_page(self,soup):
        job_result = soup.select('.sojob-result ul.sojob-list')[0]   # soup 的select方法返回的是一个列表
        job_lists = job_result.select('li')

        for each_item in job_lists:
            print('*'*100)
            self.dict_list.append(self.handle_one_item(each_item))
            

    def handle_one_item(self,each_item):
        
        # 获取职位相关信息
        jobname = each_item.select('div[class="job-info"] h3')[0].get_text().strip()
        job_href =  each_item.select('div[class="job-info"] h3>a')[0].get('href')
        # 有的href存在问题，例如：/a/20486217.shtml， 这个时候，我们需要加上：https://www.liepin.com
        if job_href.startswith('/a'):
            job_href = 'https://www.liepin.com' + job_href
        ability_info = self.get_detail_ability_info(job_href)
        job_info = each_item.select('div[class="job-info"]>p')[0].get_text().strip().split("
")
        money = job_info[0]
        site = job_info[1]
        edu = job_info[2]
        exp = job_info[3]
        date = each_item.select('div[class="job-info"] .time-info time')[0].get_text()

        # 获取公司相关信息
        company_info = each_item.select('div[class="company-info nohover"]>p')
        company_name = company_info[0].select('a')[0].get_text().strip()
        # company_class = company_info[1].select('span')[0].get_text().strip()
        company_class = company_info[1].get_text().strip()
        company_welfare = [item.get_text().strip() for item in company_info[-1].select('span') if item ]
        company_welfare = " ".join(company_welfare)

        job_info_dict = {
            "职位名称": jobname,
            '薪资统计': money,
            '工作点': site,
            '学历要求': edu,
            '工作经验': exp,
            '公司名称': company_name,
            '公司类型': company_class,
            '公司福利': company_welfare,
            '招聘详细链接': job_href,
            '岗位要求': ability_info,
            '招聘日期': date
        }

        pprint(job_info_dict)
        return job_info_dict


    def get_detail_ability_info(self,url):
        """获取工作职能要求"""
        html = self.get_html(url)
        soup = self.get_soup(html)
        info_str = soup.select('.job-description>div')[0].get_text()
        return info_str

        
    def is_next_page(self,soup):
        print('33[32m================== 开始打印下一页 ======================33[0m')
        next_href = soup.select('.pagerbar>a')[-2].get('href')
        if next_href.startswith('/zhaopin'):
            next_url = 'https://www.liepin.com' + next_href
            return next_url

        print('达到最后一页了')
        return None

        
    def start(self):
        """
        先解析第一页，在解析后续页面
        """

        data = {
            'key': self.job
        }
        
        firt_html = self.get_html(self.url, data)
        soup = self.get_soup(firt_html)
        self.handle_one_page(soup)

        while self.is_next_page(soup):
            url = self.is_next_page(soup)
            html = self.get_html(url)
            soup = self.get_soup(html)
            self.handle_one_page(soup) 

            time.sleep(5)
        
        self.out_csv()


    def out_csv(self):
        wb = Workbook()
        ws = wb.create_sheet('招聘信息', index=0)

        for item_dict in self.dict_list:
            values = [value for value in item_dict.values()]
            ws.append(values)
        
        wb.save('猎聘_' + self.job + '.xlsx' )



if __name__ == "__main__":
    
    demo = LiePin('生物信息')
    demo.start()

查看全文

相关阅读:
python开发mysql:索引
 学习笔记之English
本周学习小结（25/03
本周学习小结（18/03
本周学习小结（11/03
学习笔记之Machine Learning Crash Course | Google Developers
本周学习小结
 学习笔记之Fluent Python
Leetcode 4. Median of Two Sorted Arrays
学习笔记之Python爬虫

原文地址：https://www.cnblogs.com/lmt921108/p/13025504.html