zoukankan      html  css  js  c++  java
  • python爬取招聘网站数据

    # -*- coding: utf-8 -*-
    
    # 爬虫分析
    from bs4 import BeautifulSoup
    from lxml import etree
    from selenium import webdriver
    import time
    from pymongo import MongoClient
    
    
    class WorkSpider:
        def __init__(self):
            self.client = MongoClient('mongodb://localhost:27017/')
            self.zfdb = self.client.zfdb
            #self.zfdb.authenticate("mongodbUser", "yourpassward")
    
        # 要爬取的城市列表
        def getCity(self):
            return [
                "全国",
                "北京",
                "郑州",
                #"上海",
                #"深圳",
                #"广州",
            ]
    
        # 要爬取的语言列表
        def getLanguage(self):
            return [
                 "Java",
                 "Python",
                # "C",
                # "机器学习",
                # "图像识别",
                # "自然语言处理",
                # "区块链",
                # "精准推荐",
                # "Node.js",
                # "Go",
                # "Hadoop",
                # "Php",
                # ".NET",
                # "Android",
                # "iOS",
                # "web前端",
            ]
    
        # 经过观察发现,拉钩的 url 随语言和城市的变化如下
        def getUrl(self, language, city):
            url = "https://www.lagou.com/jobs/list_" + language + "?px=default&city=" + city
            return url
    
        # 获取一个城市,列表中所有语言的 url 列表
        def getCityUrl(self, city):
            urlList = []
            for language in self.getLanguage():
                urlList.append(self.getUrl(language, city))
            return urlList
    
        # 获取一门语言,不同城市的 url 列表
        def getLanguageUrl(self, language):
            urlList = []
            for city in self.getCity():
                urlList.append(self.getUrl(language, city))
            return urlList
    
        def getOnePageData(self):
    
            pass
    
        # MongoDB 存储数据结构
        def getRentMsg(self, name, company, welfare, salaryMin, salaryMid, salaryMax, experience, education, companyType,
                       companyLevel, companySize):
            return {
                "name": name,  # 职位名称(python工程师)
                "company": company,  # 公司名称(xxx有限公司)
                "welfare": welfare,  # 福利(餐补、下午茶、带薪年假)
                "salaryMin": salaryMin,  # 工资下限(9k)
                "salaryMid": salaryMid,  # 工资下限(9k+15k)/2
                "salaryMax": salaryMax,  # 工资上限(15k)
                "experience": experience,  # 工作经验(经验3-5年)
                "education": education,  # 教育程度(本科)
                "companyType": companyType,  # 公司类型(移动互联网/信息安全)
                "companyLevel": companyLevel,  # 公司级别(上市公司)
                "companySize": companySize,  # 公司人数规模(150-500人)
            }
    
    
        # 获取网页源码数据
        # language => 编程语言
        # city => 城市
        # collectionType => 值:True/False  True => 数据库表以编程语言命名   False => 以城市命名
        def main(self, language, city, collectionType):
            print(" 当前爬取的语言为 => " + language + "  当前爬取的城市为 => " + city)
            #print(" 当前爬取的语言为 => " + language + "  当前爬取的城市为 => " + city)
            #print(" 当前爬取的语言为 => " + language + "  当前爬取的城市为 => " + city)
            url = self.getUrl(language, city)
            print(" 当前爬取的路径为 => " + url )
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--start-maximized')  # 最大化运行(全屏窗口),不设置,取元素会报错
            chrome_options.add_argument('--disable-infobars')  # 禁用浏览器正在被自动化程序控制的提示
            chrome_options.add_argument('--incognito')  # 隐身模式(无痕模式)
            #chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面
            browser = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options)
            #browser = webdriver.Chrome("chromedriver")
            browser.get(url)
            browser.implicitly_wait(10)
            for i in range(30):
                selector = etree.HTML(browser.page_source)  # 获取源码
                soup = BeautifulSoup(browser.page_source, "html.parser")
                span = soup.find("div", attrs={"class": "pager_container"}).find("span", attrs={"action": "next"})
                print("span =>" + str(span))  # <span action="next" class="pager_next pager_next_disabled" hidefocus="hidefocus">下一页<strong class="pager_lgthen pager_lgthen_dis"></strong></span>
                classArr = span['class']
                print("classArr =>"+ str(classArr))  # 输出内容为 -> ['pager_next', 'pager_next_disabled']
                attr2 = list(classArr)[1]
                if attr2 == "pager_next_disabled":
                    print("已经爬到最后一页,爬虫结束")
                    break
                else:
                    print("还有下一页,爬虫继续")
                    #browser.find_element_by_xpath('//*[@id="order"]/li/div[4]/div[2]').click()  # 点击下一页
                    browser.find_element_by_xpath('//span[@class="pager_is_current"]/following-sibling::span').click()  # 点击下一页
                time.sleep(5)
                print('第{}页抓取完毕'.format(i + 1))
                self.getItemData(selector, language, city, collectionType)
            browser.close()
    
        # 解析一条 item 数据,并存进数据库
        def getItemData(self, selector, language, city, collectionType):
            items = selector.xpath('//*[@id="s_position_list"]/ul/li')
            for item in items:
                try:
                    name = item.xpath('div[1]/div[1]/div[1]/a/h3/text()')[0]
                    company = item.xpath('div[1]/div[2]/div[1]/a/text()')[0]
                    welfare = item.xpath('div[2]/div[2]/text()')[0]
                    salaryArray = item.xpath('div[1]/div[1]/div[2]/div/span/text()')[0].strip().split("-")
                    salaryMin = salaryArray[0][:len(salaryArray[0]) - 1]
                    salaryMax = salaryArray[1][:len(salaryArray[1]) - 1]
                    salaryMid = (int(salaryMin) + int(salaryMax)) / 2
                    educationArray = item.xpath('div[1]/div[1]/div[2]/div//text()')[3].strip().split("/")
                    education = educationArray[0].strip()
                    experience = educationArray[1].strip()
                    conmpanyMsgArray = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/")
                    companyType = conmpanyMsgArray[0].strip()
                    companyLevel = conmpanyMsgArray[1].strip()
                    companySize = conmpanyMsgArray[2].strip()
    
                    data = self.getRentMsg(
                        name,
                        company,
                        welfare,
                        int(salaryMin),
                        salaryMid,
                        int(salaryMax),
                        experience,
                        education,
                        companyType,
                        companyLevel,
                        companySize
                    )
                    if collectionType:
                        self.zfdb["z_" + language].insert_one(data)
                    else:
                        self.zfdb["z_" + city].insert_one(data)
    
                    print(data)
                except:
                    print("=======  exception  =======")
                    continue
    
    
    
    
    spider = WorkSpider()# 职业爬虫
    for language in spider.getLanguage():
        for city in spider.getCity():
            spider.main(language, city, True)
            time.sleep(5)
    Spider.py

    以上是爬取功能的全部代码:

    参考github上的源码修改:

    主要步骤如下:

    1、组装url

    2、selenium爬取数据

    3、存入数据库mongo

    4、去广告:

            browser.get(url)
            browser.implicitly_wait(10)
            try:
                browser.find_element_by_xpath('//div[@class="body-container showData"]/div/div[2]').click()  # 点击广告
            except:
                pass

    ---------------------------------------------------------------------------------------------------------------------------------------------------------

    分析数据:

    # -*- coding: utf-8 -*-
    # 数据分析,数据可视化
    from os import path
    from wordcloud import WordCloud, ImageColorGenerator
    import jieba.analyse
    import matplotlib.pyplot as plt
    #from scipy.misc 
    import imageio
    import os
    import time
    from pymongo import MongoClient
    
    
    class Analycis:
        def __init__(self):
            self.client = MongoClient('mongodb://localhost:27017/')
            self.zfdb = self.client.zfdb
            #self.zfdb.authenticate("mongodbUser", "yourpassward")
    
        def getCity(self):
            return [
                "全国",
                "北京",
                "郑州",
                #"上海",
                #"深圳",
                #"广州",
            ]
    
        def getLanguage(self):
            return [
                "Java",
                "Python",
                # "C",
                # "机器学习",
                # "图像识别",
                # "自然语言",
                # "区块链",
                # "Go",
                # "Php",
                # ".NET",
                # "Android",
                # "iOS",
                # "web前端",
                # "精准推荐",
                # "Node.js",
                # "Hadoop",
    
            ]
    
        # 统计的数据量
        # 各语言平均工资
        # 各语言学历要求
        # 各语言工作年限要求
        #
    
        # 福利词云
        # 公司级别排行(A轮、B轮)
        # 公司类型排行
    
        # 获取各语言样本数量
        def getLanguageNum(self):
            analycisList = []
            for index, language in enumerate(self.getLanguage()):
                collection = self.zfdb["z_" + language]
                totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
                totalNum2 = list(totalNum)[0]["total_num"]
                analycisList.append(totalNum2)
            return (self.getLanguage(), analycisList)
    
        # 获取各语言的平均工资
        def getLanguageAvgSalary(self):
            analycisList = []
            for index, language in enumerate(self.getLanguage()):
                collection = self.zfdb["z_" + language]
                totalSalary = collection.aggregate([{'$group': {'_id': '', 'total_salary': {'$sum': '$salaryMid'}}}])
                totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
                totalNum2 = list(totalNum)[0]["total_num"]
                totalSalary2 = list(totalSalary)[0]["total_salary"]
                analycisList.append(round(totalSalary2 / totalNum2, 2))
            return (self.getLanguage(), analycisList)
    
        # 获取一门语言的学历要求(用于 pyecharts 的词云)
        def getEducation(self, language):
            results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$education', 'weight': {'$sum': 1}}}])
            educationList = []
            weightList = []
            for result in results:
                educationList.append(result["_id"])
                weightList.append(result["weight"])
            # print(list(result))
            return (educationList, weightList)
    
        # 获取一门语言的工作年限要求(用于 pyecharts 的词云)
        def getExperience(self, language):
            results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$experience', 'weight': {'$sum': 1}}}])
            totalAvgPriceDirList = []
            for result in results:
                totalAvgPriceDirList.append(
                    {"value": result["weight"], "name": result["_id"] + "  " + str(result["weight"])})
            return totalAvgPriceDirList
    
        # 获取 welfare 数据,用于构建福利词云
        def getWelfare(self):
            content = ''
            queryArgs = {}
            projectionFields = {'_id': False, 'welfare': True}  # 用字典指定
            for language in self.getLanguage():
    
                collection = self.zfdb["z_" + language]
                searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
                for result in searchRes:
                    print(result["welfare"])
                    content += result["welfare"]
            return content
    
        # 获取公司级别排行(用于条形图)
        def getAllCompanyLevel(self):
            levelList = []
            weightList = []
            newWeightList = []
            attrList = ["A轮", "B轮", "C轮", "D轮及以上", "不需要融资", "上市公司"]
            for language in self.getLanguage():
                collection = self.zfdb["z_" + language]
                # searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
                results = collection.aggregate([{'$group': {'_id': '$companyLevel', 'weight': {'$sum': 1}}}])
                for result in results:
                    levelList.append(result["_id"])
                    weightList.append(result["weight"])
            for index, attr in enumerate(attrList):
                newWeight = 0
                for index2, level in enumerate(levelList):
                    if attr == level:
                        newWeight += weightList[index2]
                newWeightList.append(newWeight)
            return (attrList, newWeightList)
    
    
    
        # ========================================================
    
        # 展示饼图
        def showPie(self, title, attr, value):
            from pyecharts import Pie
            pie = Pie(title)
            # pie.add("aa", attr, value, is_label_show=True, title_pos='center')
            pie.add("",
                    attr,
                    value,
                    radius=[40, 75],
                    label_text_color=None,
                    is_label_show=True,
                    legend_orient="vertical",
                    legend_pos="left", )
            pie.render()
    
        # 展示矩形树图
        def showTreeMap(self, title, data):
            from pyecharts import TreeMap
            data = data
            treemap = TreeMap(title, width=1200, height=600)
            treemap.add("深圳", data, is_label_show=True, label_pos='inside', label_text_size=19)
            treemap.render()
    
        # 展示条形图
        def showLine(self, title, attr, value):
            from pyecharts import Bar
            bar = Bar(title)
            bar.add("深圳", attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True,
                    xaxis_interval=0,
                    # xaxis_label_textsize=9,
                    legend_text_size=18, label_text_color=["#000"])
            bar.render()
    
        # 展示词云
        def showWorkCloud(self, content, image_filename, font_filename, out_filename):
            d = path.dirname(__name__)
            # content = open(path.join(d, filename), 'rb').read()
            # 基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight
            # 为是否返回关键字的权重
            tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False)
            text = " ".join(tags)
            # 需要显示的背景图片
            img = imageio.imread(path.join(d, image_filename))
            # 指定中文字体, 不然会乱码的
            wc = WordCloud(font_path=font_filename,
                           background_color='black',
                           # 词云形状,
                           mask=img,
                           # 允许最大词汇
                           max_words=500,
                           # 最大号字体,如果不指定则为图像高度
                           max_font_size=130,
                           # 画布宽度和高度,如果设置了msak则不会生效
                           # width=600,
                           # height=400,
                           margin=2,
                           # 词语水平摆放的频率,默认为0.9.即竖直摆放的频率为0.1
                           prefer_horizontal=0.9
                           )
            wc.generate(text)
            img_color = ImageColorGenerator(img)
            plt.imshow(wc.recolor(color_func=img_color))
            wc.to_file("loutput.jpeg")
            plt.axis("off")
            plt.show()
            wc.to_file(path.join(d, out_filename))
    
        # 展示 pyecharts 的词云
        def showPyechartsWordCloud(self, attr, value):
            from pyecharts import WordCloud
            wordcloud = WordCloud(width=1300, height=620)
            wordcloud.add("", attr, value, word_size_range=[20, 100])
            wordcloud.render()
    
    
    analycis = Analycis()
    
    
    # 计算样本数量
    (attr, value) = analycis.getLanguageNum()
    analycis.showLine("样本数量", attr, value)
    os.rename("render.html","sampleNum.html")
    
    # 计算样本数量
    (attr, value) = analycis.getLanguageAvgSalary()
    analycis.showLine("各语言平均工资", attr, value)
    os.rename("render.html","languageAvgSalary.html")
    
    
    # 语言学历要求
    for language in analycis.getLanguage():
        (attr, value) = analycis.getEducation(language)
        print(attr, value)
        analycis.showPie("                       "+language + " 工作年限", attr, value)
        os.rename("render.html", "./languageEducation/" + language + "Education.html")
    #
    
    
    #  语言工作年限要求要求
    for language in analycis.getLanguage():
        data = analycis.getExperience(language)
        print(data)
        analycis.showTreeMap("                       "+language+"工作学历要求", data)
        os.rename("render.html", "./languageExperience/" + language + "Experience.html")
    
    #  福利词云
    analycis.showWorkCloud(analycis.getWelfare(), "docker.jpeg", "kh.ttf", out_filename="loutput.jpeg")
    
    # 公司级别(A轮、B轮) pyechart 词云
    (attr, value) = analycis.getAllCompanyLevel()
    print(attr, value)
    analycis.showLine("公司级别", attr, value)
    os.rename("render.html", "companyLevel.html")
    分析
  • 相关阅读:
    4.PHP正则表达式与数组
    3.PHP条件语句及其字符串相关函数
    3.PHP条件语句及其字符串相关函数
    2.PHP语言基础
    2.PHP语言基础
    1.简单认识PHP和环境搭建
    1.简单认识PHP和环境搭建
    Windows PE 第十章 加载配置信息
    #Leetcode# 20.Valid Parentheses
    #Leetcode# 14. Longest Common Prefix
  • 原文地址:https://www.cnblogs.com/liangblog/p/11943966.html
Copyright © 2011-2022 走看看