zoukankan      html  css  js  c++  java
  • 用python抓取智联招聘信息并存入excel

    用python抓取智联招聘信息并存入excel

    tags:python 智联招聘导出excel


    引言:前一阵子是人们俗称的金三银四,跳槽的朋友很多,我觉得每个人都应该给自己做一下规划,根据自己的进步作出调整。建议不要看到身边的人涨了工资就盲目的心动。一般来说跳槽后要熟悉新的环境会浪费不少时间,如果现在的工作在氛围和自身进步上还可以接受,其他比如待遇方面可以和公司协调解决。

    本文参考了yaoyefengchen的博客:文章链接,并进行了地域搜索优化和将存储方式由cvs改成大家常用的excel。下面进入正文

    先说一下大概流程:
    在智联职位搜索页面上选好自己的搜索条件后,发现链接地址为:

    http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=php高级工程师&sm=0&re=2006&isfilter=1&p=1&sf=10001&st=15000

    分析链接中的参数如下(过滤条件可以选择不写),并构造出请求的数据,header的设置只要可以访问网页即可。

    paras = {
           'jl': city,         # 搜索城市
           'kw': keyword,      # 搜索关键词
           'isadv': 0,         # 是否打开更详细搜索选项
           'isfilter': 1,      # 是否对结果过滤
           'p': page,          # 页数
           're': region        # region的缩写,地区,2005代表海淀
       }
       # sf=10001&st=15000这两个是我筛选的工资区间,如果有这个需求可以自己添加参数。
        url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
    

    yaoyefengchen用的是正则匹配出职位,薪资,公司等信息,并没有提供具体地域(比如海淀还是朝阳)对应的region。我后来是用的xpath提取出了北京的各个地域组成字典,直接输入地区的汉字就可以了。如下:

    # 取搜索页面得到地域的对应数字 比如海淀对应2005
    def parseHtmlToGetRegion(regionAddress):
        url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&sm=0&isfilter=1&p=1&re=2006'
        # 获取代理ip地址 只取前五页
        html= getHtml(url)
        regionId = html.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div[1]/div[2]/a/@href')
        region = html.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div[1]/div[2]/a/text()')
        #解析一下region中的编号去掉无效内容
        regionList = {}
        for i,regionHref in enumerate(regionId):
            if i==0:
                continue
            regionList[region[i]] = regionId[i][-4::]
        return regionList.get(regionAddress)
    

    另外,cvs格式在用一些工具比如excel打开的时候经常出现乱码,需要转化或者下载一些专用的软件。我觉得很不方便,所以直接存成了excel格式,不得不说,在存数据到excel文件这方面,python简直比php容易太多了。

    # 存入excle
    def write_xls_file(filename, headers, jobs):
        table = xlwt.Workbook(encoding='utf8')
        table_page = table.add_sheet('jobs')
    
        for i,header in enumerate(headers):
            table_page.write(0,i,header)
        for j,items in enumerate(jobs,start = 1):
            for q,item in items.items():
                table_page.write(j, q, item)
        table.save(filename)
    
    

    完整代码如下,可以直接使用。别忘了保存文章最下面的user_agents.py文件

    #-*- coding: utf-8 -*-
    '''
    Created on 2018-05-7
    @author: Vinter_he
    '''
    import re
    import requests
    import xlwt
    from tqdm import tqdm
    from urllib.parse import urlencode
    from requests.exceptions import RequestException
    from lxml import etree
    import user_agents
    import random
    import datetime
    
    def get_one_page(city, keyword, region, page):
       '''
       获取网页html内容并返回
       '''
       paras = {
           'jl': city,         # 搜索城市
           'kw': keyword,      # 搜索关键词
           'isadv': 0,         # 是否打开更详细搜索选项
           'isfilter': 1,      # 是否对结果过滤
           'p': page,          # 页数
           're': region        # region的缩写,地区,2005代表海淀
       }
    
       headers = {
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
           'Host': 'sou.zhaopin.com',
           'Referer': 'https://www.zhaopin.com/',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
           'Accept-Encoding': 'gzip, deflate, br',
           'Accept-Language': 'zh-CN,zh;q=0.9'
       }
    
       url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?' + urlencode(paras)
       try:
           # 获取网页内容,返回html数据
           response = requests.get(url, headers=headers)
           # 通过状态码判断是否获取成功
           if response.status_code == 200:
               return response.text
           return None
       except RequestException as e:
           return None
    
    def parse_one_page(html):
       '''
       解析HTML代码,提取有用信息并返回
       '''
       # 正则表达式进行解析
       pattern = re.compile('<a style=.*? target="_blank">(.*?)</a>.*?'        # 匹配职位信息
           '<td class="gsmc"><a href="(.*?)" target="_blank">(.*?)</a>.*?'     # 匹配公司网址和公司名称
           '<td class="zwyx">(.*?)</td>', re.S)                                # 匹配月薪
    
       # 匹配所有符合条件的内容
       items = re.findall(pattern, html)
    
       for item in items:
           job_name = item[0]
           job_name = job_name.replace('<b>', '')
           job_name = job_name.replace('</b>', '')
           yield {
               0: job_name,
               1: item[1],
               2: item[2],
               3: item[3]
           }
    
    # 存入excle
    def write_xls_file(filename, headers, jobs):
        table = xlwt.Workbook(encoding='utf8')
        table_page = table.add_sheet('jobs')
    
        for i,header in enumerate(headers):
            table_page.write(0,i,header)
        for j,items in enumerate(jobs,start = 1):
            for q,item in items.items():
                table_page.write(j, q, item)
        table.save(filename)
    
    def main(city, keyword, region, pages):
       '''
       主函数
       '''
       filename = '智联_' +datetime.date.today().strftime('%Y-%m-%d')+ city + '_' + keyword + '.xls'
       headers = ['job', 'website', 'company', 'salary']
       jobs = []
       for i in tqdm(range(pages)):
           '''
           获取该页中所有职位信息,写入xls文件
           '''
           region = parseHtmlToGetRegion(region)
           html = get_one_page(city, keyword, region, i)
           items = parse_one_page(html)
           for item in items:
               jobs.append(item)
       write_xls_file(filename, headers, jobs)
    
    def getHtml(url):
        response = requests.get(url=url, headers={'User-Agent':random.choice(user_agents.user_agents)}, timeout=10).text
        html = etree.HTML(response)
        return html
    
    # 取搜索页面得到地域的对应数字 比如海淀对应2005
    def parseHtmlToGetRegion(regionAddress):
        url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&sm=0&isfilter=1&p=1&re=2006'
        # 获取代理ip地址 只取前五页
        html= getHtml(url)
        regionId = html.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div[1]/div[2]/a/@href')
        region = html.xpath('/html/body/div[3]/div[3]/div[1]/div[4]/div[1]/div[2]/a/text()')
        #解析一下region中的编号去掉无效内容
        regionList = {}
        for i,regionHref in enumerate(regionId):
            if i==0:
                continue
            regionList[region[i]] = regionId[i][-4::]
        return regionList.get(regionAddress)
    
    if __name__ == '__main__':
       main('北京', 'php工程师', '朝阳', 10)
    
    

    下面是和以前一样的user_agents.py文件 这个文件以后就不给了大家可以自己保存一下备用

    #!/usr/bin/python
    # -*- coding:utf-8 -*-
    '''
    Created on 2018-04-27
    
    @author: Vinter_he
    '''
    
    user_agents = [
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
        'Opera/9.25 (Windows NT 5.1; U; en)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
        'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
        
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    
  • 相关阅读:
    7
    6
    5
    3
    4
    2
    1
    寒假工作经历
    软件工程第三周的总结
    软件工程第三周的学习报告 html<input> final finally finalize 的比较 BigInteger
  • 原文地址:https://www.cnblogs.com/vinter/p/9007824.html
Copyright © 2011-2022 走看看