zoukankan      html  css  js  c++  java
  • python 爬取boss直聘招聘信息实现

    1、一些公共方法的准备

    获取数据库链接:

    import pymysql
    # 获得数据库链接对象
    def getConnect(database):
        DATABASE = {
            'host': 'localhost',
            'database': database,
            'user': 'root',
            'password': '123456'
        }
        return pymysql.connect(**DATABASE)

    获取页面soup对象:

    import requests
    from bs4 import BeautifulSoup
    #转换成soup对象
    def to_soup(str):
        return BeautifulSoup(str,'lxml')
    #通过url和header获取页面soup对象
    def get_soup(url,header):
        response=requests.get(url,headers=header)
        return to_soup(response.text)

    2、爬取BOSS直聘python相关岗位的实现

    定义工作信息对象:

    class WorkInfo:
        def __init__(self, title, salary, site, experience, education, job_url, company,release_date,get_date):
            self.title = title
            self.salary = salary
            self.site = site
            self.experience = experience
            self.education = education
            self.job_url = job_url
            self.company = company
            self.release_date = release_date
            self.get_date = get_date

    获取工作信息到定义的对象的集合:

    # 获取工作信息集合
    def getWorkInfos(url, header):
            # 获得页面soup对象
            htmlSoup = rep.get_soup(url, header)
            workInfos = []
            # 获取页面内容块状列表
            job_infos = htmlSoup.find_all('div', class_='job-primary')
            if len(job_infos)==0:
                print('已到空白页!!!')
                return workInfos
            # 遍历每块,获取每块详细类容
            print('开始爬取页面数据!')
            for job_info_soup in job_infos:
                    # 标题
                    title = job_info_soup.find('div', class_='job-title').get_text()
                    # 薪资
                    salary = job_info_soup.find('span', class_='red').get_text()
                    infos = str(job_info_soup.find('p'))
                    infosList = tool.toContent(infos)
                    # 工作地址
                    site = infosList[0]
                    # 工作经验
                    experience = infosList[1]
                    # 学历要求
                    education = infosList[2]
                    # 详细信息链接
                    job_url = job_info_soup.find('a').get('href')
                    # 公司名
                    company = job_info_soup.find('div', class_='company-text').find('a').get_text()
                    # 发布时间
                    release_date = job_info_soup.find('div', class_='info-publis').find('p').get_text()[3:]
                    # 拼接获取符合数据库要求的日期字符串
                    if '' in release_date:
                        release_date=time.strftime("%Y-%m-%d",time.localtime(time.time()-86400))
                    elif ':' in release_date:
                        release_date=time.strftime("%Y-%m-%d")
                    else:
                         release_date = str(time.localtime().tm_year) + '-' + re.sub(r'[月,日]', '-', release_date)[:-1]
                    # 获取数据的时间
                    get_date = time.strftime("%Y-%m-%d  %H:%M:%S")
                    workInfo = WorkInfo(title, salary, site, experience, education, job_url, company, release_date, get_date)
                    workInfos.append(workInfo)
            print('爬取页面数据完毕!')
            return workInfos

    把获取到的工作信息集合存入数据库:

    # 存入数据库
    def toDatabase(workInfos):
        print('开始存入数据库')
        db = database.getConnect('reptile')
        cursor = db.cursor()
        for workInfo in workInfos:
            sql = "INSERT INTO `work_info` (`title`, `salary`, `site`, `experience`, `education`, `job_url`, `company`, `release_date`, `get_date`)" 
              " VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" 
              % (workInfo.title, workInfo.salary, workInfo.site, workInfo.experience, workInfo.education, workInfo.job_url, workInfo.company, workInfo.release_date,workInfo.get_date)
            cursor.execute(sql)
        cursor.close()
        db.commit()
        db.close()
        print('存入数据库完毕!')

    爬取工作实现:

    url = 'https://www.zhipin.com/c101270100/?'
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        'referer': '',
        'cookie':'lastCity=101270100; _uab_collina=155876824002955006866925; t=DPiicXvgrhx7xtms; wt=DPiicXvgrhx7xtms; sid=sem_pz_bdpc_dasou_title; __c=1559547631; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDIFkY0IWPB0KZEgsZb1OwT00000Kd7ZNC00000JqHYFm.THdBULP1doZA80K85yF9pywdpAqVuNqsusK15yF9m1DdmWfdnj0sm1PhrAf0IHYYnD7aPH9aPRckwjRLrjbsnYfYfWwaPYwDnHuDfHcdwfK95gTqFhdWpyfqn1czPjmsPjnYrausThqbpyfqnHm0uHdCIZwsT1CEQLILIz4lpA-spy38mvqVQ1q1pyfqTvNVgLKlgvFbTAPxuA71ULNxIA-YUAR0mLFW5HRvnH0s%26tpl%3Dtpl_11534_19713_15764%26l%3D1511867677%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253DBoss%2525E7%25259B%2525B4%2525E8%252581%252598%2525E2%252580%252594%2525E2%252580%252594%2525E6%252589%2525BE%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E6%252588%252591%2525E8%2525A6%252581%2525E8%2525B7%25259F%2525E8%252580%252581%2525E6%25259D%2525BF%2525E8%2525B0%252588%2525EF%2525BC%252581%2526xp%253Did(%252522m3224604348_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D8%26wd%3Dboss%25E7%259B%25B4%25E8%2581%2598%26issp%3D1%26f%3D3%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26oq%3D%2525E5%25258D%25259A%2525E5%2525AE%2525A2%2525E5%25259B%2525AD%26inputT%3D9649%26prefixsug%3Dboss%26rsp%3D0&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1558768262,1558768331,1559458549,1559547631; JSESSIONID=A0FC9E1FD0F10E42EAB681A51AC459C7;'
                 ' __a=86180698.1558768240.1559458549.1559547631.63.3.6.6; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1559551561'
                  'referer: https://www.zhipin.com/c101270100/?query=python&page=2&ka=page-2'
    }
    query='python'
    page=1
    while True:
        print("开始第:{} 页".format(page))
        purl=url+'query='+query+'&page='+str(page)+'&ka=page-'+str(page)
        workInfos = getWorkInfos(purl, header)
        if len(workInfos)==0:
            print('结束爬取!')
            break
        toDatabase(workInfos)
        page=page+1

    3、涉及的小知识

    自制去取html标签,把标签内夹杂的内容存入list中:

    # 通过正则表达式去掉HTML标签,获取标签内的文字内容列表
    def toContent(str):
        infos=re.split('<[^>]*>', str)
        # 去除空元素
        return list(filter(None,infos))

    时间的相关操作

    用‘-’替换‘月’‘日’:

    re.sub(r'[月,日]', '-', release_date)

    获取前一天’:

    release_date=time.strftime("%Y-%m-%d",time.localtime(time.time()-86400))


  • 相关阅读:
    Golang服务器热重启、热升级、热更新(safe and graceful hot-restart/reload http server)详解
    如果清空Isilon cluster上的文件,Shadow Store和data reduction的统计信息也会一并清空么?
    合并从B站下载的分开的音频和视频
    使用Notepad++远程编辑WinSCP中打开的文本文件报错“file xxx does exist anymore”
    Leetcode 1143. 最长公共子序列(LCS)动态规划
    Leetcode 126 127 单词接龙i&ii
    如何在一个Docker中同时运行多个程序进程?
    dockerfile cmd使用
    Leetcode 160.相交链表
    Leetcode 912. 排序数组
  • 原文地址:https://www.cnblogs.com/tutuwowo/p/10975003.html
Copyright © 2011-2022 走看看