zoukankan      html  css  js  c++  java
  • 爬虫10:爬取拉勾的职位信息

    很久以前的代码,整理文件时找到了,不知道还能不能用

    主要还是json的处理

    重点可以关注下信息保存那部分,写入excel,可能比较通用吧

    import requests
    import json
    import time
    from bs4 import BeautifulSoup
    import xlwt
    BaseUrl='https://www.lagou.com/jobs/positionAjax.json?'
    All_detail=[]
    #访问页面
    def read_page(tag):
        page_header={
            'Host':'www.lagou.com',
            'Origin': 'https: // www.lagou.com',
             'Referer': 'https://www.lagou.com/jobs/list_php%E5%90%8E%E7%AB%AF?px=default&city=%E4%B8%8A%E6%B5%B7',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
           'cookie':''
          }
    
        page_data={
            'first':1,
            'pn':tag,
            'kd':'PHP'
        }
        page = requests.post(url=BaseUrl,data=page_data,headers=page_header)
        return page.text
    
    #处理json,返回单页面的urllist列表
    def read_json(page):
         item_list=[]
         page_json=json.loads(page)
         page_json=page_json['content']['positionResult']['result']
        # time.sleep(3)
         for i in range(15):
             item_list.append('https://www.lagou.com/jobs/'+str(page_json[i]['positionId'])+'.html')
         return item_list
    
    #抓取页面
    def get_detail(url):
        dict = {}
        page_header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.lagou.com',
            'Origin': 'https: // www.lagou.com',
            'Referer': 'https://www.lagou.com/jobs/list_php%E5%90%8E%E7%AB%AF?px=default&city=%E4%B8%8A%E6%B5%B7',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
            'cookie': '',
            'Upgrade-Insecure-Requests': '1'
        }
        page_html=requests.get(url,headers=page_header).content
        soup=BeautifulSoup(page_html,'html.parser',from_encoding='utf-8')
        dict['公司']=soup.find("div",class_="company").text
        dict['职位']=soup.find("span",class_="name").text
        #待遇/要求
        sal = []
        for tag in soup.select("dd p span"):
             sal.append(tag.text.replace('/',''))
        str_sal=''.join(sal)
        dict["待遇/要求"]=str_sal
        #职位诱惑
        dict['职位诱惑']=soup.find("dd",class_="job-advantage").select('p')[0].text
    
        #职位描述
        req = []
        for tag in soup.find('dd',class_='job_bt').select('div p'):
               req.append(tag.text+'
    ')
        str_req=''.join(req)
        dict["职位描述"]=str_req
        return dict
    
    def saveall():
        book = xlwt.Workbook()
        sheet = book.add_sheet('ronytest', cell_overwrite_ok=True)
        heads = ['公司', '职位', '待遇/要求', '职位诱惑', '职位描述']
        ii = 0
        for head in heads:
            sheet.write(0, ii, head)
            ii += 1
    
        row = 1
        for xx in All_detail:
            #print(dict['公司'])
            sheet.write(row, 0, xx['公司'])
            sheet.write(row, 1, xx['职位'])
            sheet.write(row, 2, xx['待遇/要求'])
            sheet.write(row, 3, xx['职位诱惑'])
            sheet.write(row, 4, xx['职位描述'])
            row += 1
        book.save('拉勾网' + '.xls')
    
    if __name__=='__main__':
         AllUrl=[]
        #爬虫爬取的多页面,并把所有url加入AllUrl列表
         for tag in range(1,3):
            AllUrl.extend(read_json(read_page(tag)))
            time.sleep(1)
         print('URL爬取完成')
         #解析网页,把所有详情加入All_detail列表
         for i in AllUrl:
             All_detail.append(get_detail(i))
         print('Detail爬取完成')
         #保存数据
         saveall()
  • 相关阅读:
    mount 需要同时设置 noatime 和 nodiratime 吗?
    xfsdump命令使用
    查看linux设备文件系统类型的方法
    Elasticstack 5.1.2 集群日志系统部署及实践
    Kickstart无人值守安装[转载]
    mytop安装,使用mytop监控MySQL性能
    使用Anemometer分析MySQL慢查询记录
    (总结)Web性能压力测试工具之WebBench详解
    mysqlsla快速入门
    Linux下MySQL慢查询分析mysqlsla安装使用
  • 原文地址:https://www.cnblogs.com/ronyjay/p/8184684.html
Copyright © 2011-2022 走看看