zoukankan      html  css  js  c++  java
  • python3爬虫 -----爬取职位招聘信息-------from腾讯社会招聘

     1 # -*- coding: utf-8 -*-
     2 # author:zxy
     3 #Date:2018-9-23
     4 
     5 from lxml import etree
     6 import requests
     7 
     8 BASE_DOMAIN="http://hr.tencent.com/"
     9 HEADERS = {
    10     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
    11                   'AppleWebKit/537.36 (KHTML, like Gecko)'
    12                   ' Chrome/67.0.3396.99 Safari/537.36'
    13 }
    14 BASE_URL="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0"
    15 
    16 def parse_detail_page(url):
    17     position={}
    18     response=requests.get(url,headers=HEADERS)
    19     html=etree.HTML(response.text)
    20     work_name=html.xpath("//tr[@class='h']/td/text()")[0]
    21     work_place=html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0]
    22     work_category=html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0]
    23     work_lack_number=html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0]
    24     # print(work_lack_number)
    25     more_infos=html.xpath("//ul[@class='squareli']")
    26     work_duty=more_infos[0].xpath(".//text()")
    27     work_require=more_infos[1].xpath(".//text()")
    28 
    29     position['work_name']=work_name
    30     position['work_place']=work_place
    31     position['work_category']=work_category
    32     position['work_lack_number']=work_lack_number
    33     position['work_duty']=work_duty
    34     position['work_require']=work_require
    35 
    36     return position
    37 
    38 def get_detail_urls(url):
    39     response=requests.get(url=BASE_URL,headers=HEADERS)
    40     text=response.text
    41     html=etree.HTML(text)
    42     links=html.xpath("//tr[@class='even']//a/@href")
    43     links=map(lambda url:BASE_DOMAIN+url,links)
    44     return links
    45 
    46 def spider():
    47     base_url="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a"
    48     positions=[]
    49     for x in range(0,4): #43
    50         x*=10
    51         url=base_url.format(x)
    52         detail_urls=get_detail_urls(url)
    53         for detail_url in detail_urls:
    54             position=parse_detail_page(detail_url)
    55             positions.append(position)
    56             #print(position)
    57             with open('tecentRecruit.txt','a',encoding='utf-8') as f:
    58                 for (key,value) in position.items():
    59                     if(key=='work_duty'):
    60                         str='work_duty :{}'
    61                         f.write(str.format(value))
    62                         f.write('
    ')
    63                     elif(key=='work_require'):
    64                         str="work_require :{}"
    65                         f.write(str.format(value))
    66                         f.write('
    ')
    67                     else:
    68                         f.write(key+":"+value)
    69                         f.write('
    ')
    70                 f.write('
    '*3)
    71 
    72     #print(positions)
    73 
    74 if __name__ == '__main__':
    75     spider()

    效果如图所示:

  • 相关阅读:
    「AtCoder AGC023F」01 on Tree
    「Wallace 笔记」平面最近点对 解法汇总
    「Codeforces 1181E」A Story of One Country (Easy & Hard)
    「NOI2018」「LOJ #2720」「Luogu P4770」 你的名字
    IdentityServer4设置RefreshTokenExpiration=Sliding不生效的原因
    【知识点】IQueryable.SumAsync方法的NULL异常
    Beyond Compare 4 密钥被吊销
    【知识点】Uri对象的完整地址
    git文件夹大小写问题
    .Net Core学习资料
  • 原文地址:https://www.cnblogs.com/z-712/p/9693729.html
Copyright © 2011-2022 走看看