zoukankan      html  css  js  c++  java
  • 爬取拉钩Java招聘数据

    下面源代码:

     1 # -*- coding: utf-8 -*-
     2 from bs4 import BeautifulSoup
     3 import requests
     4 from lxml import etree
     5 
     6 url = 'https://www.lagou.com/zhaopin/Java/1/?filterOption=1'
     7 
     8 headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36',
     9          'Cookie':'user_trace_token=20170701121430-c7efa4b1-5e13-11e7-a0ba-5254005c3644; LGUID=20170701121430-c7efab16-5e13-11e7-a0ba-5254005c3644; X_HTTP_TOKEN=f3ecb9dc34a52bc5b9aa4a0990c156e1; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=47; index_location_city=%E5%B9%BF%E5%B7%9E; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAAIAACBIF0269F17A69695B00240FC72C286FEA8; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=bzclk.baidu.com; PRE_SITE=http%3A%2F%2Fbzclk.baidu.com%2Fadrc.php%3Ft%3D06KL00c00fATEwT0J_Kg0FNkUsa5H-Iu00000rhhYH300000X7NPvj.THL0oUhY0A3qmh7GuZNCUvd-gLKM0ZnqrADvmhDsmHnsnj0Yujn3nsKd5H0kfW0dwbDLPHwKrjRvPHnzPjczwjm1rjK7PbD1Pjwj0ADqI1YhUyPGujYzrHT4nj6snjczFMKzUvwGujYkP6K-5y9YIZK1rBtEILILQhk9uvqdQhPEUitOIgwVgLPEIgFWuHdVgvPhgvPsI7qBmy-bINqsmsKWThnqPjTsPWn%26tpl%3Dtpl_10085_15673_1%26l%3D1053927145%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253D%2525E3%252580%252590%2525E6%25258B%252589%2525E5%25258B%2525BE%2525E7%2525BD%252591%2525E3%252580%252591%2525E5%2525AE%252598%2525E7%2525BD%252591-%2525E4%2525B8%252593%2525E6%2525B3%2525A8%2525E4%2525BA%252592%2525E8%252581%252594%2525E7%2525BD%252591%2525E8%252581%25258C%2525E4%2525B8%25259A%2525E6%25259C%2525BA%2526xp%253Did%28%252522m260704b2%252522%29%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D128%26wd%3D%25E6%258B%2589%25E5%258B%25BE%26issp%3D1%26f%3D8%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26inputT%3D1555; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F%3Futm_source%3Dm_cf_cpt_baidu_pc; _gat=1; TG-TRACK-CODE=index_navigation; SEARCH_ID=2e7988702787470a815ca030dc6ecab0; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1498882467,1498883697,1498893967,1498895127; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1498895299; _ga=GA1.2.1800795855.1498882467; _gid=GA1.2.659409857.1498882467; LGSID=20170701152611-8e89a338-5e2e-11e7-b32d-525400f775ce; LGRID=20170701154823-a8b8f589-5e31-11e7-a0d7-5254005c3644',
    10          'Accept-Encoding':'gzip, deflate, br',
    11          'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    12          }
    13 
    14 
    15 def getdata(url):
    16 
    17 
    18     html= requests.get(url,headers=headers)
    19     html.encoding='utf-8'
    20 
    21     selector = etree.HTML(str(html.text))
    22     position =selector.xpath('//ul[@class="item_con_list"]/li/@data-positionname')#职位
    23     salary = selector.xpath('//ul[@class="item_con_list"]/li/@data-salary')#薪资
    24     company = selector.xpath('//ul[@class="item_con_list"]/li/@data-company')#公司
    25     link = selector.xpath('//div[@class="p_top"]/a/@href')#链接
    26     didian = selector.xpath('//span[@class="add"]/em/text()')#地点
    27     shijian =selector.xpath('//span[@class="format-time"]/text()')#发布时间
    28     r = list(zip(position, salary, company, link, didian, shijian))
    30     print(r)
    31 
    32 
    33 if __name__=="__main__":
    34     for i in range(1,31):
    35         url ="https://www.lagou.com/zhaopin/Java/"+str(i)+"/?filterOption="+str(i)
    36         getdata(url)
    
    
  • 相关阅读:
    数组
    mysql优化思路
    mysql_存储过程
    mysql_函数
    mysql_结构
    mysql_触发器
    mysql_变量
    mysql_事务
    mysql总结
    mysql备份
  • 原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/7103011.html
Copyright © 2011-2022 走看看