zoukankan      html  css  js  c++  java
  • python2 urllib2抓取51job网的招聘数据

     1 #coding=utf-8
     2 __author__ = "carry"
     3 
     4 
     5 import sys
     6 reload(sys)
     7 sys.setdefaultencoding('utf-8')
     8 
     9 import urllib
    10 import urllib2
    11 import re
    12 
    13 
    14 #获取源码
    15 def get_content(page):
    16     headers = {#'Host':'search.51job.com',
    17                'User-Agent':'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
    18                #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    19                #'Connection':'keep-alive'
    20                }
    21     url ='http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,'+ str(page)+'.html'
    22     req = urllib2.Request(url,headers=headers)
    23     r = urllib2.urlopen(req)
    24     response = r.read() #读取源代码并转为unicode
    25     html = response.decode('gbk').encode('utf-8')
    26     return html
    27 
    28 def get(html):
    29     reg = re.compile(r'class="t1 ">.*? <a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*? <span class="t5">(.*?)</span>',re.S)#匹配换行符
    30     items=re.findall(reg,html)
    31     return items
    32 
    33 #多页处理,下载到文件
    34 for  j in range(1,11):
    35     print(u"正在爬取第"+str(j)+"页数据...")
    36     html = get_content(j) #调用获取网页原码
    37     for i in get(html):
    38         #print(i[0],i[1],i[2],i[3],i[4])
    39         with open ('51job.txt','a') as f:
    40             f.write(i[0]+'	'+i[1]+'	'+i[2]+'	'+i[3]+'	'+i[4]+'
    ')
    41             f.write("-----------------------------------------------------")
    42             f.close()
  • 相关阅读:
    设计模式(二)
    关于ICO
    js的中关于类的应用
    接口的实现顺序学习笔记[2]
    接口的继承学习笔记[1]
    设计模式(一)
    四种领域模型
    路径问题!!
    异步调用模式学习记录
    转:四人帮设计模式
  • 原文地址:https://www.cnblogs.com/lxs1314/p/7133844.html
Copyright © 2011-2022 走看看