zoukankan      html  css  js  c++  java
  • 运用selenium、urllib抓取51job上的python任职要求,保存为txt文本

      运用selenium、urllib抓取51job上的python岗位任职要求,形成一个txt文本:

    import selenium  #测试框架
    import selenium.webdriver  #模拟浏览器
    import re
    import urllib
    import urllib.request
    
    
    def geturllistsh(searchname):
        url="https://search.51job.com/list/020000,000000,0000,00,9,99,"+searchname+",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        driver=selenium.webdriver.Chrome(executable_path="C:Program Files (x86)GoogleChromeApplicationchromedriver") #调用火狐浏览器
        driver.get(url)  #访问链接
        pagesource=driver.page_source   #抓取网页源代码
        restr="""title">(.*?)</span"""    #正则表达式
        regex=re.compile(restr,re.IGNORECASE)
        mylist=regex.findall(pagesource)
        driver.close()  #关闭
    #getnumberbyname("python")
    #num=eval(getnumberbyname("python")) #1731
    #if  num%50==0:
    #    pages=num//50+1
    #else:
     #   pages=num//50+1
        mylist = []
        for i in range(1,130):
            newurl="https://search.51job.com/list/020000,000000,0000,00,9,99,"+searchname+",2,{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=".format(i)
            mylist.append(newurl)
        for line in mylist:
            print(line)
        return mylist
    
    def  downloadgeturllist(url):
        headers={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
        request=urllib.request.Request(url,headers=headers)#发起请求,
        # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header
        request.add_header("Connection", "keep-alive") #一直活着
        try:
            response=urllib.request.urlopen(request)
            data=response.read().decode("gbk")#打开请求,抓取数据
            print(response.code)  # 可以查看响应状态码
    
            restr = "<div class="dw_table" id="resultList">([sS]*?)<!--列表表格 END-->"  # 正则表达式,()只要括号内的数据
            regex = re.compile(restr, re.IGNORECASE)
            mylist = regex.findall(data)
            #print(mylist[0])#抓取整个表格
    
            restr = "el title">([sS]*?)<!--列表表格 END-->"  # 正则表达式,()只要括号内的数据
            regex = re.compile(restr, re.IGNORECASE)
            mylist = regex.findall(data)
            restr = "<span class="t5">发布时间</span>([sS]*?)<!--列表表格 END-->"  # 正则表达式,()只要括号内的数据
            regex = re.compile(restr, re.IGNORECASE)
            mylist = regex.findall(data)
            #print(mylist[0])#抓取整个表格
            #returnurllist=[]  #存储url,最终返回
            for line  in mylist:
                restr = '<a target="_blank" title=".*?" href="(.*?)"  onmousedown=".*?">[.sS]*?</a>'
                regex = re.compile(restr, re.IGNORECASE)
                geturllist = regex.findall(line)
            for getlist in geturllist:
                print(getlist)
            return geturllist
        except:
            return ""
    
    def  getworkinfo(url):
        headers={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
        request=urllib.request.Request(url,headers=headers)#发起请求,
        # 也可以通过调⽤Request.add_header() 添加/修改⼀个特定的 header
        request.add_header("Connection", "keep-alive") #一直活着
        try:
            response=urllib.request.urlopen(request)
            data=response.read().decode("gbk","ignore")#打开请求,抓取数据
            restr = "<div class="bmsg job_msg inbox">([sS]*?).*?s<div class="mt10">"  # 正则表达式,()只要括号内的数据
            regex = re.compile(restr, re.IGNORECASE)
            mylist = regex.findall(data)
            if len(mylist) > 0:
                datas = mylist[0].strip().replace("</p>", "").replace("<p>", "")
                return datas
            else:
                return ""
        except:
            return ""
    
    savefilepath="workinfo.txt"
    savefile=open(savefilepath,"wb")
    urllist=geturllistsh("python")  #抓取urllist
    for url in urllist:
        templist=downloadgeturllist(url)
        for tempurl in templist:
            workstr=getworkinfo(tempurl)
            print(workstr)
            savefile.write((workstr+"
    ").encode("utf-8"))
    
    savefile.close()
  • 相关阅读:
    POJ2186(有向图缩点)
    POJ3352(连通分量缩点)
    POJ1523(割点所确定的连用分量数目,tarjan算法原理理解)
    POJ3694(求割边)
    POJ3177(无向图变双连通图)
    POJ1144(割点入门题)
    maven(1)-linux环境下安装maven
    linux(10)-linux环境下jdk配置自定义环境变量/etc/profile.d以及卸载自带openjdk
    ant(1)-linux环境下安装ant
    apache(2)-linux环境下apache-httpd编译安装
  • 原文地址:https://www.cnblogs.com/my-global/p/12447356.html
Copyright © 2011-2022 走看看