zoukankan      html  css  js  c++  java
  • Python 2.7 爬取51job 全国java岗位

     

     一页有50条数据一共2000页 分页是get分页

    #!/usr/bin/python
    # encoding: utf-8
    import requests
    import threading
    from lxml import etree
    import sys
    import os
    import datetime
    import re
    import random
    import time
    
    reload(sys)
    
    sys.setdefaultencoding('utf-8')
    
    # 定义写入日志的方法
    def log(context):
        txtName = "./log/log.txt"
        f=file(txtName, "a+")
        
        f.writelines(context+"
    ") 
        
        f.close()
    
    def xin():
            # 请求头
            header = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9'
            }
                
            count=1
               # 一共2000页
            while (count < 2000):
                url="https://search.51job.com/list/000000,000000,0000,00,9,99,java,2,"+str(count)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
            
                response=requests.get(url,headers=header)
                html=response.content.decode("gbk")
                print(html)
                selector=etree.HTML(html)
                contents = selector.xpath('//div[@class="dw_table"]/div[@class="el"]')
                
           
                log(""+str(count)+"页了--"+str(len(contents))+"条数据")
                for eachlink in contents:
               
                        company = eachlink.xpath('span[@class="t2"]/a/text()')[0]
                        url= eachlink.xpath('p/span/a/@href')[0]
                        name= eachlink.xpath('p/span/a/text()')[0]
                        city= eachlink.xpath('span[@class="t3"]/text()')[0]
    
                        # 工资有的是没有的
                        key="0"
                        if len(eachlink.xpath('span[@class="t4"]/text()'))<1:
                            key="0"
                        else:
                            key= eachlink.xpath('span[@class="t4"]/text()')[0]
                        
                        # 把空格去掉    
                        company=company.replace(' ','')
                        name=name.replace(' ','')
                        city=city.replace(' ','')
                        
                        zhi=name+"============="+company+"============="+city+"============="+str(key)+"============="+url
                        
                        
                        txtName = "./file/java.txt"
                        f=file(txtName, "a+")
                        f.write(zhi)
                        f.close()
                        
                sui=random.randint(1,5)
                log("休眠"+str(sui))
                time.sleep(sui)
                count=count+1     
    
    if __name__=="__main__":
        xin()
    
       

     日志文件

    爬去的数据

    但是爬去的速度有点慢,

    于是乎采用了多线程爬去,

    但是51job 立刻就把IP段给封掉了,

    于是用户4台服务器,每台爬取500条数据,最后再结合一起加到数据库中

    人生苦短,我用Python!!!

  • 相关阅读:
    Java精选笔记_EL表达式
    Java精选笔记_文件上传与下载
    Java精选笔记_Servlet事件监听器
    windows 下安装perl Tk 模块
    html 基础
    用grep 筛选fastq 序列
    php 统计fasta 序列长度和GC含量
    perl 截取 fastq文件
    Java_基础知识回顾
    Git_期末总结
  • 原文地址:https://www.cnblogs.com/wlphp/p/9571092.html
Copyright © 2011-2022 走看看