zoukankan      html  css  js  c++  java
  • 爬取爱笔智能招聘职位

     1 import urllib.request
     2 import urllib.parse
     3 import requests
     4 from urllib.parse import urlencode
     5 from pyquery import PyQuery as pq
     6 from pymongo import MongoClient
     7 import json
     8 
     9 
    10 url = 'http://aibee.com/cn/joinus.aspx?action=jobinfo'
    11 
    12 headers = {
    13     'Host': 'aibee.com',
    14     'Referer': 'http://aibee.com/cn/joinus.aspx',
    15     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    16     'X-Requested-With': 'XMLHttpRequest',
    17 } 
    18 
    19 client = MongoClient()
    20 db = client['aibee']
    21 collection = db['aibee']
    22 max_id = 50
    23 
    24 def get_page(id):  
    25 
    26     formData = {  
    27             'id': id,
    28         }  
    29      
    30     #将str类型转换为bytes类型  
    31     data = urllib.parse.urlencode(formData).encode("utf-8")   
    32     request = urllib.request.Request(url, data=data, headers=headers)  
    33     response = urllib.request.urlopen(request)
    34     #print(response.read().decode('utf-8'))
    35     result = response.read().decode('utf-8')
    36     #print(result)
    37     #print(len(result))
    38     #print(id)
    39 
    40     if len(result)!=12:
    41         
    42        # print(id)
    43         content=result.replace(",",":")
    44     
    45         id=content.split(':')[2].strip()
    46         #print(id)
    47         title=content.split(':')[4].strip()
    48         #print(title)
    49         zhize=content.split(':')[6].strip().replace("	","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\t","")
    50         #print(zhize)
    51         yaoqiu=content.split(':')[8].strip().replace("	","").replace("<ul>","").replace("<li>","").replace("</li>","").replace("</ul>","").replace("\t","")
    52         #print(yaoqiu)
    53         dtt=content.split(':')[12].strip()[:-3]
    54         #print(dtt)
    55         emailaddr=content.split(':')[16].strip()
    56         #print(emailaddr)
    57 
    58         
    59         aibee={
    60                 'id':id,
    61                 'title':title,
    62                 'zhize':zhize,
    63                 'yaoqiu':yaoqiu,
    64                 'dtt': dtt,
    65                 'emailaddr': emailaddr
    66             }
    67         #print(aibee) 
    68     else:
    69         aibee=0
    70 
    71     return aibee
    72     
    73 
    74 def write_to_file(content):
    75     with open('aibee.json','a',encoding='utf-8') as f:
    76         f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    77         f.close()
    78 
    79 
    80 def save_to_mongo(result):
    81     if collection.insert(result):
    82         print('Saved to Mongo')        
    83 
    84 
    85 if __name__ == "__main__":  
    86     
    87     for id in range(1, max_id + 1):
    88         #get_page(id)
    89         content = get_page(id)
    90         if content!=0:
    91             print(content)
    92             write_to_file(content)
    93             save_to_mongo(content)
    94         

  • 相关阅读:
    Data Load Performance Optimization
    SAPBW数据仓库增量更新(转载)
    BW数据源深入研究
    SAP BW权限
    利用HTTP协议的特性进行拒绝服务攻击的一些构思
    Python自省(反射)指南 转自http://www.cnblogs.com/huxi/archive/2011/01/02/1924317.html
    交换网络中的sniffer讨论>基于交换网络的ARP spoofing sniffer
    Windows中使用精确计时器
    HTTP POST和GET的区别
    HTTP 状态代码 错误列表
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9222489.html
Copyright © 2011-2022 走看看