zoukankan      html  css  js  c++  java
  • 360的搜索指数排行榜

    • 百度指数用图片处理过,太难抓取
    • 爬取代码是 python 3.x
    #!/usr/bin/env python
    #-*- encoding: utf-8 -*-
    # refer to http://blog.csdn.net/wangtaoking1/article/details/18308635 
    import http.cookiejar
    from urllib import request
    from urllib.parse import quote
    
    HTTP_PROXY = '10.13.61.118:6666'
    
    def getOpener(head, enable_proxy=False):
        # deal with the Cookies
        cj = http.cookiejar.CookieJar()
        cookie_support = request.HTTPCookieProcessor(cj)
        # deal with proxy
        debug_hander = request.HTTPHandler(debuglevel=1) # debuglevel=0
        proxy_handler = request.ProxyHandler({"http":HTTP_PROXY, "https":HTTP_PROXY})
        opener = request.build_opener(cookie_support, proxy_handler, debug_hander) 
            if enable_proxy else request.build_opener(cookie_support, debug_hander)
        # request.install_opener(opener)
        opener.addheaders = list(head.items())
        return opener
        
        
    import os,json  
    def main(school="江苏经贸职业技术学院"):
        header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
        'Accept-Encoding':'gzip, deflate, sdcn',
        }
        opener = getOpener(header)        
        url = "http://index.so.com/index.php?a=overviewJson&q=%s&area=%s" % (quote(school),quote("全国"))
        data = opener.open(url).read().decode('utf-8')    
        try:
            index = json.loads(data).get('data')[0]['data']['month_index']
        except:
            index = -1
        return ('%s=%d' % (school, index if isinstance(index,int) else -1 ))
    
    import time    
    if __name__ == '__main__':
        #main()
        fp = open("index.txt",'w',encoding='utf-8')
        XX = open("school_list.txt").read().splitlines()
        for line in XX:
            time.sleep(1)    
            fp.write(main(line)+'
    ')
            fp.flush()
        fp.close()    
    

    查看排行

     cat index.txt | sort -t= -k2 -nr | less
    

    --- 她说, 她是仙,她不是神
  • 相关阅读:
    第三章:Hadoop简介及配置Hadoop-1.2.1,hbase-0.94.13集群
    maven环境的搭建,lemon-OA办公系统的搭建
    如何打开mo文件并修改 PoEdit
    安装Elastix-2.4版本
    RabbitMQ安装
    Yum编译安装Error Downloading Packages报错
    linux:ping不通www.baidu.com
    tar命令解压缩出错
    PV、UV
    使用存储过程创建数据
  • 原文地址:https://www.cnblogs.com/bregman/p/5480789.html
Copyright © 2011-2022 走看看