zoukankan      html  css  js  c++  java
  • 360的搜索指数排行榜

    • 百度指数用图片处理过,太难抓取
    • 爬取代码是 python 3.x
    #!/usr/bin/env python
    #-*- encoding: utf-8 -*-
    # refer to http://blog.csdn.net/wangtaoking1/article/details/18308635 
    import http.cookiejar
    from urllib import request
    from urllib.parse import quote
    
    HTTP_PROXY = '10.13.61.118:6666'
    
    def getOpener(head, enable_proxy=False):
        # deal with the Cookies
        cj = http.cookiejar.CookieJar()
        cookie_support = request.HTTPCookieProcessor(cj)
        # deal with proxy
        debug_hander = request.HTTPHandler(debuglevel=1) # debuglevel=0
        proxy_handler = request.ProxyHandler({"http":HTTP_PROXY, "https":HTTP_PROXY})
        opener = request.build_opener(cookie_support, proxy_handler, debug_hander) 
            if enable_proxy else request.build_opener(cookie_support, debug_hander)
        # request.install_opener(opener)
        opener.addheaders = list(head.items())
        return opener
        
        
    import os,json  
    def main(school="江苏经贸职业技术学院"):
        header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
        'Accept-Encoding':'gzip, deflate, sdcn',
        }
        opener = getOpener(header)        
        url = "http://index.so.com/index.php?a=overviewJson&q=%s&area=%s" % (quote(school),quote("全国"))
        data = opener.open(url).read().decode('utf-8')    
        try:
            index = json.loads(data).get('data')[0]['data']['month_index']
        except:
            index = -1
        return ('%s=%d' % (school, index if isinstance(index,int) else -1 ))
    
    import time    
    if __name__ == '__main__':
        #main()
        fp = open("index.txt",'w',encoding='utf-8')
        XX = open("school_list.txt").read().splitlines()
        for line in XX:
            time.sleep(1)    
            fp.write(main(line)+'
    ')
            fp.flush()
        fp.close()    
    

    查看排行

     cat index.txt | sort -t= -k2 -nr | less
    

    --- 她说, 她是仙,她不是神
  • 相关阅读:
    linux下mysql忘记密码
    ORACLE 锁表处理,解锁释放session
    二.hadoop环境搭建
    并行与并发的理解
    PgSQl临时表的创建
    UNION types numeric and text cannot be matched
    Excel 自定义关闭按钮
    Excel关闭事件
    VBA 获得绝对地址控制焦点的设置
    Excel TargetRange.Validation为空的
  • 原文地址:https://www.cnblogs.com/bregman/p/5480789.html
Copyright © 2011-2022 走看看