zoukankan      html  css  js  c++  java
  • 爬虫爬oj用户头像

    import requests
    import Queue
    import urllib
    import urllib2
    import re
    import requests
    alreadyImg = set()
    s = requests.session()
    s.post("http://acm.hrbust.edu.cn/index.php?m=User&a=login"
    , data={
        "user_name": "1304020306",
        "password": "123456"
    })
    r = s.get("http://acm.hrbust.edu.cn/index.php?m=User&a=userInfo&user_name=1404020214")
    print r.text
    urllist = Queue.Queue(maxsize = -1)
    already = set()
    url = "http://acm.hrbust.edu.cn/index.php?m=Ranklist&a=showRatingrank"
    urllist.put(url)
    reg = r'a href="(.+?)"'
    httpre = re.compile(reg)
    #reg = r'src="(.+?.jpg)"'
    reimg = r'img class="large_avatar" src="([^>]+?.(png|jpg))>?"'
    imgre = re.compile(reimg)
    def putUrl(html):
        httplist = re.findall(httpre, html)
        for url in httplist:
            realurl = url
            if 'http' not in url:
                realurl = "http://acm.hrbust.edu.cn/"+url
            #print realurl
            if url not in already:
                already.add(url)
                urllist.put(realurl)
    x = 0;
    def getImg(html):
        Imglist = re.findall(imgre, html)
        global x
        for Img in Imglist:
            Img = Img[0]
            if Img in alreadyImg:
                continue
            else:
                alreadyImg.add(Img)
            print Img
            if Img[0] != 'h':
                Img = "http://acm.hrbust.edu.cn/" + Img
            #print "Img == " +Img
            try:
                urllib.urlretrieve(Img, 'C:/%s.jpg' % x)
            except urllib2.URLError, e:
                pass
            else:
                #print "http://acm.hrbust.edu.cn/"+Img
                x += 1
    while True != urllist.empty():
        url = urllist.get(urllist)
        print url
        try:
            r = s.get(url)
            html = r.text
            if "index.php?m=Ranklist&a=showRatingrank" in url:
                putUrl(html)
            getImg(html)
        except urllib2.URLError, e:
            pass
        except urllib2.HTTPError, e:
            pass
        else:
            pass
        
        #else:
        #    print url
        #print html
        #break
    View Code
  • 相关阅读:
    datetime模块
    time模块
    shelve模块
    json&pickle 序列化
    re正则
    MQ常用命令
    MQ for linux安装与卸载【转】
    Linux下安装Oracle11g服务器【转】
    PLSQL_数据泵Datapump导入导出数据IMPDP / EXPDP(概念)(Oracle数据导入导出工具)[转]
    [LeetCode]:116:Populating Next Right Pointers in Each Node
  • 原文地址:https://www.cnblogs.com/icodefive/p/5440455.html
Copyright © 2011-2022 走看看