zoukankan      html  css  js  c++  java
  • 从urllib和urllib2基础到一个简单抓取网页图片的小爬虫

    urllib最常用的两大功能(个人理解urllib用于辅助urllib2)

    1.urllib.urlopen()

    2. urllib.urlencode()   #适当的编码,可用于后面的post提交数据

       

    import urllib
    Dict = {'name' : 'Michael Foord',
              'location' : 'Northampton',
              'language' : 'Python'}
    print urllib.urlencode(Dict)  
    

     urllib2常用的函数

    1.最基本的打开读取一个网页

    import urllib2
    response = urllib2.urlopen('http://www.baidu.com/')
    html = response.read()
    

    2.地址创建一个Request对象

    req = urllib2.Request('http://www.baidu.com/')
    response = urllib2.urlopen(req)
    the_page = response.read()
    

     3.Data数据利用post方式提交

    value={'name' : 'Michael Foord',
              'location' : 'Northampton',
              'language' : 'Python'}
    data = urllib.urlencode(values)
    request = urllib2.Request(url,data)
    #request= urllib2.Request(url, data, headers)  Request对象共有三个参数
    response = urllib2.urlopen(request)
    print response.read()
    

     4.在 HTTP Request 中加入特定的 Header

    import urllib2
    request = urllib2.Request('http://www.baidu.com/')
    request.add_header('User-Agent', 'fake-client')
    response = urllib2.urlopen(request)
    print response.read()  
    

     5.Cookie

    import urllib2
    import cookielib
    cookie = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    response = opener.open('http://www.baidu.com')
    for item in cookie:
        print 'Name = '+item.name
        print 'Value = '+item.value 
    

     6.得到 HTTP 的返回码

    import urllib2
    try:
        response = urllib2.urlopen('http://bbs.csdn.net/why')
    except urllib2.HTTPError, e:
        print e.code 
    

     7.Timeout 设置

    import urllib2
    response = urllib2.urlopen('http://www.baidu.com/', timeout=10)  
    

     8.Redirect动作

    import urllib2
    my_url = 'http://www.google.cn'
    response = urllib2.urlopen(my_url)
    redirected = response.geturl() == my_url
    print redirected
    my_url = 'http://rrurl.cn/b1UZuP'
    response = urllib2.urlopen(my_url)
    redirected = response.geturl() == my_url
    print redirected 
    

    9.使用 HTTP 的 PUT 和 DELETE 方法

    import urllib2
    request = urllib2.Request(uri, data=data)
    request.get_method = lambda: 'PUT' # or 'DELETE'
    response = urllib2.urlopen(request)  
    

    10.Debug Log

    import urllib2
    httpHandler = urllib2.HTTPHandler(debuglevel=1)
    httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
    opener = urllib2.build_opener(httpHandler, httpsHandler)
    urllib2.install_opener(opener)
    response = urllib2.urlopen('http://www.google.com')  
    

     11.表单的处理

    # -*- coding: utf-8 -*-
    import urllib
    import urllib2
    postdata=urllib.urlencode({
        'username':'汪小光',
        'password':'why888',
        'continueURI':'http://www.verycd.com/',
        'fk':'',
        'login_submit':'登录'
    })
    req = urllib2.Request(
        url = 'http://secure.verycd.com/signin',
        data = postdata
    )
    result = urllib2.urlopen(req)
    print result.read()
    

     最后附上一段抓取某网站妹子图片的代码

    import urllib
    import urllib2
    import os
    
    
    def url_open(url):
        req = urllib2.Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0')
        response = urllib2.urlopen(req)
        html = response.read()
    
        return html
    
    
    def get_page(url):
        html = url_open(url).decode('utf-8')
    
        a = html.find('current-comment-page') + 23
        b = html.find(']', a)
    
        return html[a:b]
    
    
    def find_imgs(url):
        html = url_open(url).decode('utf-8')
        img_addrs = []
    
        a = html.find('img src=')
    
        while a != -1:
            b = html.find('.jpg', a, a+255)
            if b != -1:
                img_addrs.append(html[a+9:b+4])
            else:
                b = a + 9
    
            a = html.find('img src=', b)
    
        return img_addrs
    
    
    def save_imgs(folder, img_addrs):
        for each in img_addrs:
            filename = each.split('/')[-1]
            with open(filename, 'wb') as f:
                img = url_open(each)
                f.write(img)
    
    
    def download_mm(folder='OOXX', pages=10):
        os.mkdir(folder)
        os.chdir(folder)
    
        url = "http://jandan.net/ooxx/"
        page_num = int(get_page(url))
    
        for i in range(pages):
            page_num -= i
            page_url = url + 'page-' + str(page_num) + '#comments'
            img_addrs = find_imgs(page_url)
            save_imgs(folder, img_addrs)
    
    if __name__ == '__main__':
        download_mm()
    
  • 相关阅读:
    5.4 省选模拟赛 修改 线段树优化dp 线段树上二分
    一本通 高手训练 1782 分层图 状压dp
    luogu P3830 [SHOI2012]随机树 期望 dp
    5.2 省选模拟赛 或许 线型基
    luogu P4562 [JXOI2018]游戏 组合数学
    一本通 高手训练 1781 死亡之树 状态压缩dp
    luogu P4726 【模板】多项式指数函数 多项式 exp 牛顿迭代 泰勒展开
    4.28 省选模拟赛 负环 倍增 矩阵乘法 dp
    HDU 1756 Cupid's Arrow 计算几何 判断一个点是否在多边形内
    一本通 高手训练 1763 简单树 可持久化线段树 树链刨分 标记永久化
  • 原文地址:https://www.cnblogs.com/Dleo/p/5503752.html
Copyright © 2011-2022 走看看