zoukankan      html  css  js  c++  java
  • 从urllib和urllib2基础到一个简单抓取网页图片的小爬虫

    urllib最常用的两大功能(个人理解urllib用于辅助urllib2)

    1.urllib.urlopen()

    2. urllib.urlencode()   #适当的编码,可用于后面的post提交数据

       

    import urllib
    Dict = {'name' : 'Michael Foord',
              'location' : 'Northampton',
              'language' : 'Python'}
    print urllib.urlencode(Dict)  
    

     urllib2常用的函数

    1.最基本的打开读取一个网页

    import urllib2
    response = urllib2.urlopen('http://www.baidu.com/')
    html = response.read()
    

    2.地址创建一个Request对象

    req = urllib2.Request('http://www.baidu.com/')
    response = urllib2.urlopen(req)
    the_page = response.read()
    

     3.Data数据利用post方式提交

    value={'name' : 'Michael Foord',
              'location' : 'Northampton',
              'language' : 'Python'}
    data = urllib.urlencode(values)
    request = urllib2.Request(url,data)
    #request= urllib2.Request(url, data, headers)  Request对象共有三个参数
    response = urllib2.urlopen(request)
    print response.read()
    

     4.在 HTTP Request 中加入特定的 Header

    import urllib2
    request = urllib2.Request('http://www.baidu.com/')
    request.add_header('User-Agent', 'fake-client')
    response = urllib2.urlopen(request)
    print response.read()  
    

     5.Cookie

    import urllib2
    import cookielib
    cookie = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    response = opener.open('http://www.baidu.com')
    for item in cookie:
        print 'Name = '+item.name
        print 'Value = '+item.value 
    

     6.得到 HTTP 的返回码

    import urllib2
    try:
        response = urllib2.urlopen('http://bbs.csdn.net/why')
    except urllib2.HTTPError, e:
        print e.code 
    

     7.Timeout 设置

    import urllib2
    response = urllib2.urlopen('http://www.baidu.com/', timeout=10)  
    

     8.Redirect动作

    import urllib2
    my_url = 'http://www.google.cn'
    response = urllib2.urlopen(my_url)
    redirected = response.geturl() == my_url
    print redirected
    my_url = 'http://rrurl.cn/b1UZuP'
    response = urllib2.urlopen(my_url)
    redirected = response.geturl() == my_url
    print redirected 
    

    9.使用 HTTP 的 PUT 和 DELETE 方法

    import urllib2
    request = urllib2.Request(uri, data=data)
    request.get_method = lambda: 'PUT' # or 'DELETE'
    response = urllib2.urlopen(request)  
    

    10.Debug Log

    import urllib2
    httpHandler = urllib2.HTTPHandler(debuglevel=1)
    httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
    opener = urllib2.build_opener(httpHandler, httpsHandler)
    urllib2.install_opener(opener)
    response = urllib2.urlopen('http://www.google.com')  
    

     11.表单的处理

    # -*- coding: utf-8 -*-
    import urllib
    import urllib2
    postdata=urllib.urlencode({
        'username':'汪小光',
        'password':'why888',
        'continueURI':'http://www.verycd.com/',
        'fk':'',
        'login_submit':'登录'
    })
    req = urllib2.Request(
        url = 'http://secure.verycd.com/signin',
        data = postdata
    )
    result = urllib2.urlopen(req)
    print result.read()
    

     最后附上一段抓取某网站妹子图片的代码

    import urllib
    import urllib2
    import os
    
    
    def url_open(url):
        req = urllib2.Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0')
        response = urllib2.urlopen(req)
        html = response.read()
    
        return html
    
    
    def get_page(url):
        html = url_open(url).decode('utf-8')
    
        a = html.find('current-comment-page') + 23
        b = html.find(']', a)
    
        return html[a:b]
    
    
    def find_imgs(url):
        html = url_open(url).decode('utf-8')
        img_addrs = []
    
        a = html.find('img src=')
    
        while a != -1:
            b = html.find('.jpg', a, a+255)
            if b != -1:
                img_addrs.append(html[a+9:b+4])
            else:
                b = a + 9
    
            a = html.find('img src=', b)
    
        return img_addrs
    
    
    def save_imgs(folder, img_addrs):
        for each in img_addrs:
            filename = each.split('/')[-1]
            with open(filename, 'wb') as f:
                img = url_open(each)
                f.write(img)
    
    
    def download_mm(folder='OOXX', pages=10):
        os.mkdir(folder)
        os.chdir(folder)
    
        url = "http://jandan.net/ooxx/"
        page_num = int(get_page(url))
    
        for i in range(pages):
            page_num -= i
            page_url = url + 'page-' + str(page_num) + '#comments'
            img_addrs = find_imgs(page_url)
            save_imgs(folder, img_addrs)
    
    if __name__ == '__main__':
        download_mm()
    
  • 相关阅读:
    做统计图的好工具
    QueryBuildRange中的表达式
    四种方式话Equal
    QueryBuildRange的空值
    GetHashCode()初探
    X++中的字符串操作函数
    寻找缺陷的方法
    字程序级别的重构
    代码大全的方向
    多线程啊
  • 原文地址:https://www.cnblogs.com/Dleo/p/5503752.html
Copyright © 2011-2022 走看看