zoukankan      html  css  js  c++  java
  • 爬虫一之基本操作

    使用最普通的urllib来进行爬取

    get请求网页

    from urllib.request import urlopen
    
    response = urlopen('http://baidu.com',timeout=1)
    print(response.read().decode('utf-8')
    #read出是bytes格式,需指定编码格式为uft-8
    

    或者利用Request请求网页

    from urllib.request import Request
    
    request = Request('http://baidu.com')
    response = urlopen(request)
    print(response.read().decode('utf-8'))
    

    响应的内容

    print(response.status)
    print(response.getheaders())
    print(response.getheaders('Server'))
    

    post请求网页

    from urllib.parse import urlencode
    
    data = bytes(urlencode({'word':'hello'}),encoding = 'utf8')
    response = urlopen('http://baidu,com',data=data)
    

    改变Header

    url = 'http://httpbin.org/post'
    headers ={
        'User-Agent':Mozilla/4.0 (compatible;MSIE 5.5;windows NT)',
        'Host':'httpbin.org'}
    
    dict = {'name':'Germey'}
    data = bytes(urlencode(dict),encoding = 'utf8')
    req = Request(url=url, data=data,headers=headers, method='POST')
    response = request.urlopen(req)
    print(response.read().decode('utf-8')
    

    handler

    代理

    from urllib.request import ProxyHandler,build_opener
    
    proxy_handler = ProxyHandler({'http':'http://127.0.0.1:9743',
                                                       'https':'https://127.0.0.1:9743',})
    #这是有代理时才可以这样使用
    opener = build_opener(proxy_handler)
    response = opener.open('http://www.baidu.com')
    
    import http.cookiejar, urllib.request
    
    filename = 'cookie.txt'
    cookie = http.cookiejar.CookieJar()
    #要保存成文件的话,需要用以下两个子类
    #cookie = http.cookiejar.MozillaCookieJar()
    #cookie = http.cookiejar.LWPCookieJar()
    
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler)
    response = opener.open('http://www.baidu.com')
    cookie.save(ignore_discard=True, ignore_expires=True)
    
    #读取cookie
    cookie = http.cookiejar.LWPCookieJar()
    cookie.load('cookie.txt',ignore_discard=True, ignore_expires=True)
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = build_opener(handler)
    response = opener.open('http://www.baidu.com')
    

    异常处理

    from urllib import request, error
    
    try:
        response = response.urlopen('qwedqwwd')
    except error.HTTPError as e:
        print(e.reason. e.code, e.headers, sep='
    ')
    except error.URLError as e:
        print(e.reason)
    else:
        print('Request Successfully')
    
  • 相关阅读:
    不知道微博的计时机制
    Edsger W. Dijkstra
    最靠谱百度网盘下载太慢的解决办法
    这个拒绝成为比尔盖茨的“互联网之父”,今天拿下了计算机届的诺贝尔奖!
    老罗语录
    如何利用互联网免费学习英语
    wps怎么制作一个红色的电子印章?
    安防摄像头视频流媒体服务器EasyDSS如何配置接入考场监控系统?
    互联网流媒体直播点播平台报ioutil.WriteFile错误导致文件只读如何处理?
    互联网直播点播平台go语言搭建重定向和反向代理的区别及使用
  • 原文地址:https://www.cnblogs.com/guiguiguoguo/p/11176661.html
Copyright © 2011-2022 走看看