zoukankan      html  css  js  c++  java
  • python爬虫之实现HTTP网络请求的三种方式:urllib、urllib3以及requests

    1、通过urllib.requests模块实现发送请求并读取网页内容的简单示例如下:

    #导入模块
    import urllib.request
    #打开需要爬取的网页
    response = urllib.request.urlopen('http://www.baidu.com')
    #读取网页代码
    html = response.read()
    #打印读取的内容
    print(html)

    结果:

    b'<!DOCTYPE html><!--STATUS OK-->
    
    
        
        
                                <html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><meta content="always" name="referrer"><meta name="theme-color" content="#2932e1"><meta name="description" content="xe5x85xa8xe7x90x83xe6x9cx80xe5xa4xa7xe7x9ax84xe4xb8xadxe6x96x87xe6x90x9cxe7xb4xa2xe5xbcx95xe6x93x8exe3x80x81xe8x87xb4xe5x8ax9bxe4xbax8exe8xaexa9xe7xbdx91xe6xb0x91xe6x9bxb4xe4xbexbfxe6x8dxb7xe5x9cxb0xe8x8exb7xe5x8fx96xe4xbfxa1xe6x81xafxefxbcx8cxe6x89xbexe5x88xb0xe6x89x80xe6xb1x82xe3x80x82xe7x99xbexe5xbaxa6xe8xb6x85xe8xbfx87xe5x8dx83xe4xbaxbfxe7x9ax84xe4xb8xadxe6x96x87xe7xbdx91xe9xa1xb5xe6x95xb0xe6x8dxaexe5xbax93xefxbcx8cxe5x8fxafxe4xbbxa5xe7x9exacxe9x97xb4xe6x89xbexe5x88xb0xe7x9bxb8xe5x85xb3xe7x9ax84xe6x90x9cxe7xb4xa2xe7xbbx93xe6x9ex9cxe3x80x82"><link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" /><link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="xe7x99xbexe5xbaxa6xe6x90x9cxe7xb4xa2" /><link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu_85beaf5496f291521eb75ba38eacbd87.svg"><link rel="dns-prefetch" href="//dss0.bdstatic.com"/><link rel="dns-prefetch" href="//dss1.bdstatic.com"/><link rel="dns-prefetch" href="//ss1.bdstatic.com"/><link rel="dns-prefetch" href="//sp0.baidu.com"/><link rel="dns-prefetch" href="//sp1.baidu.com"/><link rel="dns-prefetch" href="//sp2.baidu.com"/><title>xe7x99xbexe5xbaxa6xe4xb8x80xe4xb8x8bxefxbcx8cxe4xbdxa0xe5xb0xb1xe7x9fxa5xe9x81x93</title><style index="newi" type="text/css">#form .bdsug{top:39px}.bdsug{display:none;position:absolute;535px;background:#fff;border:1px solid 
    ………………(太多省略)

    以上示例中是通过get请求方式获取百度的网页内容。

    下面是通过urllib.request模块的post请求实现获取网页信息的内容:

    #导入模块
    import urllib.parse
    import urllib.request
    #将数据使用urlencode编码处理后,再使用encoding设置为utf-8编码
    data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8')
    #打开指定需要爬取的网页
    response = urllib.request.urlopen('http://httpbin.org/post',data=data)
    html = response.read()
    #打印读取的内容
    print(html)

    结果:

    b'{
      "args": {}, 
      "data": "", 
      "files": {}, 
      "form": {
        "word": "hello"
      }, 
      "headers": {
        "Accept-Encoding": "identity", 
        "Content-Length": "10", 
        "Content-Type": "application/x-www-form-urlencoded", 
        "Host": "httpbin.org", 
        "User-Agent": "Python-urllib/3.7", 
        "X-Amzn-Trace-Id": "Root=1-5ec3f607-00f717e823a5c268fe0e0be8"
      }, 
      "json": null, 
      "origin": "123.139.39.71", 
      "url": "http://httpbin.org/post"
    }
    '

    2、urllib3模块

    通过urllib3模块实现发送网络请求的示例代码:

    #导入模块
    import urllib3
    #创建PoolManager对象,用于处理与线程池的连接以及线程安全的所有细节
    http = urllib3.PoolManager()
    #对需要爬取的网页发送请求
    response = http.request('GET','https://www.baidu.com/')
    #打印读取的内容
    print(response.data)

    结果:

    b'<!DOCTYPE html><!--STATUS OK-->
    <html>
    <head>
    	<meta http-equiv="content-type" content="text/html;charset=utf-8">
    	<meta http-equiv="X-UA-Compatible" content="IE=Edge">
    	<link rel="dns-prefetch" href="//s1.bdstatic.com"/>
    	<link rel="dns-prefetch" href="//t1.baidu.com"/>
    	<link rel="dns-prefetch" href="//t2.baidu.com"/>
    	<link rel="dns-prefetch" href="//t3.baidu.com"/>
    	<link rel="dns-prefetch" href="//t10.baidu.com"/>
    	<link rel="dns-prefetch" href="//t11.baidu.com"/>
    	<link rel="dns-prefetch" href="//t12.baidu.com"/>
    	<link rel="dns-prefetch" href="//b1.bdstatic.com"/>
    	<title>xe7x99xbexe5xbaxa6xe4xb8x80xe4xb8x8bxefxbcx8cxe4xbdxa0xe5xb0xb1xe7x9fxa5xe9x81x93</title>
    	<link href="https://ss1.bdstatic.com/5eN1bjq8AAUYm2zgoY3K/r/www/cache/static/home/css/index.css" rel="stylesheet" type="text/css" />
    	<!--[if lte IE 8]><style index="index" >#content{height:480px\9}#m{top:260px\9}</style><![endif]-->
    	<!--[if IE 8]><style index="index" >#u1 a.mnav,#u1 a.mnav:visited{font-family:simsun}</style><![endif]-->
    	<script>var hashMatch = document.location.href.match(/#+(.*wd=[^&].+)/);if (hashMatch && hashMatch[0] && hashMatch[1]) {document.location.replace("http://"+location.host+"/s?"+hashMatch[1]);}
    …………………………(太多省略)

    post请求实现获取网页信息的内容:

    #导入模块
    import urllib3
    #创建PoolManager对象,用于处理与线程池的连接以及线程安全的所有细节
    http = urllib3.PoolManager()
    #对需要爬取的网页发送请求
    response = http.request('POST','http://httpbin.org/post',fields={'word':'hello'})
    #打印读取的内容
    print(response.data)

    结果:

    b'{
      "args": {}, 
      "data": "", 
      "files": {}, 
      "form": {
        "word": "hello"
      }, 
      "headers": {
        "Accept-Encoding": "identity", 
        "Content-Length": "128", 
        "Content-Type": "multipart/form-data; boundary=06ff68d7a4a22f600244a70bf9382ab2", 
        "Host": "httpbin.org", 
        "X-Amzn-Trace-Id": "Root=1-5ec3f8c3-9f33c46c1c1b37f6774b84f2"
      }, 
      "json": null, 
      "origin": "123.139.39.71", 
      "url": "http://httpbin.org/post"
    }
    '

    3、requests模块

    以GET请求方式为例,打印多种请求信息的代码:

    #导入模块
    import requests
    #对需要爬取的网页发送请求
    response = requests.get('http://www.baidu.com')
    #打印状态码
    print('状态码:',response.status_code)
    #打印请求url
    print('url:',response.url)
    #打印头部信息
    print('header:',response.headers)
    #打印cookie信息
    print('cookie:',response.cookies)
    #以文本形式打印网页源码
    print('text:',response.text)
    #以字节流形式打印网页源码
    print('content:',response.content)

    结果:

    状态码: 200
    url: http://www.baidu.com/
    header: {'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform', 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html', 'Date': 'Tue, 19 May 2020 15:28:30 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:32 GMT', 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}
    cookie: <RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
    text: <!DOCTYPE html>
    <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8>
    ………………(此处省略)
    content: b'<!DOCTYPE html>
    <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8>
    ………………(此处省略)

    以POST请求方式,发送HTTP网页请求的示例:

    #导入模块
    import requests
    #表单参数
    data = {'word':'hello'}
    #对需要爬取的网页发送请求
    response = requests.post('http://httpbin.org/post',data=data)
    #以字节流形式打印网页源码
    print(response.content)

    结果:

    b'{
      "args": {}, 
      "data": "", 
      "files": {}, 
      "form": {
        "word": "hello"
      }, 
      "headers": {
        "Accept": "*/*", 
        "Accept-Encoding": "gzip, deflate", 
        "Content-Length": "10", 
        "Content-Type": "application/x-www-form-urlencoded", 
        "Host": "httpbin.org", 
        "User-Agent": "python-requests/2.23.0", 
        "X-Amzn-Trace-Id": "Root=1-5ec3fc97-965139d919e5a08e8135e731"
      }, 
      "json": null, 
      "origin": "123.139.39.71", 
      "url": "http://httpbin.org/post"
    }
    '

  • 相关阅读:
    软件测试工具
    nat 转发
    修改Oracle 10g Express Edition的字符集
    java数字证书学习笔记
    【Java–XML】JDOM解析XML字符串(非XML文档)
    JAVA Web快速开发部署(Javarebel实现真正高效的tomcat热部署)
    热天稀饭配方
    javascript 使用正则实现replaceall功能
    设置eclipse中各类型文件的默认浏览器(如设置flex的.mxml的编辑器为MXML Editor)
    GAE中JDO数据清除
  • 原文地址:https://www.cnblogs.com/xiao02fang/p/12920437.html
Copyright © 2011-2022 走看看