什么是Urllib?
Python内置的HTTP请求库
urllib.request 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparser robots.txt解析模块
相比Python的变化
Python2中的urllib2在Python3中被统一移动到了urllib.request中
python2
import urllib2
response = urllib2.urlopen('http://www.cnblogs.com/0bug')
Python3
import urllib.request
response = urllib.request.urlopen('http://www.cnblogs.com/0bug/')
urlopen()
不加data是以GET方式发送,加data是以POST发送
1
2
3
4
5
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' ) html = response.read().decode( 'utf-8' ) print(html) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
加data发送POST请求
1
2
3
4
5
6
|
import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({ 'hello' : '0bug' }), encoding= 'utf-8' ) response = urllib.request.urlopen( 'http://httpbin.org/post' , data=data) print(response.read()) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
timeout超时间
1
2
3
4
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' , timeout=0.01) print(response.read()) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1
2
3
4
5
6
7
8
|
import urllib.request import socket import urllib.error try : response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' , timeout=0.01) except urllib.error.URLError as e: if isinstance(e.reason,socket.timeout): print( '请求超时' ) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
响应
1.响应类型
1
2
3
4
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' ) print(type(response)) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
2.状态码、响应头
1
2
3
4
5
6
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' ) print(response.status) print(response.getheaders()) print(response.getheader( 'Content-Type' )) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
3.响应体
响应体是字节流,需要decode('utf-8')
1
2
3
4
5
|
import urllib.request response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug' ) html = response.read().decode( 'utf-8' ) print(html) |
Request
1
2
3
4
5
|
import urllib.request request = urllib.request.Request( 'http://www.cnblogs.com/0bug' ) response = urllib.request.urlopen(request) print(response.read().decode( 'utf-8' )) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
添加请求头信息
1
2
3
4
5
6
7
8
9
10
11
12
|
from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' , 'Host' : 'httpbin.org' } dic = { 'name' : '0bug' } data = bytes(parse.urlencode(dic), encoding= 'utf-8' ) req = request.Request(url=url, data=data, headers=headers, method= 'POST' ) response = request.urlopen(req) print(response.read().decode( 'utf-8' )) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
add_header
1
2
3
4
5
6
7
8
9
10
|
from urllib import request, parse url = 'http://httpbin.org/post' dic = { 'name' : '0bug' } data = bytes(parse.urlencode(dic), encoding= 'utf-8' ) req = request.Request(url=url, data=data, method= 'POST' ) req.add_header( 'User-Agent' , 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' ) response = request.urlopen(req) print(response.read().decode( 'utf-8' )) |
Handler
代理:
1
2
3
4
5
6
7
8
9
|
import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http' : 'http代理' , 'https' : 'https代理' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open( 'http://www.cnblogs.com/0bug' ) print(response.read()) |
Cookie
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' ) for item in cookie: print(item.name + "=" + item.value) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
Cookie保存为文件
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request filename = 'cookie.txt' cookie = http.cookiejar.MozillaCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' ) cookie.save(ignore_discard=True, ignore_expires=True) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
另一种方式存
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request filename = 'cookie.txt' cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' ) cookie.save(ignore_discard=True, ignore_expires=True) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
用什么格式的存就应该用什么格式的读
1
2
3
4
5
6
7
8
|
import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load( 'cookie.txt' , ignore_discard=True, ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open( 'http://www.baidu.com' ) print(response.read().decode( 'utf-8' )) |
异常处理
1
2
3
4
5
6
|
from urllib import request, error try : response = request.urlopen( 'http://www.cnblogs.com/0bug/xxxx' ) except error.URLError as e: print(e.reason) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1
2
3
4
5
6
7
8
9
10
|
from urllib import request, error try : response = request.urlopen( 'http://www.cnblogs.com/0bug/xxxx' ) except error.HTTPError as e: print(e.reason, e.code, e.headers, sep= '
' ) except error.URLError as e: print(e.reason) else : print( 'Request Successfully' ) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1
2
3
4
5
6
7
8
9
10
|
import socket import urllib.request import urllib.error try : response = urllib.request.urlopen( 'http://www.cnblogs.com/0bug/xxxx' , timeout=0.001) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print( '请求超时' ) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
URL解析
1
2
3
4
5
|
from urllib.parse import urlparse result = urlparse( 'www.baidu.com/index.html;user?id=5#comment' ) print(type(result)) print(result) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1
2
3
4
|
from urllib.parse import urlparse result = urlparse( 'www.baidu.com/index.html;user?id=5#comment' , scheme= 'https' ) print(result) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1
2
3
4
|
from urllib.parse import urlparse result = urlparse( 'http://www.baidu.com/index.html;user?id=5#comment' , scheme= 'https' ) print(result) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1
2
3
4
|
from urllib.parse import urlparse result = urlparse( 'http://www.badiu.com/index.html;user?id=5#comment' , allow_fragments=False) print(result) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1
2
3
4
|
from urllib.parse import urlparse result = urlparse( 'http://www.badiu.com/index.html#comment' , allow_fragments=False) print(result) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
urlunparse
1
2
3
4
|
from urllib.parse import urlunparse data = [ 'http' , 'www.baidu.com' , 'index.html' , 'user' , 'id=6' , 'comment' ] print(urlunparse(data)) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
urljoin
1
2
3
4
5
6
7
8
9
10
|
from urllib.parse import urljoin print(urljoin( 'http://www.baidu.com' , 'ABC.html' )) print(urljoin( 'http://www.baidu.com' , 'https://www.cnblogs.com/0bug' )) print(urljoin( 'http://www.baidu.com/0bug' , 'https://www.cnblogs.com/0bug' )) print(urljoin( 'http://www.baidu.com/0bug' , 'https://www.cnblogs.com/0bug?q=2' )) print(urljoin( 'http://www.baidu.com/0bug?q=2' , 'https://www.cnblogs.com/0bug' )) print(urljoin( 'http://www.baidu.com' , '?q=2#comment' )) print(urljoin( 'www.baidu.com' , '?q=2#comment' )) print(urljoin( 'www.baidu.com#comment' , '?q=2' )) |
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
urlencode
1
2
3
4
5
6
7
8
9
|
from urllib.parse import urlencode params = { 'name' : '0bug' , 'age' : 25 } base_url = 'http://www.badiu.com?' url = base_url + urlencode( params ) print(url) |