urllib模块使用
urllib.request
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
import urllib.request
url = 'http://httpbin.org/ip'
response = urllib.request.urlopen(url)
html = response.read() # 返回bytes类型数据
print(html)
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8') # 通过decode()方法将bytes类型数据转化为str类型数据
print(html)
发送post数据
import urllib.request
import urllib.parse
url = 'http://httpbin.org/post'
data = {
'name' : "小明",
'age' : 30
}
# data = urllib.parse.urlencode(data) # Error: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
# data = urllib.parse.urlencode(data).encode('utf-8')
data = bytes(urllib.parse.urlencode(data),encoding="utf-8")
response = urllib.request.urlopen(url, data=data)
html = response.read().decode('utf-8')
print(html)
设置timeout
import urllib.request
url = 'http://httpbin.org/get'
response = urllib.request.urlopen(url, timeout=1)
html = response.read().decode('utf-8')
print(html)
import socket
import urllib.request
import urllib.error
url = 'http://httpbin.org/get'
try:
response = urllib.request.urlopen(url, timeout=0.1)
html = response.read().decode('utf-8')
print(html)
except urllib.error.URLError as e:
print("捕获异常....")
print(e.reason)
if isinstance(e.reason, socket.timeout):
print("请求超时")
响应
响应类型、状态码、响应头、实际获取的url
import urllib.request
url = 'http://www.python.org'
response = urllib.request.urlopen(url)
# 响应类型
response_type = type(response)
print(response_type) # <class 'http.client.HTTPResponse'>
# 状态码
status_code = response.getcode()
print(status_code)
# 状态码对应的信息
status = response.reason
print(status) # 比如 200对应Ok, 404对应Not Found
# 响应头
response_headers = response.getheaders() # 返回列表
print(response_headers)
server_type = response.getheader('Server') # getheader()获取响应头的指定部分信息
print(server_type)
print(type(response.headers)) # <class 'http.client.HTTPMessage'>
content_type = response.headers['Content-Type'] # 获取Content-Type
print(content_type)
# 实际获取的url, 可以用来判断是否发生重定向
actual_url = response.geturl()
print(actual_url)
class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶
import urllib.request
url = 'http://httpbin.org/get'
request = urllib.request.Request(url) # 创建请求对象
response = urllib.request.urlopen(request) # 发送请求
html = response.read().decode('utf-8')
print(html)
# 默认的User-Agent为"Python-urllib/x.x" # x.x为python版本号
发送post数据
import urllib.request
import urllib.parse
url = 'http://httpbin.org/post'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
data = {
'name' : 'peter',
'age' : 20
}
data = bytes(urllib.parse.urlencode(data), encoding="utf-8") # POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
request = urllib.request.Request(url, data=data, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
# post数据时 "Content-Type": "application/x-www-form-urlencoded"
urllib.request.Request 对象方法
import urllib.request
url = 'http://httpbin.org/get'
request = urllib.request.Request(url)
# add_header(key, val) # 添加请求头信息
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36')
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
Handlers
ProxyHandler(代理)
import urllib.request
# 字典,key为协议类型,value 为 ip地址:端口号
proxy_dict = {
'http': '127.0.0.1:6688',
'https': '127.0.0.1:6688',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener)
opener.addheaders = headers.items() # 设置请求头
url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = urllib.request.urlopen(url)
print(response.read().decode('utf-8'))
# 常见错误:
# HTTPError: HTTP Error 403: Forbidden : 很可能代理服务器设置了权限,当前ip不在代理服务器允许访问列表中
代理需要身份认证
# 错误提示: HTTPError: HTTP Error 407: Proxy Authentication Required
#方法1: 代理ip设置格式 http://用户名:密码@ip地址:端口号
import urllib.request
# 字典,key为协议类型,value 为 ip地址:端口号
proxy_dict = {
'http': 'http://name:password@127.0.0.1:6688',
'https': 'http://name:password@127.0.0.1:6688',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener)
opener.addheaders = headers.items() # 设置请求头
url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))
#方法2: 使用ProxyBasicAuthHandler用于代理登陆验证(需要提供相应的用户名和密码)
import urllib.request
# 字典,key为协议类型,value 为 ip地址:端口号
proxy_dict = {
'http': 'http://127.0.0.1:6688',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'http://127.0.0.1:6688', 'name', 'password') # #realm(域)设为None即可
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(proxy_hanlder, proxy_auth_handler)
urllib.request.install_opener(opener)
opener.addheaders = headers.items() # 设置请求头
url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))
HTTPBasicAuthHandler
用于访问web服务器时的身份验证
import urllib.request
url = 'http://127.0.0.1/test/'
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url, 'admin','password') # 添加对应url的用户名和密码
http_auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(http_auth_handler)
response = opener.open(url)
print(response.read().decode('utf-8'))
FTPHandler
import urllib.request
url = 'ftp://ftp1.linuxidc.com'
username = 'ftp1.linuxidc.com'
password = 'www.linuxidc.com'
ftp_url = 'ftp://%s:%s@ftp1.linuxidc.com' %(username, password)
ftp_handler = urllib.request.FTPHandler()
opener = urllib.request.build_opener(ftp_handler)
response = opener.open(ftp_url)
print(response.read().decode('utf-8', 'ignore'))
HTTPHandler、HTTPSHandler
import urllib.request
url = 'http://www.baidu.com'
# 通过将debuglevel=1,将debug Log 打开,这样收发包的内容就会在屏幕上打印出来,方便调试
http_handler = urllib.request.HTTPHandler(debuglevel=1)
https_handler = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(http_handler, https_handler)
response = opener.open(url)
'''
效果:
send: b'GET / HTTP/1.1
Accept-Encoding: identity
Host: www.baidu.com
User-Agent: Python-urllib/3.6
Connection: close
'
reply: 'HTTP/1.1 200 OK
'
header: Date header: Content-Type header: Transfer-Encoding header: Connection header: Vary header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: P3P header: Cache-Control header: Cxy_all header: Expires header: X-Powered-By header: Server header: X-UA-Compatible header: BDPAGETYPE header: BDQID header: BDUSERID
'''
Cookie
CookieJar
import urllib.request
import http.cookiejar
url = 'http://www.baidu.com'
cookie = http.cookiejar.CookieJar()
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
response = opener.open(url)
print(response.getcode())
for item in cookie: # item为<class 'http.cookiejar.Cookie'>
print(item.name, item.value, sep=" : ")
MozillaCookieJar
创建与Mozilla cookies.txt文件兼容的FileCookieJar实例
import urllib.request
import http.cookiejar
url = 'https://www.zhihu.com/settings/profile'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
}
cookie = http.cookiejar.MozillaCookieJar("zhihu_cookie.txt")
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
opener.addheaders = headers.items()
try:
cookie.load() # 将cookie数据从文件加载到内存 很重要
except http.cookiejar.LoadError as e:
print('cookie文件加载失败')
except IOError as e:
print("cookie文件不存在")
response = opener.open(url)
print(response.geturl()) # 将geturl()返回的结果和url比对,判断是否登陆成功,失败会转到知乎登陆界面
html = response.read().decode('utf-8')
print(html)
# 对于登陆成功,需要调用MozillaCookieJar对象的save()方法,将数据从内存保存到文件中
LWPCookieJar
创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例
import urllib.request
import http.cookiejar
url = 'http://www.baidu.com'
cookie = http.cookiejar.LWPCookieJar("cookies.txt")
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response = opener.open(url)
# 必须调用save方法 将内存中的cookie对象保存到本地文件中, 下次再次使用cookie,只需调用load方法将其加载到内存中即可
cookie.save(ignore_discard=True, ignore_expires=True)
异常处理
URLError
引起URLError的原因通常有: 无网络连接,即本机无法上网、访问的目标服务器不存在。在这种情况下,异常对象会有reason属性(它是一个由(错误码、错误原因)组成的元组对象)。 捕获异常方法如下:
import urllib.request
try:
response = urllib.request.urlopen('http://www.hello_world.org')
except urllib.request.URLError as e:
print(type(e.reason)) # <class 'socket.gaierror'>
print(e.reason) # # [Errno 11001] getaddrinfo failed
HTTPError
HTTPError是URLError的子类,每次调用urlopen方法发出一个请求时,服务器上都会产生对应response,它包含一个数字"状态码",
常见的状态码有200(请求成功),302(重定向),304(文档的内容(自上次访问以来或者根据请求的条件)并没有改变)
这些状态码有的表示服务器无法完成请求。如果无法处理请求,urlopen会抛出HTTPError。
典型的错误包括404(页面没有找到)、403(请求被禁止)、401(当前请求需要用户认证)、407(需要代理验证)、500(服务器内部错误)
# 方式1
import urllib.request
import urllib.error
url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
response = urllib.request.urlopen(url)
# HTTPError是URLError子类,要放到前面处理
except urllib.error.HTTPError as e:
print("The server cannot fulfill the request...")
print("Error code: ", e.code)
print("Reason: ", e.reason)
except urllib.error.URLError as e:
print("failed to fetch the server...")
print("Reason: ", e.reason)
# 方式2
import urllib.request
import urllib.error
url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
if hasattr(e, 'code'):
print("The server cannot fulfill the request...")
print("Error code: ", e.code)
print("Reason: ", e.reason)
else:
print("failed to fetch the server...")
print("Reason: ", e.reason)
urllib.parse
urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶
负责解析URL
from urllib.parse import urlparse
# def urlparse(url, scheme='', allow_fragments=True)
# 将url解析成6部分 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
# 返回6元祖 (scheme, netloc, path, params, query, fragment)
result = urlparse('http://www.baidu.com/index.html;user?id=100#comment')
print(type(result)) # <class 'urllib.parse.ParseResult'>
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')
# 只有通过// 才能识别netloc
result = urlparse(
'//www.baidu.com/index.html;user?id=100#comment',
scheme="https")
print(result) # ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')
result = urlparse(
'www.baidu.com/index.html;user?id=100#comment',
scheme="https")
print(result) # ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=100', fragment='comment')
# 原url已包含scheme,使用已有的scheme
result = urlparse(
'http://www.baidu.com/index.html;user?id=100#comment',
scheme="https")
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')
result = urlparse(
"http://www.baidu.com/index.html;user?id=100#comment",
allow_fragments=False)
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100#comment', fragment='')
result = urlparse(
"http://www.baidu.com/index.html#comment",
allow_fragments=False)
print(result) # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')
urllib.parse.urlunparse(parts)
from urllib.parse import urlunparse
data = ("http", 'www.baidu.com','index.html', 'user','id=100','comment')
url = urlunparse(data)
print(url)
urllib.parse.urljoin(base, url, allow_fragments=True)
# 以相对路径的url为准,base url向相对路径url提供相对路径url缺少的scheme(协议),netloc(主机地址), 来构造完整的url路径
from urllib.parse import urljoin
print(urljoin("http://www.baidu.com","FAQ.html"))
print(urljoin("http://www.baidu.com/index.html","FAQ.html"))
print(urljoin("http://www.baiud.com/index.html", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/index.html", "http://www.google.com/FAQ.html?question=2"))
print(urljoin("http://www.baidu.com/index.html?wd=abc", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/", "?category=5#comment"))
print(urljoin("http://www.baidu.com/#comment", "?category=5"))
'''
http://www.baidu.com/FAQ.html
http://www.baidu.com/FAQ.html
http://www.google.com/FAQ.html
http://www.google.com/FAQ.html?question=2
http://www.google.com/FAQ.html
http://www.baidu.com/?category=5#comment
http://www.baidu.com/?category=5
'''
urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)
from urllib.parse import urlencode
basic_url = 'http://httpbin.org/get'
data = {
"key": '天气',
}
data = urlencode(data)
full_url = '%s?%s' % (basic_url, data)
print(full_url) # http://httpbin.org/get?key=%E5%A4%A9%E6%B0%94