Urllib库的详解
1. Urllib是python内置的HTTP请求库
包含4个模块: urllib.request(请求模块); urllib.error(异常处理模块); urllib.parse(url解析模块); urllib.robotparse(robots.text解析模块)
@urlopende 基础用法
~ 获取response内容:
import urllib.request #get类型的请求 response=urllib.request.urlopen('http://www.xxx.com') print(response.read().decode('utf-8'))
~ 获取response内容:
import urllib.parse #post请求类型 import urllib.request data=bytes(urllib.parse.urlencode({'word':hello}),encoding='utf-8') response=urllib.request.urlopen('http://www.xxxx.com',data=data) print(response.read())
~ 超时设置:在urlopen的设置框里加一个timeout功能
import socket import urllib.request import urllin.error try response=urllib.request.urlopen('http://www.xxxx.com',timeout=1) except urllib.error.URLError as e: if isinstance(e.reason,socket.timeout): print('timeout')
@响应request的用法
~状态码,响应头:
import urllib.request response=urllib.request.urlopen('http://www.xxxx.com') print(response.status) #获取响应状态码,200是OK; print(response.getheaders)#获取所有响应头,是一个元组; print(response.getheaders('server'))#可以添加参数,添加server可以获取服务器类型 print(response.getheaders.read().decde('utf-8')) #可以得到响应头的内容; ~request: import urllib.request request=urllib.request.Request('http://www.xxxx.com') response=urllib.request.urlopen(request) #通过urlopen request数据 print(response.read().decode('utf-8')) from urllib import request, parse url='htp://www.xxxx.com' headers={ 'User-Agent':'电脑的配置信息', 'Host':'httpbin.org' } dict={ 'name':'Germany' } data=bytes(parse.urlencode(dict),encoding='utf-8') req=request.Request(url=url,date=date,headers=headers,method='post') #完整的request参数构造 response=request.urlopen(req) print(response.read().decod('utf-8'))
@Handler:
~代理:可以伪装IP地址
import urllib.request proxy_handler=urllib.request.ProxyHandler({ 'http'='http://127.0.0.1:9743', #代理端口 'https'='http://127.0.0.1:9743' }) opener=urllib.request.build_opener(proxy_handler) response=opener.open('http://www.baidu.com') print(response.read())
~cookie
import http.cookiejar,urllib.request cookie=http.cookiejar.CookJar() #声明为cookjar的对象 handler=urllib.request.HTTPCookieProcessor(cookie) opener=urllib.request.build_opener(handler) #传递handler response=opener.open('http://www.baidu.com') for item in cookie print(item.name+'='+item.value)
!把cookie保存在文本文件
import http.cookiejar,urllib.request filename='cookie.text' cookie=http.cookiejar.MozillaCookieJar(filename) handler=urllib.request.HTTPCookieProcessor(cookie) opener=urllib.request.build_opener(handler) response=opener.open('http://www.baidu.com') cookie.save(ignore_discard=True,ignore_expires=True)
@异常处理
from urllib import request.error #捕捉异常 try response=request.urlopen('http://www.xxx.com') except error.URLError as e: #可以加其他的操作 print(e.reason)
~具体可以处理的异常:
URLError: reason属性
HTTPError: code属性;reason属性; headers属性
ContentTooShortError
from urllib import request.error #捕捉异常 try response=request.urlopen('http://www.xxx.com') except error.HTTPError as e: print(e.reason,e.code,e.headers,sep=' ') except error.URLError as e: print(e.reason) else print('request successfully')
@URL解析
~urlparse: 将url进行拆分
urllib.parse.urlparse(urlstring,scheme='',allow_fragments=True)
~urlunparse:将url进行拼接
from urllib.parse import urlunparse data={'http','www.xxxx.com','index.html','user','comment'} ~urljoin:拼接URL ~urlencode:将字典对象编程get请求参数 from urllib.parse import urlencode params={ 'name':'germany' 'age':28 } base_url='http://www.baidu.com?' url=base_url+urlencode(params) print(url)