基本模型
请求与响应
import urllib.request as urllib2 request=urllib2.Request('http://www.zhihu.com') response=urllib2.urlopen(request) html=response.read() print(html)
Cookie处理
呵呵
import urllib.request as urllib2 import http.cookiejar as cookielib cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) response = opener.open('http://www.zhihu.com') for item in cookie: print(item.name+':'+item.value) 以下自定义Cookie内容???? opener = urllib2.build_opener() opener.addheaders.append( ( 'Cookie', 'email=' + "xxxxxxx@163.com" ) ) req = urllib2.Request( "http://www.zhihu.com/" ) response = opener.open(req) print(response.headers) retdata = response.read()
Timeout处理
设置局部的Timeout
超时会抛出异常
import urllib.request as urllib2 import http.cookiejar as cookielib request=urllib2.Request('http://www.zhihu.com') response = urllib2.urlopen(request,timeout=0.01) html=response.read()
print(html)
修改全局的Timeout
import urllib2 import socket socket.setdefaulttimeout(10) # 10 秒钟后超时 urllib2.socket.setdefaulttimeout(10) # 另一种方式
返回响应代码
正常200,网页丢失404
import urllib.request as urllib2 import http.cookiejar as cookielib try: response = urllib2.urlopen('http://www.samoy.cn/seoganhuo/1') print(response) print(response.getcode()) except urllib2.HTTPError as e: if hasattr(e, 'code'): print('Error code:',e.code)
检查重定向问题
当访问的网址返回后仍然是该网址,则未发生重定向。
import urllib.request as urllib2 import http.cookiejar as cookielib response = urllib2.urlopen('http://www.baidu.cn') isRedirected = response.geturl() == 'http://www.baidu.cn' print(isRedirected)
另外一种使用类进行的重定向检查
import urllib.request as urllib2 import http.cookiejar as cookielib class RedirectHandler(urllib2.HTTPRedirectHandler): def http_error_301(self, req, fp, code, msg, headers): pass def http_error_302(self, req, fp, code, msg, headers): result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) result.status = code result.newurl = result.geturl() return result opener = urllib2.build_opener(RedirectHandler) result=opener.open('http://www.baidu.cn') print(result.newurl) print(result.status)
代理设置
使用install_opener更新全局的Proxy。
import urllib.request as urllib2 import http.cookiejar as cookielib proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) response = urllib2.urlopen('http://www.zhihu.com/') print(response.read())
更新局部
import urllib.request as urllib2 import http.cookiejar as cookielib proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'}) opener = urllib2.build_opener(proxy) response = opener.open("http://www.zhihu.com/") print(response.read())
使用requests实现http请求
参见:包括get post 响应,编码,请求头处理,超时判定
Cookie处理
获取
import requests user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} r = requests.get('http://www.baidu.com',headers=headers) #遍历出所有的cookie字段的值 for cookie in r.cookies.keys(): print(cookie+':'+r.cookies.get(cookie))
发送自定义
import requests user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} cookies = dict(name='qiye',age='10') r = requests.get('http://www.baidu.com',headers=headers,cookies=cookies) print (r.text)
带账号密码发送cookie用于登录
import requests loginUrl = 'http://www.xxxxxxx.com/login' s = requests.Session() #首先访问登录界面,作为游客,服务器会先分配一个cookie r = s.get(loginUrl,allow_redirects=True) datas={'name':'qiye','passwd':'qiye'} #向登录链接发送post请求,验证成功,游客权限转为会员权限 r = s.post(loginUrl, data=datas,allow_redirects= True) print(r.text)
重定向检验
import requests r = requests.get('http://www.baidu.cn') print(r.url) print(r.status_code) print(r.history)
代理设置
举例
import requests proxies = { "http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080", } requests.get("https://www.baidu.com", proxies=proxies)
或使用https://doman@host方式设置proxies,进行代理