zoukankan html css js c++ java

python3 urllib模块使用

urllib模块使用

urllib.request

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

import urllib.request

url = 'http://httpbin.org/ip'
response = urllib.request.urlopen(url)
html = response.read()  # 返回bytes类型数据
print(html)

url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8') # 通过decode()方法将bytes类型数据转化为str类型数据
print(html)

发送post数据

import urllib.request
import urllib.parse

url = 'http://httpbin.org/post'

data = {
    'name' : "小明",
    'age' : 30
}
# data = urllib.parse.urlencode(data)  # Error: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
# data = urllib.parse.urlencode(data).encode('utf-8')
data = bytes(urllib.parse.urlencode(data),encoding="utf-8")
response = urllib.request.urlopen(url, data=data)
html = response.read().decode('utf-8')
print(html)

设置timeout

import urllib.request

url = 'http://httpbin.org/get'
response = urllib.request.urlopen(url, timeout=1)
html = response.read().decode('utf-8')
print(html)

import socket
import urllib.request
import urllib.error

url = 'http://httpbin.org/get'
try:
    response = urllib.request.urlopen(url, timeout=0.1)
    html = response.read().decode('utf-8')
    print(html)
except urllib.error.URLError as e:
    print("捕获异常....")
    print(e.reason)
    if isinstance(e.reason, socket.timeout):
        print("请求超时")

响应

响应类型、状态码、响应头、实际获取的url

import urllib.request

url = 'http://www.python.org'
response = urllib.request.urlopen(url)
# 响应类型
response_type = type(response)
print(response_type)  # <class 'http.client.HTTPResponse'>
# 状态码
status_code = response.getcode()
print(status_code)
# 状态码对应的信息
status = response.reason
print(status)    # 比如 200对应Ok, 404对应Not Found
# 响应头
response_headers = response.getheaders()  # 返回列表
print(response_headers)
server_type = response.getheader('Server') # getheader()获取响应头的指定部分信息
print(server_type)
print(type(response.headers))  # <class 'http.client.HTTPMessage'>
content_type = response.headers['Content-Type'] # 获取Content-Type
print(content_type)
# 实际获取的url, 可以用来判断是否发生重定向
actual_url = response.geturl()
print(actual_url)

class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶

import urllib.request

url = 'http://httpbin.org/get'
request = urllib.request.Request(url)  # 创建请求对象
response = urllib.request.urlopen(request) # 发送请求
html = response.read().decode('utf-8')
print(html)
# 默认的User-Agent为"Python-urllib/x.x" # x.x为python版本号

发送post数据

import urllib.request
import urllib.parse

url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}
data = {
    'name' : 'peter', 
    'age' : 20
}

data = bytes(urllib.parse.urlencode(data), encoding="utf-8") # POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
request = urllib.request.Request(url, data=data, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
# post数据时  "Content-Type": "application/x-www-form-urlencoded"

urllib.request.Request 对象方法

import urllib.request


url = 'http://httpbin.org/get'
request = urllib.request.Request(url)
# add_header(key, val)   # 添加请求头信息
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36')
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)

Handlers

ProxyHandler(代理)

import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号
proxy_dict = {
    'http': '127.0.0.1:6688',
    'https': '127.0.0.1:6688',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}


proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = urllib.request.urlopen(url)
print(response.read().decode('utf-8'))

# 常见错误: 
# HTTPError: HTTP Error 403: Forbidden : 很可能代理服务器设置了权限，当前ip不在代理服务器允许访问列表中

代理需要身份认证

# 错误提示: HTTPError: HTTP Error 407: Proxy Authentication Required

#方法1: 代理ip设置格式 http://用户名:密码@ip地址:端口号
import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号
proxy_dict = {
    'http': 'http://name:password@127.0.0.1:6688',
    'https': 'http://name:password@127.0.0.1:6688',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}


proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
opener = urllib.request.build_opener(proxy_hanlder)
urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))

#方法2: 使用ProxyBasicAuthHandler用于代理登陆验证(需要提供相应的用户名和密码)
import urllib.request

# 字典，key为协议类型,value 为 ip地址:端口号
proxy_dict = {
    'http': 'http://127.0.0.1:6688',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
}


proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'http://127.0.0.1:6688', 'name', 'password') # #realm(域)设为None即可
proxy_auth_handler = urllib.request.ProxyBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(proxy_hanlder, proxy_auth_handler)
urllib.request.install_opener(opener)

opener.addheaders = headers.items()   # 设置请求头

url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
response = opener.open(url)
print(response.read().decode('utf-8'))

HTTPBasicAuthHandler

用于访问web服务器时的身份验证

import urllib.request

url = 'http://127.0.0.1/test/'
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url, 'admin','password')  # 添加对应url的用户名和密码
http_auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(http_auth_handler)
response = opener.open(url)
print(response.read().decode('utf-8'))

FTPHandler

import urllib.request


url = 'ftp://ftp1.linuxidc.com'
username = 'ftp1.linuxidc.com'
password = 'www.linuxidc.com'

ftp_url = 'ftp://%s:%s@ftp1.linuxidc.com' %(username, password)
ftp_handler = urllib.request.FTPHandler()  
opener = urllib.request.build_opener(ftp_handler)
response = opener.open(ftp_url)
print(response.read().decode('utf-8', 'ignore'))

HTTPHandler、HTTPSHandler

import urllib.request


url = 'http://www.baidu.com'
# 通过将debuglevel=1,将debug Log 打开,这样收发包的内容就会在屏幕上打印出来，方便调试
http_handler = urllib.request.HTTPHandler(debuglevel=1)
https_handler = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(http_handler, https_handler)
response = opener.open(url)

'''
效果:
send: b'GET / HTTP/1.1
Accept-Encoding: identity
Host: www.baidu.com
User-Agent: Python-urllib/3.6
Connection: close

'
reply: 'HTTP/1.1 200 OK
'
header: Date header: Content-Type header: Transfer-Encoding header: Connection header: Vary header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: P3P header: Cache-Control header: Cxy_all header: Expires header: X-Powered-By header: Server header: X-UA-Compatible header: BDPAGETYPE header: BDQID header: BDUSERID 
'''

CookieJar

import urllib.request
import http.cookiejar


url = 'http://www.baidu.com'
cookie = http.cookiejar.CookieJar()
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
response = opener.open(url)
print(response.getcode())
for item in cookie:  # item为<class 'http.cookiejar.Cookie'>
    print(item.name, item.value, sep=" : ")

MozillaCookieJar

创建与Mozilla cookies.txt文件兼容的FileCookieJar实例

import urllib.request
import http.cookiejar


url = 'https://www.zhihu.com/settings/profile'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
}

cookie = http.cookiejar.MozillaCookieJar("zhihu_cookie.txt")
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(cookie_handler)
opener.addheaders = headers.items()

try:
    cookie.load()    # 将cookie数据从文件加载到内存  很重要
except http.cookiejar.LoadError as e:
    print('cookie文件加载失败')
except IOError as e:
    print("cookie文件不存在")

response = opener.open(url)
print(response.geturl())  # 将geturl()返回的结果和url比对，判断是否登陆成功，失败会转到知乎登陆界面
html = response.read().decode('utf-8')
print(html)

# 对于登陆成功，需要调用MozillaCookieJar对象的save()方法，将数据从内存保存到文件中

LWPCookieJar

创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例

import urllib.request
import http.cookiejar


url = 'http://www.baidu.com'
cookie = http.cookiejar.LWPCookieJar("cookies.txt")
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
response = opener.open(url)
# 必须调用save方法 将内存中的cookie对象保存到本地文件中, 下次再次使用cookie，只需调用load方法将其加载到内存中即可
cookie.save(ignore_discard=True, ignore_expires=True)

异常处理

URLError

引起URLError的原因通常有: 无网络连接,即本机无法上网、访问的目标服务器不存在。在这种情况下，异常对象会有reason属性(它是一个由(错误码、错误原因)组成的元组对象)。捕获异常方法如下:

import urllib.request


try:
    response = urllib.request.urlopen('http://www.hello_world.org')
except urllib.request.URLError as e:
    print(type(e.reason)) #  <class 'socket.gaierror'>
    print(e.reason)  # # [Errno 11001] getaddrinfo failed

HTTPError

HTTPError是URLError的子类，每次调用urlopen方法发出一个请求时，服务器上都会产生对应response,它包含一个数字"状态码"，
常见的状态码有200(请求成功),302(重定向)，304(文档的内容(自上次访问以来或者根据请求的条件)并没有改变)
这些状态码有的表示服务器无法完成请求。如果无法处理请求，urlopen会抛出HTTPError。
典型的错误包括404(页面没有找到)、403(请求被禁止)、401(当前请求需要用户认证)、407(需要代理验证)、500(服务器内部错误)

# 方式1
import urllib.request
import urllib.error


url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
    response = urllib.request.urlopen(url)
# HTTPError是URLError子类，要放到前面处理
except urllib.error.HTTPError as e:
    print("The server cannot fulfill the request...")
    print("Error code: ", e.code)
    print("Reason: ", e.reason)
except urllib.error.URLError as e:
    print("failed to fetch the server...")
    print("Reason: ", e.reason)

# 方式2
import urllib.request
import urllib.error


url = 'http://www.hello_world.org'
# url = 'http://example.com/test.html'
try:
    response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
    if hasattr(e, 'code'):
        print("The server cannot fulfill the request...")
        print("Error code: ", e.code)
        print("Reason: ", e.reason)
    else:
        print("failed to fetch the server...")
        print("Reason: ", e.reason)

urllib.parse

urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶

负责解析URL

from urllib.parse import urlparse

# def urlparse(url, scheme='', allow_fragments=True)
# 将url解析成6部分 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
# 返回6元祖 (scheme, netloc, path, params, query, fragment)


result = urlparse('http://www.baidu.com/index.html;user?id=100#comment')
print(type(result))  # <class 'urllib.parse.ParseResult'>
print(result)   # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')

# 只有通过// 才能识别netloc
result = urlparse(
    '//www.baidu.com/index.html;user?id=100#comment',
    scheme="https")
print(result)  # ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')


result = urlparse(
    'www.baidu.com/index.html;user?id=100#comment',
    scheme="https")
print(result)  # ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=100', fragment='comment')

# 原url已包含scheme，使用已有的scheme
result = urlparse(
    'http://www.baidu.com/index.html;user?id=100#comment',
    scheme="https")
print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')

result = urlparse(
    "http://www.baidu.com/index.html;user?id=100#comment",
    allow_fragments=False)
print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100#comment', fragment='')

result = urlparse(
    "http://www.baidu.com/index.html#comment",
    allow_fragments=False)
print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')

urllib.parse.urlunparse(parts)

from urllib.parse import urlunparse


data = ("http", 'www.baidu.com','index.html', 'user','id=100','comment')
url = urlunparse(data)
print(url)

urllib.parse.urljoin(base, url, allow_fragments=True)

# 以相对路径的url为准，base url向相对路径url提供相对路径url缺少的scheme(协议),netloc(主机地址), 来构造完整的url路径
from urllib.parse import urljoin


print(urljoin("http://www.baidu.com","FAQ.html"))
print(urljoin("http://www.baidu.com/index.html","FAQ.html"))
print(urljoin("http://www.baiud.com/index.html", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/index.html", "http://www.google.com/FAQ.html?question=2"))
print(urljoin("http://www.baidu.com/index.html?wd=abc", "http://www.google.com/FAQ.html"))
print(urljoin("http://www.baidu.com/", "?category=5#comment"))
print(urljoin("http://www.baidu.com/#comment", "?category=5"))

'''
http://www.baidu.com/FAQ.html
http://www.baidu.com/FAQ.html
http://www.google.com/FAQ.html
http://www.google.com/FAQ.html?question=2
http://www.google.com/FAQ.html
http://www.baidu.com/?category=5#comment
http://www.baidu.com/?category=5
'''

urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)

 from urllib.parse import urlencode

basic_url = 'http://httpbin.org/get'
data = {
    "key": '天气',
}
data = urlencode(data)
full_url = '%s?%s' % (basic_url, data)
print(full_url) # http://httpbin.org/get?key=%E5%A4%A9%E6%B0%94

查看全文

相关阅读:
SuperMap房产测绘成果管理平台
 SuperMap产权登记管理平台
 Android adb shell am 的用法(1)
由浅入深谈Perl中的排序
 Android 内存监测和分析工具
 Android 网络通信
 adb server is out of date. killing...
引导页使用ViewPager遇到OutofMemoryError的解决方案
 adb logcat 详解
 How to send mail by java mail in Android uiautomator testing?

原文地址：https://www.cnblogs.com/hupeng1234/p/7099476.html