zoukankan html css js c++ java

常见的爬虫分析库（1）-Python3中Urllib库基本使用

原文来自：https://www.cnblogs.com/0bug/p/8893677.html

什么是Urllib?

Python内置的HTTP请求库

urllib.request 请求模块

urllib.error 异常处理模块

urllib.parse url解析模块

urllib.robotparser robots.txt解析模块

相比Python的变化

Python2中的urllib2在Python3中被统一移动到了urllib.request中

python2

import urllib2

response = urllib2.urlopen('http://www.cnblogs.com/0bug')

Python3

import urllib.request

response = urllib.request.urlopen('http://www.cnblogs.com/0bug/')

urlopen()

不加data是以GET方式发送，加data是以POST发送

import urllib.request
 
response = urllib.request.urlopen('http://www.cnblogs.com/0bug')
html = response.read().decode('utf-8')
print(html)

结果

加data发送POST请求

import urllib.parse
import urllib.request
 
data = bytes(urllib.parse.urlencode({'hello': '0bug'}), encoding='utf-8')
response = urllib.request.urlopen('http://httpbin.org/post', data=data)
print(response.read())

结果

timeout超时间

import urllib.request
 
response = urllib.request.urlopen('http://www.cnblogs.com/0bug', timeout=0.01)
print(response.read())

结果

import urllib.request
import socket
import urllib.error
try:
    response = urllib.request.urlopen('http://www.cnblogs.com/0bug', timeout=0.01)
except urllib.error.URLError as  e:
    if isinstance(e.reason,socket.timeout):
        print('请求超时')

结果

响应

1.响应类型

import urllib.request
 
response = urllib.request.urlopen('http://www.cnblogs.com/0bug')
print(type(response))

结果

2.状态码、响应头

import urllib.request
 
response = urllib.request.urlopen('http://www.cnblogs.com/0bug')
print(response.status)
print(response.getheaders())
print(response.getheader('Content-Type'))

结果

3.响应体

响应体是字节流，需要decode('utf-8')

import urllib.request
 
response = urllib.request.urlopen('http://www.cnblogs.com/0bug')
html = response.read().decode('utf-8')
print(html)

Request

import urllib.request
 
request = urllib.request.Request('http://www.cnblogs.com/0bug')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))

结果

添加请求头信息

from urllib import request, parse
 
url = 'http://httpbin.org/post'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    'Host': 'httpbin.org'
}
dic = {'name': '0bug'}
data = bytes(parse.urlencode(dic), encoding='utf-8')
req = request.Request(url=url, data=data, headers=headers, method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

结果

add_header

from urllib import request, parse
 
url = 'http://httpbin.org/post'
dic = {'name': '0bug'}
data = bytes(parse.urlencode(dic), encoding='utf-8')
req = request.Request(url=url, data=data, method='POST')
req.add_header('User-Agent',
               'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
response = request.urlopen(req)
print(response.read().decode('utf-8'))

Handler

代理：

import urllib.request
 
proxy_handler = urllib.request.ProxyHandler({
    'http': 'http代理',
    'https': 'https代理'
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('http://www.cnblogs.com/0bug')
print(response.read())

Cookie

import http.cookiejar, urllib.request
 
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name + "=" + item.value)

结果

Cookie保存为文件

import http.cookiejar, urllib.request
 
filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

cookie.txt

另一种方式存

import http.cookiejar, urllib.request
 
filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)

cookie.txt

用什么格式的存就应该用什么格式的读

import http.cookiejar, urllib.request
 
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

异常处理

from urllib import request, error
 
try:
    response = request.urlopen('http://www.cnblogs.com/0bug/xxxx')
except error.URLError as e:
    print(e.reason)

结果

from urllib import request, error
 
try:
    response = request.urlopen('http://www.cnblogs.com/0bug/xxxx')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='
')
except error.URLError as e:
    print(e.reason)
else:
    print('Request Successfully')

结果

import socket
import urllib.request
import urllib.error
 
try:
    response = urllib.request.urlopen('http://www.cnblogs.com/0bug/xxxx', timeout=0.001)
except urllib.error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason, socket.timeout):
        print('请求超时')

结果

URL解析

from urllib.parse import urlparse
 
result = urlparse('www.baidu.com/index.html;user?id=5#comment')
print(type(result))
print(result)

结果

from urllib.parse import urlparse
 
result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)

结果

from urllib.parse import urlparse
 
result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', scheme='https')
print(result)

结果

from urllib.parse import urlparse
 
result = urlparse('http://www.badiu.com/index.html;user?id=5#comment', allow_fragments=False)
print(result)

结果

from urllib.parse import urlparse
 
result = urlparse('http://www.badiu.com/index.html#comment', allow_fragments=False)
print(result)

结果

urlunparse

from urllib.parse import urlunparse
 
data = ['http', 'www.baidu.com', 'index.html', 'user', 'id=6', 'comment']
print(urlunparse(data))

结果

urljoin

from urllib.parse import urljoin
 
print(urljoin('http://www.baidu.com', 'ABC.html'))
print(urljoin('http://www.baidu.com', 'https://www.cnblogs.com/0bug'))
print(urljoin('http://www.baidu.com/0bug', 'https://www.cnblogs.com/0bug'))
print(urljoin('http://www.baidu.com/0bug', 'https://www.cnblogs.com/0bug?q=2'))
print(urljoin('http://www.baidu.com/0bug?q=2', 'https://www.cnblogs.com/0bug'))
print(urljoin('http://www.baidu.com', '?q=2#comment'))
print(urljoin('www.baidu.com', '?q=2#comment'))
print(urljoin('www.baidu.com#comment', '?q=2'))

结果

urlencode

from urllib.parse import urlencode
 
params = {
    'name': '0bug',
    'age': 25
}
base_url = 'http://www.badiu.com?'
url = base_url + urlencode(params)
print(url)

查看全文

相关阅读:
js json string 互转
 更新内置flash方法[转]
CSS设置滚动条样式[转]
ArcGIS JavaScript API本地部署离线开发环境[转]
正则匹配整数和小数
 把数字字符串转换为日期
 获取滚动条高度的兼容问题
 angular实现多个div的展开和折叠
 IE浏览器overflow：srcoll兼容问题记录
 input验证，光标左右移动问题

原文地址：https://www.cnblogs.com/yunlongaimeng/p/9802052.html