一个最简单的爬虫
from urllib.request import urlopen
url = 'http://www.baidu.com'
# 发送请求
response = urlopen(url)
# 读取内容
info = response.read()
# 打印内容
#print(info.decode())
# 打印状态码
print(response.getcode())
print('*'*50)
# 打印真实url
print(response.geturl())
print('*'*50)
# 打印响应头
print(response.info())
request的使用
from urllib.request import urlopen
from urllib.request import Request
from fake_useragent import UserAgent
url = 'https://www.baidu.com'
ua = UserAgent()
headers = {
"User-Agent": ua.chrome
}
request = Request(url,headers=headers)
print(request.get_header('User-agent'))
response = urlopen(request)
info = response.read()
print(info.decode())
get请求编码转换
get请求时,往往在
url
上需要添加参数,但是如果参数值是中文的话会出现请求报错的情况,我们就需要将中转换编码才行
from urllib.request import urlopen
from urllib.request import Request
from fake_useragent import UserAgent
from urllib import parse
# 单个参数
#url = 'https://www.baidu.com/s?wd={}'.format(parse.quote('火狐'))
# 如果url中有多个参数,可以使用urlencode
args = {
'wd':'火狐',
'id':'utf-8'
}
url = 'https://www.baidu.com/s?'+parse.urlencode(args)
print(url)
ua = UserAgent()
headers = {
"User-Agent": ua.chrome
}
print(url)
request = Request(url,headers=headers)
#print(request.get_header('User-agent'))
response = urlopen(request)
info = response.read()
print(info.decode())
Post请求
from urllib.request import Request,urlopen
from urllib.parse import urlencode
from fake_useragent import UserAgent
import ssl
url = 'https://www.maguangyi.top/tzsc/login.php'
from_data = {
'pwd' : '123456'
}
headers = {
'User-Agent' : UserAgent().chrome
}
f_data = urlencode(from_data)
request = Request(url,data=f_data.encode(),headers=headers)
# 忽略ssl证书验证
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
response = urlopen(request,context=ctx)
print(response.read().decode())