urllib的介绍
url = 'http://www/baidu.com/' # 向ོ指ོ定ོ的ོUོRོLོ地ོ址ོ发ོ起ོ请ོ求ོ,并ོ返ོ回ོ服ོ务ོ器ོ响ོ应ོ的ོ数ོ据ོ response = urllib.request.urlopen("http://www.baidu.com") # 第ོ一ོ种ོ读ོ取ོ方ོ式ོ data = response.read() # 第ོ二ོ种ོ读ོ取ོ方ོ式ོ 读ོ取ོ一ོ行ོ data = response.readline() # 第ོ三ོ种ོ读ོ取ོ方ོ式ོ, 读ོ取ོ文ོ件ོ的ོ全ོ部ོ内ོ容ོ, 会ོ把ོ读ོ取ོ到ོ的ོ数ོ据ོ复赋ོ值ོ给ོ一ོ个ོ列ོ表ོ变ོ量ོ data = response.readlines() with open(r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file.html', 'wb') as f: f.write(data)
response 常ོ用ོ的ོ属ོ性ོ
print(response.info())
# 返ོ回ོ状ོ态ོ码ོ
print(response.getcode())
# 返ོ回ོ正ོ在ོ爬ོ取ོ的ོUོRོLོ地ོ址ོ
pོrོiོnོtོ(response.geturl())
#解ོ码ོ出ོUོRོLོ中ོ的ོ汉ོ字ོ
pོrོiོnོtོ(urllib.request.unquote('url'))
#编ོ码ོ
print(urllib.request.quote('url'))
爬ོ取ོ到ོ的ོ网ོ页ོ直ོ接ོ写ོ入ོ文ོ件ོ
# urllib.request.urlretrieve('http://www.baidu.com',
# filename=r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file2.html')
# 清除缓存
# urllib.request.urlcleanup()
模ོ拟ོ浏ོ览ོ器ོ
# 3 模拟浏览器 # url = 'http://www.baidu.com'
# 请ོ求ོ头ོ # headers = { # 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) " # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" # } # # 设置一个请求体 # req = urllib.request.Request(url, headers=headers) # # 发起请求 # response = urllib.request.urlopen(req) # data = response.read()
# 设置超时机制
# 如果网页长时间未响应, 系统判断超时, 无法爬取
# for i in range(1, 100):
# try:
# response = urllib.request.urlopen(
# "http://www.baidu.com", timeout=0.5)
# print(len(response.read().decode('utf-8')))
# except:
# print('请求超时, 继续下一个爬取')
参ོ数ོ打ོ包ོ# 打包
# url = "http://www.sunck.wang:8085/form"
# data = {
# "username": "sunck",
# "passwd": "666"
# }
# # 对要发送的数据进行打包, 注意编码
# postData = urllib.parse.urlencode(data).encode('utf-8')
# # 请求体
# # 请求
# headers = {
# 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) "
# "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
# }
# # 设置一个请求体
# req = urllib.request.Request(url, headers=headers, data=postData)
# # 发起请求
# response = urllib.request.urlopen(req)
# data = response.read()
# print(data.decode('utf-8'))
urlencode 编码
举例:
import urllib
param = {'name': '王月'}
print(urllib.parse.urlencode(param))
结果:name=%E7%8E%8B%E6%9C%88
parse_qs 解码
举例:
name = urllib.parse.urlencode(param)
print(urllib.parse.parse_qs(name))
{'name': ['王月']}
处ོ理ོHོTོTོPོSོ的ོ网ོ址ོ
import ssl
import json
#
# def ajaxCrawler(url):
#
# headers = {
# 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) "
# "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
# }
# req = urllib.request.Request(url, headers=headers)
#
# # 使用ssl使用未验证的上下文
# context = ssl._create_unverified_context()
# response = urllib.request.urlopen(req, context=context)
# json_str = response.read().decode('utf-8')
# json_data = json.loads(json_str)
# #
# return json_data
#
# url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20"
# info = ajaxCrawler(url)
# print(info)
# 自动抓取下一页
# for i in range(10):
# url2 = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=" + str(
# i * 20) + "&limit=20"
# info = ajaxCrawler(url2)
# print(len(info))
#
def joyCrawler(url):
headers = {
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
req = urllib.request.Request(url, headers=headers)
context = ssl._create_unverified_context()
response = urllib.request.urlopen(req, context=context)
HTML = response.read().decode('utf-8')
with open(r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file3.html', 'w') as f:
f.write(HTML)
url = 'https://www.qiushibaike.com/8hr/page/2/'
print(joyCrawler(url))
urlparse和urlsplit 函数:
url = 'http://www.baidu.com/s?wd=python&username=abc#1'
result = urllib.parse.urlparse(url)
print(result)
结果: ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='', query='wd=python&username=abc', fragment='1')
取值:
print('path:', result.path)
result2 = urllib.parse.urlsplit(url)
print(result2)
结果:plitResult(scheme='http', netloc='www.baidu.com', path='/s', query='wd=python&username=abc', fragment='1')
区别:urlsplit 没有params urlparse 有params