1.get传参
(1)汉字报错 :解释器器ascii没有汉字 url汉字转码
urllib.parse.quote safe="string.printtable"
(2)字典传参
urllib.parse.urlencode()
post:
urlib.request.openurl(url,data = "服务器器接受的数据")
handler:处理理器器的⾃自定义:
User-Agent:
(1)模拟真实的浏览器器发送请求:(1)百度批量量搜索(2)检查元素(百度搜索useragent⼤大全)
(2)request.add_header(动态添加head数据)
(3)响应头 response.header
(4)创建request:urlib.request.Request(url)
2.IP代理理:
(1)免费的IP:时效性差,错误率⾼高
(2)付费的IP:贵花钱,也有失效不不能⽤用的
IP分类:
透明:对⽅方知道我们真实的ip
匿匿名:对⽅方不不知道我们真实的ip,知道了了你使⽤用了了代理理
⾼高匿匿:对⽅方不不知道我们真是的IP.也不不知道我们使⽤用了了代理理
handler:
(1)系统的urlopen()不不⽀支持代理理的添加
创建对应的处理理器器(handler)
1.代理理处理理器器:ProxyHandler
2.拿着ProxyHandler创建opener:bulid_opener()
3.opener.open(url)就可以请求数据
auth认证handler
Cookieshandler
URLError
requests(第三⽅方模块):简单易易⽤用
数据解析:
数据存储:json csv MongDB resdis mysql
import urllib.request import urllib.parse import string def get_params(): url = "http://www.baidu.com/s?" params = { "wd":"中文", "key":"zhang", "value":"san" } str_params = urllib.parse.urlencode(params) print(str_params) final_url = url + str_params #将带有中文的url 转译成计算机可以识别的url end_url = urllib.parse.quote(final_url,safe=string.printable) response = urllib.request.urlopen(end_url) data = response.read().decode("utf-8") print(data) get_params()
import urllib.request def load_baidu(): url= "https://www.baidu.com" header = { #浏览器的版本 "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", # "haha":"hehe" } #创建请求对象 request = urllib.request.Request(url) #动态的去添加head的信息 request.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36") #请求网络数据(不在此处增加请求头信息因为此方法系统没有提供参数) response = urllib.request.urlopen(request) print(response) data = response.read().decode("utf-8") #获取到完整的url final_url = request.get_full_url() print(final_url) #响应头 # print(response.headers) #获取请求头的信息(所有的头的信息) # request_headers = request.headers # print(request_headers) #(2)第二种方式打印headers的信息 #注意点:首字母需要大写,其他字母都小写 request_headers = request.get_header("User-agent") # print(request_headers) with open("02header.html","w")as f: f.write(data) load_baidu()
import urllib.request def load_baidu(): url= "http://www.baidu.com" #添加请求头的信息 #创建请求对象 request = urllib.request.Request(url) #请求网络数据 response = urllib.request.urlopen(request) print(response) data = response.read().decode("utf-8") #响应头 # print(response.headers) #获取请求头的信息 request_headers = request.headers print(request_headers) with open("02header.html","w")as f: f.write(data) load_baidu()
import urllib.request import random def load_baidu(): url = "http://www.baidu.com" user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50" ] #每次请求的浏览器都是不一样的 random_user_agent = random.choice(user_agent_list) request = urllib.request.Request(url) #增加对应的请求头信息(user_agent) request.add_header("User-Agent",random_user_agent) #请求数据 response = urllib.request.urlopen(request) #请求头的信息 print(request.get_header("User-agent")) load_baidu()
import urllib.request def handler_openner(): #系统的urlopen并没有添加代理的功能所以需要我们自定义这个功能 #安全 套接层 ssl第三方的CA数字证书 #http80端口# 和https443 #urlopen为什么可以请求数据 handler处理器 #自己的oppener请求数据 # urllib.request.urlopen() url = "https://blog.csdn.net/m0_37499059/article/details/79003731" #创建自己的处理器 handler = urllib.request.HTTPHandler() #创建自己的oppener opener=urllib.request.build_opener(handler) #用自己创建的opener调用open方法请求数据 response = opener.open(url) # data = response.read() data = response.read().decode("utf-8") with open("02header.html", "w")as f: f.write(data) handler_openner()
import urllib.request def create_proxy_handler(): url = "https://blog.csdn.net/m0_37499059/article/details/79003731" #添加代理 proxy = { #免费的写法 "http":"" # "http":"120.77.249.46:8080" #付费的代理 # "http":"xiaoming":123@115. } #代理处理器 proxy_handler = urllib.request.ProxyHandler(proxy) #创建自己opener opener = urllib.request.build_opener(proxy_handler) #拿着代理ip去发送请求 response = opener.open(url) data = response.read().decode("utf-8") with open("03header.html", "w")as f: f.write(data) create_proxy_handler()
import urllib.request def proxy_user(): proxy_list = [ {"https":""}, # {"https":"106.75.226.36:808"}, # {"https":"61.135.217.7:80"}, # {"https":"125.70.13.77:8080"}, # {"https":"118.190.95.35:9001"} ] for proxy in proxy_list: print(proxy) #利用遍历出来的ip创建处理器 proxy_handler = urllib.request.ProxyHandler(proxy) #创建opener opener = urllib.request.build_opener(proxy_handler) try: data = opener.open("http://www.baidu.com",timeout=1) haha = data.read() print(haha) except Exception as e: print(e) proxy_user()
付费的代理发送
import urllib.request #付费的代理发送 #1.用户名密码(带着) #通过验证的处理器来发送 def money_proxy_use(): # #第一种方式付费代理发送请求 # #1.代理ip # money_proxy ={"http":"username:pwd@192.168.12.11:8080"} # #2.代理的处理器 # proxy_handler=urllib.request.ProxyHandler(money_proxy) # # #3.通过处理器创建opener # opener = urllib.request.build_opener(proxy_handler) # #4.open发送请求 # opener.open("http://www.baidu.com") # #第二种方式发送付费的ip地址 use_name = "abcname" pwd = "123456" proxy_money = "123.158.63.130:8888" #2.创建密码管理器,添加用户名和密码 password_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm() #uri定位 uri>url #url 资源定位符 password_manager.add_password(None,proxy_money,use_name,pwd) #3.创建可以验证代理ip的处理器 handle_auth_proxy = urllib.request.ProxyBasicAuthHandler(password_manager) #4.根据处理器创建opener opener_auth = urllib.request.build_opener(handle_auth_proxy) #5.发送请求 response = opener_auth.open("http://www.baidu.com") print(response.read()) #爬取自己公司的数据,做数据分析 #admin money_proxy_use()
爬取自己的网站
import urllib.request def auth_nei_wang(): #1.用户名密码 user = "admin" pwd = "adimin123" nei_url = "http://192.168.179.66" #2.创建密码管理器 pwd_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm() pwd_manager.add_password(None,nei_url,user,pwd) #创建认证处理器(requests) auth_handler = urllib.request.HTTPBasicAuthHandler(pwd_manager) opener = urllib.request.build_opener(auth_handler) response = opener.open(nei_url) print(response) auth_nei_wang()
cookie
第一种:
""" 直接获取 个人中心的页面 手动粘贴 复制 PC 抓包的 cookies 放在 request对象的请求头里面 """ import urllib.request # 1.数据url url = 'https://www.yaozh.com/member/' # 2.添加请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' , 'Cookie': '_ga=GA1.2.1820447474.1535025127; MEIQIA_EXTRA_TRACK_ID=199Tty9OyANCXtHaSobJs67FU7J; UtzD_f52b_ulastactivity=1511944816%7C0; WAF_SESSION_ID=7d88ae0fc48bffa022729657cf09807d; PHPSESSID=7jsc60esmb6krgthnj99dfq7r3; _gid=GA1.2.358950482.1540209934; _gat=1; MEIQIA_VISIT_ID=1BviNX3zYEKVS7bQVpTRHOTFV8M; yaozh_logintime=1540209949; yaozh_user=381740%09xiaomaoera12; yaozh_userId=381740; db_w_auth=368675%09xiaomaoera12; UtzD_f52b_saltkey=CfYyYFY2; UtzD_f52b_lastvisit=1540206351; UtzD_f52b_lastact=1540209951%09uc.php%09; UtzD_f52b_auth=2e13RFf%2F3R%2BNjohcx%2BuoLcVRx%2FhF0NvwUbslgSZX%2FOUMkCRRcgh5Ayg6RGnklcG3d2DkUFAXJxjhlIS8fPvr9rrwa%2FY; yaozh_uidhas=1; yaozh_mylogin=1540209953; MEIQIA_EXTRA_TRACK_ID=199Tty9OyANCXtHaSobJs67FU7J; WAF_SESSION_ID=7d88ae0fc48bffa022729657cf09807d; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1535025126%2C1535283389%2C1535283401%2C1539351081%2C1539512967%2C1540209934; MEIQIA_VISIT_ID=1BviNX3zYEKVS7bQVpTRHOTFV8M; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1540209958' } # 3.构建请求对象 request = urllib.request.Request(url, headers=headers) # 4.发送请求对象 response = urllib.request.urlopen(request) # 5.读取数据 data = response.read() print(type(data)) # 保存到文件中 验证数据 with open('01cook.html', 'wb') as f: f.write(data)
第二种:
""" 获取 个人中心的页面 1. 代码登录 登录成功 cookie(有效) 2. 自动带着cookie 去请求个人中心 cookiejar 自动保存这个cookie """ import urllib.request from http import cookiejar from urllib import parse # 登录之前的 登录页的网址https://www.yaozh.com/login/ # 找登录 参数 # 后台 根据你发送的请求方式来判断的 如果你是get(登录页面),如果POST(登录结果) # 1. 代码登录 # 1.1 登录的网址 login_url = 'https://www.yaozh.com/login' # 1.2 登录的参数 login_form_data = { "username": "3253212", "pwd": "56uhjyh", "formhash": "CE3ADF28C5", "backurl": "https%3A%2F%2Fwww.yaozh.com%2F" } # 1.3 发送登录请求POST cook_jar = cookiejar.CookieJar() # 定义有添加 cook 功能的 处理器 cook_hanlder = urllib.request.HTTPCookieProcessor(cook_jar) # 根据处理器 生成 opener opener = urllib.request.build_opener(cook_hanlder) # 带着参数 发送post请求 # 添加请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36' } # 1.参数 将来 需要转译 转码; 2. post请求的 data要求是bytes login_str = parse.urlencode(login_form_data).encode('utf-8') login_request = urllib.request.Request(login_url, headers=headers, data=login_str) # 如果登录成功, cookjar自动保存cookie opener.open(login_request) # 2. 代码带着cooke去访问 个人中心 center_url = 'https://www.yaozh.com/member/' center_request = urllib.request.Request(center_url, headers=headers) response = opener.open(center_url) # bytes -->str data = response.read().decode() with open('02cook.html', 'w') as f: f.write(data) # 一个用户 在不同的地点(IP(福建,上海, 杭州, 河南)) 不同浏览器 上面 不停的登录 非人为操作 # 封你的账号 # N 个 账号
错误提示
# urlib.request 提示错误 HTTPError UrlError """ raise URLError(err) urllib.error.URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known> raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 404: Not Found """ import urllib.request url = 'https://blog.csdn.net/zjsxxzh/article/details/110' url = 'https://affdsfsfsdfd.cn' try: response = urllib.request.urlopen(url) except urllib.request.HTTPError as error: print(error.code) except urllib.request.URLError as error: print(error)