requests模块

import requests """ # 1. 方法 requests.get requests.post requests.put requests.delete ... requests.request(method='POST') """ # 2. 参数 """ 2.1 url 2.2 headers 2.3 cookies 2.4 params 2.5 data,传请求体 requests.post( ..., data={'user':'alex','pwd':'123'} ) GET /index http1.1 host:c1.com user=alex&pwd=123 2.6 json,传请求体 requests.post( ..., json={'user':'alex','pwd':'123'} ) GET /index http1.1 host:c1.com Content-Type:application/json {"user":"alex","pwd":123} 2.7 代理 proxies # 无验证 proxie_dict = { "http": "61.172.249.96:80", "https": "http://61.185.219.126:3128", } ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict) # 验证代理 from requests.auth import HTTPProxyAuth proxyDict = { 'http': '77.75.105.165', 'https': '77.75.106.165' } auth = HTTPProxyAuth('用户名', '密码') r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth) print(r.text) #----------------------------------------以下是不太常用的(了解)------------------------------------------------- 2.8 文件上传 files # 发送文件 file_dict = { 'f1': open('xxxx.log', 'rb') #本地问文件 } requests.request( method='POST', url='http://127.0.0.1:8000/test/', files=file_dict ) 2.9 认证 auth 内部: 用户名和密码,用户和密码加密,放在请求头中传给后台。 - "用户:密码" - base64("用户:密码") #加密 - "Basic base64("用户:密码")" #构造字符串 - 请求头: Authorization: "Basic base64("用户|密码")" #把构造的字符串放入请求头中 from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) #执行HTTPBasicAuth时会调用__call__方法,然后走上面的流程 print(ret.text) 2.10 超时 timeout # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) 2.11 允许重定向 allow_redirects ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text) 2.12 大文件下载 stream from contextlib import closing with closing(requests.get('http://httpbin.org/get', stream=True)) as r1: # 在此处理响应。 for i in r1.iter_content(): print(i) #---------------------------下面的暂时用不到,知道就行--------------------------------------------- 2.13 证书 cert - 百度、腾讯 => 不用携带证书(系统帮你做了) - 自定义证书 requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem") requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key")) 2.14 确认 verify =False """ requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")
requests中的session:
- session(推荐自己携带cookie) 以前的cookie是我们自己携带的,每次请求都需要携带,在requests模块中有一个叫session的东西 在seesion的内部会把我们访问的cookie全都保存下来,也会把header保存下来,也就是会把响应头和请求头全都保存下来, 我们在发请求的时候就不需要自己携带cookie了,它的内部会自动帮我们带上。 用法如下: session = requests.Session() session.get() session.post()
示例:
# ######################爬取汽车之家##################################### ''' import requests #import requests:伪造浏览器发起Http请求 from bs4 import BeautifulSoup # pip3 install BeautifulSoup4 # BeautifulSoup 将html格式的字符串解析成对象。 对象.find/find_all response = requests.get("https://www.autohome.com.cn/news/") response.encoding = 'gbk' #传输之间默认用的是字节,把字节转化为字符串,指定字符编码 soup = BeautifulSoup(response.text,'html.parser') #把html格式的字符串解析成soup对象 div = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'}) #find找到是匹配成功的第一个数据 li_list = div.find_all(name='li') #找到div标签下的所有li for li in li_list: title = li.find(name='h3') if not title: continue p = li.find(name='p') a = li.find(name='a') print(title.text) #找h3标签对象里面的文本内容 标题 print(a.attrs.get('href')) #找a标签对象里面的属性 链接 print(p.text) #找p标签对象里面的文本内容 简介 img = li.find(name='img') src = img.get('src') src = "https:" + src print(src) # 再次发起请求,下载图片 file_name = src.rsplit('/',maxsplit=1)[1] #拿到文件名 ret = requests.get(src) with open(file_name,'wb') as f: f.write(ret.content) # .content是返回的二进制 ''' # ################################### 示例一:爬取数据(携带请起头) ###################################
""" import requests from bs4 import BeautifulSoup r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ) soup = BeautifulSoup(r1.text,'html.parser') # 标签对象 content_list = soup.find(name='div',id='content-list') # print(content_list) # [标签对象,标签对象] item_list = content_list.find_all(name='div',attrs={'class':'item'}) for item in item_list: a = item.find(name='a',attrs={'class':'show-content color-chag'}) print(a.text.strip()) # print(a.text) """
# ################################### 示例二:点赞 ################################### #访问网站页面时会返回一个cookie,当登陆的时候会携带这个cookie(登陆时返回给我们的cookie是假的) import requests # 1. 查看首页 r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ) # 2. 提交用户名和密码 r2 = requests.post( url='https://dig.chouti.com/login', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, data={ 'phone':'8613121758648', 'password':'woshiniba', 'oneMonth':1 }, cookies=r1.cookies.get_dict() ) # 3. 点赞 r3 = requests.post( url='https://dig.chouti.com/link/vote?linksId=20435396', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, cookies=r1.cookies.get_dict() ) print(r3.text)

"""""" # ################################### 示例三:自动登录GitHub ################################### # 1. GET,访问登录页面 """ - 去HTML中找隐藏的Input标签获取csrf token - 获取cookie """ # 2. POST,用户名和密码 """ - 发送数据: - csrf - 用户名 - 密码 - 携带cookie """ # 3. GET,访问https://github.com/settings/emails """ - 携带 cookie """ import requests from bs4 import BeautifulSoup # ############## 方式一 ############## # # # 1. 访问登陆页面,获取 authenticity_token # i1 = requests.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 携带authenticity_token和用户名密码等信息,发送用户验证 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "wupeiqi@live.com", # 'password': 'xxoo' # } # # i2 = requests.post('https://github.com/session', data=form_data, cookies=c1) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = requests.get('https://github.com/settings/repositories', cookies=c1) # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp) # ############## 方式二 ############## # session = requests.Session() # # 1. 访问登陆页面,获取 authenticity_token # i1 = session.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 携带authenticity_token和用户名密码等信息,发送用户验证 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "wupeiqi@live.com", # 'password': 'xxoo' # } # # i2 = session.post('https://github.com/session', data=form_data) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = session.get('https://github.com/settings/repositories') # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp)
拉勾网示例:
########################拉勾网############################### ''' - 密码加密 - 找js,通过python实现加密方式 - 找密文,密码<=>密文 - Referer头, 上一次请求地址,可以用于做防盗链。 ''' import re import requests r1 = requests.get( url='https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # print(X_Anti_Forge_Token, X_Anti_Forge_Code) # print(r1.text) # r2 = requests.post( url='https://passport.lagou.com/login/login.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'X-Anit-Forge-Code':X_Anti_Forge_Code, 'X-Anit-Forge-Token':X_Anti_Forge_Token, 'Referer': 'https://passport.lagou.com/login/login.html', # 上一次请求地址是什么? }, data={ "isValidate": True, 'username': '15269853268', 'password': 'ab18d328d7126ea65915c50359c22c0d', 'request_form_verifyCode': '', 'submit': '' }, cookies=r1.cookies.get_dict() ) print(r2.text)
小节:
请求头: user-agent referer host cookie 特殊请起头,查看上一次请求获取内容。 'X-Anit-Forge-Code':... 'X-Anit-Forge-Token':... 请求体: - 原始数据 - 原始数据 + token - 密文 - 找算法 - 使用密文 套路: - post登录获取cookie,以后携带cookie - get获取未授权cookie,post登录携带cookie去授权,以后携带cookie