一.Requests模块:
安装:pip3 install requests
1.Requests:非常强大。
(1)指定url
(2)发请求
(3)数据解析
(4)持久化存储
2.Get
3.Post
4.Ajax的get
5.Ajax的post:
6.综合练习:百度贴吧
# 基本使用和响应对象的常见属性 import requests url='https://www.taobao.com' response = requests.get(url=url) # print(response.text) #字符串 # print(response.content) #二进制 # print(response.json()) #数据中的json格式的数据 # print(response.encoding) #服务端的编码格式 # print(response.status_code) #服务端的响应状态码 # print(response.headers) #服务端的响应头信息
# 写入文件 import requests url='https://www.taobao.com' response = requests.get(url) data = response.text with open('./taobao.html','w',encoding='utf-8') as f: f.write(data) print('over')
# get + 参数 import requests url = 'http://www.baidu.com/s' #注意https 会闪 wd = input('enter a word:') param = { 'ie':'utf-8', 'wd':wd } response = requests.get(url=url,params=param) data = response.text filename = wd+'.html' with open(filename,'w',encoding='utf-8') as f: # 注意编码,不然会乱码 f.write(data) print('over')
# 百度翻译 # post + 参数 import requests #如下两行代码表示的是忽略证书(SSLError) import ssl ssl._create_default_https_context = ssl._create_unverified_context url = 'https://fanyi.baidu.com/sug' data = { 'kw':'dog' } header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } response = requests.post(url=url,data=data) print(response.text) #返回json结果 在线解析
import requests url = 'https://movie.douban.com/j/chart/top_list' param= { 'type':'24', 'interval_id':'100:90', 'action':'', 'start':'20', 'limit':'20' } header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } response = requests.get(url=url,params=param,headers=header) print(response.url) print(response.text)
import requests url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } address = input('enter a address:') start_page = int(input('enter a start pagenum: ')) end_page = int(input('enter a end pagenum:')) for page in range(start_page,end_page+1): data = { 'cname':'', 'pid':'', 'keyword':address, 'pageIndex':str(page), 'pageSize':'10' } response = requests.post(url=url,headers=header,data=data) print(response.text)
import requests name = input('enter name:') start_page = int(input('enter start page:')) end_page = int(input('enter end page:')) url = 'http://tieba.baidu.com/f' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } for page in range(start_page, end_page + 1): print('开始爬取第%s页的数据' % page) param = { 'kw': name, 'ie': 'utf-8', 'pn': (page - 1) * 50 } # 不能用header,否则页面加载有问题,不知道原因 # response = requests.get(url=url,params=param,headers=headers) response = requests.get(url=url, params=param) # 持久化操作 filename = name + '_' + str(page) + '.html' with open(filename, 'w', encoding='utf-8') as f: f.write(response.text) print('结束爬取第%s页的数据' % page)
#实现人人网的登录操作 import requests #input('enter a code:') #获取session对象,通过session发起的请求,该请求中会自动携带cookie session=requests.session() #指定url 抓包获取url url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201883913543' data = { 'email': '17701256561', 'icode': '', 'origURL': 'http://www.renren.com/home', 'domain': 'renren.com', 'key_id': '1', 'captcha_type': 'web_login', 'password': '7b456e6c3eb6615b2e122a2942ef3845da1f91e3de075179079a3b84952508e4', 'rkey': '44fd96c219c593f3c9612360c80310a3', 'f': 'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3Dm7m_NSUp5Ri_ZrK5eNIpn_dMs48UAcvT-N_kmysWgYW%26wd%3D%26eqid%3Dba95daf5000065ce000000035b120219', } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } #第一次通过session发起了请求,该请求中一定会携带cookie response = session.post(url=url,headers=headers,data=data) #再次发起请求,访问二级子页面 url_ = 'http://www.renren.com/289676607/profile' response_ = session.get(url=url_,headers=headers) with open('./second.html','w',encoding='utf-8') as fp: fp.write(response_.text)
#代码设置代理 import requests import random headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } url = 'http://www.baidu.com/s' param = { 'ie':'utf-8', 'wd':'ip' } # 代理ip通过 ProxyThorn获取 proxy1 = { "http": "112.115.57.20:3128" } proxy2 = { 'http': '121.41.171.223:3128' } proxy3 = { 'http': '121.41.171.223:3128' } proxys = [proxy1,proxy2,proxy3] proxy = random.choice(proxys) response = requests.get(url=url,headers=headers,params=param,proxies=proxy) print(response.text) #还原代理IP成自己本机IP requests.get(url,proxies={'http':''})