一:爬虫介绍
1. 什么是爬虫
爬虫就是一个网络蜘蛛,伪装成用去,去网站拿到自己想要的数据。注意:是可见即可爬,如果爬取的是人家加密的数据,就是黑客(犯法)
2. 爬虫的本质
模拟浏览器发送请求(requests,selenium)->下载网页代码->只提取有用的数据(bs4,xpath,re)->存放于数据库或文件中(文件,
excel,mysql,redis,mongodb)
3. 发送请求:请求地址(浏览器调试,抓包工具),请求头(难),请求体(难),请求方法
4. 拿到响应体: 拿到响应体(json格式,xml格式,html格式(bs4,xpath),加密的未知格式(需要解密))
5. 入库:MongoDB(json格式数据)
6. 性能高一些(多线程,多进程,协程),只针对与python语言的cpython解释器(GIL:同一时刻只能由一个线程在执行)
io密集型:用线程
计算密集型:用进程
7. scrapy框架处理了性能
二、requests模块使用
1 安装:pip3 install requests
2 图片防盗链:referer(一般是访问该网站的上一次地址)
3 代码
import requests # 1 发送get请求 # res是python的对象,对象里,响应头,响应体。。。。
#爬取妹子图(单张) header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 'referer': 'https://www.mzitu.com/225078/2' } res = requests.get('https://www.mzitu.com/', headers=header) print(res.text) res1 = requests.get('https://i3.mmzztt.com/2020/03/14a02.jpg', headers=header) print(res1.text) #得到该网页内容 print(res1.content) # 二进制内容
#将爬取的数据写入文件
with open('a.jpg', 'wb')as f: for line in res1.iter_content(): f.write(line)
请求地址中携带数据的两种方式(推荐第二种)
方式一,在header中放
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'cookie':'key=asdfasdfasdfsdfsaasdf;key2=asdfasdf;key3=asdfasdf'
}
res=requests.get('http://127.0.0.1:8000/index/',headers=header)
方式二:
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
}
cookies是一个字典或者CookieJar对象
res=requests.get('http://127.0.0.1:8000/index/',headers=header,cookies={'key':'asdfasdf'})
print(res.text)
4 发送post请求,携带数据(urlencoded和json)
#urlencoded编码数据
res=requests.post('http://127.0.0.1:8000/index/',data={'name':'lqz'}) print(res.text) #json格式的数据 res=requests.post('http://127.0.0.1:8000/index/',json={'age':1,},) print(res.text)
5 自动携带cookie(得先登录)
session=requests.session() res=session.post('http://127.0.0.1:8000/index/') # 假设这个请求登录了 res1=session.get('http://127.0.0.1:8000/order/') # 现在不需要手动带cookie,session会帮咱处理
6 response对象
respone=requests.post('http://127.0.0.1:8000/index/',data={'name':'lqz'}) print(respone.text) # 响应的文本 print(respone.content) # 响应体的二进制 print(respone.status_code) # 响应状态码 print(respone.headers) # 响应头 print(respone.cookies) # cookie print(respone.cookies.get_dict()) # 把cookie转成字典 print(respone.cookies.items()) # key和value print(respone.url) # 请求的url print(respone.history) #[]放重定向之前的地址 print(respone.encoding) # 响应的编码方式 respone.iter_content() # 图片,视频,大文件,一点一点循环取出来 for line in respone.iter_content(): f.write(line)
7 编码问题
res=requests.get('http://www.autohome.com/news') 一旦打印出来出现乱码问题 方式一 res.encoding='gb2312' 方式二 res.encoding=res.apparent_encoding print(res.text)
8 解析json
import json respone=requests.post('http://127.0.0.1:8000/index/',data={'name':'lqz'}) print(type(respone.text)) # 响应的文本 print(json.loads(respone.text)) print(respone.json()) # 相当于上面那句话 print(type(respone.json())) # 相当于上面那句话
9 高级用法之ssl(了解)
import requests respone=requests.get('https://www.12306.cn') #不验证证书,报警告,返回200 print(respone.status_code) 使用证书,需要手动携带 import requests respone=requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key')) print(respone.status_code)
10 高级用法:使用代理
respone=requests.get('http://127.0.0.1:8000/index/',proxies={'http':'代理的地址和端口号',}) 代理,免费代理,收费代理花钱买 代理池:列表放了一堆代理ip,每次随机取一个,再发请求就不会封ip了 高匿和透明代理?如果使用高匿代理,后端无论如何拿不到你的ip,使用透明,后端能够拿到你的ip 后端如何拿到透明代理的ip, 后端:X-Forwarded-For respone=requests.get('https://www.baidu.com/',proxies={'http':'27.46.20.226:8888',}) print(respone.text)
11 超时设置
import requests respone=requests.get('https://www.baidu.com', timeout=0.0001)
12 认证设置(你见不到了)
import requests r=requests.get('xxx',auth=('user','password')) print(r.status_code)
13 异常处理
import requests from requests.exceptions import * #可以查看requests.exceptions获取异常类型 try: r=requests.get('http://www.baidu.com',timeout=0.00001) except ReadTimeout: print('===:') except Exception as e: print(e)
三、模拟登陆某网站
#http://www.aa7a.cn/
import requests
session=requests.session()
data = {
'username': '2393732792@qq.com',
'password': '不告诉你',
'captcha': 'zdu4',
'remember': 1,
'ref': 'http://www.aa7a.cn/user.php?act=logout',
'act': 'act_login',
'Cookie: _jzqa':'1.2792172112721243000.1609120233.1609120233.1609120233.1; _jzqc=1; _jzqckmp=1; UM_distinctid=176a70a34f41b-0110db8ca74d6d-376b4502-1fa400-176a70a34f5994; CNZZDATA4603183=cnzz_eid%3D699572895-1609115372-%26ntime%3D1609115372; Hm_lvt_c29657ca36c6c88e02fed9a397826038=1609120233; CNZZDATA1260462072=29789027-1609115372-%7C1609115372; Qs_lvt_201322=1609120232; __xsptplusUT_422=1; __xsptplus422=422.1.1609120235.1609120264.2%234%7C%7C%7C%7C%7C%23%23zIGwupE6-j3-7AL-M8sUNlINzXmrjslX%23; mediav=%7B%22eid%22%3A%22179539%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22%2BNaKHo%3FBqD9.)brpw%25uh%22%2C%22ctn%22%3A%22%22%2C%22vvid%22%3A%22%2BNaKHo%3FBqD9.)brpw%25uh%22%2C%22_mvnf%22%3A1%2C%22_mvctn%22%3A0%2C%22_mvck%22%3A0%2C%22_refnf%22%3A0%7D; _qzjc=1; ECS_ID=cae4ff892ee137ed78cf440d9dc9b4e5b44682e2; ECS[visit_times]=2; _qzja=1.1413123720.1609120232650.1609120232650.1609120232650.1609120312187.1609120315593.2393732792%2540qq_com.1.0.5.1; _qzjb=1.1609120232650.5.0.0.0; _qzjto=5.1.0; _jzqb=1.13.10.1609120233.1; Qs_pv_201322=103942035330658960%2C385729642579573200%2C4568030181997294000%2C3227532881226825700; Hm_lpvt_c29657ca36c6c88e02fed9a397826038=1609120316',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# rest=session.get('http://www.aa7a.cn/user.php',data=data) 要先用get发一次请求,再用post法送请求,不然无法成功
rest = session.post('http://www.aa7a.cn/user.php',data=data)
# print(rest.text)
# 拿到cookie
cookie=rest.cookies
print(cookie)
# 携带着cookies,表示登录了,页面中会有我们的用户信息616564099@qq.com
rest1=session.get('http://www.aa7a.cn/index.php')
# print(rest1.text)
# rest1=requests.get('http://www.aa7a.cn/index.php')
print('2393732792@qq.com' in rest1.text)
# http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2F
四、爬取妹子图
import requests import re import time
#制作请求头 header = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', # 'cookie': 'Hm_lvt_cb7f29be3c304cd3bb0c65a4faa96c30=1608884416,1608966200; Hm_lpvt_cb7f29be3c304cd3bb0c65a4faa96c30=1608966288', # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'referer': 'https://www.mzitu.com/', #访问该页面的上一次地址,这个必须得有,有防盗链,不加获取不到真正的图片 } res = requests.get('https://www.mzitu.com/',headers=header)
#拿到首页html数据 # print(res.text)
#正则筛选出图片组链接后的编号 re_meizi = 'href="https://www.mzitu.com/(d+?)"' meizi_urls = re.findall(re_meizi, res.text)
#得到该页所有包含图片的链接的编号 # print(meizi_urls)
#for循环该链接编号 for i in meizi_urls: print('https://www.mzitu.com/'+i)
#拼接路径,并访问 response1 = requests.get('https://www.mzitu.com/'+i,headers=header)
#正则筛选出该页所有图片链接 meizi_tp = "img src='(.*?)'" meizi_tpurls = re.findall(meizi_tp,response1.text) print(meizi_tpurls) #循环所有链接 for p in meizi_tpurls: #拿到数据 response2 = requests.get(p,headers=header) print(response2.text) time.sleep(0.5)
#写入文件 with open(i+'.jpg','wb') as f: for line in response2.iter_content(): f.write(line) break
五、爬取梨视频
改规则了,正在研究