爬虫的相关操作
1、爬文本内容
# coding=gbk
import requests ##声明相关库
import re
response=requests.get('http://duanziwang.com/') ##获取到需要爬虫的网址
data=response.text ##把网站的相关内容全部爬下来
res=re.findall('<p>(.*?)</p>',data) ##获取自己需要的那一部分
print(res)
import requests
import re
respone=requests.get('http://ishuo.cn/')
data=respone.text
res=re.findall('<li class="list_li">(.*?)</li>',data)
dict={}
for i in res: ##对内容进行循环,并取出标题跟内容
context=re.findall('<div class="content">(.*?)</div>',i)[0]
title=re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0]
desc=re.findall('</a>(04月.*?)</div>',i)[0]
dict[title]=(context,desc)
for i in dict.items():
print(f'{i[0]:<20} | {i[1]}')
2、爬图片内容
import requests
import re
respone=requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg')
data=respone.text
img_res=re.findall('data-src="(.*?)"',data)
for i in img_res:
img_response=requests.get(i)
img_data=img_response.content ##图片会以二进制的形式显示
img_name=i.split('/')[-1] ##以/进行分割并取出每一张图片
f=open(img_name,'wb') ##打开图片并写入
f.write(img_data)
3、爬视频内容
import requests
import re
response = requests.get('http://www.mod.gov.cn/v/index.htm')
data = response.text
mp4_res2 = re.findall('<a href="(.*?)">', data)
for i in mp4_res2: # type:str
res = re.findall('(.*?htm)', i)[0]
res = 'http://www.mod.gov.cn/v/' + res
response = requests.get(res)
data = response.text
# http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
url_res = re.findall('//Video (.*?.mp4)',data)[0]
mp4_response = requests.get(url_res)
mp4_data = mp4_response.content
f = open('test.mp4','wb')
f.write(mp4_data)
# break