爬取校花网图片
# 页面路由规律
# href = "http://www.xiaohuar.com/list-1-0.html" 第一页
# href = "http://www.xiaohuar.com/list-1-1.html" 第二页
# href = "http://www.xiaohuar.com/list-1-2.html" 第三页
# href = "http://www.xiaohuar.com/list-1-3.html" 第四页
# 生成所有的页码
def get_page_url():
for i in range(2):
yield 'http://www.xiaohuar.com/list-1-{}.html'.format(i)
# for url in get_page_url():
# print(url)
from requests_html import HTMLSession
import os
session = HTMLSession()
# 第一页解析测试
# url = 'http://www.xiaohuar.com/list-1-0.html'
# r = session.request(method='get', url=url, headers=headers)
# # print(r.text)
# img_element_list = r.html.find('[class="img"] img')
# # print(img_element_list)
# for img_element in img_element_list:
# print(img_element.attrs.get('alt'))
# print(r.html.base_url[:-1] + img_element.attrs.get('src'))
# 解析页面,获取图片名和url
def parse_page(url):
r = session.request(method='get', url=url)
img_element_list = r.html.find('[class="img"] img')
for img_element in img_element_list:
file_name = img_element.attrs.get('alt').replace('/', '').replace('\', '') + '.png'
print(file_name)
file_url = img_element.attrs.get('src')
file_url = r.html.base_url[:-1] + file_url if not file_url.startswith('http') else file_url # 处理相对路径和绝对路径
save_file(file_name, file_url)
def save_file(name, url):
base_path = '校花图片'
file_path = os.path.join(base_path, name)
r = session.get(url=url)
with open(file_path, 'wb') as f:
f.write(r.content)
print('%s下载成功' % name)
if __name__ == '__main__':
for page_url in get_page_url():
parse_page(page_url)
爬取校花网视频
# 页面路由规律
# http://www.xiaohuar.com/list-3-0.html 第一页
# http://www.xiaohuar.com/list-3-1.html 第二页
# http://www.xiaohuar.com/list-3-2.html 第三页
# http://www.xiaohuar.com/list-3-3.html 第四页
# http://www.xiaohuar.com/list-3-4.html 第五页
# http://www.xiaohuar.com/list-3-5.html 第六页
from requests_html import HTMLSession
import os
session = HTMLSession()
# 获取索引页url
def get_index_page():
for i in range(6):
url = 'http://www.xiaohuar.com/list-3-%s.html' % i
yield url
# 解析索引页测试
# url = 'http://www.xiaohuar.com/list-3-5.html'
# r = session.get(url=url)
# # print(r.html.find('#images a[class="imglink"]'))
# for element in r.html.find('#images a[class="imglink"]'):
# print(element.attrs.get('href'))
# 解析索引页获取详情页url
def get_detail_page(url):
r = session.get(url=url)
for element in r.html.find('#images a[class="imglink"]'):
print(element.attrs.get('href'))
yield element.attrs.get('href')
# 测试解析详情页获取视频url,名字
# url = 'http://www.xiaohuar.com/p-3-13.html'
# # url = 'http://www.xiaohuar.com/p-3-5.html'
# r = session.get(url=url)
# r.html.encoding = 'gbk'
# file_name = r.html .find('title', first=True).text.replace('\', '')
#
# print(file_name)
#
# element = r.html.find('#media source', first=True)
# if element:
# video_url = element.attrs.get('src')
# print(video_url)
# else:
# video_url = r.html.search('var vHLSurl = "{}";')[0]
# print(video_url)
# 解析详情页获取视频url,名字
def get_url_name(url):
r = session.get(url=url)
r.html.encoding = 'gbk'
file_name = r.html.find('title', first=True).text.replace('\', '')
print(file_name)
element = r.html.find('#media source', first=True)
if element:
video_url = element.attrs.get('src')
video_type = 'mp4'
else:
video_url = r.html.search('var vHLSurl = "{}";')[0]
video_type = 'm3u8'
return file_name, video_url, video_type
# 保存文件
def save(file_name, video_url, video_type):
if video_type == 'mp4':
file_name += '.mp4'
r = session.get(url=video_url)
with open(file_name, 'wb') as f:
f.write(r.content)
elif video_type == 'm3u8':
save_m3u8(file_name, video_url)
# 处理m3u8
def save_m3u8(file_name, video_url):
if not os.path.exists(file_name):
os.mkdir(file_name)
r = session.get(url=video_url)
m3u8_path = os.path.join(file_name, 'playlist.m3u8')
with open(m3u8_path, 'wb') as f:
f.write(r.content)
# print(r.text)
for line in r.text:
if line.endswith('ts'):
ts_url = video_url.replace('playlist.m3u8', line)
ts_path = os.path.join(file_name, line)
r1 = session.get(url=ts_url)
with open(ts_path, 'wb') as f:
f.write(r1.content)
if __name__ == '__main__':
for index_page in get_index_page():
for detail_url in get_detail_page(index_page):
file_name, video_url, video_type = get_url_name(detail_url)
save(file_name, video_url, video_type)