分了3个文件,
-config 存放一些信息及配置
-proxy_pool 抓取代理
-get_mzitu 爬取网页
使用前需要安装redis数据库 https://redis.io/download
config文件
#设置user_agent条目
USER_AGENTS = [
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0"
]
#设置refefer条目
REFERER = [
'https://www.mzitu.com/',
'https://www.mzitu.com/215027',
'https://www.mzitu.com/201236',
]
#设置redis服务器连接
CONN = StrictRedis(host='localhost',port=6379,db=0,password='')
proxy_pool文件
import re
import requests
import random
from config import *
from redis import StrictRedis
from requests import ConnectionError
def headers():
header= {
'User-Agent' : random.choice(USER_AGENTS),
}
return header
def get_page(url):
print('pool正在打开网页')
try:
header = headers()
res = requests.get(url, headers=header)
if res.status_code == 200:
# print(res.text)
return res.text
else:
get_page(url)
except ConnectionError:
get_page(url)
def get_proxy_list():
print('正在爬取网页')
base_url = 'https://www.xicidaili.com/wt/'
page_n = random.randint(100,2700)
url = base_url + str(page_n)
print(url)
html = get_page(url)
# print(html)
try:
pattrens = 'alt="Cn" /></td>([dD]*?)</tr>'
root = re.findall(pattrens, html)
# print(root)
list_ip = []
# 再次匹配数据的正则
for i in range(len(root)):
# print(len(root))
key = re.findall('<td>([dD]*?)</td>', root[i])
# list_ip.append(key[3].lower() + '://' + key[0] + ':' + key[1])
list_ip.append(key[0] + ':' + key[1])
print(list_ip)
return list_ip
except Exception:
print('解析IP地址出错l')
traceback.print_exc()
def check_proxy():
print('正在检查')
list_ip = get_proxy_list()
url_baidu = 'https://www.mzitu.com'
for i in list_ip:
print(i)
proxy_dic = {'http':i,
}
try:
r_baidu = requests.get(url_baidu,proxies=proxy_dic)
if r_baidu.status_code == 200:
save_2_redis(i)
else : pass
except ConnectionError:pass
def save_2_redis(proxy):
print('正在写入%s' ,proxy)
conn = StrictRedis(host='localhost',port=6379,password='')
conn.set(proxy.split(':')[0],proxy)
def get_proxy():
print('pool获取proxy')
if len(CONN.keys('*')) <= 3:
check_proxy()
else:
key = CONN.randomkey()
r = CONN.get(key)
CONN.delete(key)
print(str(r,encoding='utf-8'))
return str(r,encoding='utf-8')
def main():
get_proxy()
if __name__ == '__main__':
main()
get_mzitu文件
import os
import requests
import random
from config import *
from proxy_pool import get_proxy
from bs4 import BeautifulSoup
def headers():
header= {
'User-Agent' : random.choice(USER_AGENTS),
}
return header
def referer_headers():
referer_header = {
'User-Agent': random.choice(USER_AGENTS),
'Referer':'https://www.mzitu.com/',
}
return referer_header
def get_proxy_page(url,proxy_dic=None):
if proxy_dic:
header = headers()
res = requests.get(url, headers=header, proxies=proxy_dic)
return res.text,proxy_dic
else:
try:
header = headers()
proxy_dic = crate_proxy_dic()
res = requests.get(url, headers=header, proxies=proxy_dic)
return res.text,proxy_dic
except ConnectionError:
get_proxy_page(url)
def crate_proxy_dic():
proxy = 'http://' + str(get_proxy())
proxy_dic = {
'http': proxy,
}
# print(proxy_dic)
return proxy_dic
def get_all_girls(url):
print('获取all_girl的url')
html,proxy_dic = get_proxy_page(url,None)
# 构建soup页面
soup = BeautifulSoup(html, 'html.parser')
# 获取 class_='archives' 下的所有 'a'标签
total_info = soup.find(class_='archives').find_all('a')
# 遍历 'a' 标签,读取'href'值
all_list=[]
for girls_info in total_info:
link_url = girls_info['href']
all_list.append(link_url)
print(all_list,proxy_dic)
return all_list,proxy_dic
def get_girl_all_page(all_list,proxy_dic):
for url in all_list:
html,proxy_dic = get_proxy_page(url,proxy_dic)
soup = BeautifulSoup(html,'lxml')
# 在 class_='pagenavi' 中的倒数第3个标签,读取 'span' 的值(图片数量)
max_page = soup.find(class_='pagenavi',).find_all('a')[-2].find('span').string
title = soup.find(class_='main-title').string
# 循环读取详情页面中的'img'标签中的'src'值
header = referer_headers()
pic_url_list = []
for i in range(int(max_page)):
page_url = url + "/%s" %(i+1)
pic_url,proxy_dic = append_img_url(page_url, header, proxy_dic)
pic_url_list.append(pic_url)
download_Pic(title, pic_url_list, proxy_dic)
def append_img_url(page_url , header , proxy_dic=None):
try:
res = requests.get(page_url, headers=header, proxies=proxy_dic)
if res.status_code == 200:
pic_url = get_img_url(res)
print(pic_url,proxy_dic)
return pic_url,proxy_dic
else:
proxy_dic = crate_proxy_dic()
res = requests.get(page_url, headers=header, proxies=proxy_dic)
if res.status_code==200:
proxy_dic = proxy_dic
pic_url = get_img_url(res)
return pic_url,proxy_dic
else:
append_img_url(page_url, header, proxy_dic=None)
except ConnectionError:
append_img_url(page_url, header, proxy_dic=None)
def get_img_url(res):
html = res.text
soup = BeautifulSoup(html, 'lxml')
pic_url = soup.find('img').get('src')
return pic_url
def download_Pic(title, pic_url_list,proxy_dic=None):
print('download_pic')
# 新建文件夹,路径
os.mkdir(title)
headers = referer_headers()
# 自定义序列号
j = 1
# 下载图片
for item in pic_url_list:
# 定义文件路径及名称
filename = '%s/%s.jpg' % (title, str(j))
print('downloading....%s : NO.%s' % (title, str(j)))
with open(filename, 'wb') as f:
try:
img_res = requests.get(item, headers=headers,proxies=proxy_dic)
if img_res.status_code==200:
img = img_res.content
f.write(img)
else:
proxy_dic = crate_proxy_dic()
img_res = requests.get(item, headers=headers, proxies=proxy_dic)
img = img_res.content
f.write(img)
except ConnectionError:
proxy_dic = crate_proxy_dic()
img_res = requests.get(item, headers=headers, proxies=proxy_dic)
img = img_res.content
f.write(img)
j += 1
if __name__ == '__main__':
url = 'https://www.mzitu.com/all'
all_list, proxy_dic = get_all_girls(url)
get_girl_all_page(all_list, proxy_dic)