zoukankan html css js c++ java

代理池爬取mzitu

分了3个文件，

-config 存放一些信息及配置

-proxy_pool 抓取代理

-get_mzitu 爬取网页

使用前需要安装redis数据库 https://redis.io/download

config文件

#设置user_agent条目
USER_AGENTS = [
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    "Mozilla/5.0"
]
#设置refefer条目
REFERER = [
    'https://www.mzitu.com/',
    'https://www.mzitu.com/215027',
    'https://www.mzitu.com/201236',
]
#设置redis服务器连接
CONN = StrictRedis(host='localhost',port=6379,db=0,password='')

　　proxy_pool文件

import re
import requests
import random
from config import *
from redis import StrictRedis
from requests import ConnectionError

def headers():
    header= {
        'User-Agent' : random.choice(USER_AGENTS),
    }
    return header

def get_page(url):
    print('pool正在打开网页')
    try:
        header = headers()
        res = requests.get(url, headers=header)
        if res.status_code == 200:
            # print(res.text)
            return res.text
        else:
            get_page(url)
    except ConnectionError:
        get_page(url)

def get_proxy_list():
    print('正在爬取网页')
    base_url = 'https://www.xicidaili.com/wt/'
    page_n = random.randint(100,2700)
    url = base_url + str(page_n)
    print(url)
    html = get_page(url)
    # print(html)
    try:
        pattrens = 'alt="Cn" /></td>([dD]*?)</tr>'
        root = re.findall(pattrens, html)
        # print(root)
        list_ip = []
        # 再次匹配数据的正则
        for i in range(len(root)):
            # print(len(root))
            key = re.findall('<td>([dD]*?)</td>', root[i])
            # list_ip.append(key[3].lower() + '://' + key[0] + ':' + key[1])
            list_ip.append(key[0] + ':' + key[1])
        print(list_ip)
        return list_ip
    except Exception:
        print('解析IP地址出错l')
        traceback.print_exc()
def check_proxy():
    print('正在检查')
    list_ip = get_proxy_list()
    url_baidu = 'https://www.mzitu.com'
    for i in list_ip:
        print(i)
        proxy_dic = {'http':i,
        }
        try:
            r_baidu = requests.get(url_baidu,proxies=proxy_dic)
            if r_baidu.status_code == 200:
                save_2_redis(i)
            else : pass
        except ConnectionError:pass

def save_2_redis(proxy):
    print('正在写入%s' ,proxy)
    conn = StrictRedis(host='localhost',port=6379,password='')
    conn.set(proxy.split(':')[0],proxy)

def get_proxy():
    print('pool获取proxy')
    if len(CONN.keys('*')) <= 3:
        check_proxy()
    else:
        key = CONN.randomkey()
        r = CONN.get(key)
        CONN.delete(key)
        print(str(r,encoding='utf-8'))
        return str(r,encoding='utf-8')

def main():
    get_proxy()

if __name__ == '__main__':
    main()

　　get_mzitu文件

import os
import requests
import random
from config import *
from proxy_pool import get_proxy
from bs4 import BeautifulSoup

def headers():
    header= {
        'User-Agent' : random.choice(USER_AGENTS),
    }
    return header

def referer_headers():
    referer_header = {
        'User-Agent': random.choice(USER_AGENTS),
        'Referer':'https://www.mzitu.com/',
    }
    return referer_header

def get_proxy_page(url,proxy_dic=None):
    if proxy_dic:
        header = headers()
        res = requests.get(url, headers=header, proxies=proxy_dic)
        return res.text,proxy_dic
    else:
        try:
            header = headers()
            proxy_dic = crate_proxy_dic()
            res = requests.get(url, headers=header, proxies=proxy_dic)
            return res.text,proxy_dic
        except ConnectionError:
            get_proxy_page(url)
def crate_proxy_dic():
    proxy = 'http://' + str(get_proxy())
    proxy_dic = {
        'http': proxy,
    }
    # print(proxy_dic)
    return proxy_dic

def get_all_girls(url):
    print('获取all_girl的url')
    html,proxy_dic = get_proxy_page(url,None)
    # 构建soup页面
    soup = BeautifulSoup(html, 'html.parser')
    # 获取 class_='archives' 下的所有 'a'标签
    total_info = soup.find(class_='archives').find_all('a')
    # 遍历 'a' 标签，读取'href'值
    all_list=[]
    for girls_info in total_info:
        link_url = girls_info['href']
        all_list.append(link_url)
    print(all_list,proxy_dic)
    return all_list,proxy_dic
def get_girl_all_page(all_list,proxy_dic):
    for url in all_list:
        html,proxy_dic = get_proxy_page(url,proxy_dic)
        soup = BeautifulSoup(html,'lxml')
        # 在 class_='pagenavi' 中的倒数第3个标签，读取 'span' 的值（图片数量）
        max_page = soup.find(class_='pagenavi',).find_all('a')[-2].find('span').string
        title = soup.find(class_='main-title').string
        # 循环读取详情页面中的'img'标签中的'src'值
        header = referer_headers()
        pic_url_list = []
        for i in range(int(max_page)):
            page_url = url + "/%s"  %(i+1)
            pic_url,proxy_dic = append_img_url(page_url, header, proxy_dic)
            pic_url_list.append(pic_url)
        download_Pic(title, pic_url_list, proxy_dic)
def append_img_url(page_url , header , proxy_dic=None):
    try:
        res = requests.get(page_url, headers=header, proxies=proxy_dic)
        if res.status_code == 200:
            pic_url = get_img_url(res)
            print(pic_url,proxy_dic)
            return pic_url,proxy_dic
        else:
            proxy_dic = crate_proxy_dic()
            res = requests.get(page_url, headers=header, proxies=proxy_dic)
            if res.status_code==200:
                proxy_dic = proxy_dic
                pic_url = get_img_url(res)
                return pic_url,proxy_dic
            else:
                append_img_url(page_url, header, proxy_dic=None)
    except ConnectionError:
        append_img_url(page_url, header, proxy_dic=None)
def get_img_url(res):
    html = res.text
    soup = BeautifulSoup(html, 'lxml')
    pic_url = soup.find('img').get('src')
    return pic_url
def download_Pic(title, pic_url_list,proxy_dic=None):
    print('download_pic')
    # 新建文件夹，路径
    os.mkdir(title)
    headers = referer_headers()
    # 自定义序列号
    j = 1
    # 下载图片
    for item in pic_url_list:
        # 定义文件路径及名称
        filename = '%s/%s.jpg' % (title, str(j))
        print('downloading....%s : NO.%s' % (title, str(j)))
        with open(filename, 'wb') as f:
            try:
                img_res = requests.get(item, headers=headers,proxies=proxy_dic)
                if img_res.status_code==200:
                    img = img_res.content
                    f.write(img)
                else:
                    proxy_dic = crate_proxy_dic()
                    img_res = requests.get(item, headers=headers, proxies=proxy_dic)
                    img = img_res.content
                    f.write(img)
            except ConnectionError:
                proxy_dic = crate_proxy_dic()
                img_res = requests.get(item, headers=headers, proxies=proxy_dic)
                img = img_res.content
                f.write(img)
        j += 1

if __name__ == '__main__':
    url = 'https://www.mzitu.com/all'
    all_list, proxy_dic = get_all_girls(url)
    get_girl_all_page(all_list, proxy_dic)

查看全文

相关阅读:
JAVA编程规则【转自java编程思想】
诊断 Java 代码: 轻松掌握 Java 泛型
 Linux开启telnet远程登录服务全攻略
 TCP详解
 UNIX环境高级编程文件描述符浅析
 DHCP与BOOTP有什么区别
 Linux 多播（组播）例程
 你所不知道的传输层
 虚电路方式,数据报方式
 java foreach 使用

原文地址：https://www.cnblogs.com/lijifei/p/12103057.html