zoukankan      html  css  js  c++  java
  • 通过代理爬mzitu

    #导入库

    import os
    import requests
    from bs4 import BeautifulSoup
    import time
    from config import *
    import random
    import re
    from requests import ConnectionError
    

    #生成mzitu请求headers

    def res_headers():
        headers = {
            'User-Agent': random.choice(USER_AGENT_LIST),
            'Referer':random.choice(REFERER_LIST),
        }
        return headers
    

      #生成单个user-agent

    def get_header():
        headers = {
            'User-Agent':random.choice(USER_AGENT_LIST)
        }
        return headers
    

      #获取list后checkip返回可用ip

    def get_proxy_list():
        ip_list = []
        base_url = 'https://www.xicidaili.com/wt/'
        header = get_header()
        actual_url = base_url + str(random.randint(1,300))
        try:
            res = requests.get(url=actual_url, headers=header)
            if res.status_code == 200:
                html = res.text
                pattern = '(d+.d+.d+.d+)</td>s*<td>(d+)'
                re_list = re.findall(pattern, html)
                for ip_port in re_list:
                    ip_port = ip_port[0] + ':' + ip_port[1]
                    ip_list.append(ip_port)
                check_ip(ip_list)
            else:get_proxy_list()
        except ConnectionError:
            get_proxy_list()
    

      #check 有效ip

    def check_ip(ip_list):
        # print('check_ip')
        url_baidu = 'https://www.mzitu.com/'
        proxy_ip = 'http://' + random.choice(ip_list)
        proxy_ip_dic = {
            'http': proxy_ip
        }
        header = get_header()
        # print(proxy_ip_dic)
        try:
            res = requests.get(url_baidu, headers=header, proxies=proxy_ip_dic, timeout=8)
            if res.status_code == 200:
                # print(proxy_ip_dic)
                return proxy_ip_dic
        except ConnectionError:
            get_proxy_list()
    

      #网站请求

    def get_page(url):
        headers=res_headers()
        # 创建session
        s = requests.session()
        s.keep_alive = False
        # 获取页面
        res = s.get(url,headers=headers)
        html = res.text
        return html
    

      #获取页面all girls的详情页url

    def get_all_girls(url):
        html = get_page(url)
        # 构建soup页面
        soup = BeautifulSoup(html, 'html.parser')
        # 获取 class_='archives' 下的所有 'a'标签
        total_info = soup.find(class_='archives').find_all('a')
        # 遍历 'a' 标签,读取'href'值
        all_list=[]
        for girls_info in total_info:
            link_url = girls_info['href']
            all_list.append(link_url)
        # print(all_list)
        return all_list
    

      #获取girl的所有图片url

    def get_girl_all_page(url):
        print('获取girl的所有图片url')
        html=get_page(url)
        soup = BeautifulSoup(html,'html.parser')
        # 在 class_='pagenavi' 中的倒数第3个标签,读取 'span' 的值(图片数量)
        max_page = soup.find(class_='pagenavi',).find_all('a')[-2].find('span').string
        title = soup.find(class_='main-title').string
        # 循环读取详情页面中的'img'标签中的'src'值
        headers = res_headers()
        proxy = get_proxy_list()
        pic_url_list = []
        for i in range(int(max_page)):
            page_url = url + "/%s"  %(i+1)
            html = requests.get(page_url, headers=headers, proxies=proxy).text
            # print(html)
            soup = BeautifulSoup(html,'html.parser')
            # print(soup.text)
            # pic_url = soup.find('img').get('src')
            pic_url = soup.find('img').get('src')
            # print(pic_url)
            pic_url_list.append(pic_url)
            time.sleep(0.1)
        # print(pic_url_list)
        download_Pic(title,pic_url_list)
    

      #下载图片,以标题为文件夹名

    def download_Pic(title, pic_url_list):
        # 新建文件夹,路径
        os.mkdir(title)
        headers = res_headers()
        proxy = get_proxy_list()
        # 自定义序列号
        j = 1
        # 下载图片
        for item in pic_url_list:
            # 定义文件路径及名称
            filename = '%s/%s.jpg' % (title, str(j))
            print('downloading....%s : NO.%s' % (title, str(j)))
            with open(filename, 'wb') as f:
                img = requests.get(item, headers=headers,proxies=proxy).content
                f.write(img)
            j += 1
        time.sleep(10)
    

      #主程序

    if __name__ == '__main__':
        url = "https://www.mzitu.com/all"
        pic_list = get_all_girls(url)
        for i in pic_list:
            get_girl_all_page(i)
    

      #config.py

    USER_AGENT_LIST = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
        "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
        "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
    ]
    
    REFERER_LIST= [
    'https://www.mzitu.com/215756', 
    'https://www.mzitu.com/201236',
    'https://www.mzitu.com/214521',
    'https://www.mzitu.com/200253',
    'https://www.mzitu.com/214751',
    'https://www.mzitu.com/199934',
    'https://www.mzitu.com/214404',
    'https://www.mzitu.com/199190',
    'https://www.mzitu.com/214261',
    'https://www.mzitu.com/199970',
    ]  
    

          爬了1个小时后,自己的ip被代理网站屏蔽了,哈哈哈哈

    Traceback (most recent call last):
      File "D:Python38libsite-packagesurllib3connection.py", line 156, in _new_conn
        conn = connection.create_connection(
      File "D:Python38libsite-packagesurllib3utilconnection.py", line 84, in create_connection
        raise err
      File "D:Python38libsite-packagesurllib3utilconnection.py", line 74, in create_connection
        sock.connect(sa)
    TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "D:Python38libsite-packagesurllib3connectionpool.py", line 665, in urlopen
        httplib_response = self._make_request(
      File "D:Python38libsite-packagesurllib3connectionpool.py", line 376, in _make_request
        self._validate_conn(conn)
      File "D:Python38libsite-packagesurllib3connectionpool.py", line 994, in _validate_conn
        conn.connect()
      File "D:Python38libsite-packagesurllib3connection.py", line 334, in connect
        conn = self._new_conn()
      File "D:Python38libsite-packagesurllib3connection.py", line 168, in _new_conn
        raise NewConnectionError(
    urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x000001E898850C40>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "D:Python38libsite-packages
    equestsadapters.py", line 439, in send
        resp = conn.urlopen(
      File "D:Python38libsite-packagesurllib3connectionpool.py", line 719, in urlopen
        retries = retries.increment(
      File "D:Python38libsite-packagesurllib3util
    etry.py", line 436, in increment
        raise MaxRetryError(_pool, url, error or ResponseError(cause))
    urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='www.mzitu.com', port=443): Max retries exceeded with url: /194229/30 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001E898850C40>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。'))
    
    During handling of the above exception, another exception occurred:
    
    Traceback (most recent call last):
      File "D:/project/mzitu/spider.py", line 132, in <module>
        get_girl_all_page(i,proxy)
      File "D:/project/mzitu/spider.py", line 98, in get_girl_all_page
        html = requests.get(page_url,headers=res_headers(),proxies=proxy).text
      File "D:Python38libsite-packages
    equestsapi.py", line 75, in get
        return request('get', url, params=params, **kwargs)
      File "D:Python38libsite-packages
    equestsapi.py", line 60, in request
        return session.request(method=method, url=url, **kwargs)
      File "D:Python38libsite-packages
    equestssessions.py", line 533, in request
        resp = self.send(prep, **send_kwargs)
      File "D:Python38libsite-packages
    equestssessions.py", line 646, in send
        r = adapter.send(request, **kwargs)
      File "D:Python38libsite-packages
    equestsadapters.py", line 516, in send
        raise ConnectionError(e, request=request)
    requests.exceptions.ConnectionError: HTTPSConnectionPool(host='www.mzitu.com', port=443): Max retries exceeded with url: /194229/30 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001E898850C40>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。'))
    

      

  • 相关阅读:
    必须了解的经典排序算法整理
    浅谈Code Review
    NOIP2018提高组省一冲奖班模测训练(六)
    NOIP2018提高组省一冲奖班模测训练(五)
    NOIP2018提高组金牌训练营——动态规划专题
    poj 3074
    搜索中的剪枝
    bitset骚操作
    NOIP 2017 宝藏
    prim求最小生成树
  • 原文地址:https://www.cnblogs.com/lijifei/p/12048437.html
Copyright © 2011-2022 走看看