zoukankan      html  css  js  c++  java
  • python3 爬煎蛋ooxx妹子图

    import re
    import urllib.request
    import random
    import os
    import http.server
    import http.client
    from urllib.error import URLError, HTTPError
    import urllib.parse
    proxy = []  #定义代理IP列表
    
    
    def change_proxy():      #创建使用随机某个代理IP地址
        proxy_ip = random.choice(proxy)
        proxy_support = urllib.request.ProxyHandler({"http":proxy_ip})
        opener = urllib.request.build_opener(proxy_support)
        opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')]
        urllib.request.install_opener(opener)
        print("代理IP: %s" % proxy_ip)
    
    def url_open(url):     #访问jandan.net网站,如果报错进行重新获取代理IP,最多5次
        count = 0
        while True:
            try:
                if count == "5":
                    print("已经失败了5次,程序退出,重新执行")
                count += 1
                response = urllib.request.urlopen(url)
                html = response.read()
                return html
            except OSError as e:
                print("链接出问题了,智能切换新的代理IP
    出错的问题是:" + str(e))
                change_proxy()
                continue
            except urllib.error.URLError as u:
                print("链接出问题了,智能切换新的代理IP
    出错的问题是:" + str(u))
                change_proxy()
                continue
            except (http.client.BadStatusLine,http.client.IncompleteRead) as h:
                print("链接出问题了,智能切换新的代理IP
    出错的问题是:" + str(h))
                change_proxy()
                continue
    
    def get_pagenum(url):    #获取jandan网站的页面号(2305)
        html = url_open(url).decode("utf-8")
        num_re = re.compile(r'<spansclass="current-comment-page">[d{4}]</span>')
        num = num_re.search(html)
        a = re.compile(r'd{4}')
        num_b = a.search(num.group())
        return  num_b.group()
    
    def get_imgurl(url):    #获取图片的地址
        img = []
        html = url_open(url).decode("utf-8")
        jpg_re = re.compile(r'<img src="//ww.*.jpg')
        numurl = jpg_re.findall(html)
        jpg = re.compile(r'//ww.+.jpg')
        for line in numurl:
            imgurl = jpg.findall(line)
            img.append(imgurl[0])
        return img
    
    def save_img(img):   #保存图片
        i = 0
        for each in img:
            i += 1
            filename = each.split('/')[-1]
            with open(filename,'wb') as f:
                imgpage = url_open("http:%s" %each)
                f.write(imgpage)
                print("下载本页的第%s张图片,名称为%s" %(i,filename))
    
    
    def get_proxy():     #从IP代理网站上抓取代理IP,存入Proxy列表中
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
        req = urllib.request.Request(url="http://www.xicidaili.com",headers=head)
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        IP = re.compile(r'''<trsclass=.+>s+
                                        <tds.+</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        <tds.+?</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        </tr>
                                        ''',re.VERBOSE)
        proxy_ip = IP.findall(html)
        for num in range(len(proxy_ip)):
            protocol_list = proxy_ip[num].split()
            protocol = protocol_list[-4].split(">")
            HTTP = protocol[1].split("<")
            PORT_list = proxy_ip[num].split()
            PORT = PORT_list[8].split(">")
            PO = PORT[1].split("<")
            ip_list = proxy_ip[num].split()
            ip = ip_list[7].split(">")
            IP = ip[1].split("<")
            if HTTP[0] == "HTTP":
                IP_list = IP[0]+":"+PO[0]
                proxy.append(IP_list)
        return proxy
    
    def download(dir,url):
        if not os.path.isdir(dir):
            os.mkdir(dir)
            os.chdir(dir)
        else:
            os.chdir(dir)
        url = url
        page_num = int(get_pagenum(url))
        for i in range(10):
            page_num -= 1
            pageurl = url + "page-" + str(page_num) + "#comments"
            imgurl = get_imgurl(pageurl)
            print("下载第%s页图片" % page_num)
            saveimg = save_img(imgurl)
    
    if __name__ == "__main__":
        get_proxy()
        change_proxy()
        dir = "ooxx"
        url = "http://jandan.net/ooxx/"
        download(dir,url)
  • 相关阅读:
    【转】sql server编写通用脚本自动检查两个不同服务器的新旧数据库的表结构差异
    Pytest 2
    【转】python通过SMTP协议发送邮件失败,报错505或535
    【转】环境搭建之allure的安装配置,及简单使用
    Pytest 1
    替换姓名为隐式
    docker 用户组权限
    安装go环境
    Win10配置WSL2安装Ubuntu,并支持Nvidia CUDA 环境
    miniconda源配置
  • 原文地址:https://www.cnblogs.com/jonnter/p/7725219.html
Copyright © 2011-2022 走看看