zoukankan      html  css  js  c++  java
  • Python3网络爬虫

    # 最近在实验楼学习了爬取妹子图,发现在运行的时候不是很流畅,有些图片下 1 # coding: utf-8

     1 # coding: utf-8
     2 
     3 import re
     4 import threading
     5 from urllib.request import urlopen
     6 from urllib.error import HTTPError
     7 from bs4 import BeautifulSoup
     8 import meizi_series_nextpage
     9 
    10 def loadurl(url):
    11     try:
    12         conn = urlopen(url)
    13         html = conn.read()
    14         return html
    15     except HTTPError as e:
    16         return e
    17     except Exception as e:
    18         print("unkown exception in conn.read() %s "%e)
    19         return ''
    20 
    21 def meizi(url,path):
    22     # 获取首页标签
    23     print('start open meiziwang')
    24     html = ''
    25     while True:
    26         html = loadurl(url)
    27         if html == '':
    28             print('load', url,'error')
    29             continue
    30         else:
    31             break
    32     mnvtp = BeautifulSoup(html)
    33     taglists = mnvtp.findAll("div",{"class":"tags"})
    34     taglistss = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists)
    35     print(list(set(taglistss)))
    36     print(len(list(set(taglistss))))
    37     print('open meiziwang over')
    38     meizi_series_nextpage.nextpage(url,path)
    39     threads = []
    40     for url in list(set(taglistss)):
    41         t =threading.Thread(target=meizi_series_nextpage.nextpage, args=(url, path))
    42         threads.append(t)
    43     for t in threads:
    44         t.start()
    45     for t in threads:
    46         t.join()
    47 if __name__ == '__main__':
    48     meizi('http://www.meizitu.com','D:\MeiZi\')
    49     print ('Spider Stop')
    View Code
    # coding: utf-8
    import re
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
    import meizi_series_getpage
    
    
    #同样的,这里是加载链接防超时
    def loadurl(url):
        try:
            conn = urlopen(url, timeout=5)
            html = conn.read()
            return html
        except HTTPError as e:
            print(e)
        except Exception as e:
            print(e)
    
    def nextpage(url,path):
        #获取首页尾部标签
        nextweibu = re.split("/",url)
        # 获取头部文件
        nexthead = re.split("/a/",url)
        nexthead = nexthead[0] + "/a/"
        # 创建首页路径
        path = path+"\"+nextweibu[-1].split(".",1)[0]
        # 获取html
        while True:
            html = loadurl(url)
            if html == '':
                print('load', url,'error')
                continue
            else:
                break
        # 获取子标签
        mnvtp = BeautifulSoup(html)
        taglists = mnvtp.findAll("div",{"id":"wp_page_numbers"})
        taglists = re.findall('<a.*?href="(.*?)".*?>','%s'%taglists)
        taglists = sorted(list(set(taglists)))
        if taglists == []:
            taglists = [nextweibu[-1]]
    
        # 获取单个首页所有标签完整url路径
        print("正在获取首页所有子标签Url:%s"%url)
        completeurl = []
        for i in taglists:
            url = nexthead + i
            completeurl.append(url)
        completeurl = sorted(completeurl)
        for i in completeurl:
            print("正在获取子标签下所有套图url路径")
            meizi_series_getpage.tag_series(i,path)
    View Code  
    View Code
    # coding: utf-8
    import time
    from urllib.request import urlopen
    from urllib.request import Request
    from urllib.error import HTTPError
    from urllib.request import urlretrieve
    import os
    import re
    from bs4 import BeautifulSoup
    import urllib
    from urllib import parse
    # 图片下载的主逻辑函数,获取图片链接,然后传给pic_list()
    def picurl(url,path):
        if os.path.exists(path):
            print(path,'目录已存在')
        else:
            print("正在创建目录:%s"%path)
            os.makedirs(path)
        # 获取套图url(图片)地址
        html = ''
        while True:
            html = loadurl(url)
            if html == '':
                continue
            else:
                break
        rePicContent1 = '<div.*?id="picture.*?>.*?<p>(.*?)</p>'
        rePicContent2 = '<div.*?class="postContent.*?>.*?<p>(.*?)</p>'
        rePicList = '<img.*?src="(.*?)".*?>'
        #这里对re.S做个介绍,re.S是可以不添加的,加上之后,它的作用就是能忽略换行符,将两条作为一条来匹配。html代码碰上换行的概率是很高的,所以我一致采用re.S(下文有配图)
        picContent = re.findall(rePicContent1,"%s"%html,re.S)
        if len(picContent) <=0:
            picContent = re.findall(rePicContent2, "%s"%html,re.S)
        if len(picContent) <=0:
            print('无法匹配到对应的图片url')
            return False
        else:
            picList = re.findall(rePicList,"%s"%picContent[0],re.S)
            pic_list(picList,path)
    
    # #这个函数,相当于一个中介,我只是把for循环代码提出就得到了这个函数
    def pic_list(picList,path):
        for picurl in picList:
            print("获取图片地址:%s"%picurl)
            save_pic(picurl,path)
    
    #保存图片的逻辑代码块
    def save_pic(url,path):
        searchname = '.*/(.*?.jpg)'
        name = re.findall(searchname,url)
        filename = path +'\'+ name[0]
    
        print(filename + ':start') #控制台显示信息
    
        #定义了在下载图片时遇到错误的重试次数
        tryTimes = 3
    
        #当重试次数没有用完时,则尝试下载
        while tryTimes != 0:
            tryTimes -= 1
            if os.path.exists(filename):
                print(filename,'已存在,跳过')
                return True
            elif os.path.exists(filename):
                os.mknod(filename)
            if download(url,filename):
                break
    
        if tryTimes != 0:
            print(filename + ": over")
        else:
            print(url + " :Failed to download")
        #控制台显示信息
    
    #这里是图片保存的代码被调函数,timeout=5设置超时时间,一个500k不到的图片,5秒时间算长的了,超时的话,返回失败
    
    def download(url,filename):
    
        try:
            headers = {
            'Host':'mm.howkuai.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
            }
            Url = Request(url,headers=headers)
            req = urlopen(Url).read()
            f = open(filename,'wb')
            f.write(req)
            f.close()
            return True
        except HTTPError as e:
            print(e)
            return False
        except Exception as e:
            print(e)
    
    def loadurl(url):
    
        try:
            conn = urlopen(url,timeout=5)
            html = conn.read()
            return html
        except HTTPError as e:
            return ''
        except Exception as e:
            print("unkown exception in conn.read()")
            return ''
    View Code

    有时间在来解释代码含义,第一段代码是主函数,分别根据导入的py创建既可。

  • 相关阅读:
    idea 导入(非maven)web项目并发布到tomcat服务器
    EasyUI-combotree 下拉树 数据回显时默认选中
    千万级别数据量mysql优化策略
    MySQL忘记密码,或:root密码重置报错:mysqladmin: connect to server at 'localhost' failed的解决方案
    centOS 6.5下升级mysql,从5.1升级到5.7
    查看mysql数据库版本方法总结
    【转】App开发者必备的运营、原型、UI设计工具整理
    APP原型设计工具,哪家强?转自知乎
    数据库为什么要分库分表
    20180925-1 每周例行报告
  • 原文地址:https://www.cnblogs.com/heweiblog/p/6536818.html
Copyright © 2011-2022 走看看