zoukankan      html  css  js  c++  java
  • 2019-01-31 Python学习之BFS与DFS实现爬取邮箱

    今天学习了python网络爬虫的简单知识

    首先是一个爬取百度的按行读取和一次性爬取

    逐行爬取

    for line in urllib.request.urlopen("http://www.baidu.com"):
        print(line.decode("utf-8"))
    

    全部爬取

    mystr = urllib.request.urlopen("http://www.baidu.com").read()
    print(mystr.decode("utf-8"))
    

    分别用栈和队列实现了DFS和BFS的邮箱爬取

    用队列deque实现BFS

    import re
    import urllib
    import urllib.request
    from collections import deque
    
    def getallemail(data):  #邮箱的正则表达式获取所有的邮箱
        try:
            mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+.[A-Z]{2,4})", re.IGNORECASE)
            mylist = mailregex.findall(data)
            return mylist
        except:
            return []
    
    
    def getdata(url):   #用utf-8编码读取url返回网页源代码
        try:
            data = urllib.request.urlopen(url).read().decode("utf-8")
            return data
        except:
            return ""
    
    
    def geteveryurl(data):  #获得网页所有的url
        alllist = []
        mylist1 = getallhttp(data)
        mylist2 = []
        if len(mylist1)>0:
            mylist2 = getabsurl(mylist1[0],data)    #mylist[0]作用是提取元素
        alllist.extend(mylist1)
        alllist.extend(mylist2)
        return alllist
    
    
    def gethostname(httpstr):
        try:
            mailregex = re.compile(r"(http://S*?)/",re.IGNORECASE) #预编译提取主机名的regex
            mylist = mailregex.findall(httpstr)
            if len(mylist)==0:
                return None
            else:
                return mylist[0]
        except:
            return None
    
    
    def getabsurl(url,data):
        try:
            regex = re.compile("href="(.*?)"",re.IGNORECASE) #预编译提取href正则表达式
            httplist = regex.findall(data)
            newhttplist = httplist.copy()  #进行一次深拷贝,以进行后面的删除行为
            for data in newhttplist:
                if data.find("http://")!=-1:  #如果其中包含http
                    httplist.remove(data) #在原list中remove此data
                if data.find("javascript")!=-1:
                    httplist.remove(data) #同理
            hostname = gethostname(url)
            if hostname!=None:
                for i in range(len(httplist)):
                    httplist[i] = hostname + httplist[i]
            return httplist
        except:
            return []
    
    
    def getallhttp(data):#找到所有的http
        try:
            mailregex = re.compile(r"(http://S*?)["|>|)]",re.IGNORECASE)
            mylist = mailregex.findall(data)
            return mylist
        except:
            return[]
    
    
    def BFS(urlstr):
        urlqueue = deque([]) #新建一个队列
        urlqueue.append(urlstr) #队列中加入最初的url
        while len(urlqueue)!=0: #判断队列是否为空
            url = urlqueue.popleft()  #队列弹出的数据(url)
            print(url)  #打印url连接
            pagedata = getdata(url)  #获取网页源代码
            emaillist = getallemail(pagedata)  #提取邮箱到列表
            if len(emaillist)!=0:       #若邮箱列表不为空
                for email in emaillist:
                    print(email)        #打印所有的邮箱
            newurllist = geteveryurl(pagedata) #抓取该网页的所有的url
            if len(newurllist)!=0:      #若列表不为空
                for urlstr in newurllist:
                    if urlstr not in urlqueue:
                        urlqueue.append(urlstr)     #若url不在该队列中,则将该url加入队列中
    
    BFS(input("请输入你想爬取的最初页面"))
    
    
    

    用栈stack实现DFS

    import re
    import urllib
    import urllib.request
    
    def getallemail(data):  #邮箱的正则表达式获取所有的邮箱
        try:
            mailregex = re.compile(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+.[A-Z]{2,4})", re.IGNORECASE)
            mylist = mailregex.findall(data)
            return mylist
        except:
            return []
    
    
    def getdata(url):   #用utf-8编码读取url返回网页源代码
        try:
            data = urllib.request.urlopen(url).read().decode("utf-8")
            return data
        except:
            return ""
    
    
    def geteveryurl(data):  #获得网页所有的url
        alllist = []
        mylist1 = getallhttp(data)
        mylist2 = []
        if len(mylist1)>0:
            mylist2 = getabsurl(mylist1[0],data)    #mylist[0]作用是提取元素
        alllist.extend(mylist1)
        alllist.extend(mylist2)
        return alllist
    
    
    def gethostname(httpstr):
        try:
            mailregex = re.compile(r"(http://S*?)/",re.IGNORECASE) #预编译提取主机名的regex
            mylist = mailregex.findall(httpstr)
            if len(mylist)==0:
                return None
            else:
                return mylist[0]
        except:
            return None
    
    
    def getabsurl(url,data):
        try:
            regex = re.compile("href="(.*?)"",re.IGNORECASE) #预编译提取href正则表达式
            httplist = regex.findall(data)
            newhttplist = httplist.copy()  #进行一次深拷贝,以进行后面的删除行为
            for data in newhttplist:
                if data.find("http://")!=-1:  #如果其中包含http
                    httplist.remove(data) #在原list中remove此data
                if data.find("javascript")!=-1:
                    httplist.remove(data) #同理
            hostname = gethostname(url)
            if hostname!=None:
                for i in range(len(httplist)):
                    httplist[i] = hostname + httplist[i]
            return httplist
        except:
            return []
    
    
    def getallhttp(data):#找到所有的http
        try:
            mailregex = re.compile(r"(http://S*?)["|>|)]",re.IGNORECASE)
            mylist = mailregex.findall(data)
            return mylist
        except:
            return[]
    
    
    def DFS(urlstr):
        visitlist = [] #代表已经访问过的url,防止深度遍历出现死循环
        urlstack=[]         #栈
        urlstack.append(urlstr)
        while len(urlstack)!=0:
            url = urlstack.pop()
            print(url)  #打印url链接
            if url not in visitlist:
                pagedata = getdata(url)
                emaillist = getallemail(pagedata)
                if len(emaillist)!=0:
                    for email in emaillist:
                        print(email)
                newurllist = geteveryurl(pagedata)
                if len(newurllist)!=0:
                    for urlstr in newurllist :
                        if urlstr not in urlstack:
                            urlstack.append(urlstr)
                visitlist.append(url)
    
    
    DFS(input("请输入你想爬取的最初页面"))
    
    #提取数据容易出现广度遍历
    #深度遍历容易出现死循环
    
    • 其中需要注意的是,DFS容易出现死循环现象,故使用visitlist来避免,数据提取适合使用广度遍历实现,因为深度遍历是一撸到底,适合挖掘网站的层数。

    代码来自尹成python教学

  • 相关阅读:
    IOS UIwebview 背景色调整
    文件的创建 判断是否存在文件 读取 写入
    IOS 关于ipad iphone5s崩溃 解决
    iOS tabbar 控制器基本使用
    iOS 关于流媒体 的初级认识与使用
    总结 IOS 7 内存管理
    iOS 应用首次开启 出现引导页面
    IOS UItableView 滚动到底 触发事件
    IOS 应用中从竖屏模式强制转换为横屏模式
    iOS 定位系统 知识
  • 原文地址:https://www.cnblogs.com/roccoshi/p/13027106.html
Copyright © 2011-2022 走看看