zoukankan      html  css  js  c++  java
  • python 爬虫(四)

    爬遍整个网络

    1 当我们访问整个网络的时候,我们不可避免的会访问不同的网站,但是不同的网站会有完全不同的结构和内容...

    现在一步一步的构建访问整个网络的脚本

    I 从一个网站开始,每一次都爬向不同的网站。如果在一个页面找不到指向其他网站的链接,获取本网站其他界面信息,直到找到其他网站的链接。

    # -*- coding:utf-8 -*-  
    
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
    from random import choice
    import re
    
    basename = "http://en.wikipedia.org"
    visitedpages = set()
    
    def getInternalLinks(bsObj,includeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(/|.*" + includeUrl + ")")) if 'href' in eachlink.attrs]
    
    def getExternalLinks(bsObj,excludeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")) if 'href' in eachlink.attrs]
    
    def splitAddress(address):
        addressParts = address.replace("http://","").split("/")
        return addressParts
    
    def getRandomExternalLink(startingPage):
        html = urlopen(startingPage)
        with html:
            bsObj = BeautifulSoup(html,"html.parser")
        externalLinks = getExternalLinks(bsObj,splitAddress(startingPage)[0])
        if len(externalLinks) == 0:
            internalLinks = getInternalLinks(bsObj, splitAddress(startingPage)[0])
            return choice(internalLinks)
        else:
            return choice(externalLinks)
    
    def followExternalLink(startingPage):
        externalLink = getRandomExternalLink("http://www.oreilly.com/")
        if externalLink in visitedpages:
            print("visited")
        else:    
            print("the random external link is   " + externalLink)
            visitedpages.add(externalLink)
            followExternalLink(externalLink)
    
    
    if __name__ == "__main__":
        #print(splitAddress("http://www.oreilly.com/")[0])
        #print(getRandomExternalLink("http://www.oreilly.com/"))
        followExternalLink("http://www.oreilly.com/")    
                 
    View Code

    II 从一个网站开始,查找这个网站所有界面信息,获取整个网站指向其他网站的链接

    # -*- coding:utf-8 -*-  
    
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
    from random import choice
    import re
    
    def getInternalLinks(bsObj,includeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(/|.*" + includeUrl + ")")) if 'href' in eachlink.attrs]
    
    def getExternalLinks(bsObj,excludeUrl):
        return [eachlink.attrs['href'] for eachlink in bsObj.find_all("a",href=re.compile("^(http|www)((?!" + excludeUrl + ").)*$")) if 'href' in eachlink.attrs]
    
    def splitAddress(address):
        addressParts = address.replace("http://","").split("/")
        return addressParts
    
    allINlinks = set()
    allEXlinks = set()
    def getAllexternalLinks(startPage):
        try:
            with urlopen(startPage) as html:
                bsObj = BeautifulSoup(html,"html.parser")
        except HTTPError as e:
            print(e)
        else:
            allinternallinks = getInternalLinks(bsObj,splitAddress(startPage)[0])
            allexternallinks = getExternalLinks(bsObj,splitAddress(startPage)[0])
            print("************external*******************************")
            for eachexternallink in allexternallinks:
                if eachexternallink not in allEXlinks:
                    allEXlinks.add(eachexternallink)
                    print(eachexternallink)
            print("************internal*******************************")
            for eachinternallink in allinternallinks:
                if eachinternallink not in allINlinks:
                    allINlinks.add(eachinternallink)
                    print(eachinternallink)
                    getAllexternalLinks(eachinternallink)
    
    if __name__ == "__main__":
        getAllexternalLinks("http://www.oreilly.com/")    
    View Code

       ***************还存在问题的代码***************************

  • 相关阅读:
    需求层次性、需求分类
    CSMA/CA协议详解
    Git笔记:GitFlow工作流模拟、分支管理、使用规范
    Vue.js笔记(四) 路由router与重定向
    DolphinScheduler 源码分析之 DAG类
    linux 一分钟安装maven linux
    linux 一分钟搭建zookeeper linux 单机版(亲测可用)
    canal-adapter1.1.14最新版本安装的过程中出现的NullPointerException异常
    yum.repos.d中的变量($releasever与$basearch)
    索引知识
  • 原文地址:https://www.cnblogs.com/someoneHan/p/6234508.html
Copyright © 2011-2022 走看看