zoukankan      html  css  js  c++  java
  • 阅读OReilly.Web.Scraping.with.Python.2015.6笔记---Crawl

    阅读OReilly.Web.Scraping.with.Python.2015.6笔记---Crawl

    1.函数调用它自身,这样就形成了一个循环,一环套一环:

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import re
    pages = set()
    def getLinks(pageUrl):
        global pages
        html = urlopen("http://en.wikipedia.org"+pageUrl)
        bsObj = BeautifulSoup(html,"lxml")
        try:
            print(bsObj.h1.get_text())
            print(bsObj.find(id ="mw-content-text").findAll("p")[0])                     //找到网页中 id=mw-content-text,然后在这个基础上查找"p"这个标签的内容 [0]则代表选择第0个
            print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])         //找到id=ca-edit里面的span标签里面的a标签里面的href的值
        except AttributeError:
            print("This page is missing something! No worries though!")
            
        for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
            if 'href' in link.attrs:
                if link.attrs['href'] not in pages:
                    #We have encountered a new page
                    newPage = link.attrs['href']
                    print(newPage)
                    pages.add(newPage)
                    getLinks(newPage)
    
    getLinks("")
    

     2.对网址进行处理,通过"/"对网址中的字符进行分割

    def splitAddress(address):
        addressParts = address.replace("http://", "").split("/")
        return addressParts
    
    addr = splitAddress("https://hao.360.cn/?a1004")
    print(addr)
    

    运行结果为:

    runfile('C:/Users/user/Desktop/chensimin.py', wdir='C:/Users/user/Desktop')
    ['https:', '', 'hao.360.cn', '?a1004']                   //两个//之间没有内容,所用用''表示
    

      

    def splitAddress(address):
        addressParts = address.replace("http://", "").split("/")
        return addressParts
    
    addr = splitAddress("http://www.autohome.com.cn/wuhan/#pvareaid=100519")
    print(addr)
    

    运行结果为:

    runfile('C:/Users/user/Desktop/chensimin.py', wdir='C:/Users/user/Desktop')
    ['www.autohome.com.cn', 'wuhan', '#pvareaid=100519']
    

     3.抓取网站的内部链接

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import re
    
    #Retrieves a list of all Internal links found on a page
    def getInternalLinks(bsObj, includeUrl):
        internalLinks = []
        #Finds all links that begin with a "/"
        for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
            if link.attrs['href'] is not None:
                if link.attrs['href'] not in internalLinks:
                    internalLinks.append(link.attrs['href'])
        return internalLinks
    
    startingPage = "http://oreilly.com"
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html,"lxml")
    
    def splitAddress(address):
        addressParts = address.replace("http://", "").split("/")
        return addressParts
    
    internalLinks = getInternalLinks(bsObj, splitAddress(startingPage)[0])
    print(internalLinks)
    

    运行结果为(此页面内的所有内部链接):

    runfile('C:/Users/user/Desktop/untitled112.py', wdir='C:/Users/user/Desktop')
    ['https://www.oreilly.com', 'http://www.oreilly.com/ideas', 
    'https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170601+nav',
    'http://www.oreilly.com/conferences/', 'http://shop.oreilly.com/', 'http://members.oreilly.com', '/topics/ai', '/topics/business',
    '/topics/data', '/topics/design', '/topics/economy', '/topics/operations', '/topics/security', '/topics/software-architecture', '/topics/software-engineering',
    '/topics/web-programming', 'https://www.oreilly.com/topics',
    'https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+get+started+now',
    'https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170203+homepage+sign+in',
    'https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+get+started+now',
    'https://www.safaribooksonline.com/public/free-trial/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+start+free+trial',
    'https://www.safaribooksonline.com/accounts/login/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170710+homepage+sign+in',
    'https://www.safaribooksonline.com/live-training/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+take+a+live+online+course',
    'https://www.safaribooksonline.com/learning-paths/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+follow+a+path',
    'https://www.safaribooksonline.com/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170505+homepage+unlimited+access', 'http://www.oreilly.com/live-training/?view=grid',
    'https://www.safaribooksonline.com/your-experience/?utm_medium=content&utm_source=oreilly.com&utm_campaign=lgen&utm_content=20170201+homepage+safari+platform',
    'https://www.oreilly.com/ideas/8-data-trends-on-our-radar-for-2017?utm_medium=referral&utm_source=oreilly.com&utm_campaign=lgen&utm_content=link+2017+trends',
    'https://www.oreilly.com/ideas?utm_medium=referral&utm_source=oreilly.com&utm_campaign=lgen&utm_content=link+read+latest+articles', 'http://www.oreilly.com/about/',
    'http://www.oreilly.com/work-with-us.html', 'http://www.oreilly.com/careers/', 'http://shop.oreilly.com/category/customer-service.do', 'http://www.oreilly.com/about/contact.html',
    'http://www.oreilly.com/emails/newsletters/', 'http://www.oreilly.com/terms/', 'http://www.oreilly.com/privacy.html', 'http://www.oreilly.com/about/editorial_independence.html']

    4.抓取网站的外部链接
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import re
    
    #Retrieves a list of all external links found on a page
    def getExternalLinks(bsObj, excludeUrl):
        externalLinks = []
        #Finds all links that start with "http" or "www" that do
        #not contain the current URL
        for link in bsObj.findAll("a",
                                  href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
            if link.attrs['href'] is not None:
                if link.attrs['href'] not in externalLinks:
                    externalLinks.append(link.attrs['href'])
        return externalLinks
    
    startingPage = "http://oreilly.com"
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html,"lxml")
    
    def splitAddress(address):
        addressParts = address.replace("http://", "").split("/")
        return addressParts
    
    print(splitAddress(startingPage))
    print(splitAddress(startingPage)[0])
    
    externalLinks = getExternalLinks(bsObj,splitAddress(startingPage)[0])
    print(externalLinks)

    运行结果为:
    runfile('C:/Users/user/Desktop/untitled112.py', wdir='C:/Users/user/Desktop')
    ['oreilly.com']
    oreilly.com
    ['https://cdn.oreillystatic.com/pdf/oreilly_high_performance_organizations_whitepaper.pdf', 'http://twitter.com/oreillymedia', 'http://fb.co/OReilly', 'https://www.linkedin.com/company/oreilly-media', 'https://www.youtube.com/user/OreillyMedia']
  • 相关阅读:
    为 DropDownList 选项添加背景或样式
    杂七杂八——Name与x:Name的关系
    原创Godaddy帐户取消信用卡或PayPla绑定,防止无端扣费[图文教程]
    dota中名词解释
    在 JavaScript 中如何创建多行字符串(JavaScript Multiline String)
    HDU 4035 Maze(概率DP)
    获取想要得到的风格设计属性描述————Resources.Theme类函数public TypedArray obtainStyledAttributes()
    Android 异步更新UI —— Handler 【复杂的数据操作另起新线程然后在当前UI线程更新结果】
    超链接中文乱码问题
    java中的final变量
  • 原文地址:https://www.cnblogs.com/chensimin1990/p/7213933.html
Copyright © 2011-2022 走看看