zoukankan      html  css  js  c++  java
  • 7-7

    #

    from bs4 import BeautifulSoup
    from urllib.request import urlopen
    res = urlopen('http://pythonscraping.com/pages/page1.html')
    bs = BeautifulSoup(res.read(),'html.parser')
    print(bs.h1)  #获取标签内容
    BeautifulSoup  res.status 200  html.parser/lxml/html5lib   
    from urllib.request import urlopen
    from urllib.error import HTTPError
    from bs4 import BeautifulSoup
    import sys
    
    def getTitle(url):
        try:
            html = urlopen(url)
        except HTTPError as e:
            print(e)
            return None
        try:
            bsObj = BeautifulSoup(html, "html.parser")
            title = bsObj.body.h1
        except AttributeError as e:
            return None
        return title
    
    title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
    if title == None:
        print("Title could not be found")
    else:
        print(title)
    try 异常处理
    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    import datetime
    import random
    import re
    
    random.seed(datetime.datetime.now())#随机数种子  时间种子
    def getLinks(articleUrl):
        html = urlopen("http://en.wikipedia.org"+articleUrl)
        bsObj = BeautifulSoup(html, "html.parser")
        return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
    links = getLinks("/wiki/Kevin_Bacon")
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
        print(newArticle)
        links = getLinks(newArticle)
    seed random 随机种子
    Natural Language Toolkit,自然语言处理工具包,在NLP领域中,最常使用的一个Python库
  • 相关阅读:
    前后端分离项目采用Prerender的SEO优化流程
    spring多数据源分布式事务的分析与解决方案
    Windows上MyEclipse2017 CI7 安装、破解以及配置
    WINDOWS上JDK安装与环境变量设置
    Abp Vnext Vue3 的版本实现
    你好,年轻人
    数据结构·堆
    数据结构·优先队列
    算法笔记·并查集
    JAVA问题解决——Jar包中资源调用
  • 原文地址:https://www.cnblogs.com/zhangchen-sx/p/11148153.html
Copyright © 2011-2022 走看看