zoukankan html css js c++ java

7-7

from bs4 import BeautifulSoup
from urllib.request import urlopen
res = urlopen('http://pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(res.read(),'html.parser')
print(bs.h1)  #获取标签内容

BeautifulSoup res.status 200 html.parser/lxml/html5lib

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import sys

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print(e)
        return None
    try:
        bsObj = BeautifulSoup(html, "html.parser")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title

title = getTitle("http://www.pythonscraping.com/exercises/exercise1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)

try 异常处理

from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())#随机数种子  时间种子
def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org"+articleUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    print(newArticle)
    links = getLinks(newArticle)

seed random 随机种子

Natural Language Toolkit，自然语言处理工具包，在NLP领域中，最常使用的一个Python库

查看全文

相关阅读:
前后端分离项目采用Prerender的SEO优化流程
 spring多数据源分布式事务的分析与解决方案
 Windows上MyEclipse2017 CI7 安装、破解以及配置
 WINDOWS上JDK安装与环境变量设置
 Abp Vnext Vue3 的版本实现
 你好，年轻人
 数据结构·堆
 数据结构·优先队列
 算法笔记·并查集
 JAVA问题解决——Jar包中资源调用

原文地址：https://www.cnblogs.com/zhangchen-sx/p/11148153.html