1 #beautifulSoup 2 from urllib.request import urlopen 3 from bs4 import BeautifulSoup 4 html = urlopen("http://www.pythonscraping.com/pages/page1.html") 5 bsObj = BeautifulSoup(html,'html.parser') 6 print(bsObj.h1) 7 8 #处理异常 9 html = urlopen("http://www.pythonscraping.com/pages/page1.html") 10 11 #可能会发生两种异常 12 #1.网页在服务器上不存在 13 #2.服务器不存在 14 15 #可以用下方式处理处理这种异常 16 17 try: 18 html = urlopen("http://www.pythonscraping.com/pages/page1.html") 19 if html is None: 20 print("URL is not found") 21 else: 22 #程序继续 23 except HTTPError as e: 24 print(e) 25 #返回空值,中断程序,或者执行另一个方案 26 else: 27 #程序继续。注意:如果你已经在上面异常捕捉那一段代码里返回或中断 28 #那么就不需要使用else语句了,这段代码也不会执行 29 30 32 from urllib.request import urlopen 33 from urllib.error import HTTPError 34 from bs4 import BeautifulSoup 35 def getTitle(url): 36 try: 37 html = urlopen(url) 38 except HTTPError as e: 39 return None 40 try: 41 bsObj = BeautifulSoup(html,'html.parser') 42 title = bsObj.body.h1 43 except AttributeError as e: 44 return None 45 return title 46 title = getTitle("http://www.pythonscraping.com/pages/page1.html") 47 if title == None: 48 print("Title could not be found") 49 else: 50 print(title)