from BeautifulSoup import BeautifulSoup import re doc = ['<html><head><title>Page title</title></head>', '<body><p id="firstpara" align="center">This is paragraph <b>one</b>.', '<p id="secondpara" align="blah">This is paragraph <b>two</b>.', '</html>'] soup = BeautifulSoup(''.join(doc)) print soup.prettify()
运行结果为:
print soup.contents[0].name # print soup.contents[0].contents[0].name for i in range(len(soup.contents[0])): print soup.contents[0].contents[i].name
titleTag = soup.html.head.title titleTag # <title>Page title</title> titleTag.string # u'Page title' len(soup('p')) # 2 soup.findAll('p', align="center") # [<p id="firstpara" align="center">This is paragraph <b>one</b>. </p>] soup.find('p', align="center") # <p id="firstpara" align="center">This is paragraph <b>one</b>. </p> soup('p', align="center")[0]['id'] # u'firstpara' soup.find('p', align=re.compile('^b.*'))['id'] # u'secondpara' soup.find('p').b.string # u'one' soup('p')[1].b.string # u'two'