1 #baidu_hotword.py 2 #get baidu hotword in news.baidu.com 3 import urllib2 4 import os 5 import re 6 7 def getHtml(url): 8 page = urllib2.urlopen(url) 9 html = page.read() 10 page.close() 11 return html 12 13 def getHotWord(html): 14 reg = '<li.*?hotwords_li_a.*?title="(.*?)".*?</li>' 15 hotwords = re.compile(reg).findall(html) 16 return hotwords 17 18 if __name__ == '__main__': 19 html = getHtml('http://news.baidu.com/') 20 #print(html) 21 hotwds = getHotWord(html) 22 for i in hotwds: 23 print unicode(i, "gb2312")
输出时必须使用unicode(i, "gb2312"),否则输出不了中文,跟字符编码相关,暂时还没研究。