转自:http://www.linuxany.com/archives/596.html
import re import urllib def test(html,rex): alist = [] r = re.compile(rex) matchs = r.findall(html) if matchs != None: for found in matchs: if found not in alist: alist.append(found) return alist rex = r'<as*href="(.*?)"' page=urllib.urlopen('http://hi.baidu.com') html=page.read() page.close() print test(html,rex)