1 __author__ = 'minmin' 2 #coding:utf-8 3 import re,urllib,sgmllib,os 4 5 #根据当前的url获取html 6 def getHtml(url): 7 page = urllib.urlopen(url) 8 html = page.read() 9 page.close() 10 return html 11 12 #根据html获取想要的文章内容 13 def func(str): 14 result= re.findall(r"<p>([^<>]*)</p>",getHtml(url),re.M) 15 artical ='' 16 for j in result: 17 if len(j)<>0: 18 j = j.replace(" ","") 19 j = j.replace("<STRONG>"," ")#去掉<STRONG>,换成" " 20 j = j.replace("</STRONG>"," ")#去掉</STROGN>换成" " 21 temp = re.findall(r"Copyright.*?",j,re.M); 22 if temp == []: 23 artical = artical + j + ' ' 24 return artical 25 26 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。 27 class URLPaser(sgmllib.SGMLParser): 28 def reset(self): 29 sgmllib.SGMLParser.reset(self) 30 self.urls = [] 31 32 def start_a(self,attrs): 33 href = [v for k,v in attrs if k == 'href'] 34 if href: 35 self.urls.extend(href) 36 37 IParser = URLPaser() 38 socket = urllib.urlopen("http://tech.sina.com.cn/it/")#打开这个网页 39 40 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中 41 IParser.feed(socket.read())#分析啦 42 43 reg = 'http://tech.sina.com.cn/it/.*'#这个是用来匹配符合条件的链接,使用正则表达式匹配 44 45 pattern = re.compile(reg) 46 os.getcwd()#获得当前文件夹路径 47 os.path.sep#当前系统路径分隔符 48 49 #判断文件是否存在 50 if os.path.exists('news163_it')==False: 51 os.makedirs('news163_it') 52 53 i = 0 54 url2 = [] 55 for url in IParser.urls:#链接都存在urls里 56 if pattern.match(url): 57 if url not in url2: 58 url2.append(url) 59 artical = func(url) 60 if len(artical)<>0: 61 print url 62 print artical 63 i = i + 1 64 f = open("news163_it/"+ str(i) + '.txt','a+') 65 f.write(artical) 66 f.close()