今天用Python写了一个下载韩寒新浪博客文章的下载器,恩,基本功能如下:
1、从新浪博客上批量下载文章,并按文章标题创建文件
2、对下载的文章进行格式化。
已知Bug:长篇文章格式会错乱
1 #!/usr/bin/python 2 #-*- coding:utf-8 -*- 3 4 import urllib 5 import os 6 import re 7 8 def article_format(usock,basedir): 9 title_flag=True 10 context_start_flag=True 11 context_end_flag=True 12 for line in usock: 13 if title_flag: 14 title=re.findall(r'(<title>.+?<)',line) 15 if title: 16 title=title[0][7:-1] 17 filename=basedir+title 18 print filename 19 try: 20 fobj=open(filename,'w+') 21 fobj.write(title+' ') 22 title_flag=False 23 except IOError,e: 24 print "Open %s error:%s"%(filename,e) 25 else: 26 #print "Title has not found,drop it" 27 pass 28 elif context_start_flag: 29 results1=re.findall(r'(<.+?正文开始.+?>)',line) 30 if results1: 31 context_start_flag=False 32 elif context_end_flag: 33 results2=re.findall(r'(<.+?正文结束.+?)',line) 34 if results2: 35 context_end_flag=False 36 fobj.write(' END') 37 fobj.close() 38 break 39 else: 40 if 'div' in line or 'span' in line or '<p>' in line: 41 pass 42 else: 43 line=re.sub(',',',',line) 44 line=re.sub(':',':',line) 45 line=re.sub('!','!',line) 46 line=re.sub('(','(',line) 47 line=re.sub(')',')',line) 48 line=re.sub('⋯','...',line) 49 line=re.sub('?','?',line) 50 line=re.sub(';',';',line) 51 line=re.sub(r'<wbr>','',line) 52 line=re.sub(r' ','',line) 53 line=re.sub(r'<brs+?/>','',line) 54 fobj.write(line) 55 else: 56 pass 57 58 if __name__=='__main__': 59 basedir='/home/tmyyss/article/' 60 if not os.path.exists(basedir): 61 os.makedirs(basedir) 62 63 usock=urllib.urlopen("http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html") 64 context=usock.read() 65 #print context 66 raw_url_list=re.findall(r'(<as+title.+?href="http.+?html)',context) 67 for url in raw_url_list: 68 url=re.findall('(http.+?html)',url)[0] 69 article_usock=urllib.urlopen(url) 70 article_format(article_usock,basedir)