从手机页面读取,有时候也会卡死,解决办法还是重新来………………
# -*-coding:utf-8-*- # 作者:fwindpeak # import urllib import urllib2 import re from HTMLParser import HTMLParser htmstart='''<html> <head> <meta charset="utf-8"/> <title>qzone_blog</title> </head> <body> ''' htmend='''</body> </html> ''' def cn(s): return s.decode("utf-8").encode("gbk") def DownloadBlog(qq, filename = None): blogList = [] print 'Start' if filename==None: filename="%s.htm"%qq blogurl = 'http://z.qq.com/blog/blog_list.jsp?sid=AefvkfGVCCDx2PfXiaquF7pf&B_UID=%s'%qq url = blogurl cookieFile = urllib2.HTTPCookieProcessor() opener = urllib2.build_opener(cookieFile) opener.addheaders = [('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Referer','http://z.qq.com/'), ('User-Agent','Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1')] # 获取所有页的文章路径 pagenum=0 while True: req = urllib2.Request(url) result = opener.open(req) text = result.read() detailPattern = re.compile(r'href="(.+blog_detail.jsp?.+)">(.+)</a>') detail = re.findall(detailPattern,text) #print detail blogList.append(detail) nextpagePattern = re.compile(r'href="(.*?)">下页</a>') nextpage = nextpagePattern.search(text) if nextpage: pagenum=pagenum+1 url = nextpage.group(1) url=url.replace('amp;','') print url print "page %d"%pagenum else: break file = open(filename, 'w') htmstart='''<html> <head> <meta charset="utf-8"/> <title>qzone_blog_%s</title> </head> <body> '''%qq file.write(htmstart); # 下载文章 blogContentPattern = re.compile(r'<p class="tabs-1">(.+<br/>.+)<p class="tabs-1"><br/>相关搜索', re.S) try: for dat in blogList: for url,title in dat: #print url,title url=url.replace('amp;','') url=url+"&isReadAllPage=true" print 'Downloading', cn(title) text = opener.open(url,).read() #print text ret = blogContentPattern.search(text) if ret: txt = ret.group(1) #print txt file.write(txt) file.write("<br/><hr/><br/>") except Exception,e: print e pass finally: opener.close() file.write(htmend) file.close() if __name__ == '__main__': print "QZone blog download" qq=raw_input("QQ:") DownloadBlog(qq)