zoukankan      html  css  js  c++  java
  • 批量下载QQ空间日志

    从手机页面读取,有时候也会卡死,解决办法还是重新来………………

    # -*-coding:utf-8-*-
    # 作者:fwindpeak
    #
    import urllib
    import urllib2
    import re
    from HTMLParser import HTMLParser
    
    htmstart='''<html>
    <head>
    <meta charset="utf-8"/>
    <title>qzone_blog</title>
    </head>
    <body>
    '''            
    htmend='''</body>
    </html>
    '''
    
    def cn(s):
        return s.decode("utf-8").encode("gbk")
        
    def DownloadBlog(qq, filename = None):
        blogList = []
        print 'Start'
        if filename==None:
            filename="%s.htm"%qq
        blogurl = 'http://z.qq.com/blog/blog_list.jsp?sid=AefvkfGVCCDx2PfXiaquF7pf&B_UID=%s'%qq
    
        url = blogurl
        cookieFile = urllib2.HTTPCookieProcessor()
        opener = urllib2.build_opener(cookieFile)
        opener.addheaders = [('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                            ('Referer','http://z.qq.com/'),
                            ('User-Agent','Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91)
                                            AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1')]
        
        # 获取所有页的文章路径
        pagenum=0
        while True:
            req = urllib2.Request(url)
            result = opener.open(req)        
            text = result.read()      
            detailPattern = re.compile(r'href="(.+blog_detail.jsp?.+)">(.+)</a>')
            detail = re.findall(detailPattern,text)
            #print detail
            blogList.append(detail)
            nextpagePattern = re.compile(r'href="(.*?)">下页</a>')
            nextpage = nextpagePattern.search(text)
            if nextpage:
                pagenum=pagenum+1
                url = nextpage.group(1)
                url=url.replace('amp;','')
                print url
                print "page %d"%pagenum  
            else:
                break
            
        file = open(filename, 'w')
        htmstart='''<html>
                    <head>
                    <meta charset="utf-8"/>
                    <title>qzone_blog_%s</title>
                    </head>
                    <body>
                    '''%qq 
        file.write(htmstart);
        # 下载文章
        blogContentPattern = re.compile(r'<p class="tabs-1">(.+<br/>.+)<p class="tabs-1"><br/>相关搜索', re.S)
        try:
            for dat in blogList:
                for url,title in dat:
                    #print url,title
                    url=url.replace('amp;','')
                    url=url+"&isReadAllPage=true"
                    print 'Downloading', cn(title)
                    text = opener.open(url,).read()
                    #print text
                    ret = blogContentPattern.search(text)
                    if ret:
                        txt = ret.group(1)
                        #print txt
                        file.write(txt)
                        file.write("<br/><hr/><br/>")
        except Exception,e:
            print e
            pass
        finally:
            opener.close()
            file.write(htmend)
            file.close()
            
    if __name__ == '__main__':
        print "QZone blog download"
        qq=raw_input("QQ:")
        DownloadBlog(qq)
    
  • 相关阅读:
    【转】Netty系列之Netty是什么
    【转】程序员技术练级攻略
    【转】Spring MVC 教程,快速入门,深入分析
    【转】Jqgrid学习之数据
    【转】Jqgrid学习之ColModel API
    【转】jqGrid学习之参数
    【转】jqGrid学习之安装
    go语言项目汇总
    33 Introducing the Go Race Detector
    32 Profiling Go Programs 分析go语言项目
  • 原文地址:https://www.cnblogs.com/fwindpeak/p/3369390.html
Copyright © 2011-2022 走看看