zoukankan      html  css  js  c++  java
  • python下载QQ空间的博客文章

    # -*-coding:utf-8-*-
    # 作者:华亮
    #
    
    import urllib
    import urllib2
    import re
    from HTMLParser import HTMLParser
    
    
    # 获取QQ空间博客列表
    class QQBlogList(HTMLParser):
        in_key_div = False
        in_ul = False
        in_li = False
        in_a = False
        blogList = []
        lasturl = ''
        
        def handle_starttag(self, tag, attrs):
            attrs = dict(attrs)
            if tag == 'div' and 'class' in attrs and attrs['class'] == 'bloglist':
                self.in_key_div = True
            elif self.in_key_div:
                if tag == 'ul':
                    self.in_ul = True
                elif self.in_ul and tag == 'li':
                    self.in_li = True
                elif self.in_li and tag == 'a' and 'href' in attrs:
                    self.in_a = True
                    self.lasturl = attrs['href']
        
        def handle_data(self, data):
            if self.in_a:
                self.blogList.append((data, self.lasturl))
        
        def handle_endtag(self, tag):
            if self.in_key_div and tag == 'div':
                self.in_key_div = False
            elif self.in_ul and tag == 'ul':
                self.in_ul = False
            elif self.in_li and tag == 'li':
                self.in_li = False
            elif self.in_a and tag == 'a':
                self.in_a = False
                
             
                
    class QQ:  
        '''
        QQ
            作者:华亮
            说明:自动下载QQ空间博客文章
        '''
            
        @staticmethod      
        def DownloadBlog(qq, filename = None):
            print 'Start'
            blogurl = 'http://qz.qq.com/%s/bloglist?page=0' % qq
            QQ.__Download(blogurl, filename)           
            print 'End'
        
        @staticmethod
        def __Download(starturl, filename):
            url = starturl
            
            cookieFile = urllib2.HTTPCookieProcessor()
            opener = urllib2.build_opener(cookieFile)    
            
            # 获取所有页的文章路径
            while True:
                req = urllib2.Request(url)
                result = opener.open(req)        
                text = result.read()     
                
                qq = QQBlogList()        
                qq.feed(text)
                qq.close()          
                       
                nextpagePattern = re.compile(r'<a href="(.*?)" title="下一页" class="bt_next"><span>下一页</span></a>')              
                nextpage = nextpagePattern.search(text)
                if nextpage:
                    url = nextpage.group(1)            
                else:
                    break  
              
            if not filename:
                filename = "blog.txt"
            file = open(filename, 'w')    
            
            # 下载文章
            blogContentPattern = re.compile(r'<div class="entry_content">(.*?)</div>', re.S) 
            for title, url in qq.blogList:
                print 'Downloading', title
                req = urllib2.Request(url)
                result = opener.open(req)
                file.write('\n' + title + '\n')
                ret = blogContentPattern.search( result.read() )
                if ret:
                    file.write(ret.group(1).replace('<p>', '\n'))
            file.close()
    # -*-coding:utf-8-*-
    # 作者:华亮
    #
    
    from QQ import QQ
    
    if __name__ == '__main__':
        # 第一个参数为QQ号,第二个为保存文件名
        QQ.DownloadBlog('1241224798', 'blog.txt')
    
    
  • 相关阅读:
    mvc使用model进行数据的增加修改的方法
    c#导出word在iis部署上报异常
    做个转圈圈的咚咚
    VS2008中AJAX的部署问题(工具箱中无AJAX Extensions选项卡)
    关于 AjaxControlToolkit requires ASP.NET Ajax 4.0 scripts. 错误
    ASP.NET关于继承DropDownList的自定义DDL控件
    线性表顺序表示的C#实现(参考数据结构(C语言版))
    WORD2003出现的乱码
    线性表链式表示的C#实现(参考数据结构(C语言版))
    有错误先找自己的原因(若你百度不出为什么vista开网页慢,可以来试试这方法)
  • 原文地址:https://www.cnblogs.com/hzhida/p/2635479.html
Copyright © 2011-2022 走看看