zoukankan      html  css  js  c++  java
  • 下载cnblogs所有博客内容

    写了一个python脚本,简单粗暴
    import urllib,os,sys
    
    def getUrlContent(url):
        fp =urllib.urlopen(url)
        cont =fp.read()
        fp.close()
        return cont
        
    # to read blog urls in one page    
    class findBlog:
        def __init__(self,cont):
            self.p=0
            self.cont =cont
        def get(self):
            p1 =self.cont.find('<div class="post">',self.p)
            if p1>0:
                p2 =self.cont.find('<h2><a id="homepage1_HomePageDays',p1)
                if p2>0:
                    p3 =self.cont.find('href="',p2)
                    if p3>0:
                        p4 =self.cont.find('">',p3)
                        if p4>0:
                            url =self.cont[(p3+len('href="')):p4]
                            p5 =self.cont.find('</a>',p4)
                            if p5>0:
                                title =self.cont[(p4+len('">')):p5]
                                self.p =p5
                                return [url, title]
            return None
    
    def logFile(fname, cont):
        if os.path.isfile(fname):
            print fname, 'esist!'
        #    return
        fp=open(fname,'w')
        fp.write(cont)
        fp.close()
        
    def appendFile(fname, cont):
        fp=open(fname,'a')
        fp.write(cont)
        fp.close()
        
    def MyCmd(x):
        print x
        os.system(x)
        
    #to read all blog contents in all pages
    class blogReader:
        def __init__(self):
            self.is_latest_written =0
            self.latest_url =''
            
            #update self.latest_url 
            fname ='cfg.txt'
            if os.path.isfile(fname):
                fp =open(fname,'r')
                self.latest_url =fp.readline().strip()
                fp.close()
                
            print 'latest_url', self.latest_url
        def readPage(self,pid):
            is_latest =0
            cont =getUrlContent('http://www.cnblogs.com/cutepig/default.html?page=%d&OnlyTitle=1'%pid)
            fpLog =open('log.txt','a')
            fb =findBlog(cont)
            print >>fpLog, '--------page', pid
            print '--------page', pid
            while 1:
                ret =fb.get()
                if ret is None: break
                [url, title] =ret
                print >>fpLog, ret
                #print ret    #why cannot print chinses?
                if not self.is_latest_written:
                    logFile('cfg.txt', url)
                    self.is_latest_written =1
                    
                print title.decode('utf-8')
                
                if url==self.latest_url:
                    is_latest =1
                    break
                    
                blogFname =url.replace(';','').replace('&','').replace('?','').replace(':','').replace('/','')+'.htm'
                logFile( blogFname, getUrlContent(url))
                appendFile( 'index2.htm', '<a href=%s>%s</a><br>\n'%(blogFname, title))
                
            fpLog.close()
            MyCmd('copy /y index2.htm+index.htm index.htm')
            return is_latest
            
        def read_all(self):
            pid =1
            while 1:
                is_latest =self.readPage(pid)
                if is_latest:
                    break
                pid =pid+1
            
    br =blogReader()
    br.read_all()
    View Code
  • 相关阅读:
    Android 下压缩图片—微弱失真
    android中ViewHolder通用简洁写法
    Android裁剪固定大小头像的功能
    Android进度加载的Loading效果
    GitHub开源项目总结
    Android 实现emoji表情的demo
    vim 高级使用技巧第二篇
    android apk 防止反编译技术第一篇-加壳技术
    Android Metro风格的Launcher开发系列第二篇
    FFMPEG高级编程第一篇:环境搭建及编译
  • 原文地址:https://www.cnblogs.com/cutepig/p/3129534.html
Copyright © 2011-2022 走看看