zoukankan      html  css  js  c++  java
  • python 爬虫-sohu抓小说

    #coding:utf-8
    import urllib2
    import sys
    import re
    
    
    def getPage(url,offset = '0'):
        realurl = "%s%s%s" %(url,offset,'.shtml')
        print realurl
        resp = urllib2.urlopen(realurl)
        content = resp.read()
        #print content
        p = re.compile('<[^>]+>')
        p1=re.compile('<[^>p]+>')
        print p
        rematch = re.compile(r'(<h1.*</h1>)')
        h1 = rematch.findall(content)
        print h1[0],'ok'
        try:
            h1content = p.sub("",h1[0])
            print h1content
        except Exception,e:
            print str(e),'error'
            return
        fp = open(r'juyudao.txt','a')
        fp.write(h1content+ '
    ')
        fp.flush()
       
        #print content
        
        content = content.replace('
    ','')    
        content = content.replace('
    ','')
    
        content = content.replace(' ','')
    
        content = content.replace('     ','')
        cont = re.search('articleBody(.*)class="pages">', content, re.S)#先获取一部分html
        #print 'cont1',cont.group()
        cont1=cont.group()
        articleBody=re.findall('</script>(.*)<divclass="pages">',cont1)
        #print articleBody
        articleBody=articleBody[0].replace('</p>','')
        articleBody=p1.sub('',articleBody)
        txt=articleBody.split('<p>')
        for i in txt:
            fp.write(i+ '
    ')
            fp.flush()
        
        fp.close()
    
    
    def getBook(url, startoffset, endOffset):
        while startoffset < endOffset:
            getPage(url, offset = str(startoffset))
            startoffset += 1
    
    if __name__ == '__main__':
        getPage(url = 'http://book.sohu.com/20131107/n389762800',offset='')
        getBook(url = 'http://book.sohu.com/20131107/n389762800_',startoffset=1,endOffset=20)
  • 相关阅读:
    报错18:28:21: Debugging starts QLayout: Attempting to add QLayout ““ to MainWindow “MainWindow“, which
    春 景
    sysTime(系统毫秒)
    balabala视频格式转换器
    tasklist /FI命令结论
    QFileSystemWatcher
    Qfileinfo
    Qdir
    欢迎光临我的普吉岛芭东海滩公寓
    使用ArcGIS GP服务之一框架介绍
  • 原文地址:https://www.cnblogs.com/paisen/p/3539635.html
Copyright © 2011-2022 走看看