zoukankan      html  css  js  c++  java
  • python 爬虫-sohu抓小说

    #coding:utf-8
    import urllib2
    import sys
    import re
    
    
    def getPage(url,offset = '0'):
        realurl = "%s%s%s" %(url,offset,'.shtml')
        print realurl
        resp = urllib2.urlopen(realurl)
        content = resp.read()
        #print content
        p = re.compile('<[^>]+>')
        p1=re.compile('<[^>p]+>')
        print p
        rematch = re.compile(r'(<h1.*</h1>)')
        h1 = rematch.findall(content)
        print h1[0],'ok'
        try:
            h1content = p.sub("",h1[0])
            print h1content
        except Exception,e:
            print str(e),'error'
            return
        fp = open(r'juyudao.txt','a')
        fp.write(h1content+ '
    ')
        fp.flush()
       
        #print content
        
        content = content.replace('
    ','')    
        content = content.replace('
    ','')
    
        content = content.replace(' ','')
    
        content = content.replace('     ','')
        cont = re.search('articleBody(.*)class="pages">', content, re.S)#先获取一部分html
        #print 'cont1',cont.group()
        cont1=cont.group()
        articleBody=re.findall('</script>(.*)<divclass="pages">',cont1)
        #print articleBody
        articleBody=articleBody[0].replace('</p>','')
        articleBody=p1.sub('',articleBody)
        txt=articleBody.split('<p>')
        for i in txt:
            fp.write(i+ '
    ')
            fp.flush()
        
        fp.close()
    
    
    def getBook(url, startoffset, endOffset):
        while startoffset < endOffset:
            getPage(url, offset = str(startoffset))
            startoffset += 1
    
    if __name__ == '__main__':
        getPage(url = 'http://book.sohu.com/20131107/n389762800',offset='')
        getBook(url = 'http://book.sohu.com/20131107/n389762800_',startoffset=1,endOffset=20)
  • 相关阅读:
    如何实现CSS限制字数,超出部份显示省略号
    html禁止手机页面放大缩小
    jQuery 获取屏幕高度、宽度
    谷歌api 二维码生成 实例
    java.security.NoSuchAlgorithmException: Cannot find any provider supporting RSA
    eclipse安装阿里开发规范
    tomcat下配置http和https
    linux安装redis
    修改java配置文件
    Linux下安装Mysql
  • 原文地址:https://www.cnblogs.com/paisen/p/3539635.html
Copyright © 2011-2022 走看看