zoukankan      html  css  js  c++  java
  • python 爬小说

    #coding=utf-8
    import datetime
    import time
    import sys
    import os 
    
    import urllib2
    import urllib
    
    sx = '小说站网址'
    
    type = sys.getfilesystemencoding()  
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
    headers = { 'User-Agent' : user_agent }  
    
    
    fo = open("note.txt", "wb")
    
    def getHtml(url):  
        try:  
            request = urllib2.Request(url, headers=headers)  
            response = urllib2.urlopen(request)  
            data = response.read()  
            data = data.decode('gbk')  
            data = data.encode('utf-8')  
            print len(data)  
            return data
        except urllib2.URLError, e:  
            if hasattr(e, "code"):  
                print e.code  
            if hasattr(e, "reason"):  
                print e.reson  
            pass
    
    def dealIndex(url):
        data = getHtml(url)
        # pos = data.find()
        bgnpos = data.index('ChapterList_HengFu_1') + 10
        endpos = data.index('ChapterList_HengFu_2') - 10
        print bgnpos
        print endpos
    
        achfx = data[bgnpos:endpos]
        pos = bgnpos
    
        i = 0
        while 1:
            newpos = achfx.find('href=', pos)
            if newpos == -1 or newpos >= endpos:
                break
    
            # print data[newpos:newpos+200]
            indexurl = achfx[newpos+6:newpos+19]
    
            titlepos = achfx.find('</a>', newpos+20)
            titlename = achfx[newpos+21:titlepos+1]
            # print indexurl + "   " + titlename
            pos = titlepos + 5
    
            dealContext(sx + indexurl, titlename)
            # i = i + 1
            # # print "-----------------" + str(pos)
            # if i >= 1:
            #     break
            pass
    
        # print achfx
    
    
    def dealContext(url, title):
        print url
        print title
    
        data = getHtml(url)
        bgnpos = data.find('name="content"', 10) + 15
        endpos = data.find('yuedu_bottom', bgnpos)
        endpos = data.find('</div>', endpos - 50)
    
        sContent = data[bgnpos:endpos]
        sContent = sContent.replace('&nbsp;', ' ')
        sContent = sContent.replace('<br />', ' ')
    
        # # sContent = sContent.strip("&nbsp;")
        # # sContent = sContent.strip('<br />')
        # print sContent
        # # print sContent.strip('<br />')
        sContent = title + "  " + sContent
        fo.write(sContent)
    
    dealIndex(sx)
    
    
    fo.close()
  • 相关阅读:
    前端优化方法(全)
    前端工程化
    HTTP状态码
    TCP三次握手和四次挥手
    在浏览器输入url后并回车发生了哪些过程
    javascript异步编程
    为什么浏览器采用多进程模型
    LeetCode——最长回文子串?
    LeetCode——字符串的排列/找到字符串中所有字母异位词
    LeetCode——24 点游戏
  • 原文地址:https://www.cnblogs.com/yylingyao/p/8438130.html
Copyright © 2011-2022 走看看