zoukankan      html  css  js  c++  java
  • python 爬小说

    #coding=utf-8
    import datetime
    import time
    import sys
    import os 
    
    import urllib2
    import urllib
    
    sx = '小说站网址'
    
    type = sys.getfilesystemencoding()  
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
    headers = { 'User-Agent' : user_agent }  
    
    
    fo = open("note.txt", "wb")
    
    def getHtml(url):  
        try:  
            request = urllib2.Request(url, headers=headers)  
            response = urllib2.urlopen(request)  
            data = response.read()  
            data = data.decode('gbk')  
            data = data.encode('utf-8')  
            print len(data)  
            return data
        except urllib2.URLError, e:  
            if hasattr(e, "code"):  
                print e.code  
            if hasattr(e, "reason"):  
                print e.reson  
            pass
    
    def dealIndex(url):
        data = getHtml(url)
        # pos = data.find()
        bgnpos = data.index('ChapterList_HengFu_1') + 10
        endpos = data.index('ChapterList_HengFu_2') - 10
        print bgnpos
        print endpos
    
        achfx = data[bgnpos:endpos]
        pos = bgnpos
    
        i = 0
        while 1:
            newpos = achfx.find('href=', pos)
            if newpos == -1 or newpos >= endpos:
                break
    
            # print data[newpos:newpos+200]
            indexurl = achfx[newpos+6:newpos+19]
    
            titlepos = achfx.find('</a>', newpos+20)
            titlename = achfx[newpos+21:titlepos+1]
            # print indexurl + "   " + titlename
            pos = titlepos + 5
    
            dealContext(sx + indexurl, titlename)
            # i = i + 1
            # # print "-----------------" + str(pos)
            # if i >= 1:
            #     break
            pass
    
        # print achfx
    
    
    def dealContext(url, title):
        print url
        print title
    
        data = getHtml(url)
        bgnpos = data.find('name="content"', 10) + 15
        endpos = data.find('yuedu_bottom', bgnpos)
        endpos = data.find('</div>', endpos - 50)
    
        sContent = data[bgnpos:endpos]
        sContent = sContent.replace('&nbsp;', ' ')
        sContent = sContent.replace('<br />', ' ')
    
        # # sContent = sContent.strip("&nbsp;")
        # # sContent = sContent.strip('<br />')
        # print sContent
        # # print sContent.strip('<br />')
        sContent = title + "  " + sContent
        fo.write(sContent)
    
    dealIndex(sx)
    
    
    fo.close()
  • 相关阅读:
    164 Maximum Gap 最大间距
    162 Find Peak Element 寻找峰值
    160 Intersection of Two Linked Lists 相交链表
    155 Min Stack 最小栈
    154 Find Minimum in Rotated Sorted Array II
    153 Find Minimum in Rotated Sorted Array 旋转数组的最小值
    152 Maximum Product Subarray 乘积最大子序列
    151 Reverse Words in a String 翻转字符串里的单词
    bzoj3994: [SDOI2015]约数个数和
    bzoj 4590: [Shoi2015]自动刷题机
  • 原文地址:https://www.cnblogs.com/yylingyao/p/8438130.html
Copyright © 2011-2022 走看看