zoukankan html css js c++ java

python 爬小说

#coding=utf-8
import datetime
import time
import sys
import os 

import urllib2
import urllib

sx = '小说站网址'

type = sys.getfilesystemencoding()  
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
headers = { 'User-Agent' : user_agent }  


fo = open("note.txt", "wb")

def getHtml(url):  
    try:  
        request = urllib2.Request(url, headers=headers)  
        response = urllib2.urlopen(request)  
        data = response.read()  
        data = data.decode('gbk')  
        data = data.encode('utf-8')  
        print len(data)  
        return data
    except urllib2.URLError, e:  
        if hasattr(e, "code"):  
            print e.code  
        if hasattr(e, "reason"):  
            print e.reson  
        pass

def dealIndex(url):
    data = getHtml(url)
    # pos = data.find()
    bgnpos = data.index('ChapterList_HengFu_1') + 10
    endpos = data.index('ChapterList_HengFu_2') - 10
    print bgnpos
    print endpos

    achfx = data[bgnpos:endpos]
    pos = bgnpos

    i = 0
    while 1:
        newpos = achfx.find('href=', pos)
        if newpos == -1 or newpos >= endpos:
            break

        # print data[newpos:newpos+200]
        indexurl = achfx[newpos+6:newpos+19]

        titlepos = achfx.find('</a>', newpos+20)
        titlename = achfx[newpos+21:titlepos+1]
        # print indexurl + "   " + titlename
        pos = titlepos + 5

        dealContext(sx + indexurl, titlename)
        # i = i + 1
        # # print "-----------------" + str(pos)
        # if i >= 1:
        #     break
        pass

    # print achfx


def dealContext(url, title):
    print url
    print title

    data = getHtml(url)
    bgnpos = data.find('name="content"', 10) + 15
    endpos = data.find('yuedu_bottom', bgnpos)
    endpos = data.find('</div>', endpos - 50)

    sContent = data[bgnpos:endpos]
    sContent = sContent.replace('&nbsp;', ' ')
    sContent = sContent.replace('<br />', ' ')

    # # sContent = sContent.strip("&nbsp;")
    # # sContent = sContent.strip('<br />')
    # print sContent
    # # print sContent.strip('<br />')
    sContent = title + "  " + sContent
    fo.write(sContent)

dealIndex(sx)


fo.close()

查看全文

相关阅读:
164 Maximum Gap 最大间距
 162 Find Peak Element 寻找峰值
 160 Intersection of Two Linked Lists 相交链表
 155 Min Stack 最小栈
 154 Find Minimum in Rotated Sorted Array II
153 Find Minimum in Rotated Sorted Array 旋转数组的最小值
 152 Maximum Product Subarray 乘积最大子序列
 151 Reverse Words in a String 翻转字符串里的单词
 bzoj3994: [SDOI2015]约数个数和
 bzoj 4590: [Shoi2015]自动刷题机

原文地址：https://www.cnblogs.com/yylingyao/p/8438130.html