zoukankan      html  css  js  c++  java
  • Python爬虫——小说

    #encoding:utf8

    import re

    import urllib2

    url = 'http://www.23us.com/html/55/55304/'

    request = urllib2.Request(url)

    response = urllib2.urlopen(request)

    content = response.read().decode('gbk')

    the_url = re.compile('<td class="L"><a href="(.*?)">.*?</a></td>',re.S) last_url = the_url.findall(content)

    for i in last_url:

        print i

        url = 'http://www.23us.com/html/55/55304/'+i

        request = urllib2.Request(url)

        response = urllib2.urlopen(request)

        zhi = response.read()

        code = re.compile('.*?content="text.html; charset=(.*?)".*?',re.S)

        last_code = code.findall(zhi)[0]

        try:

            content = zhi.decode(''+last_code)

        except:

            try:

                content = zhi.decode('gb2312')

            except:

                continue

        last_content = re.compile('<title>(.*?)</title>.*?<dd id="contents">(.*?)</dd>',re.S)

        last_content = last_content.findall(content)    

        if last_content==[]:        

                print '采集失败'

                print content

        for I,J in last_content:

            J = J.replace('&nbsp;','').replace('<br/> <br/>',' ')  

           file = open('小说.txt','a+')

            t = ' ' + I + ' ' + ' ' + J

            file.write(t.encode('utf-8'))        

            file.close()

  • 相关阅读:
    abs() 与fabs() 的区别辨析
    TCP服务端如何判断客户端断开连接学习
    数据结构与算法李春葆 第二章思维导图
    数据结构与算法思维导图第一章
    关于临时表的操作
    关于 ANSI_NULLS和QUOTED_IDENTIFIER
    临时表和视图的区别
    关于CancellationToken的解释
    在项目中添加signalr.js
    数据库存储过程的写法
  • 原文地址:https://www.cnblogs.com/yueminghai/p/6544775.html
Copyright © 2011-2022 走看看