zoukankan      html  css  js  c++  java
  • python 爬虫 小说

    #encoding:utf8

    import re

    import urllib2

    url = 'http://www.23us.com/html/55/55304/'

    request = urllib2.Request(url)

    response = urllib2.urlopen(request)

    content = response.read().decode('gbk')

    the_url = re.compile('<td class="L"><a href="(.*?)">.*?</a></td>',re.S) last_url = the_url.findall(content)

    for i in last_url:

        print i

        url = 'http://www.23us.com/html/55/55304/'+i

        request = urllib2.Request(url)

        response = urllib2.urlopen(request)

        zhi = response.read()

        code = re.compile('.*?content="text.html; charset=(.*?)".*?',re.S)

        last_code = code.findall(zhi)[0]

        try:

            content = zhi.decode(''+last_code)

        except:

            try:

                content = zhi.decode('gb2312')

            except:

                continue

        last_content = re.compile('<title>(.*?)</title>.*?<dd id="contents">(.*?)</dd>',re.S)

        last_content = last_content.findall(content)    

        if last_content==[]:        

                print '采集失败'

                print content

        for I,J in last_content:

            J = J.replace('&nbsp;','').replace('<br/> <br/>',' ')  

           file = open('小说.txt','a+')

            t = ' ' + I + ' ' + ' ' + J

            file.write(t.encode('utf-8'))        

            file.close()

  • 相关阅读:
    CentOS系统下的数据盘挂载
    在iOS微信浏览器中自动播放HTML5 audio(音乐)的2种正确方式
    C盘占用过满问题
    大量ECAgent报错
    微信电脑版不断崩溃
    java web 在tomcat没有正常输出
    文件解压缩失败
    在myeclipse安装beyond插件
    限时免费 GoodSync 10 同步工具【转】
    soapUI的bug切换版本解决
  • 原文地址:https://www.cnblogs.com/zhanglong68/p/6546754.html
Copyright © 2011-2022 走看看