zoukankan      html  css  js  c++  java
  • scrapy递归下载网站

    # encoding: utf-8
    import os
    import re
    import subprocess
    import sys

    import chardet
    import scrapy
    from scrapy.http import Request
    from scrapy.selector import HtmlXPathSelector
    from scrapy.spider import BaseSpider
    from scrapy.utils.url import urljoin_rfc

    from mychardet import *

    # print sys.getdefaultencoding()

    # print sys.path

    def get_default_to_codec():
        return mytogb18030

    def getfirst(a):
    #     print 'enter getfirst', repr(a)
        if a == None:
            return u''
        elif len(a) == 0:
            return u''
        b = a[0]
    #     print repr(b)#, chardet.detect(b)
        return b

    class Greasemonkey1Spider(scrapy.Spider):
        name = "test"
        allowed_domains = ["localhost"]
        start_urls = (
            'http://localhost/test',
        )

        def parseContext(self, response):
    #         print "Enter parseContext: ", response.url
            hxs = response
            sel = hxs.xpath('//title/text()')
            if sel != None:
                titles = sel.extract()
                if len(titles) > 0: title = titles[0]
                else: title = ''

            sel = hxs.xpath('/html/body')
            if sel != None:
                bodys = sel.extract()
                if len(bodys) > 0: body = bodys[0]
                else: body = ''
    #         print title, repr(body)

        def parse(self, response):
            baseurl = response.url
            print 'baseurl  = ',  baseurl
            self.parseContext(response)

            hxs  = response.xpath(r'//a')
            for path in hxs:
                titles = getfirst(path.xpath(r'text()').extract())
                urls = getfirst(path.xpath(r'@href').extract())
    #             print titles, urls
                item_url = urljoin_rfc(baseurl, urls)
                yield Request(item_url,callback=self.parse)

    if __name__ == '__main__':
        cmd = '''E:Python27Scriptsscrapy.exe crawl --nolog test'''
        cwd = os.path.split(__file__)[0]
        p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, cwd=cwd)
        while None == p.poll():
            out, err = p.communicate()
    #         print 'out, err', out, err, repr(out), repr(err)
            if err:
                print err
            elif out:
                print out

        print p.returncode

    #     while not p.poll():
    #         print p.stdout.read()
    #         print p.stderr.read()

  • 相关阅读:
    1.在html中引入js文件和Jquery框架
    在html页面添加一个隐藏域,并渲染一个需要保存的数值,在js中需要再获取,而不影响页面结构
    Django REST framework 使用简记
    BlockingQueue-线程的阻塞队列
    ExecutorService生命周期
    Java transient关键字使用小记
    数组
    Git 常用命令
    Git 常见报错
    python 常见报错
  • 原文地址:https://www.cnblogs.com/zhang-pengcheng/p/4287293.html
Copyright © 2011-2022 走看看