scrapy递归下载网站

zoukankan html css js c++ java

scrapy递归下载网站

# encoding: utf-8
import os
import re
import subprocess
import sys

import chardet
import scrapy
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.utils.url import urljoin_rfc

from mychardet import *

# print sys.getdefaultencoding()

# print sys.path

def get_default_to_codec():
    return mytogb18030

def getfirst(a):
#     print 'enter getfirst', repr(a)
    if a == None:
        return u''
    elif len(a) == 0:
        return u''
    b = a[0]
#     print repr(b)#, chardet.detect(b)
    return b

class Greasemonkey1Spider(scrapy.Spider):
    name = "test"
    allowed_domains = ["localhost"]
    start_urls = (
        'http://localhost/test',
    )

    def parseContext(self, response):
#         print "Enter parseContext: ", response.url
        hxs = response
        sel = hxs.xpath('//title/text()')
        if sel != None:
            titles = sel.extract()
            if len(titles) > 0: title = titles[0]
            else: title = ''

        sel = hxs.xpath('/html/body')
        if sel != None:
            bodys = sel.extract()
            if len(bodys) > 0: body = bodys[0]
            else: body = ''
#         print title, repr(body)

    def parse(self, response):
        baseurl = response.url
        print 'baseurl = ', baseurl
        self.parseContext(response)

        hxs = response.xpath(r'//a')
        for path in hxs:
            titles = getfirst(path.xpath(r'text()').extract())
            urls = getfirst(path.xpath(r'@href').extract())
#             print titles, urls
            item_url = urljoin_rfc(baseurl, urls)
            yield Request(item_url,callback=self.parse)

if __name__ == '__main__':
    cmd = '''E:Python27Scriptsscrapy.exe crawl --nolog test'''
    cwd = os.path.split(__file__)[0]
    p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, cwd=cwd)
    while None == p.poll():
        out, err = p.communicate()
#         print 'out, err', out, err, repr(out), repr(err)
        if err:
            print err
        elif out:
            print out

    print p.returncode

#     while not p.poll():
#         print p.stdout.read()
#         print p.stderr.read()

查看全文

相关阅读:
web前端常见面试题
 pyhton课堂随笔-基本画图
 安装和启动json-server
Idea破解注册码
 MongoDB基本增删改查
 MogonDB安装及配置
 python和jupyter安装
 web中静态资源和动态资源的概念及区别
 nodejs基础（三）
C#进程、线程、CPU

原文地址：https://www.cnblogs.com/zhang-pengcheng/p/4287293.html