zoukankan      html  css  js  c++  java
  • scrapy

    # -*- coding: utf-8 -*-
    import scrapy
    import chardet
    from scrapy.spider import BaseSpider
    from scrapy.selector import HtmlXPathSelector
    from scrapy.utils.url import urljoin_rfc
    from scrapy.http import Request

    class Greasemonkey1Spider(scrapy.Spider):
        name = "greasemonkey1"
        allowed_domains = ["wiki.greasespot.net"]
        start_urls = (
            'http://wiki.greasespot.net/',
        )

        def parse(self, response):
            baseurl = response.url
            print 'baseurl  = ',  baseurl

            hxs  = response.xpath(r'//a')
            for path in hxs:
                titles = path.xpath(r'text()').extract()
                urls = path.xpath(r'@href').extract()
                if len(titles) == 0:
                    continue
                if len(urls) == 0:
                    continue
                title = titles[0]
                url = urls[0]
                if title == '':
                    continue
                if len(url) == 0:
                    continue
                if url[0] == '#':
                    continue
                print '2222',  title, url
    #
                url2 = urljoin_rfc(baseurl, url)
                print '=== ', url2
                yield scrapy.Request(url2, callback=self.parse)

  • 相关阅读:
    直接插入排序
    希尔排序
    堆排序
    红黑树
    hashMap原理
    JAVA随笔4
    JAVA随笔3(集合框架,流)
    Linux环境下如何生成core文件
    Centos6 升级glibc-2.17,解决Requires: libc.so.6(GLIBC_2.14)(64bit)错误解决方法
    MediaWiki搭建步骤
  • 原文地址:https://www.cnblogs.com/zhang-pengcheng/p/4223074.html
Copyright © 2011-2022 走看看