zoukankan      html  css  js  c++  java
  • 一个连续的scrapy

    1.创建一个scrapy项目

    scrapy startproject SpiderAnything

    2.生成一个爬虫  itcash爬虫名字, itcash.cn爬虫范围

    scrapy genspider tb 'taobao.com'

      # 启动爬虫  or 创建py启动文件  ps:windows可通过 start /b python file.py  后台挂起

    from scrapy import cmdline
    
    # cmdline.execute("scrapy crawl qxs_esjzone -L DEBUG".split())
    # cmdline.execute("scrapy crawl qxs_esjzone -L INFO".split())
    print("start crawl Light fiction by qxs_esjzone...")
    cmdline.execute("scrapy crawl qxs_esjzone".split())

    3.设置USER-AGENT

    在middleware中

    class RandomUAMiddleware:
        def process_request(self, request, spider):
            # 随机选择ua
            ua = random.choice(spider.settings.get('USER_AGENT_LIST'))
            request.headers["User-Agent"] = ua

    setting.py中加入

    USER_AGENT_LIST = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" 
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

    注册该中间件

    DOWNLOADER_MIDDLEWARES = {
       'SpiderJD.middlewares.SpiderjdDownloaderMiddleware': 543,
       'SpiderJD.middlewares.RandomUAMiddleware': 400, #注册
    }

    4.在items.py中初始化item

    import scrapy
    
    
    class SpiderjdItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        menu = scrapy.Field()
        shop_name = scrapy.Field() #会在第6步的爬虫脚本用到

    5.piplines.py

    mport pymysql
    
    
    class SpiderjdPipeline:
    
        def process_item(self, item, spider):
            # collection.insert(dict(item))
            print(item)  # 接收爬虫脚本(第6步)中yeild的item参数,这这里可以执行数据库操作
            # sql = """
            #     insert into shop_info(menu,shop_name) values('%s','%s')""" 
            #       % (
            #           item['menu'],
            #           item['shop_name']
            #       )
            # print(sql)
            # self.cursor.execute(sql)
            return item
    
        def open_spider(self, spider):
            # 连接数据库
            self.connect = pymysql.connect(
                host='127.0.0.1',
                port=3306,
                db='renren',
                user='root',
                passwd='123456',
                charset='utf8',
                use_unicode=True)
    
            # 通过cursor执行增删查改
            self.cursor = self.connect.cursor()
            self.connect.autocommit(True)
    
        def close_spider(self, spider):
            self.cursor.close()
            self.connect.close()

    6.编写爬虫

    class JdSpider(scrapy.Spider):
        name = 'tb'
        # allowed_domains = ['jd.com', 'p.3.cn']
        # start_urls = ['https://book.jd.com/booksort.html']
        allowed_domains = ['taobao.com']
        start_urls = ['https://www.taobao.com/']
    
        def parse(self, response):
            # 处理start_urls地址对应的响应  extract提取文字
            # res = response.xpath("//div[@class='li_txt']//h3/text()").extract()
    
            # 分组
            item = SpiderjdItem() # 调用第4步初始化的item
            li_list = response.xpath("//ul[@class='service-bd']/li")
            for li in li_list:
                # 存入pipeline  settings开启ITEM_PIPELINES
                item['menu'] = li.xpath(".//a/text()").extract_first()  # 返回列表第一个,无则None
                # 减少内存占用 存入pipeline
                yield item  # Request, BaseItem对象, dict, None

         # 爬下一级 next_url = li.xpath(".//a/@href").extract_first() if next_url != "javascript:;": next_url = 'https:' + next_url print("next_url: ", next_url) yield scrapy.Request( next_url, meta={'item': item}, # 将这次生城的item传入下个方法里面 callback=self.shop_next # 注意 这里回调可能不生效 by:robot.txt rules 在setting设置 ROBOTSTXT = FALSE )
    def shop_next(self, response): item = response.meta['item'] print(response.request.headers["User-Agent"]) li_list = response.xpath("//div[@class='mui-zebra-module']/div[0]/div") print(li_list, '?2?') i = 1 for li in li_list: item['shop_name'] = li.xpath(".//span[@class='text'].text()").extract_first() # 返回列表第一个,无则None print(item['shop_name'], i, "+", i) i += 1 # 减少内存占用 存入pipeline yield item # Request, BaseItem对象, dict, None

     7.启动爬虫

    
    scrapy list 查看有哪些爬虫
    
    scrapy crawl tb

    8.调式爬虫

    在项目的目录下创建mainDEBUG.py文件

    from scrapy import cmdline
    
    # cmdline.execute("scrapy crawl tb -L WARNING".split())
    cmdline.execute("scrapy crawl tb".split())

    然后再pycharm中 run - edit configurations - "+" - 选择python 

    name : 命名

    script:选择创建的mainDEBUG.py

     应用即可运行debug模式

  • 相关阅读:
    决策树
    训练/测试
    机器学习 标准差
    pandas01
    提交参数说明
    pandas_series02
    Yarn 模式
    词云图
    软件需求与分析课堂综合测试
    课堂测试10
  • 原文地址:https://www.cnblogs.com/tangpg/p/14592401.html
Copyright © 2011-2022 走看看