zoukankan      html  css  js  c++  java
  • 使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

    备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲

    run.py

     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 __author__ = 'Zqf'
     4 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider
     5 from scrapy.crawler import CrawlerProcess
     6 from scrapy.utils.project import get_project_settings
     7 
     8 # 获取settings.py模块的设置
     9 settings = get_project_settings()
    10 process = CrawlerProcess(settings=settings)
    11 
    12 # 可以添加多个spider
    13 process.crawl(DingdianSimpleSpider)
    14 
    15 # 启动爬虫,会阻塞,直到爬取完成
    16 process.start()

    kugou.py

     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 import re
     4 
     5 __author__ = 'Zqf'
     6 
     7 import scrapy
     8 from kugoumusic.items import KugoumusicItem
     9 from scrapy.linkextractors import LinkExtractor
    10 from scrapy.spiders import Rule
    11 
    12 
    13 class KugouSpiders(scrapy.spiders.CrawlSpider):
    14     name = 'kugou'
    15     
    16     start_urls = ['http://www.kugou.com/']
    17 
    18     rules = (
    19         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',
    20                                   'http://www.kugou.com/yy/singer/index/d-([a-z]|null)-1.html'])),
    21         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/d+.html']), callback='parse_item')
    22     )
    23     
    24     def parse_item(self, response):
    25         singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()
    26         print(singer)
    27         songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()
    28         print(songs)
    29     
    30         item = KugoumusicItem()
    31         item['singer'] = singer
    32         item['songs'] = songs
    33         
    34         yield item

    items.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # https://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class KugoumusicItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     # name = scrapy.Field()
    14     singer = scrapy.Field()
    15     songs = scrapy.Field()

    pipelines.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 from pymongo import MongoClient
     8 
     9 
    10 class KugoumusicPipeline(object):
    11     
    12     def open_spider(self, spider):
    13         # mongo_config = spider.settings['MONGO_CONFIG']
    14         # host = '127.0.0.1', port = 27017
    15         self.client = MongoClient(host='127.0.0.1', port=27017)
    16         self.coll = self.client['student_db']['kugou']
    17         self.li = []
    18         
    19     def close_spider(self, spider):
    20         self.insert()
    21         self.client.close()
    22         
    23     def insert(self):
    24         self.coll.insert_many(self.li)
    25     
    26     def process_item(self, item, spider):
    27         if len(self.li) >= 100:
    28             self.insert()
    29             self.li = []
    30             print("成功插入100条数据-------------------------------------")
    31         else:
    32             self.li.append(dict(item))
    33         
    34         return item

    settings.py

      1 # -*- coding: utf-8 -*-
      2 
      3 # Scrapy settings for kugoumusic project
      4 #
      5 # For simplicity, this file contains only settings considered important or
      6 # commonly used. You can find more settings consulting the documentation:
      7 #
      8 #     https://doc.scrapy.org/en/latest/topics/settings.html
      9 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
     10 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
     11 
     12 BOT_NAME = 'kugoumusic'
     13 
     14 SPIDER_MODULES = ['kugoumusic.spiders']
     15 NEWSPIDER_MODULE = 'kugoumusic.spiders'
     16 
     17 # MONGO_CONFIG = ['192.168.62.35:1806, '
     18 #               '192.168.62.240:1806, '
     19 #               '192.168.62.23:1806, '
     20 #               '192.168.62.32:1806, '
     21 #               '192.168.62.25:1806, '
     22 #               '192.168.62.28:1806, '
     23 #               '192.168.62.241:1806']
     24 
     25 # MONGO_CONFIG = {
     26 #     'host': '127.0.0.1',
     27 #     'port': 27017
     28     # 'user': 'root',
     29     # 'password': '123456',
     30     # 'db': 's1806',
     31     # 'charset': 'utf8'
     32 # }
     33 # Crawl responsibly by identifying yourself (and your website) on the user-agent
     34 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)'
     35 
     36 # Obey robots.txt rules
     37 ROBOTSTXT_OBEY = False
     38 
     39 # Configure maximum concurrent requests performed by Scrapy (default: 16)
     40 #CONCURRENT_REQUESTS = 32
     41 
     42 # Configure a delay for requests for the same website (default: 0)
     43 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
     44 # See also autothrottle settings and docs
     45 #DOWNLOAD_DELAY = 3
     46 # The download delay setting will honor only one of:
     47 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
     48 #CONCURRENT_REQUESTS_PER_IP = 16
     49 
     50 # Disable cookies (enabled by default)
     51 #COOKIES_ENABLED = False
     52 
     53 # Disable Telnet Console (enabled by default)
     54 #TELNETCONSOLE_ENABLED = False
     55 
     56 # Override the default request headers:
     57 DEFAULT_REQUEST_HEADERS = {
     58     'Connection': 'keep-alive',
     59     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
     60     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
     61     'Accept-Encoding': 'gzip, deflate, br',
     62     'Accept-Language': 'zh-CN,zh;q=0.9',
     63 }
     64 
     65 # Enable or disable spider middlewares
     66 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
     67 #SPIDER_MIDDLEWARES = {
     68 #    'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,
     69 #}
     70 
     71 # Enable or disable downloader middlewares
     72 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
     73 #DOWNLOADER_MIDDLEWARES = {
     74 #    'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,
     75 #}
     76 
     77 # Enable or disable extensions
     78 # See https://doc.scrapy.org/en/latest/topics/extensions.html
     79 #EXTENSIONS = {
     80 #    'scrapy.extensions.telnet.TelnetConsole': None,
     81 #}
     82 
     83 # Configure item pipelines
     84 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     85 ITEM_PIPELINES = {
     86    'kugoumusic.pipelines.KugoumusicPipeline': 300,
     87 }
     88 
     89 # Enable and configure the AutoThrottle extension (disabled by default)
     90 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
     91 #AUTOTHROTTLE_ENABLED = True
     92 # The initial download delay
     93 #AUTOTHROTTLE_START_DELAY = 5
     94 # The maximum download delay to be set in case of high latencies
     95 #AUTOTHROTTLE_MAX_DELAY = 60
     96 # The average number of requests Scrapy should be sending in parallel to
     97 # each remote server
     98 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
     99 # Enable showing throttling stats for every response received:
    100 #AUTOTHROTTLE_DEBUG = False
    101 
    102 # Enable and configure HTTP caching (disabled by default)
    103 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    104 #HTTPCACHE_ENABLED = True
    105 #HTTPCACHE_EXPIRATION_SECS = 0
    106 #HTTPCACHE_DIR = 'httpcache'
    107 #HTTPCACHE_IGNORE_HTTP_CODES = []
    108 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    Understanding about Baire Category Theorem
    Isometric embedding of metric space
    Convergence theorems for measurable functions
    Mindmap for "Principles of boundary element methods"
    Various formulations of Maxwell equations
    Existence and uniqueness theorems for variational problems
    Kernels and image sets for an operator and its dual
    [loj6498]农民
    [luogu3781]切树游戏
    [atAGC051B]Three Coins
  • 原文地址:https://www.cnblogs.com/tttzqf/p/9638545.html
Copyright © 2011-2022 走看看