zoukankan      html  css  js  c++  java
  • 使用scrapy 爬取酷狗音乐歌手及歌曲名并存入mongodb中

    备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲

    run.py

     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 __author__ = 'Zqf'
     4 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider
     5 from scrapy.crawler import CrawlerProcess
     6 from scrapy.utils.project import get_project_settings
     7 
     8 # 获取settings.py模块的设置
     9 settings = get_project_settings()
    10 process = CrawlerProcess(settings=settings)
    11 
    12 # 可以添加多个spider
    13 process.crawl(DingdianSimpleSpider)
    14 
    15 # 启动爬虫,会阻塞,直到爬取完成
    16 process.start()

    kugou.py

     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 import re
     4 
     5 __author__ = 'Zqf'
     6 
     7 import scrapy
     8 from kugoumusic.items import KugoumusicItem
     9 from scrapy.linkextractors import LinkExtractor
    10 from scrapy.spiders import Rule
    11 
    12 
    13 class KugouSpiders(scrapy.spiders.CrawlSpider):
    14     name = 'kugou'
    15     
    16     start_urls = ['http://www.kugou.com/']
    17 
    18     rules = (
    19         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html',
    20                                   'http://www.kugou.com/yy/singer/index/d-([a-z]|null)-1.html'])),
    21         Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/d+.html']), callback='parse_item')
    22     )
    23     
    24     def parse_item(self, response):
    25         singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first()
    26         print(singer)
    27         songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract()
    28         print(songs)
    29     
    30         item = KugoumusicItem()
    31         item['singer'] = singer
    32         item['songs'] = songs
    33         
    34         yield item

    items.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # https://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class KugoumusicItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     # name = scrapy.Field()
    14     singer = scrapy.Field()
    15     songs = scrapy.Field()

    pipelines.py

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 from pymongo import MongoClient
     8 
     9 
    10 class KugoumusicPipeline(object):
    11     
    12     def open_spider(self, spider):
    13         # mongo_config = spider.settings['MONGO_CONFIG']
    14         # host = '127.0.0.1', port = 27017
    15         self.client = MongoClient(host='127.0.0.1', port=27017)
    16         self.coll = self.client['student_db']['kugou']
    17         self.li = []
    18         
    19     def close_spider(self, spider):
    20         self.insert()
    21         self.client.close()
    22         
    23     def insert(self):
    24         self.coll.insert_many(self.li)
    25     
    26     def process_item(self, item, spider):
    27         if len(self.li) >= 100:
    28             self.insert()
    29             self.li = []
    30             print("成功插入100条数据-------------------------------------")
    31         else:
    32             self.li.append(dict(item))
    33         
    34         return item

    settings.py

      1 # -*- coding: utf-8 -*-
      2 
      3 # Scrapy settings for kugoumusic project
      4 #
      5 # For simplicity, this file contains only settings considered important or
      6 # commonly used. You can find more settings consulting the documentation:
      7 #
      8 #     https://doc.scrapy.org/en/latest/topics/settings.html
      9 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
     10 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
     11 
     12 BOT_NAME = 'kugoumusic'
     13 
     14 SPIDER_MODULES = ['kugoumusic.spiders']
     15 NEWSPIDER_MODULE = 'kugoumusic.spiders'
     16 
     17 # MONGO_CONFIG = ['192.168.62.35:1806, '
     18 #               '192.168.62.240:1806, '
     19 #               '192.168.62.23:1806, '
     20 #               '192.168.62.32:1806, '
     21 #               '192.168.62.25:1806, '
     22 #               '192.168.62.28:1806, '
     23 #               '192.168.62.241:1806']
     24 
     25 # MONGO_CONFIG = {
     26 #     'host': '127.0.0.1',
     27 #     'port': 27017
     28     # 'user': 'root',
     29     # 'password': '123456',
     30     # 'db': 's1806',
     31     # 'charset': 'utf8'
     32 # }
     33 # Crawl responsibly by identifying yourself (and your website) on the user-agent
     34 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)'
     35 
     36 # Obey robots.txt rules
     37 ROBOTSTXT_OBEY = False
     38 
     39 # Configure maximum concurrent requests performed by Scrapy (default: 16)
     40 #CONCURRENT_REQUESTS = 32
     41 
     42 # Configure a delay for requests for the same website (default: 0)
     43 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
     44 # See also autothrottle settings and docs
     45 #DOWNLOAD_DELAY = 3
     46 # The download delay setting will honor only one of:
     47 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
     48 #CONCURRENT_REQUESTS_PER_IP = 16
     49 
     50 # Disable cookies (enabled by default)
     51 #COOKIES_ENABLED = False
     52 
     53 # Disable Telnet Console (enabled by default)
     54 #TELNETCONSOLE_ENABLED = False
     55 
     56 # Override the default request headers:
     57 DEFAULT_REQUEST_HEADERS = {
     58     'Connection': 'keep-alive',
     59     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
     60     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
     61     'Accept-Encoding': 'gzip, deflate, br',
     62     'Accept-Language': 'zh-CN,zh;q=0.9',
     63 }
     64 
     65 # Enable or disable spider middlewares
     66 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
     67 #SPIDER_MIDDLEWARES = {
     68 #    'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543,
     69 #}
     70 
     71 # Enable or disable downloader middlewares
     72 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
     73 #DOWNLOADER_MIDDLEWARES = {
     74 #    'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543,
     75 #}
     76 
     77 # Enable or disable extensions
     78 # See https://doc.scrapy.org/en/latest/topics/extensions.html
     79 #EXTENSIONS = {
     80 #    'scrapy.extensions.telnet.TelnetConsole': None,
     81 #}
     82 
     83 # Configure item pipelines
     84 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     85 ITEM_PIPELINES = {
     86    'kugoumusic.pipelines.KugoumusicPipeline': 300,
     87 }
     88 
     89 # Enable and configure the AutoThrottle extension (disabled by default)
     90 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
     91 #AUTOTHROTTLE_ENABLED = True
     92 # The initial download delay
     93 #AUTOTHROTTLE_START_DELAY = 5
     94 # The maximum download delay to be set in case of high latencies
     95 #AUTOTHROTTLE_MAX_DELAY = 60
     96 # The average number of requests Scrapy should be sending in parallel to
     97 # each remote server
     98 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
     99 # Enable showing throttling stats for every response received:
    100 #AUTOTHROTTLE_DEBUG = False
    101 
    102 # Enable and configure HTTP caching (disabled by default)
    103 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    104 #HTTPCACHE_ENABLED = True
    105 #HTTPCACHE_EXPIRATION_SECS = 0
    106 #HTTPCACHE_DIR = 'httpcache'
    107 #HTTPCACHE_IGNORE_HTTP_CODES = []
    108 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    js中new的本质
    js中真伪数组转换
    2 DC电参数测试 (1)
    1 开短路测试
    2月书单 《编码隐匿在计算机软硬件背后的语言》 21-25章
    2月书单 《编码隐匿在计算机软硬件背后的语言》 17-20章
    时间的掌控
    数码管的秘密
    会眨眼的小灯
    点亮一盏灯
  • 原文地址:https://www.cnblogs.com/tttzqf/p/9638545.html
Copyright © 2011-2022 走看看