备注还没来得及写,共爬取八千多的歌手,每名歌手平均三十首歌曲算,大概二十多万首歌曲
run.py
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 __author__ = 'Zqf' 4 from dingdian_simple.spiders.dingdian_spider import DingdianSimpleSpider 5 from scrapy.crawler import CrawlerProcess 6 from scrapy.utils.project import get_project_settings 7 8 # 获取settings.py模块的设置 9 settings = get_project_settings() 10 process = CrawlerProcess(settings=settings) 11 12 # 可以添加多个spider 13 process.crawl(DingdianSimpleSpider) 14 15 # 启动爬虫,会阻塞,直到爬取完成 16 process.start()
kugou.py
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 import re 4 5 __author__ = 'Zqf' 6 7 import scrapy 8 from kugoumusic.items import KugoumusicItem 9 from scrapy.linkextractors import LinkExtractor 10 from scrapy.spiders import Rule 11 12 13 class KugouSpiders(scrapy.spiders.CrawlSpider): 14 name = 'kugou' 15 16 start_urls = ['http://www.kugou.com/'] 17 18 rules = ( 19 Rule(LinkExtractor(allow=['http://www.kugou.com/yy/html/singer.html', 20 'http://www.kugou.com/yy/singer/index/d-([a-z]|null)-1.html'])), 21 Rule(LinkExtractor(allow=['http://www.kugou.com/yy/singer/home/d+.html']), callback='parse_item') 22 ) 23 24 def parse_item(self, response): 25 singer = response.xpath('//div/div[@class="clear_fix"]/strong/text()').extract_first() 26 print(singer) 27 songs = response.xpath('//ul[@id="song_container"]/li//span[@class="text"]/i/text()').extract() 28 print(songs) 29 30 item = KugoumusicItem() 31 item['singer'] = singer 32 item['songs'] = songs 33 34 yield item
items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class KugoumusicItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 singer = scrapy.Field() 15 songs = scrapy.Field()
pipelines.py
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 from pymongo import MongoClient 8 9 10 class KugoumusicPipeline(object): 11 12 def open_spider(self, spider): 13 # mongo_config = spider.settings['MONGO_CONFIG'] 14 # host = '127.0.0.1', port = 27017 15 self.client = MongoClient(host='127.0.0.1', port=27017) 16 self.coll = self.client['student_db']['kugou'] 17 self.li = [] 18 19 def close_spider(self, spider): 20 self.insert() 21 self.client.close() 22 23 def insert(self): 24 self.coll.insert_many(self.li) 25 26 def process_item(self, item, spider): 27 if len(self.li) >= 100: 28 self.insert() 29 self.li = [] 30 print("成功插入100条数据-------------------------------------") 31 else: 32 self.li.append(dict(item)) 33 34 return item
settings.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for kugoumusic project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://doc.scrapy.org/en/latest/topics/settings.html 9 # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'kugoumusic' 13 14 SPIDER_MODULES = ['kugoumusic.spiders'] 15 NEWSPIDER_MODULE = 'kugoumusic.spiders' 16 17 # MONGO_CONFIG = ['192.168.62.35:1806, ' 18 # '192.168.62.240:1806, ' 19 # '192.168.62.23:1806, ' 20 # '192.168.62.32:1806, ' 21 # '192.168.62.25:1806, ' 22 # '192.168.62.28:1806, ' 23 # '192.168.62.241:1806'] 24 25 # MONGO_CONFIG = { 26 # 'host': '127.0.0.1', 27 # 'port': 27017 28 # 'user': 'root', 29 # 'password': '123456', 30 # 'db': 's1806', 31 # 'charset': 'utf8' 32 # } 33 # Crawl responsibly by identifying yourself (and your website) on the user-agent 34 #USER_AGENT = 'kugoumusic (+http://www.yourdomain.com)' 35 36 # Obey robots.txt rules 37 ROBOTSTXT_OBEY = False 38 39 # Configure maximum concurrent requests performed by Scrapy (default: 16) 40 #CONCURRENT_REQUESTS = 32 41 42 # Configure a delay for requests for the same website (default: 0) 43 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 44 # See also autothrottle settings and docs 45 #DOWNLOAD_DELAY = 3 46 # The download delay setting will honor only one of: 47 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 48 #CONCURRENT_REQUESTS_PER_IP = 16 49 50 # Disable cookies (enabled by default) 51 #COOKIES_ENABLED = False 52 53 # Disable Telnet Console (enabled by default) 54 #TELNETCONSOLE_ENABLED = False 55 56 # Override the default request headers: 57 DEFAULT_REQUEST_HEADERS = { 58 'Connection': 'keep-alive', 59 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 60 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 61 'Accept-Encoding': 'gzip, deflate, br', 62 'Accept-Language': 'zh-CN,zh;q=0.9', 63 } 64 65 # Enable or disable spider middlewares 66 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 67 #SPIDER_MIDDLEWARES = { 68 # 'kugoumusic.middlewares.KugoumusicSpiderMiddleware': 543, 69 #} 70 71 # Enable or disable downloader middlewares 72 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 73 #DOWNLOADER_MIDDLEWARES = { 74 # 'kugoumusic.middlewares.KugoumusicDownloaderMiddleware': 543, 75 #} 76 77 # Enable or disable extensions 78 # See https://doc.scrapy.org/en/latest/topics/extensions.html 79 #EXTENSIONS = { 80 # 'scrapy.extensions.telnet.TelnetConsole': None, 81 #} 82 83 # Configure item pipelines 84 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 85 ITEM_PIPELINES = { 86 'kugoumusic.pipelines.KugoumusicPipeline': 300, 87 } 88 89 # Enable and configure the AutoThrottle extension (disabled by default) 90 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 91 #AUTOTHROTTLE_ENABLED = True 92 # The initial download delay 93 #AUTOTHROTTLE_START_DELAY = 5 94 # The maximum download delay to be set in case of high latencies 95 #AUTOTHROTTLE_MAX_DELAY = 60 96 # The average number of requests Scrapy should be sending in parallel to 97 # each remote server 98 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 99 # Enable showing throttling stats for every response received: 100 #AUTOTHROTTLE_DEBUG = False 101 102 # Enable and configure HTTP caching (disabled by default) 103 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 104 #HTTPCACHE_ENABLED = True 105 #HTTPCACHE_EXPIRATION_SECS = 0 106 #HTTPCACHE_DIR = 'httpcache' 107 #HTTPCACHE_IGNORE_HTTP_CODES = [] 108 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'