# spider文件 # -*- coding: utf-8 -*- import scrapy import re from scrapy import Request import json import string import random from xpc.items import PostItem, CommentItem, CopyItem # 多个item def strip(s): # s存在就去空,不存在就返回空 if s: return s.strip() return "" # 使用scrapy.Request和scrapy.FormRequest发送请求的时候,默认会把cookies保存下来 # 模拟登录的时候不用scrapy框架,直接使用request模块 cookies = dict( Authorization='4F635191B0602B5D3B06024483B0602AAF8B06023C2F6259656D' ) # 上面的cookies是网站返回的,需要先登陆的一下把这个cookies找到 # 生成26个字母+数字 def gen_sessionid(): return "".join(random.choices(string.ascii_lowercase + string.digits, k=26)) class XinpianchangSpider(scrapy.Spider): name = 'XinPianChang' allowed_domains = ['xinpianchang.com', 'openapi-vtom.vmovier.com'] start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=tabArticle'] # 假设从21页开始访问,这里就需要带上cookies,这时候最开始设置的cookies就不能用了,网站会返回4个cookies。需要从写start_requests函数 # start_urls = ['https://www.xinpianchang.com/channel/index/type-/sort-like/duration_type-0/resolution_type-/page-21'] page_count = 0 # 重写父类中的 start_requests方法,该方法默认对start_urls中的url发get请求 # def start_requests(self): # for url in self.start_urls: # # data = { # # "kw": "cat" # # } # # post请求发送,使用FormRequest # # yield scrapy.FormRequest(url=url, callback=self.parse, formdata=data) # # c = cookies.copy() # c.update(PHPSESSID=gen_sessionid(), # SERVER_ID='b52601c8-285bdd26', # channel_page='apU%3D') # yield Request(url, cookies=c, dont_filter=True) def parse(self, response): # from scrapy.shell import inspect_response # inspect_response(response, self) self.page_count += 1 if self.page_count >= 100: cookies.update(PHPSESSID=gen_sessionid()) self.page_count = 0 url_list = response.xpath('//ul[@class="video-list"]/li/@data-articleid').extract() for pid in url_list: detail_url = 'https://www.xinpianchang.com/a{}?from=ArticleList'.format(pid) # print(detail_url) request = response.follow(detail_url, callback=self.parse_post) request.meta['pid'] = pid yield request # 进入作品的详情页请求 pages = response.xpath('//div[@class="page"]/a/@href').extract() for page_url in pages: # print("列表页翻页url", page_url) # page_url是一个相对路径,不完整的 yield response.follow(page_url, self.parse, cookies=cookies) def parse_post(self, response): pid = response.meta['pid'] post = PostItem() post['pid'] = pid post['title'] = response.xpath('//div[@class="title-wrap"]/h3/text()').get() # video_url = 'https://openapi-vtom.vmovier.com/v3/video/5E34203E92450?expand=resource&usage=xpc_web' # response.text拿到网页返回的源码 vid = re.findall('vid: "(.*?)",', response.text)[0] # print(vid) video_url = 'https://openapi-vtom.vmovier.com/v3/video/{}?expand=resource&usage=xpc_web'.format(vid) cates = response.xpath('//span[@class="cate v-center"]/a/text()').extract() post['category'] = ''.join([cate.strip() for cate in cates]) post['create_time'] = response.xpath('//span[contains(@class,"update-time")]/i/text()').get() post['play_count'] = response.xpath('//i[contains(@class,"play-counts")]/text()').get() desc_lst = response.xpath('//p[contains(@class,"desc")]//text()').extract() post['desc'] = ' '.join([i.strip() for i in desc_lst]) # 请求这个video_url, 多了一步这个注意一下 request = Request(video_url, callback=self.parse_video) # 把之前获取到的post通过meta传到下一个函数中. 这个post是请求传参 request.meta['post'] = post yield request # 获取评论链接‘https://app.xinpianchang.com/comments?resource_id=10664352&type=article&page=1&per_page=24’ comment_url = "https://app.xinpianchang.com/comments?resource_id={}&type=article&page=1&per_page=24".format( pid) request = Request(comment_url, callback=self.parse_comment) # 把之前获取到的post通过meta传到下一个函数中 request.meta['pid'] = pid yield request # 获取作者页链接 creator_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li') composer_url = 'https://www.xinpianchang.com/u{}?from=articleList' # cid = response.xpath('//div[@class="filmplay-creator right-section"]/ul[@class="creator-list"]/li/a/@data-userid') for creator in creator_list: cid = creator.xpath('./a/@data-userid').get() composer_url = 'https://www.xinpianchang.com/u{}?from=articleList'.format(cid) request = response.follow(composer_url, self.parse_composer) request.meta['cid'] = cid # 避免在cookies更新之后,不断的添加到请求头里面,避免请求头里带有一串cookies request.meta['dont_merge_cookies'] = True yield request # 作者和视频的对应关系 cr = CopyItem() cr['pid'] = pid cr['cid'] = cid cr['pcid'] = pid + cid cr['role'] = creator.xpath('./div[@class="creator-info"]/span/text()').get() # print("cr", cr) yield cr def parse_video(self, response): # 这个response是json格式 post = response.meta['post'] # 先把返回的json转化一下, 注意一下 result = json.loads(response.text) post['video_url'] = result['data']['resource']['default']['url'] # 直接返回给管道了 yield post def parse_comment(self, response): result = json.loads(response.text) for c in result['data']['list']: comment = CommentItem() comment['uname'] = c['userInfo']['username'] comment['user_id'] = c['userInfo']['id'] # comment['user_page'] = c['userInfo']['web_url'] comment['content'] = c['content'] comment['content_id'] = c['id'] print(comment) yield comment # 如果有下一页 if result['data']['next_page_url']: next_page = 'https://app.xinpianchang.com' + result['data']['next_page_url'] # print("next_page", next_page) yield response.follow(next_page, self.parse_comment) def parse_composer(self, response): pass
# settings文件 # -*- coding: utf-8 -*- # Scrapy settings for xpc project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'xpc' SPIDER_MODULES = ['xpc.spiders'] NEWSPIDER_MODULE = 'xpc.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'xpc (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: # CONCURRENT_REQUESTS_PER_DOMAIN = 16 # CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) # 如果使用自定义cookie就把COOKIES_ENABLED设置为True # 如果使用settings的cookie就把COOKIES_ENABLED设置为False COOKIES_ENABLED = True COOKIES_DEBUG = True # 可以打印出来详细的cookies信息 # Disable Telnet Console (enabled by default) # TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html # SPIDER_MIDDLEWARES = { # 'xpc.middlewares.XpcSpiderMiddleware': 543, # } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'xpc.middlewares.XpcDownloaderMiddleware': 543, # } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html # EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, # } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'xpc.pipelines.XpcPipeline': 300, # 优先级高 # 'xpc.pipelines.MysqlPipeline': 301, # 'xpc.pipelines.RedisPipeline': 302, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html # AUTOTHROTTLE_ENABLED = True # The initial download delay # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = False # True缓存访问过的网页,不会真实的发请求 # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = 'httpcache' # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # 日志类型: INFO DEBUG ERROR LOG_LEVEL = 'DEBUG'
# item文件 # -*- coding: utf-8 -*- import scrapy class PostItem(scrapy.Item): # 保存视频信息 # 自定义字段,有多个表的时候需要写个table_name table_name = 'posts' # 下面的是数据字段 pid = scrapy.Field() title = scrapy.Field() category = scrapy.Field() create_time = scrapy.Field() play_count = scrapy.Field() desc = scrapy.Field() video_url = scrapy.Field() class CommentItem(scrapy.Item): # 保存评论信息 table_name = 'comments' content_id = scrapy.Field() pid = scrapy.Field() cid = scrapy.Field() uname = scrapy.Field() user_id = scrapy.Field() content = scrapy.Field() user_page = scrapy.Field()
class CopyItem(scrapy.Item): table_name = 'copyrights' pcid = scrapy.Field() # 表的主键 pid = scrapy.Field() cid = scrapy.Field() role = scrapy.Field()
# pipeline文件 # -*- coding: utf-8 -*- import csv from xpc.items import PostItem, CommentItem, CopyItem import pymysql from redis import Redis import os class XpcPipeline(object): def __init__(self): # 当前文件的上一级 store_file = os.path.dirname(__file__) + '/xpc.csv' # 打开文件 self.file = open(store_file, 'w', newline="") # csv 写法 self.writer = csv.writer(self.file) def open_spider(self, spider): print("pipeline 开始爬虫......")
# 执行多个不同的item时 def process_item(self, item, spider): if isinstance(item, PostItem): print("这是发布信息:", item) elif isinstance(item, CommentItem): print("这是评论信息:", item) elif isinstance(item, CopyItem): print("这是版权信息:", item) return item # 返回给下一个要执行的管道类 def close_spider(self, spider): print("pipeline 结束爬虫......") # 连接数据库 class MysqlPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = pymysql.Connect( host='127.0.0.1', port=3306, user='root', password='', db='test_db', charset='utf8' ) print("数据库连接成功") def process_item(self, item, spider): self.cursor = self.conn.cursor() try: self.cursor.execute('insert into test_db values("%s", "%s")' % (item['author'], item['content'])) self.conn.commit() except Exception as e: print("数据库插入异常:", e) print("数据库执行回滚") self.conn.rollback() return item def close_spider(self, spider): print("断开数据库连接") self.cursor.close() self.conn.close() # 连接数据库 class RedisPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = Redis( host='127.0.0.1', port=6379 ) print("数据库连接成功") def process_item(self, item, spider): dic = { "author": item["author"], "content": item["content"] } self.conn.lpush("队列名字", dic) def close_spider(self, spider): print("断开数据库连接") self.cursor.close() self.conn.close()