zoukankan      html  css  js  c++  java
  • 腾讯视频信息数据爬虫开发【核心爬虫代码】

       腾讯视频信息数据爬取程序代码【笔记】

      

    # -*- coding: utf-8 -*-
    import scrapy
    from ..items import TencentItem,CommentItem
    import re,requests,json
     
     
    class TencentSpiderSpider(scrapy.Spider):
        name = 'tencent_spider'
        allowed_domains = ['v.qq.com']
        start_urls = ['https://v.qq.com/x/list/movie']
     
        def parse(self, response):
            category_part = response.xpath('//div[@class="mod_row_filter"]/ul/li/a/@href').extract()
            for href in category_part:
                detail_url='https://v.qq.com/x/list/movie{}'.format(href)
                yield scrapy.Request(url=detail_url,
                                     callback=self.detail_parse
                )
        def detail_parse(self,response):
            headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 ' 
                              'Firefox/53.0'}
            #分类后的电影信息
            movie_links=response.xpath('//div[@class="mod_figures mod_figure_v"]/ul/li/a/@href').extract()
            movie_titles=response.xpath('//div[@class="figure_title_score"]/strong/a/text()').extract()
            movie_scores=response.xpath('//div[@class="figure_score"]//text()').extract()
            score_list=[]
            total_score=[]
            #得到处理后的评分列表
            for movie_score in movie_scores:
                if movie_score !='
    							' and movie_score!='
    						':
                    score_list.append(movie_score)
            #print(score_list)
            j=0
            while j in range(0,len(score_list)-1):
                score=score_list[j]+score_list[j+1]
                j += 2
                total_score.append(score)
            #print(total_score)
            movie_playCounts=response.xpath('//div[@class="figure_count"]/span/text()').extract()#播放量
            movie_account=response.xpath('//span[@class="option_txt"]/em/text()').extract_first('')#个数
            #进入电影详情页
            for x in range(0,len(movie_links)):
                #获取电影链接中的cid例如中括号的内容https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery(19109829145422060698_1517407245638)&op=3&【cid=b5i4g9z3u5h31jy】
                #然后接合GET请求中的评论页的json链接获取json数据中的comment_id,然后拼接评论页url,获取评论内容
                cid=movie_links[x].split('/')[-1]#获取cid
                cid=cid.split('.')[0]
                #print(cid)
                #获取comment_id
                comment_id_url='https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery&op=3&cid={}'.format(cid)
                html=requests.get(comment_id_url).text
                pattern=re.compile(r'comment_id":"(.*?)"')
                comment_id=re.search(pattern,html).group(1)
                #print(comment_id)
                #获取评论页内容
                comment_url='http://coral.qq.com/article/{}/comment/'.format(comment_id)
                comment_html=requests.get(comment_url,headers=headers).text
                dict=json.loads(comment_html)#获得json数据,并通过解析取出需要数据
                data_dict = dict['data']
                commentid_list = data_dict['commentid']
                if commentid_list:#电影有评论
                    for detail in commentid_list:
                        comment =CommentItem()
                        comment['movie_title'] = movie_titles[x]#电影名
                        comment['timeDifference'] = detail['timeDifference']# 发布时间
                        comment['content'] = detail['content']# 内容
                        comment['up'] = detail['up']# 点赞
                        comment['rep'] = detail['rep']# 踩
                        userinfo_dict = detail['userinfo']# 用户信息(字典)
                        userid = userinfo_dict['userid']
                        comment['userid']=userid# 用户id
                        comment['userLink']='http://video.coral.qq.com/review/user/{}'.format(userid)#用户链接
                        yield comment
     
                yield  scrapy.Request(url=movie_links[x],
                                    callback=self.movie_parse,
                                     meta={'movie_link':movie_links[x],
                                         'movie_title':movie_titles[x],
                                           'score':total_score[x],
                                           'movie_playCount':movie_playCounts[x],
                                           'movie_account':movie_account}
                )
            # 下一页
            next_pg = response.xpath('//a[@class="page_next"]/@href').extract_first('')
            print(next_pg)
            if next_pg:
                next_url = 'https://v.qq.com/x/list/movie{}'.format(next_pg)
                yield scrapy.Request(url=next_url,
                                     callback=self.detail_parse
                                     )
        def movie_parse(self,response):
     
           #简介区
           abstract=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[2]/p/text('
                                    ')').extract_first('')
           directors=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//text()').extract()
           director_links = response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//@href').extract()
           if directors:#存在导演信息
                director=directors[0]
                act=','.join(directors[1:])
                director_link=director_links[0]
                act_link=','.join(director_links[1:])
           else:
               director ='#'
               act = '#'
               director_link = '#'
               act_link = '#'
           #概览区
           movie_title=response.meta['movie_title']
           score=response.meta['score']
           movie_playCount=response.meta['movie_playCount']
           movie_account=response.meta['movie_account']
           movie_link=response.meta['movie_link']
     
           movie=TencentItem()
            #简介
           movie['abstract']=abstract
           movie['director']=director
           movie['act']=act
           movie['director_link']=director_link
           movie['act_link']=act_link
            #概览
           movie['movie_title']=movie_title
           movie['score']=score
           movie['movie_playCount']=movie_playCount
           movie['movie_link']=movie_link
           movie['movie_account']=movie_account
           yield movie
    

      

  • 相关阅读:
    重温Thinking in java
    线程池
    apache DBUtils学习
    Mysql 建表 数据类型选择
    毫秒必争,前端网页性能最佳实践
    tomcat6 开启GZIP
    处理百万级以上的数据提高查询速度的方法
    Tomcat内存设置
    Tomcat全局Filter
    Tomcat多工程共享Session、ServletContext
  • 原文地址:https://www.cnblogs.com/68xi/p/9351436.html
Copyright © 2011-2022 走看看