zoukankan      html  css  js  c++  java
  • python每日一题:爬虫电影的动态票房信息

    题目:从http://movie.mtime.com中读取一个电影的票房信息和相关的同类电影。

    方案一:采用Ajax技术,逐步提取动态网站的json,再进行爬虫

    知识点:

     1.由于该数据是动态信息,需要找到相关的json,并拼接出所需要的网址,进行爬虫。

     2.利用正则表达式提取网址信息、利用json将字符串字典化。

    from bs4 import BeautifulSoup
    import re, csv, urllib.request, urllib.parse, time, json, pickle
    
    
    class url_manager(object):
        def __init__(self):
            self.new_urls = []  # 书籍上采用set()函数,主要是考虑到次函数的去重功能,但集合是无序的,导致不方便查找new_urls内的数据,且后序add(url)时已进行判定,不必要再使用set()
            self.old_urls = []
    
        def add_new_url(self, url):
            if url not in self.new_urls and url not in self.old_urls:
                self.new_urls.append(url)
    
        def add_new_urls(self, urls):
            if urls == None:
                return
            for url in urls:
                self.add_new_url(url)
    
        def have_new_url(self):
            return len(self.new_urls) != 0
    
        def get_new_url(self):
            data = self.new_urls.pop(0)  # 从第一个数据进行删除,逐一爬虫
    
            self.old_urls.append(data)
            return data
    
    
    class url_download(object):
        def download(self, url):
            response = urllib.request.urlopen(url)
            data = response.read().decode()
            if data == None:
                print("no web")
                return False
            return data
    
    
    class url_scrapy(object):
        def get_data(self, source_url, source_data):#书中将票房的电影由于其json中关键字的差异,进行了分类,本程序中为了简单起见,只是读取一下关键字。
    
            pattern = re.compile(r'=(.*?);')
            jsondata = pattern.findall(str(source_data))[0]
            jsondata2=json.loads(jsondata)
            movieresult={'MovienIDame': jsondata2['value']['movieRating']['MovieId'],'RatingFinal': jsondata2['value']['movieRating']['RatingFinal'],'movieTitle': jsondata2['value']['movieTitle'] }
            print(movieresult)
            return (source_url,movieresult)  # 没有搜索到url时,返回None
        def get_sameurldata(self, source_url, source_data):
            a=[]
            pattern = re.compile(r'=(.*?);')
            jsonurl = pattern.findall(str(source_data))[0]
            jsonurl1=json.loads(jsonurl)
            movielist=jsonurl1['value']['movieList']
            for i in movielist:
                a.append(i['url'])
            return (a)  # 没有搜索到url时,返回None
    
    
    
    class output_url(object):
        def output_scroe(self, root_url):  # 组建动态网址
            time0 = time.strftime("%Y%m%d%H%M%S11111", time.localtime())
            a = re.compile(r'/(d+)/')
            urlnum = a.findall(root_url)
            url2 = "http%3A%2F%2Fmovie.mtime.com%2F" + urlnum[0] + '%2F'
            scroe_url = 'http://service.library.mtime.com/Movie.api?' 
                        'Ajax_CallBack=true' 
                        '&Ajax_CallBackType=Mtime.Library.Services' 
                        '&Ajax_CallBackMethod=GetMovieOverviewRating' 
                        '&Ajax_CrossDomain=1' 
                        '&Ajax_RequestUrl=%s' 
                        '&t=%s' 
                        '&Ajax_CallBackArgument0=%s' % (url2, time0, urlnum[0])
            return scroe_url
    
        def output_sameurl(self, root_url):  # 组建动态网址
    
            time0 = time.strftime("%Y%m%d%H%M%S11111", time.localtime())
            a = re.compile(r'/(d+)/')
            urlnum = a.findall(root_url)
            url2 = "http%3A%2F%2Fmovie.mtime.com%2F" + urlnum[0] + '%2F'
            same_url = 'http://service.library.mtime.com/Movie.api?' 
                        'Ajax_CallBack=true' 
                        '&Ajax_CallBackType=Mtime.Library.Services' 
                        '&Ajax_CallBackMethod=GetSimilarRecommenMovieInfoByMovieId' 
                        '&Ajax_CrossDomain=1' 
                        '&Ajax_RequestUrl=%s' 
                        '&t=%s' 
                        '&Ajax_CallBackArgument0=%s' % (url2, time0, urlnum[0])
            return same_url
    
    class output_data(object):
        def data_save(self, data):
            with open('pachong.csv', "a+", encoding='utf-8') as f:
                f1 = csv.writer(f, lineterminator='
    ')
                f1.writerows(data)
    
    
    class controllers(object):
        def __init__(self):
            self.manager = url_manager()
            self.download = url_download()
            self.scrapy = url_scrapy()
            self.output = output_data()
            self.scoreurl = output_url()
    
        def control(self, url):
            self.manager.add_new_url(url)
            num = 1
            data1 = 0
            while (1):
                if num > 200:
                    break
                elif self.manager.have_new_url():
                    url_down = self.manager.get_new_url()
                    score_url = self.scoreurl.output_scroe(url_down)
                    same_url = self.scoreurl.output_sameurl(url_down)
                    info = self.download.download(score_url)
                    url_info=self.download.download(same_url)
                    data1, url1 = self.scrapy.get_data(url_down, info)
                    same_url1= self.scrapy.get_sameurldata(url_down, url_info)
                    if data1 != None:
                        self.output.data_save(data1)
                        print(num, "is finished:", url_down)
                        num += 1
                    self.manager.add_new_urls(same_url1)
                else:
                    print('has no url')
                    break
    
    
    if __name__ == "__main__":
        url = r'http://movie.mtime.com/225824/'
        a = controllers()
        a.control(url)
    方案二:采用selenium技术进行加载动态网站信息,直接获取数据

  • 相关阅读:
    031.NET5_ActionFilter的自定义和执行特点
    030.NET5_Autofac单抽象多实现属性注入
    029.NET5_Autofac单抽象多实现构造函数注入
    028.NET5_Autofac通过类支持AOP
    vue 设置回车input提交
    vscode设置全局自动换行
    vscode 插件大全
    SQL Server 2019基础配置
    SQL Server 2019 安装教程
    phpstudy(php环境)设置内网访问
  • 原文地址:https://www.cnblogs.com/xuehaiwuya0000/p/10622924.html
Copyright © 2011-2022 走看看