题目:从http://movie.mtime.com中读取一个电影的票房信息和相关的同类电影。
方案一:采用Ajax技术,逐步提取动态网站的json,再进行爬虫
知识点:
1.由于该数据是动态信息,需要找到相关的json,并拼接出所需要的网址,进行爬虫。
2.利用正则表达式提取网址信息、利用json将字符串字典化。
from bs4 import BeautifulSoup import re, csv, urllib.request, urllib.parse, time, json, pickle class url_manager(object): def __init__(self): self.new_urls = [] # 书籍上采用set()函数,主要是考虑到次函数的去重功能,但集合是无序的,导致不方便查找new_urls内的数据,且后序add(url)时已进行判定,不必要再使用set() self.old_urls = [] def add_new_url(self, url): if url not in self.new_urls and url not in self.old_urls: self.new_urls.append(url) def add_new_urls(self, urls): if urls == None: return for url in urls: self.add_new_url(url) def have_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): data = self.new_urls.pop(0) # 从第一个数据进行删除,逐一爬虫 self.old_urls.append(data) return data class url_download(object): def download(self, url): response = urllib.request.urlopen(url) data = response.read().decode() if data == None: print("no web") return False return data class url_scrapy(object): def get_data(self, source_url, source_data):#书中将票房的电影由于其json中关键字的差异,进行了分类,本程序中为了简单起见,只是读取一下关键字。 pattern = re.compile(r'=(.*?);') jsondata = pattern.findall(str(source_data))[0] jsondata2=json.loads(jsondata) movieresult={'MovienIDame': jsondata2['value']['movieRating']['MovieId'],'RatingFinal': jsondata2['value']['movieRating']['RatingFinal'],'movieTitle': jsondata2['value']['movieTitle'] } print(movieresult) return (source_url,movieresult) # 没有搜索到url时,返回None def get_sameurldata(self, source_url, source_data): a=[] pattern = re.compile(r'=(.*?);') jsonurl = pattern.findall(str(source_data))[0] jsonurl1=json.loads(jsonurl) movielist=jsonurl1['value']['movieList'] for i in movielist: a.append(i['url']) return (a) # 没有搜索到url时,返回None class output_url(object): def output_scroe(self, root_url): # 组建动态网址 time0 = time.strftime("%Y%m%d%H%M%S11111", time.localtime()) a = re.compile(r'/(d+)/') urlnum = a.findall(root_url) url2 = "http%3A%2F%2Fmovie.mtime.com%2F" + urlnum[0] + '%2F' scroe_url = 'http://service.library.mtime.com/Movie.api?' 'Ajax_CallBack=true' '&Ajax_CallBackType=Mtime.Library.Services' '&Ajax_CallBackMethod=GetMovieOverviewRating' '&Ajax_CrossDomain=1' '&Ajax_RequestUrl=%s' '&t=%s' '&Ajax_CallBackArgument0=%s' % (url2, time0, urlnum[0]) return scroe_url def output_sameurl(self, root_url): # 组建动态网址 time0 = time.strftime("%Y%m%d%H%M%S11111", time.localtime()) a = re.compile(r'/(d+)/') urlnum = a.findall(root_url) url2 = "http%3A%2F%2Fmovie.mtime.com%2F" + urlnum[0] + '%2F' same_url = 'http://service.library.mtime.com/Movie.api?' 'Ajax_CallBack=true' '&Ajax_CallBackType=Mtime.Library.Services' '&Ajax_CallBackMethod=GetSimilarRecommenMovieInfoByMovieId' '&Ajax_CrossDomain=1' '&Ajax_RequestUrl=%s' '&t=%s' '&Ajax_CallBackArgument0=%s' % (url2, time0, urlnum[0]) return same_url class output_data(object): def data_save(self, data): with open('pachong.csv', "a+", encoding='utf-8') as f: f1 = csv.writer(f, lineterminator=' ') f1.writerows(data) class controllers(object): def __init__(self): self.manager = url_manager() self.download = url_download() self.scrapy = url_scrapy() self.output = output_data() self.scoreurl = output_url() def control(self, url): self.manager.add_new_url(url) num = 1 data1 = 0 while (1): if num > 200: break elif self.manager.have_new_url(): url_down = self.manager.get_new_url() score_url = self.scoreurl.output_scroe(url_down) same_url = self.scoreurl.output_sameurl(url_down) info = self.download.download(score_url) url_info=self.download.download(same_url) data1, url1 = self.scrapy.get_data(url_down, info) same_url1= self.scrapy.get_sameurldata(url_down, url_info) if data1 != None: self.output.data_save(data1) print(num, "is finished:", url_down) num += 1 self.manager.add_new_urls(same_url1) else: print('has no url') break if __name__ == "__main__": url = r'http://movie.mtime.com/225824/' a = controllers() a.control(url)
方案二:采用selenium技术进行加载动态网站信息,直接获取数据