import json
import requests
from requests.exceptions import RequestException
import re
import time
print("0")
def get_hot_movie_rank(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
print("1")
def parse_hot_movie_rank(html):
pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
+ '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?realtime">.*?stonefont">(.*?)</span></span>(.*?)</p>'
+ '.*?total-boxoffice">.*?stonefont">(.*?)</span></span>(.*?)</p>.*?</dd>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3].strip()[3:],
'release time':item[4].strip()[5:],
'Real-time box office':(item[5] + item[6]).strip() ,
'Total box office':(item[7] + item[8]).strip()
}
print("2")
def write_to_file(content):
with open('maoyan1.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii = False) + 'n')
print("3")
def main():
url = 'https://maoyan.com/board/1'
html = get_hot_movie_rank(url)
for item in parse_hot_movie_rank(html):
print(item)
write_to_file(item)
print("4")
if __name__ == '__main__':
main()
time.sleep(1)
print("5")
0
1
2
3
4
{'index': '1', 'image': 'https://p0.meituan.net/moviemachine/f7d2ad70eb79d6d9b8a197713db9b8c41711752.jpg@160w_220h_1e_1c', 'title': '复仇者联盟4:终局之战', 'actor': '小罗伯特·唐尼,克里斯·埃文斯,马克·鲁法洛', 'release time': '2019-04-24', 'Real-time box office': '.亿', 'Total box office': '.亿'}
{'index': '2', 'image': 'https://p1.meituan.net/movie/d28b729ffe72353a72d1e7ef8a9b90591544978.jpg@160w_220h_1e_1c', 'title': '何以为家', 'actor': '赞恩·阿尔·拉菲亚,约丹诺斯·希费罗,博鲁瓦蒂夫·特雷杰·班科尔', 'release time': '2019-04-29', 'Real-time box office': '.万', 'Total box office': '.万'}
{'index': '3', 'image': 'https://p0.meituan.net/movie/29cebff7d3ed1cf98fbeb6b01c908e1b9947789.jpg@160w_220h_1e_1c', 'title': '雪暴', 'actor': '张震,廖凡,倪妮', 'release time': '2019-04-30', 'Real-time box office': '.万', 'Total box office': '.万'}
{'index': '4', 'image': 'https://p0.meituan.net/moviemachine/409aca94fa1695a6bdb5206735189c11495127.jpg@160w_220h_1e_1c', 'title': '下一任:前任', 'actor': '郭采洁,郑恺,李东学', 'release time': '2019-05-01', 'Real-time box office': '.万', 'Total box office': '.万'}
{'index': '5', 'image': 'https://p1.meituan.net/movie/c63849c7a9de360a7b192bc322792a111705236.jpg@160w_220h_1e_1c', 'title': '反贪风暴4', 'actor': '古天乐,郑嘉颖,林峯', 'release time': '2019-04-04', 'Real-time box office': '.万', 'Total box office': '.亿'}
{'index': '6', 'image': 'https://p0.meituan.net/moviemachine/90258899534b9cca44f2e9b9a6246504248749.jpg@160w_220h_1e_1c', 'title': '动物出击', 'actor': '景熙童', 'release time': '2019-04-30', 'Real-time box office': '.万', 'Total box office': '.万'}
{'index': '7', 'image': 'https://p0.meituan.net/movie/eda6595dc2c3a5d7cdda5eb4f8d8b1982460902.jpg@160w_220h_1e_1c', 'title': '撞死了一只羊', 'actor': '金巴,更登彭措,索朗旺姆', 'release time': '2019-04-26', 'Real-time box office': '.万', 'Total box office': '.万'}
{'index': '8', 'image': 'https://p0.meituan.net/movie/29caaa1b66c95807a3f4d29b5b03644b1876102.jpg@160w_220h_1e_1c', 'title': '调音师', 'actor': '阿尤斯曼·库拉纳,塔布,拉迪卡·艾普特', 'release time': '2019-04-03', 'Real-time box office': '.万', 'Total box office': '.亿'}
{'index': '9', 'image': 'https://p0.meituan.net/movie/0253cac859838e4fd6ae94cf986b07971008254.jpg@160w_220h_1e_1c', 'title': '神奇乐园历险记', 'actor': '索菲亚·玛丽,詹妮弗·加纳,肯·哈德森·坎贝尔', 'release time': '2019-04-19', 'Real-time box office': '.万', 'Total box office': '.万'}
{'index': '10', 'image': 'https://p0.meituan.net/movie/86aba43e286ed044a544a75748d08aca3798593.jpg@160w_220h_1e_1c', 'title': '天上再见', 'actor': '纳威尔·佩雷兹·毕斯卡亚特,阿尔贝·杜邦泰尔,艾米莉·德奎恩', 'release time': '2019-04-30', 'Real-time box office': '.万', 'Total box office': '.万'}
5
不懂数字为什么都以&#x开头,so 并没有完成我的目标,ing~
记于20190818,前几天在看别人的爬虫时,看到别人提及了这种反爬技术,所以,我接下来要更新这篇文章啦!虽然妇联已经过去了很久,最近哪吒比较火