1 import re
2 from urllib.request import urlopen
3 import ssl
4
5 ssl._create_default_https_context = ssl._create_default_https_context
6
7
8 def getPage(url):
9 response = urlopen(url)
10 return response.read().decode("utf-8")
11
12
13 def parsePage(s):
14 com = re.compile(
15 '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>d+).*?' +
16 '<span class="title">(?P<title>.*?)</span>' +
17 '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>' +
18 '(?P<comment_num>.*?)评价</span>', re.S)
19 ret = com.finditer(s)
20 for i in ret:
21 yield {
22 "id": i.group("id"),
23 "title": i.group("title"),
24 "rating_num": i.group("rating_num"),
25 "comment_num": i.group("comment_num"),
26 }
27
28
29 def main(num):
30 url = 'https://movie.douban.com/top250?start=%s&filter=' % num
31 response_html = getPage(url)
32 ret = parsePage(response_html)
33 f = open("move", "a", encoding="utf-8")
34
35 for obj in ret:
36 # print(obj)
37 data = str(obj)
38 f.write(data + "
")
39
40
41 count = 0
42 for i in range(5):
43 main(count)