结果输出到文本文件中。
1 import codecs 2 import requests 3 from bs4 import BeautifulSoup 4 5 headers={'User-Agent': 'Mozilla/5.0'} 6 index_url = 'https://movie.douban.com/top250' 7 8 def get_html(url): 9 html = requests.get(url, headers=headers).text 10 return html 11 12 def create_list(html): 13 soup = BeautifulSoup(html, 'lxml') 14 movie_names = [] 15 movie_info = [] 16 for t in soup.find_all('div', 'hd'): 17 name = t.find('span', 'title').get_text() 18 movie_names.append(name) 19 for t in soup.find_all('div', 'info'): 20 info = t.find('p').get_text().replace(' ','') 21 movie_info.append(info) 22 next_page = soup.find('span', 'next').find('a') 23 if next_page: 24 return movie_names, movie_info, index_url + next_page['href'] 25 else: 26 return movie_names, movie_info, None 27 28 def main(): 29 order = 1 30 url = index_url 31 with codecs.open('top250.txt', 'wb', encoding='utf-8') as f: 32 while url: 33 html = get_html(url) 34 names, info, url = create_list(html) 35 for n in range(25): 36 f.write('Top ' + str(order) + ' ' + names[n] + ' ') 37 f.write(info[n] + ' ') 38 order = order + 1 39 40 if __name__ == '__main__': 41 main()