工具
python3.5
BeautifulSoup
步骤:
1、根据url抓取豆瓣电影html,并解析
2、BeautifulSoup截取节点,写入字典
3、保存字典信息
# -*- coding='utf-8' -*- import requests from bs4 import BeautifulSoup import json #发送request,返回response def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def getMovieInfo(mlist, html): soup = BeautifulSoup(html, 'html.parser') #解析成html lists = soup.find_all('li', attrs={'class':'list-item'}) for ls in lists: if ls.attrs['data-category']== 'nowplaying': #判断正热播的电影 mdict = {} mdict['电影名'] = ls.attrs['data-title'] mdict['评分'] = ls.attrs['data-score'] mdict['时长'] = ls.attrs['data-duration'] mdict['主演'] = ls.attrs['data-actors'] mlist.append(mdict) #写入txt文件 def saveMovieInfo(mlist, path): with open(path, 'w', encoding='utf-8') as f: f.write(str(mlist)) f.close() def main(): mlist = [] url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/' path = 'D://pachong//movie.txt' html = getHTMLText(url) print(len(html)) getMovieInfo(mlist, html) print() saveMovieInfo(mlist, path) if __name__ == '__main__': main()