主要用到lxml的etree解析网页代码,xpath获取HTML标签。
代码如下:
1 #!/user/bin env python 2 # author:Simple-Sir 3 # time:2019/7/17 22:08 4 # 获取豆瓣网正在上映电影最热评论 5 import requests 6 from lxml import etree 7 8 # 伪装浏览器 9 headers ={ 10 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 11 'Referer':'https://movie.douban.com/' 12 } 13 # 获取首页网页信息并解析 14 url = 'https://movie.douban.com/cinema/nowplaying/chengdu/' 15 16 def getUrlText(url): 17 respons = requests.get(url,headers=headers) # 获取网页信息 18 urlText = respons.text 19 html = etree.HTML(urlText) # 使用lxml解析网页 20 return html 21 22 # 提取电影名称及详情地址链接列表 23 def getWallUrl(url): 24 hrefUrl = getUrlText(url) 25 ul = hrefUrl.xpath('//ul[@class="lists"]')[0] # 获取ul标签 26 liList = ul.xpath('./li') # # 获取li标签列表 27 liHrefs = [] 28 for li in liList: 29 liHref = li.xpath('.//@href')[0] 30 name = li.xpath('@data-title')[0] 31 msg = { 32 name:liHref 33 } 34 liHrefs.append(msg) 35 return liHrefs 36 37 # 解析电影详情地址 38 def downPL(url): 39 moveUrl = getWallUrl(url) 40 n=0 41 for murl in moveUrl: 42 n+=1 43 for d in murl: 44 plHtml = getUrlText(murl[d]) 45 plTextFull = plHtml.xpath('//div[@id="hot-comments"]//span[@class="hide-item full"]//text()') 46 plTextShort = plHtml.xpath('//div[@id="hot-comments"]//span[@class="short"]//text()') 47 if(len(plTextFull)==0 and len(plTextShort)>0): 48 print('正在写入《{}》的评论。'.format(d)) 49 with open('豆瓣评论.txt','a+',encoding='utf-8') as fp: 50 fp.write('{}、《{}》的最热评论是: {} '.format(n,d,plTextShort[0])) 51 elif(len(plTextFull)>0): 52 print('正在写入《{}》的评论。'.format(d)) 53 with open('豆瓣评论.txt','a+',encoding='utf-8') as fp: 54 fp.write('{}、《{}》的最热评论是: {} '.format(n,d,plTextShort[0])) 55 else: 56 print('正在写入《{}》的评论。'.format(d)) 57 with open('豆瓣评论.txt','a+',encoding='utf-8') as fp: 58 fp.write('{}、《{}》暂无评论! '.format(n,d)) 59 return print('{}部电影的所有评论已全部写入“豆瓣评论.txt”,请查看。'.format(n)) 60 61 if __name__ == '__main__': 62 downPL(url)
执行效果:
文件详情: