1 """豆瓣电影爬虫""" 2 3 4 import requests 5 from lxml import etree 6 7 # 1、将目标网站上的页面爬取出来 8 headers = { 9 'User-Agent': 'Mozilla/5.0', 10 } 11 12 url = 'https://movie.douban.com/cinema/nowplaying/shangrao/' 13 #url = 'https://movie.douban.com/' 14 15 response = requests.get(url, headers=headers) 16 text = response.text 17 #print(response.text) 18 with open('douban.html', 'w', encoding='utf-8') as fp: 19 fp.write(response.content.decode('utf-8')) 20 # response.text返回的是一个经过解码的字符串,是str(unicode)类型 21 # response.content返回的是一个原生的字符串是bytes类型,没有经过解码,respose.content.decode('utf-8')解码 22 23 # 2、将数据根据一定的规则进行提取 24 movies = [] 25 html = etree.HTML(text) 26 ul = html.xpath("//ul[@class='lists']")[0] 27 lis = ul.xpath("./li") 28 for li in lis: 29 title = li.xpath("@data-title")[0] 30 region = li.xpath("@data-region")[0] 31 director = li.xpath("@data-director")[0] 32 actors = li.xpath("@data-actors")[0] 33 duration = li.xpath("@data-duration")[0] 34 img = li.xpath(".//img/@src")[0] 35 release_date = li.xpath(".//li[@class='release-date']/text()")[0].strip() # strip()去除两边空格 36 #print(release_date) 37 38 movie = { 39 'title': title, 40 'region': region, 41 'director': director, 42 'actors': actors, 43 'duration': duration, 44 'img': img, 45 'date': release_date 46 } 47 movies.append(movie) 48 49 for movie in movies: 50 print(movie)