from lxml import etree import requests url = "https://movie.douban.com/chart" headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"} response = requests.get(url,headers = headers) html_str = response.content.decode() #print(response) html = etree.HTML(html_str) #将html_str文本转化为对象 print(html) #1.获取所有电影的url # url_list = html.xpath("//div[@class = 'indent']/div/table//div[@class='pl2']/a/@href") #利用xpath来获取在tabal下面中class为pl2中a的href的值 #print(url_list) #2.获取所有图片的地址 # img_list = html.xpath("//div[@class = 'indent']/div/table/tr/td/a/img/@src") #print(img_list) # img = html.xpath("//div[@class = 'indent']/div/table//a[@class='nbg']/img/@src") #print(img) #上面两个功能一样,只要找到相应位置的字段值里面含有class,就可以利用//快速定位。 #3.需要每部电影里对应的参数 ret1 = html.xpath("//div[@class = 'indent']/div/table") print(ret1) for table in ret1: item = {} #标题 item["title"] = table.xpath(".//div[@class='pl2']/a/text()")[0].replace("/","").strip() # 去字典里面的第一个数值,且将/符号用控制符来替代,并且将外皮全部剥去。 #电影地址 item["url"] = table.xpath(".//a[@class='nbg']/@href") #图片 item["img"] = table.xpath(".//a[@class='nbg']/img/@src")[0] #评分 item["grade"] = table.xpath(".//div[@class='star clearfix']/span[@class = 'rating_nums']/text()") item["comment"] = table.xpath(".//div[@class = 'star clearfix']/span[@class = 'pl']/text()") print(item)
这样就能解析出电影的信息了。