import requests from lxml import html etree = html.etree # 请求头 网站url url = 'https://movie.douban.com/cinema/nowplaying/langfang/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400', 'Referer': 'https://movie.douban.com/' } resp = requests.get(url,headers=headers) # text str content bytes # text 解码过的 # print(resp.content.decode('utf-8')) # print(resp.text) # 转化为html 对象 <class 'lxml.etree._Element'> html = etree.HTML(resp.text) print(type(html)) ul = html.xpath("//ul[@class='lists']")[0] # print(ul) # print(etree.tostring(ul,encoding='utf-8').decode('utf-8') lis = ul.xpath("./li") # print(etree.tostring(li,encoding='utf-8').decode('utf-8')) for li in lis: name = li.xpath("@data-title") print(name) img = li.xpath(".//img/@src") print(img)
爬取豆瓣电影的在映电影名称和其海报url。
了解了正则表达式和lxml和bs4之间的区别和优缺点。