1.豆瓣
爬取单个页面数据
import requests from lxml import etree #import os url = "https://movie.douban.com/cinema/nowplaying/yongzhou/" headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } req = requests.get(url=url,headers=headers) text = req.text dics = [] #将抓取下来的数据根据一定的规则进行提取 html = etree.HTML(text) ul = html.xpath("//ul[@class='lists']")[0] #print(etree.tostring(ul,encoding='utf-8').decode('utf-8')) lis = ul.xpath("./li") for li in lis: title = li.xpath("@data-title")[0] score = li.xpath("@data-actors")[0] adress = li.xpath("@data-region")[0] img_hai = li.xpath(".//img/@src")[0] dic = { 'title':title, 'score':score, 'adress':adress, 'img':img_hai } dics.append(dic) print(dics)
2.电影天堂
爬取多个页面数据
import requests import json from lxml import etree url = "http://www.dytt8.net" HEADERS = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Referer':'http://www.dytt8.net/html/gndy/dyzz/list_23_1.html' } def get_url(urls): response = requests.get(urls,headers=HEADERS) text = response.text #请求页面 html = etree.HTML(text) #解析为HTML文档 html为Element对象 (可以执行xpath语法) detail_urls = html.xpath("//table[@class='tbspan']//a/@href") #获取页面下的href detail_urls = map(lambda urls:url+urls,detail_urls) #将detail_urls这个列表中每个url都扔给lambda这个函数合并 再将整个修改后的赋给detail_urls return detail_urls def parse_detail_url(de_ur): movie = {} response = requests.get(de_ur,headers=HEADERS) text = response.content.decode('gbk') html = etree.HTML(text) title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] #获取标题 movie['title'] = title #放入字典 zoomE = html.xpath("//div[@id='Zoom']")[0] img_hb = zoomE.xpath(".//img/@src") cover = img_hb[0] #海报 #sst = img_hb[1] #电影截图 movie['cover'] = cover #movie['sst'] = sst def parse_info(info,rule): return info.replace(rule,"").strip() #.strip()把前后空格删掉 infos = zoomE.xpath(".//text()") for index,info in enumerate(infos): #enumerate 索引序列(0 str 1 str 2 str) if info.startswith("◎片 名"): #判断 以。。开始 info = parse_info(info,"◎片 名") #调用parse_info将"◎片 名"替换为无(没有) movie['pian'] = info elif info.startswith("◎年 代"): info = parse_info(info, "◎年 代") movie['year'] = info elif info.startswith("◎产 地"): info = parse_info(info, "◎产 地") movie['adress'] = info elif info.startswith("◎导 演"): info = parse_info(info, "◎导 演") movie['actor'] = info elif info.startswith("◎类 别"): info = parse_info(info, "◎类 别") movie['lb'] = info elif info.startswith("◎豆瓣评分"): info = parse_info(info, "◎豆瓣评分") movie['db'] = info elif info.startswith("◎主 演"): info = parse_info(info, "◎主 演") actors = [] for x in range(index+1,len(infos)): actor = infos[x] if actor.startswith("◎"): #过滤简介部分 break actors.append(actor) movie['actors'] = actors elif info.startswith("◎简 介"): info = parse_info(info,"◎简 介") for x in range(index+1,len(infos)): profile = infos[x].strip() if profile.startswith("【"): #过滤下载地址部分 break movie['profile'] = profile download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0] #下载地址 movie['download_url'] = download_url return movie def write_to_file(content): with open('result.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)+' ') #ensure_ascii=False 输出为中文 f.close() def dianying(): urld = "http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html" #这里用到了{} .format()的用法 movies = [] #定义一个列表 for x in range(1,8): #第一个for循环用来控制7个页面 print(x) urls = urld.format(x) if x==5: #这里因为第5个页面出现报错信息 可能是编码问题 解决不了 所以我就过滤了第5页 continue detail_ur = get_url(urls) #解析每页的详细信息 write_to_file("第%s页" % x) for detail_url in detail_ur: #第二个for循环用来遍历每个页 movie = parse_detail_url(detail_url) movies.append(movie) write_to_file(movie) if __name__ == '__main__': dianying()
3.腾讯招聘
跟上一个电影天堂的代码差不多
import requests import json from lxml import etree url = "https://hr.tencent.com/" HEADERS = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } def get_url(urld): response = requests.get(urld,headers=HEADERS) text = response.text html = etree.HTML(text) detail_url = html.xpath("//tr[@class='even' or @class='odd']//a/@href") detail_url = map(lambda x:url+x,detail_url) return detail_url def prease_url(detail_url): dic = {} #print(detail_url) response = requests.get(detail_url,headers=HEADERS) text =response.text html = etree.HTML(text) title = html.xpath("//tr[@class='h']//td[@class='l2 bold size16']//text()")[0] dic['title'] = title #方法一 (死板) adress = html.xpath("//tr[@class='c bottomline']//td//text()")[1] dic['adress'] = adress # 方法二 (简洁) str = html.xpath("//tr[@class='c bottomline']//td") leibie = str[1].xpath(".//text()")[1] dic['leibie'] = leibie nums = str[2].xpath(".//text()")[1] dic['nums'] = nums gz = html.xpath("//ul[@class='squareli']") gzzz = gz[0].xpath(".//text()") gzyq = gz[1].xpath(".//text()") dic['工作职责'] = gzzz dic['工作要求'] = gzyq #print(dic) return dic def write_to_file(content): with open('tengxun.txt','a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False)+' ') #ensure_ascii=False 输出为中文 f.close() def tengxun(): movies = [] urls = "https://hr.tencent.com/position.php?keywords=python&lid=0&tid=87&start={}#a" for x in range(0,501,10): #步长为10 print(x) urld = urls.format(x) detail_urls = get_url(urld) for detail_url in detail_urls: movie = prease_url(detail_url) movies.append(movie) write_to_file(x) write_to_file(movies) if __name__ == '__main__': tengxun()