大众点评爬虫分析,,大众点评 的爬虫价格利用css的矢量图偏移,进行加密
只要拦截了css 解析以后再写即可
# -*- coding: utf-8 -*- """ Created on Sat Apr 27 09:28:50 2019 @author: Administrator """ import re import requests from bs4 import BeautifulSoup headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'cy=13; cye=wuxi; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=16a5c669f37c8-00e87b02ef26eb-e323069-1fa400-16a5c669f38c8; _lxsdk=16a5c669f37c8-00e87b02ef26eb-e323069-1fa400-16a5c669f38c8; _hc.v=0ff28a55-7cc0-b228-f700-c5a9a02cf919.1556328391; s_ViewType=10; _lxsdk_s=16a5c669f39-2cc-cf2-eeb%7C%7C64', 'Host': 'www.dianping.com', 'Referer': 'https://www.dianping.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } #url = 'https://www.dianping.com/search/keyword/13/0_%E4%B8%87%E8%B1%A1%E5%9F%8E' # 万象城 url = 'https://www.dianping.com/search/keyword/13/0_%E4%B8%80%E7%82%B9%E7%82%B9' tem = requests.get(url, headers = headers) ls = tem.text soup = BeautifulSoup(ls, 'lxml') #//div[@class="shop-list J_shop-list shop-all-list"] linkes = soup.select('link[type="text/css"]') '''获取svg css链接代码''' for link in linkes: tem = link.attrs['href'] if tem.__contains__('svg'): svg_href = 'https:'+tem css_html = requests.get(svg_href).text # 获取svg的内容 '''获取标题''' titles = [] pes = [] title = soup.select('div[class="shop-list J_shop-list shop-all-list"] ul li div[class="tit"] a h4') for t in title: titles.append(t.get_text()) '''获取数据url的方法,传入参数为前两个字母''' def get_svg_url(svg_): svg_url = "http:" + re.findall('class^="{}".+(//.+svgtextcss.+.svg)'.format(svg_), ' '.join(css_html.split('}')))[0] return svg_url '''获取价格''' comment = soup.select('div[class="shop-list J_shop-list shop-all-list"] ul li div[class="comment"]') for com in comment: tem_price = com.select('a') price = tem_price[1].select('b') svges = tem_price[1].select('b svgmtsi') if price == []: pes.append(' ') continue tem_pri1 = price[0] # 价格直接显现的部分 str_pri1 = re.findall('<b>(.*?)</b>', str(tem_pri1))[0] # 去除b pri1=re.sub(r'<svgmtsi class=".*"></svgmtsi>',' ',str_pri1) # pri1 = tem_pri1.get_text() # print(pri1) pri2 = '' for svg in svges: tem_svg = svg.attrs['class'][0] # svg的属性 xy_loc = re.findall('{}{{background:(.*?);}}'.format(tem_svg),css_html)[0].split(' ') #正则将其的坐标提取处理,注意{{}} svg_url = get_svg_url(tem_svg[:2]) thr = requests.get(svg_url).text y = abs(float(xy_loc[1].replace('px', ''))) # 坐标的y轴 sop = BeautifulSoup(thr, 'lxml') text = sop.select('text') for te in text: # 判断密文在第几行并找出密文 index = float(te.attrs['y']) if y <= index: miwen = te.get_text() break '''解密x,判断其位置''' x = float(xy_loc[0].replace('px', '')) # 坐标的y轴 ind = int((x+7)/(-12)) tem_pri = list(miwen)[ind] pri2 = pri2+tem_pri # print(pri1.replace(' ',pri2)) # print(pri1, pri2) pes.append(pri1.replace(' ',pri2)) # print(tem_pri) # pri2 = svges # price = pri1.replace(' ',pri2) # print(pri1, pri2) for i,j in zip(titles,pes): print(i,j)