zoukankan      html  css  js  c++  java
  • 爬虫--反爬--css反爬---大众点评爬虫

    大众点评爬虫分析,,大众点评 的爬虫价格利用css的矢量图偏移,进行加密

    只要拦截了css 解析以后再写即可

    # -*- coding: utf-8 -*-
    """
    Created on Sat Apr 27 09:28:50 2019
    
    @author: Administrator
    """
    import re
    import requests
    from bs4 import BeautifulSoup
    
    headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Cookie': 'cy=13; cye=wuxi; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=16a5c669f37c8-00e87b02ef26eb-e323069-1fa400-16a5c669f38c8; _lxsdk=16a5c669f37c8-00e87b02ef26eb-e323069-1fa400-16a5c669f38c8; _hc.v=0ff28a55-7cc0-b228-f700-c5a9a02cf919.1556328391; s_ViewType=10; _lxsdk_s=16a5c669f39-2cc-cf2-eeb%7C%7C64',
    'Host': 'www.dianping.com',
    'Referer': 'https://www.dianping.com/',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }
    #url = 'https://www.dianping.com/search/keyword/13/0_%E4%B8%87%E8%B1%A1%E5%9F%8E' # 万象城
    url = 'https://www.dianping.com/search/keyword/13/0_%E4%B8%80%E7%82%B9%E7%82%B9'
    tem = requests.get(url, headers = headers)
    ls = tem.text 
    soup = BeautifulSoup(ls, 'lxml')
    #//div[@class="shop-list J_shop-list shop-all-list"]
    linkes = soup.select('link[type="text/css"]')
    
    '''获取svg css链接代码'''
    for link in linkes:
    tem = link.attrs['href']
    if tem.__contains__('svg'):
    svg_href = 'https:'+tem
    css_html = requests.get(svg_href).text # 获取svg的内容
    
    '''获取标题'''
    titles = []
    pes = []
    title = soup.select('div[class="shop-list J_shop-list shop-all-list"] ul li div[class="tit"] a h4')
    for t in title:
    titles.append(t.get_text())
    '''获取数据url的方法,传入参数为前两个字母'''
    def get_svg_url(svg_):
    svg_url = "http:" + re.findall('class^="{}".+(//.+svgtextcss.+.svg)'.format(svg_),
    '
    '.join(css_html.split('}')))[0]
    return svg_url
    '''获取价格'''
    comment = soup.select('div[class="shop-list J_shop-list shop-all-list"] ul li div[class="comment"]')
    for com in comment:
    tem_price = com.select('a')
    price = tem_price[1].select('b')
    svges = tem_price[1].select('b svgmtsi')
    if price == []:
    pes.append(' ')
    continue
    tem_pri1 = price[0] # 价格直接显现的部分
    str_pri1 = re.findall('<b>(.*?)</b>', str(tem_pri1))[0] # 去除b
    pri1=re.sub(r'<svgmtsi class=".*"></svgmtsi>',' ',str_pri1)
    # pri1 = tem_pri1.get_text()
    # print(pri1)
    pri2 = ''
    for svg in svges:
    tem_svg = svg.attrs['class'][0] # svg的属性
    xy_loc = re.findall('{}{{background:(.*?);}}'.format(tem_svg),css_html)[0].split(' ') #正则将其的坐标提取处理,注意{{}}
    svg_url = get_svg_url(tem_svg[:2])
    thr = requests.get(svg_url).text
    y = abs(float(xy_loc[1].replace('px', ''))) # 坐标的y轴
    sop = BeautifulSoup(thr, 'lxml')
    text = sop.select('text')
    for te in text: # 判断密文在第几行并找出密文
    index = float(te.attrs['y'])
    if y <= index:
    miwen = te.get_text()
    break
    '''解密x,判断其位置'''
    x = float(xy_loc[0].replace('px', '')) # 坐标的y轴
    ind = int((x+7)/(-12)) 
    tem_pri = list(miwen)[ind]
    pri2 = pri2+tem_pri
    # print(pri1.replace(' ',pri2))
    # print(pri1, pri2)
    pes.append(pri1.replace(' ',pri2))
    # print(tem_pri)
    # pri2 = svges
    # price = pri1.replace(' ',pri2)
    # print(pri1, pri2)
    
    for i,j in zip(titles,pes):
    print(i,j)
    
     
    
     
    
     
    
     
    
     

  • 相关阅读:
    ASP.NET 页面间传递参数的方法
    Javascript 检查字符串是否是数字的几种方法
    周鸿祎教你写商业计划书
    提高网站程序性能的十条建议
    启动IIS6下Gzip功能详细操作步骤
    IT创业前要深思的八大问题
    学了php才知道MVC是怎么回事
    不能访问windows installer 服务 的解决方法
    SNS 相关
    【技术】Ubuntu上位机配置Blackfin开发环境手记
  • 原文地址:https://www.cnblogs.com/baili-luoyun/p/10780485.html
Copyright © 2011-2022 走看看