zoukankan      html  css  js  c++  java
  • Python 爬取大众点评店铺评论

      1 import parsel
      2 import pymysql
      3 from lxml import etree
      4 import re
      5 import requests
      6 def download_data(url,cookie):
      7     '''
      8     获取加密网页源码
      9     获取加密文件
     10     :return:
     11     '''
     12     headers = {
     13         "Cookie": cookie,
     14         "Referer": "http://www.dianping.com/",
     15         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
     16     }
     17     '''
     18     获取原始网页
     19     '''
     20     ret = requests.get(url=url, headers=headers).text
     21     with open('01 原始网页_加密.html', 'w', encoding='utf-8') as f:
     22         f.write(ret)
     23 
     24     '''
     25     获取css文件
     26     '''
     27     css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret)
     28     css_url = 'https:' + css_url[0]
     29     css_response = requests.get(css_url).text
     30     with open('02 css样式.css', 'w', encoding='utf-8') as f:
     31         f.write(css_response)
     32 
     33     '''
     34     获取svg对照表
     35     '''
     36     svg_urls = re.findall(r'.*?[class^="(.*?)"]{.*?background-image: url((.*?));', css_response)
     37     for svg_url in svg_urls:
     38         name, url = svg_url
     39         svg_url = 'https:' + url
     40         svg_response = requests.get(svg_url).text
     41         with open(F'03 svg对照表{name}.svg', 'w', encoding='utf-8') as f:
     42             f.write(svg_response)
     43 def crack_data():
     44     '''
     45     解密数据,破解svg对应关系
     46     :return:
     47     '''
     48     with open('03 svg对照表zpd.svg', 'r', encoding='utf-8') as f:#文件名称根据获取到的svg文件更换
     49         svg_html = f.read()
     50     sel = parsel.Selector(svg_html)
     51     texts = sel.css('textPath')
     52     paths = sel.css('path')
     53     path_dict = {}
     54     for path in paths:
     55         path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
     56         # print(path.css('path::attr(id)').get())
     57         # print(path.css('path::attr(d)').get().split(' ')[1])
     58     count = 1
     59     zpd_svg_dict = {}  # y坐标和字符串的联系
     60     for text in texts:
     61         zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
     62         count += 1
     63     print(zpd_svg_dict)
     64 
     65     with open('02 css样式.css', 'r', encoding='utf-8') as f:
     66         css_html = f.read()
     67 
     68     css_paths = re.findall(r'''
     69     .(zpd.*?) {
     70         background: -(d+).0px -(d+).0px;
     71     }
     72     ''', css_html) # 正则表达式条件根据css文件类标签更换
     73     print(css_paths)
     74     last_map = {}
     75     for css_path in css_paths:
     76         css_name, x, y = css_path
     77         index = int(int(x) / 14)
     78         for i in zpd_svg_dict:
     79             if int(y) > int(i):
     80                 pass
     81             else:
     82                 last_map[css_name] = zpd_svg_dict[i][index]
     83                 break
     84     return last_map
     85 
     86 def decryption(last_map):
     87     '''
     88     返回破解后的html
     89     :param last_map:
     90     :return:
     91     '''
     92 
     93     with open('01 原始网页_加密.html', 'r', encoding='utf-8') as f:
     94         ret = f.read()
     95     svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret)
     96     for svg in svg_list:
     97         print(svg, last_map[svg])
     98         ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg])
     99     return ret
    100 def write_data(ret):
    101     '''
    102     获取评论数据并写入数据库
    103 
    104     :param ret:
    105     :return:
    106     '''
    107     # 用不到的div标签去掉  并不是全部都有这个标签 影响代码编写
    108     ret = ret.replace(' <div class="richtitle">消费后评价</div>', '')
    109     # ret = ret.replace(div,'')
    110     # print(ret)
    111     etre = etree.HTML(ret)
    112     li_list = etre.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li')
    113 
    114     # 初始化数据库
    115     db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, database='review',
    116                          charset='utf8mb4')
    117     cursor = db.cursor()
    118     count = 0
    119     for li in li_list:
    120         name = li.xpath('./div[@class="main-review"]/div[1]/a/text()')[0].strip()
    121         score = re.findall('sml-rank-stars sml-str(.*?) star', li.xpath('./div[1]/div[2]/span[1]/@class')[0])[0].strip()
    122         time = li.xpath('//div[@class="misc-info clearfix"]/span[1]/text()')[count].strip()
    123         shop_name = li.xpath('//div[@class="misc-info clearfix"]/span[2]/text()')[count].strip()
    124         comment = ','.join([i.replace('
    ', '').strip() for i in li.xpath('./div/div[4]/text()')])
    125         count += 1
    126         print(name, score, time, shop_name, comment)
    127         # 写入数据库
    128         sql = 'insert into dianping(name,score,time,shop_name,comment)values(%s,%s,%s,%s,%s)'
    129         cursor.execute(sql, (name, score, time, shop_name, comment))
    130         db.commit()
    131     # 关闭连接
    132     db.close()
    133 
    134 
    135 if __name__ == '__main__':
    136     #cookie 不定时更换
    137     cookie = "s_ViewType=10; _lxsdk_cuid=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _lxsdk=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _hc.v=c4dfac1c-01af-6a87-d803-2cd6b8db107a.1605834485; fspop=test; ctu=ef0b64e4cabf67f148563284ea8c8d0555a008f7ca0dee097831c90b52822812; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1605834487,1605835298,1606093773; cy=2; cye=beijing; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1606098153; expand=yes; _lxsdk_s=175f2cc7d23-6-9d5-75e%7C%7C532"
    138     url = 'http://www.dianping.com/shop/130096343/review_all' #这是一个商家的评论  可以更换
    139     try:
    140         download_data(url,cookie)
    141     except Exception:
    142         print('出现验证码验证')#访问过多会出现验证码 目前没有破解
    143     map_dict = {}
    144     try:
    145         map_dict = crack_data()
    146     except Exception:
    147         print('css类属性发生变化')
    148     ret = decryption(map_dict)
    149     write_data(ret)
  • 相关阅读:
    CSS盒子模型
    getContextPath、getServletPath、getRequestURI、request.getRealPath的区别
    MYSQL中的CASE WHEN END AS
    单点登录的精华总结
    git&github
    June 21st 2017 Week 25th Wednesday
    June 20th 2017 Week 25th Tuesday
    June 19th 2017 Week 25th Monday
    June 18th 2017 Week 25th Sunday
    June 17th 2017 Week 24th Saturday
  • 原文地址:https://www.cnblogs.com/shiguanggege/p/14035903.html
Copyright © 2011-2022 走看看