zoukankan      html  css  js  c++  java
  • Python 爬取大众点评店铺评论

      1 import parsel
      2 import pymysql
      3 from lxml import etree
      4 import re
      5 import requests
      6 def download_data(url,cookie):
      7     '''
      8     获取加密网页源码
      9     获取加密文件
     10     :return:
     11     '''
     12     headers = {
     13         "Cookie": cookie,
     14         "Referer": "http://www.dianping.com/",
     15         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
     16     }
     17     '''
     18     获取原始网页
     19     '''
     20     ret = requests.get(url=url, headers=headers).text
     21     with open('01 原始网页_加密.html', 'w', encoding='utf-8') as f:
     22         f.write(ret)
     23 
     24     '''
     25     获取css文件
     26     '''
     27     css_url = re.findall('<link rel="stylesheet" type="text/css" href="(//s3plus.meituan.*?)">', ret)
     28     css_url = 'https:' + css_url[0]
     29     css_response = requests.get(css_url).text
     30     with open('02 css样式.css', 'w', encoding='utf-8') as f:
     31         f.write(css_response)
     32 
     33     '''
     34     获取svg对照表
     35     '''
     36     svg_urls = re.findall(r'.*?[class^="(.*?)"]{.*?background-image: url((.*?));', css_response)
     37     for svg_url in svg_urls:
     38         name, url = svg_url
     39         svg_url = 'https:' + url
     40         svg_response = requests.get(svg_url).text
     41         with open(F'03 svg对照表{name}.svg', 'w', encoding='utf-8') as f:
     42             f.write(svg_response)
     43 def crack_data():
     44     '''
     45     解密数据,破解svg对应关系
     46     :return:
     47     '''
     48     with open('03 svg对照表zpd.svg', 'r', encoding='utf-8') as f:#文件名称根据获取到的svg文件更换
     49         svg_html = f.read()
     50     sel = parsel.Selector(svg_html)
     51     texts = sel.css('textPath')
     52     paths = sel.css('path')
     53     path_dict = {}
     54     for path in paths:
     55         path_dict[path.css('path::attr(id)').get()] = path.css('path::attr(d)').get().split(' ')[1]
     56         # print(path.css('path::attr(id)').get())
     57         # print(path.css('path::attr(d)').get().split(' ')[1])
     58     count = 1
     59     zpd_svg_dict = {}  # y坐标和字符串的联系
     60     for text in texts:
     61         zpd_svg_dict[path_dict[str(count)]] = text.css('textPath::text').get()
     62         count += 1
     63     print(zpd_svg_dict)
     64 
     65     with open('02 css样式.css', 'r', encoding='utf-8') as f:
     66         css_html = f.read()
     67 
     68     css_paths = re.findall(r'''
     69     .(zpd.*?) {
     70         background: -(d+).0px -(d+).0px;
     71     }
     72     ''', css_html) # 正则表达式条件根据css文件类标签更换
     73     print(css_paths)
     74     last_map = {}
     75     for css_path in css_paths:
     76         css_name, x, y = css_path
     77         index = int(int(x) / 14)
     78         for i in zpd_svg_dict:
     79             if int(y) > int(i):
     80                 pass
     81             else:
     82                 last_map[css_name] = zpd_svg_dict[i][index]
     83                 break
     84     return last_map
     85 
     86 def decryption(last_map):
     87     '''
     88     返回破解后的html
     89     :param last_map:
     90     :return:
     91     '''
     92 
     93     with open('01 原始网页_加密.html', 'r', encoding='utf-8') as f:
     94         ret = f.read()
     95     svg_list = re.findall('<svgmtsi class="(.*?)"></svgmtsi>', ret)
     96     for svg in svg_list:
     97         print(svg, last_map[svg])
     98         ret = ret.replace(f'<svgmtsi class="{svg}"></svgmtsi>', last_map[svg])
     99     return ret
    100 def write_data(ret):
    101     '''
    102     获取评论数据并写入数据库
    103 
    104     :param ret:
    105     :return:
    106     '''
    107     # 用不到的div标签去掉  并不是全部都有这个标签 影响代码编写
    108     ret = ret.replace(' <div class="richtitle">消费后评价</div>', '')
    109     # ret = ret.replace(div,'')
    110     # print(ret)
    111     etre = etree.HTML(ret)
    112     li_list = etre.xpath('//*[@id="review-list"]/div[2]/div[3]/div[3]/div[3]/ul/li')
    113 
    114     # 初始化数据库
    115     db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, database='review',
    116                          charset='utf8mb4')
    117     cursor = db.cursor()
    118     count = 0
    119     for li in li_list:
    120         name = li.xpath('./div[@class="main-review"]/div[1]/a/text()')[0].strip()
    121         score = re.findall('sml-rank-stars sml-str(.*?) star', li.xpath('./div[1]/div[2]/span[1]/@class')[0])[0].strip()
    122         time = li.xpath('//div[@class="misc-info clearfix"]/span[1]/text()')[count].strip()
    123         shop_name = li.xpath('//div[@class="misc-info clearfix"]/span[2]/text()')[count].strip()
    124         comment = ','.join([i.replace('
    ', '').strip() for i in li.xpath('./div/div[4]/text()')])
    125         count += 1
    126         print(name, score, time, shop_name, comment)
    127         # 写入数据库
    128         sql = 'insert into dianping(name,score,time,shop_name,comment)values(%s,%s,%s,%s,%s)'
    129         cursor.execute(sql, (name, score, time, shop_name, comment))
    130         db.commit()
    131     # 关闭连接
    132     db.close()
    133 
    134 
    135 if __name__ == '__main__':
    136     #cookie 不定时更换
    137     cookie = "s_ViewType=10; _lxsdk_cuid=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _lxsdk=175e331ad79c8-0996df2d570671-46460e2a-1fa400-175e331ad79c8; _hc.v=c4dfac1c-01af-6a87-d803-2cd6b8db107a.1605834485; fspop=test; ctu=ef0b64e4cabf67f148563284ea8c8d0555a008f7ca0dee097831c90b52822812; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1605834487,1605835298,1606093773; cy=2; cye=beijing; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1606098153; expand=yes; _lxsdk_s=175f2cc7d23-6-9d5-75e%7C%7C532"
    138     url = 'http://www.dianping.com/shop/130096343/review_all' #这是一个商家的评论  可以更换
    139     try:
    140         download_data(url,cookie)
    141     except Exception:
    142         print('出现验证码验证')#访问过多会出现验证码 目前没有破解
    143     map_dict = {}
    144     try:
    145         map_dict = crack_data()
    146     except Exception:
    147         print('css类属性发生变化')
    148     ret = decryption(map_dict)
    149     write_data(ret)
  • 相关阅读:
    矩阵乘法(二):利用矩阵快速幂运算完成递推
    更改codeblock编译后程序的图标
    如何在VS2008下使用FLTK
    Python type() 函数
    Python range() 函数用法
    Python len()方法
    Python filter() 函数
    Python bool() 函数
    数据类型
    JAVA标识符
  • 原文地址:https://www.cnblogs.com/shiguanggege/p/14035903.html
Copyright © 2011-2022 走看看