import re from pyquery import PyQuery as pq import requests from bs4 import BeautifulSoup headers = { 'Cookie': 'did_close_tag=; __mta=53728023.1611284104076.1611284104076.1611284104076.1; _lxsdk_cuid=17692643552c8-0f6aedeca7c4d3-3e604809-1fa400-17692643552c8; _hc.v=b32e7d08-accb-d713-b51b-e9f3f91b5edf.1609120457; s_ViewType=10; ua=dpuser_9207645725; fspop=test; cy=160; cye=zhengzhou; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; aburl=1; cityInfo=%7B%22cityId%22%3A160%2C%22cityName%22%3A%22%E9%83%91%E5%B7%9E%22%2C%22provinceId%22%3A0%2C%22parentCityId%22%3A0%2C%22cityOrderId%22%3A0%2C%22isActiveCity%22%3Afalse%2C%22cityEnName%22%3A%22zhengzhou%22%2C%22cityPyName%22%3Anull%2C%22cityAreaCode%22%3Anull%2C%22cityAbbrCode%22%3Anull%2C%22isOverseasCity%22%3Afalse%2C%22isScenery%22%3Afalse%2C%22TuanGouFlag%22%3A0%2C%22cityLevel%22%3A0%2C%22appHotLevel%22%3A0%2C%22gLat%22%3A0%2C%22gLng%22%3A0%2C%22directURL%22%3Anull%2C%22standardEnName%22%3Anull%7D; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1611198575; ll=7fd06e815b796be3df069dec7836c3df; ctu=86aab2ab6c4756757274f60b38f21950d48955d7ee9953f6a5bf87aae9d763a6; uamo=15824770183; uuid=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; iuuid=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; _lxsdk=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; _ga=GA1.2.1895823352.1611212247; _gid=GA1.2.769218074.1611212247; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1609120457,1610938415,1611198481,1611284068; Hm_lpvt_dbeeb675516927da776beeb1d9802bd4=1611284104; Hm_lvt_4c4fc10949f0d691f3a2cc4ca5065397=1611284126; Hm_lpvt_4c4fc10949f0d691f3a2cc4ca5065397=1611284126; lgtoken=0151ee32a-ad64-45a6-a153-c86aff4f5e61; dper=b2c6d5e2034c2abae310adab2004594fbefa99da8183294a9df2da90639ea0cf955b9cd7b4b84ff2dbce1b799ec44c8d671a35eaae9ffec36642947e930bf49a5424e4880315715ea0c2aee51c0ba850f547ad046acf78b7142d64e6bab72319; dplet=4731b96715688d19463cb7d344f97171; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1611284182; _lxsdk_s=1772803b3fb-22f-c42-633%7C%7C411', 'user-agent': 'MMozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } def get_font_dict(): """ 获取 svgmtsi 的class 的位置信息,也就是每个字代表的偏移量,整理成字典 :return: """ url = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/9188e234976e64ada3d82aaf266d6f52.css' r = requests.get(url, headers=headers) font_list = re.findall('.*?{.*?}', r.text) font_dict = {} for font in font_list: class_font = font.split('{')[0].replace('.', '') pianyi_list = re.findall('d+', font.split('{')[1]) num_list = [int(x) for x in pianyi_list if int(x) != 0] if len(num_list) == 1: num_list.insert(0, 0) font_dict[class_font] = num_list return font_dict def get_font_place(): """ 获取svg中字体的位置,便于获取css偏移的字,整理成字典比较好 :return: """ url = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/fa364e43fd811d8b108e34479d0cc4a0.svg' place_dict = dict() r = requests.get(url, headers) soup = BeautifulSoup(r.text, 'html.parser') text_list = soup.find_all('text') for i in range(len(text_list)): y = text_list[i]['y'] text = text_list[i].text place_dict[y] = text return place_dict def main(): """ 主方法: 1.html 就是点评的详情页, 根据svgmtsi 的class属性 获取css 中的坐标值。 a,b b值用来获取字体的行数,也就是在哪一行,b+23就是字体所在行数,我也不知道为啥是23。 字体的宽度是14 所以用a/14 就是字体所在的行的位置索引 用正则获取 <svgmtsi class=".*?"></svgmtsi> 然后解析成汉字 再去替换,就可以解析到正确的字体 """ font_dict = get_font_dict() place_dict = get_font_place() with open('1.html', 'r', encoding='utf8')as f: html = f.read() result = re.findall('(<svgmtsi class=".*?"></svgmtsi>)', html) for svgmtsi in result: doc = pq(svgmtsi).attr('class') doc_values = font_dict[doc] index_num = doc_values[1] + 23 try: text = place_dict[str(index_num)] except KeyError: continue text_index = int(int(doc_values[0])/14) string = text[text_index] html = html.replace(svgmtsi, string) h_doc = pq(html) comments = h_doc('.reviews-items li').items() for comment in comments: comment_text1 = comment('.review-truncated-words').text() comment_text2 = comment('.review-words.Hide').text() comment_text = comment_text1 + comment_text2 comment_text = comment_text.replace('展开评价','').replace('收起评价','').replace(' ','') if comment_text: print(comment_text) print('==========================================') if __name__ == '__main__': main()
这就是大众点评的评论字体加密
简单的demo 没有完整的爬取代码,只有解析字体加密的,反正我解决不了ip + cookie 的反扒,好气哟!