import re
from pyquery import PyQuery as pq
import requests
from bs4 import BeautifulSoup
headers = {
'Cookie': 'did_close_tag=; __mta=53728023.1611284104076.1611284104076.1611284104076.1; _lxsdk_cuid=17692643552c8-0f6aedeca7c4d3-3e604809-1fa400-17692643552c8; _hc.v=b32e7d08-accb-d713-b51b-e9f3f91b5edf.1609120457; s_ViewType=10; ua=dpuser_9207645725; fspop=test; cy=160; cye=zhengzhou; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; aburl=1; cityInfo=%7B%22cityId%22%3A160%2C%22cityName%22%3A%22%E9%83%91%E5%B7%9E%22%2C%22provinceId%22%3A0%2C%22parentCityId%22%3A0%2C%22cityOrderId%22%3A0%2C%22isActiveCity%22%3Afalse%2C%22cityEnName%22%3A%22zhengzhou%22%2C%22cityPyName%22%3Anull%2C%22cityAreaCode%22%3Anull%2C%22cityAbbrCode%22%3Anull%2C%22isOverseasCity%22%3Afalse%2C%22isScenery%22%3Afalse%2C%22TuanGouFlag%22%3A0%2C%22cityLevel%22%3A0%2C%22appHotLevel%22%3A0%2C%22gLat%22%3A0%2C%22gLng%22%3A0%2C%22directURL%22%3Anull%2C%22standardEnName%22%3Anull%7D; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1611198575; ll=7fd06e815b796be3df069dec7836c3df; ctu=86aab2ab6c4756757274f60b38f21950d48955d7ee9953f6a5bf87aae9d763a6; uamo=15824770183; uuid=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; iuuid=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; _lxsdk=DA34EA3FD6269A8A12E4F0DC658618076D55A2D5DB5F0F9BA006E145EBA0165B; _ga=GA1.2.1895823352.1611212247; _gid=GA1.2.769218074.1611212247; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1609120457,1610938415,1611198481,1611284068; Hm_lpvt_dbeeb675516927da776beeb1d9802bd4=1611284104; Hm_lvt_4c4fc10949f0d691f3a2cc4ca5065397=1611284126; Hm_lpvt_4c4fc10949f0d691f3a2cc4ca5065397=1611284126; lgtoken=0151ee32a-ad64-45a6-a153-c86aff4f5e61; dper=b2c6d5e2034c2abae310adab2004594fbefa99da8183294a9df2da90639ea0cf955b9cd7b4b84ff2dbce1b799ec44c8d671a35eaae9ffec36642947e930bf49a5424e4880315715ea0c2aee51c0ba850f547ad046acf78b7142d64e6bab72319; dplet=4731b96715688d19463cb7d344f97171; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1611284182; _lxsdk_s=1772803b3fb-22f-c42-633%7C%7C411',
'user-agent': 'MMozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_font_dict():
"""
获取 svgmtsi 的class 的位置信息,也就是每个字代表的偏移量,整理成字典
:return:
"""
url = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/9188e234976e64ada3d82aaf266d6f52.css'
r = requests.get(url, headers=headers)
font_list = re.findall('.*?{.*?}', r.text)
font_dict = {}
for font in font_list:
class_font = font.split('{')[0].replace('.', '')
pianyi_list = re.findall('d+', font.split('{')[1])
num_list = [int(x) for x in pianyi_list if int(x) != 0]
if len(num_list) == 1:
num_list.insert(0, 0)
font_dict[class_font] = num_list
return font_dict
def get_font_place():
"""
获取svg中字体的位置,便于获取css偏移的字,整理成字典比较好
:return:
"""
url = 'http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/fa364e43fd811d8b108e34479d0cc4a0.svg'
place_dict = dict()
r = requests.get(url, headers)
soup = BeautifulSoup(r.text, 'html.parser')
text_list = soup.find_all('text')
for i in range(len(text_list)):
y = text_list[i]['y']
text = text_list[i].text
place_dict[y] = text
return place_dict
def main():
"""
主方法: 1.html 就是点评的详情页,
根据svgmtsi 的class属性 获取css 中的坐标值。
a,b b值用来获取字体的行数,也就是在哪一行,b+23就是字体所在行数,我也不知道为啥是23。
字体的宽度是14 所以用a/14 就是字体所在的行的位置索引
用正则获取 <svgmtsi class=".*?"></svgmtsi> 然后解析成汉字 再去替换,就可以解析到正确的字体
"""
font_dict = get_font_dict()
place_dict = get_font_place()
with open('1.html', 'r', encoding='utf8')as f:
html = f.read()
result = re.findall('(<svgmtsi class=".*?"></svgmtsi>)', html)
for svgmtsi in result:
doc = pq(svgmtsi).attr('class')
doc_values = font_dict[doc]
index_num = doc_values[1] + 23
try:
text = place_dict[str(index_num)]
except KeyError:
continue
text_index = int(int(doc_values[0])/14)
string = text[text_index]
html = html.replace(svgmtsi, string)
h_doc = pq(html)
comments = h_doc('.reviews-items li').items()
for comment in comments:
comment_text1 = comment('.review-truncated-words').text()
comment_text2 = comment('.review-words.Hide').text()
comment_text = comment_text1 + comment_text2
comment_text = comment_text.replace('展开评价','').replace('收起评价','').replace('
','')
if comment_text:
print(comment_text)
print('==========================================')
if __name__ == '__main__':
main()
这就是大众点评的评论字体加密
简单的demo 没有完整的爬取代码,只有解析字体加密的,反正我解决不了ip + cookie 的反扒,好气哟!