#!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests from lib.re_util import ReUtil base_url = 'http://ns.meituan.com/meishi/b25710/' cookies_str = '' cookies_dict = {} for cookie in cookies_str.split(";"): k, v = cookie.split("=", 1) cookies_dict[k.strip()] = v.strip() headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36' } page = requests.get( url=base_url, cookies=cookies_dict, headers=headers ) def get_element_from_html(raw_html): regex = ReUtil.get_regex(begin_with=['"poiInfos":'], end_with=['},"comHeader"']) result = regex.findall(raw_html) print(result[0][1]) ans = "" for i in range(4): ans += result[0][i] return result get_element_from_html(page.text)
ReUtil,这个工具其实也够用了,但是还是建议用xPath这种正规的方法来处理HTML
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import re class ReUtil: need_escape = { '\': True, '^': True, '$': True, '.': True, '*': True, '+': True, '?': True, '{': True, '}': True, '(': True, ')': True, '[': True, ']': True, '|': True, } exits = {} @classmethod def get_regex(cls, begin_with=None, must_contain=None, end_with=None) -> 're object': begin_with = cls.conver_to_list(begin_with) must_contain = cls.conver_to_list(must_contain) end_with = cls.conver_to_list(end_with) pattern = '' pattern += cls.list_to_restring(begin_with) pattern += '(.*)?' pattern += cls.list_to_restring(must_contain) pattern += '(.*)?' pattern += cls.list_to_restring(end_with) if cls.exits.get(pattern): return cls.exits[pattern] regex_obj = re.compile(pattern, re.DOTALL) cls.exits[pattern] = regex_obj return regex_obj @classmethod def list_to_restring(cls, args: list) -> 'str': ans = '((?i)' # ignore capitals for i, arg in enumerate(args): for j in range(len(arg)): if arg[j] in cls.need_escape: ans += '\' ans += arg[j] if i != len(args) - 1: ans += '|' ans += ')' return ans @classmethod def conver_to_list(cls, value) -> 'list': return [] if not value else [value] if not isinstance(value, list) else value @classmethod def get_all_number_to_list(cls, string): return re.findall('d+.?d*', string)