zoukankan      html  css  js  c++  java
  • 爬取美团网数据

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    import requests
    
    from lib.re_util import ReUtil
    
    base_url = 'http://ns.meituan.com/meishi/b25710/'
    
    cookies_str = ''
    
    cookies_dict = {}
    for cookie in cookies_str.split(";"):
        k, v = cookie.split("=", 1)
        cookies_dict[k.strip()] = v.strip()
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.98 Chrome/71.0.3578.98 Safari/537.36'
    }
    
    page = requests.get(
        url=base_url,
        cookies=cookies_dict,
        headers=headers
    )
    
    def get_element_from_html(raw_html):
        regex = ReUtil.get_regex(begin_with=['"poiInfos":'], end_with=['},"comHeader"'])
        result = regex.findall(raw_html)
        print(result[0][1])
        ans = ""
        for i in range(4):
            ans += result[0][i]
        return result
    
    get_element_from_html(page.text)

    ReUtil,这个工具其实也够用了,但是还是建议用xPath这种正规的方法来处理HTML

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    import re
    
    
    class ReUtil:
        need_escape = {
            '\': True,
            '^': True,
            '$': True,
            '.': True,
            '*': True,
            '+': True,
            '?': True,
            '{': True,
            '}': True,
            '(': True,
            ')': True,
            '[': True,
            ']': True,
            '|': True,
        }
        exits = {}
    
        @classmethod
        def get_regex(cls, begin_with=None, must_contain=None, end_with=None) -> 're object':
            begin_with = cls.conver_to_list(begin_with)
            must_contain = cls.conver_to_list(must_contain)
            end_with = cls.conver_to_list(end_with)
    
            pattern = ''
            pattern += cls.list_to_restring(begin_with)
            pattern += '(.*)?'
            pattern += cls.list_to_restring(must_contain)
            pattern += '(.*)?'
            pattern += cls.list_to_restring(end_with)
    
            if cls.exits.get(pattern):
                return cls.exits[pattern]
            regex_obj = re.compile(pattern, re.DOTALL)
            cls.exits[pattern] = regex_obj
            return regex_obj
    
        @classmethod
        def list_to_restring(cls, args: list) -> 'str':
            ans = '((?i)' # ignore capitals
            for i, arg in enumerate(args):
                for j in range(len(arg)):
                    if arg[j] in cls.need_escape:
                        ans += '\'
                    ans += arg[j]
                if i != len(args) - 1:
                    ans += '|'
            ans += ')'
            return ans
    
        @classmethod
        def conver_to_list(cls, value) -> 'list':
            return [] if not value else [value] if not isinstance(value, list) else value
    
        @classmethod
        def get_all_number_to_list(cls, string):
            return re.findall('d+.?d*', string)
  • 相关阅读:
    关于JsonObject的笔记
    addHeader() 与 setHeader() 区别
    BeanUtils.copyProperties(A,B)字段复制用法
    servletcontext的小结
    枚举笔记
    关于spring mvc接受前台参数的笔记
    关于session和cookie
    servlet学习
    tomcat到底是干嘛的
    .json文件报错 ,点进去是Expected value at 1:0
  • 原文地址:https://www.cnblogs.com/liuweimingcprogram/p/10472391.html
Copyright © 2011-2022 走看看