zoukankan      html  css  js  c++  java
  • 爬虫3 html解析器 html_parser.py

    #coding:utf8
    import urlparse
    from bs4 import BeautifulSoup
    import re
    
    __author__ = 'wang'
    
    
    class HtmlParser(object):
        def parse(self, page_url, html_cont):
            if page_url is None or html_cont is None:
                return
    
            soup = BeautifulSoup(html_cont, 'html.parser', from_encoding = 'utf-8')
            new_urls = self._get_new_urls(page_url, soup)
            new_data = self._get_new_data(page_url, soup)
            return new_urls, new_data;
    
        def _get_new_urls(self, page_url, soup):
            new_urls = set()
            links = soup.find_all('a', href=re.compile(r"/view/d+.htm"))
    
            for link in links:
                new_url = link['href']
                new_full_url = urlparse.urljoin(page_url, new_url);
                new_urls.add(new_full_url)
    
            return new_urls
    
        def _get_new_data(self, page_url, soup):
            res_data = {}
            res_data['url'] = page_url
            title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find("h1")
            res_data['title'] = title_node.get_text()
            summary_node = soup.find('div', class_ = 'lemma-summary')
            res_data['summary'] = summary_node.get_text()
    
            return res_data
  • 相关阅读:
    普通平衡树(treap与splay模板)
    NOIP2009T4 靶形数独
    单调队列模板
    NOIP2010引水入城
    数差
    NOIP2016DAY2T1 组合数问题
    NOIP2016 D2T3 愤怒的小鸟
    NOIP双栈排序
    膜拜
    斐波那契数列
  • 原文地址:https://www.cnblogs.com/brady-wang/p/6115804.html
Copyright © 2011-2022 走看看