zoukankan      html  css  js  c++  java
  • 百科简单的爬虫

    百科爬虫

    特点:根据关键词的特征 寻找与关键词最相关的百科的实体
    
    ==|  Fighting ~~
    
    import re
    from urllib.parse import quote, urljoin
    import requests
    from bs4 import BeautifulSoup
    from lxml import etree
    
    s1, s2 = 'u4e00', 'u9fa5'
    d1, d2 = '0', '9'
    po = ",。、;():
    .-():-"
    
    
    def get_str_baike(s):
        ans = ''
        s = re.sub('[[^[]*]', '', s)
        pos = ',、;():.-():-'
        for ch in s:
            if (ch in pos or s1 <= ch <= s2 or d1 <= ch <= d2 or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z'):
                ans += ch
        return ans
    
    
    def craw_bk(key, feature=''):
        def rt_response(url):
            sessions = requests.session()
            sessions.headers[
                'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
            html = sessions.get(url)
            html.encoding = 'utf'
            return etree.HTML(html.text)
    
        def get_raw_html(url, code='UTF-8'):
            head = {
                'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36"
            }
            try:
                r = requests.get(url, headers=head)
                r.encoding = code
                html = r.text
            except BaseException:
                print("open error", url)
                return ""
            return html
    
        def get_key_val(html):
            ans = dict()
            soup = BeautifulSoup(html, 'lxml')
            dd = soup.find('dd', class_='lemmaWgt-lemmaTitle-title')
            if dd:
                ans['name'] = dd.find('h1').get_text()
            dt = soup.find_all('dt', class_='basicInfo-item name')
            dd = soup.find_all('dd', class_='basicInfo-item value')
            for i in range(len(dt)):
                s1 = dt[i].get_text().strip('
    ')
                s2 = dd[i].get_text().strip('
    ')
                s = ''.join([ch for ch in s1 if 'u4e00' <= ch <= 'u9fa5'])
                ans[s] = s2
                # print(f'{s}: {s2}')
            div = soup.find('div', class_='lemma-summary')
            if div:
                pa = div.find_all('div', class_='para')
                txt = '
    '.join([it.get_text() for it in pa])
                li = txt.strip('
    ').split('
    ')
                txt = '
    '.join([it for it in li if it != '
    '])
                ans['introduct'] = txt
            return ans
    
        def search_find(key, feature):
            key = quote(key + feature)
            url = 'http://baike.baidu.com/search/none?word={}'.format(key)
            response = rt_response(url)
            hrefs = response.xpath('//a[@class="result-title"]/@href')
            if hrefs:
                href = urljoin(url, hrefs[0])
                url = href + '?noadapt=1'
                html = get_raw_html(url, code='UTF-8')
                ans = get_key_val(html)
                return ans
            else:
                return None
        s = quote(key)
        url = 'http://baike.baidu.com/item/' + s + '?noadapt=1'
        html = get_raw_html(url)
        soup = BeautifulSoup(html, 'lxml')
        s = soup.find('div', class_="main-content") 
        if s and feature in s.get_text():  # feature 不在文本中
            ans = get_key_val(html)
        else:
            ans = search_find(key, feature)  # 搜索查询
        for key, val in ans.items():  # 字符串规范
            ans[key] = get_str_baike(val)
        return ans
    
    
    if __name__ == '__main__':
        ans = craw_bk('朝阳区', feature='长春市')
        for key, val in ans.items():
            print(f'{key}:{val}')
    
  • 相关阅读:
    如何进入闷声发大财的传统企业做数据分析工作,帮助企业做数字转型?
    做一个最好的自己,不盲目学数据分析
    sql server如何判断数据库是否存在
    c++引用深入探讨
    VirtualBox 安装ghost版windows XP
    QT透明显示文字
    CSAPP
    随机颜色的生成
    远程桌面如何传递声音的原理
    递归求解全排列
  • 原文地址:https://www.cnblogs.com/xidian-mao/p/11959829.html
Copyright © 2011-2022 走看看