zoukankan      html  css  js  c++  java
  • 从0开始学爬虫8使用requests/pymysql和beautifulsoup4爬取维基百科词条链接并存入数据库

    从0开始学爬虫8使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

    Python使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库

    参考文档:

    https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

    # 安装 beautifulsoup4

    (pytools) D:pythonpytools>pip install beautifulsoup4

    安装mysql的模块

    pymysql的地址:https://github.com/PyMySQL/PyMySQL

    爬取维基百科词条

    # coding=utf-8
    
    from bs4 import BeautifulSoup
    import requests
    import re
    
    
    def spider_wike():
        url = "https://en.wikipedia.org/wiki/Main_Page"
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
        resp = requests.get(url, headers = headers)
        # 将响应数据转换为utf-8编码
        resp.encoding = 'utf-8'
    
        html_doc = resp.text
    
        soup = BeautifulSoup(html_doc, "html.parser")
        # 找到以wiki开头的a标签的href属性
        list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
        # print(list_urls)
    
        # 输出所有的词条对应的名称和URL
        for url in list_urls:
            # 过滤掉.jpg 或.JPG 结尾的URL
            if not re.search(r".(jpg|JPG)", url["href"]):
                # 词条加网址
                # sting只能获取一个, get_text() 可以获取标签下所有的内容
                print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])
    
    
    if __name__ == '__main__':
        spider_wike()

    # 将维基百科词条链接存入数据库

    # coding=utf-8
    
    from bs4 import BeautifulSoup
    import requests
    import re
    import pymysql.cursors
    
    
    ''' 
        # 环境准备
        pip install pymysql
        create database wikiurl charset=utf8mb4;
        use wikiurl;
        create table urls (id int primary key auto_increment,urlname varchar(255),urlhref varchar(1000));
    '''
    url = "https://en.wikipedia.org/wiki/Main_Page"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
    resp = requests.get(url, headers = headers)
    # 将响应数据转换为utf-8编码
    resp.encoding = 'utf-8'
    
    html_doc = resp.text
    
    soup = BeautifulSoup(html_doc, "html.parser")
    # 找到以wiki开头的a标签的href属性
    list_urls = soup.find_all("a", href=re.compile("^/wiki/"))
    # print(list_urls)
    
    # 输出所有的词条对应的名称和URL
    for url in list_urls:
        # 过滤掉.jpg 或.JPG 结尾的URL
        if not re.search(r".(jpg|JPG)", url["href"]):
            # 词条加网址
            # sting只能获取一个, get_text() 可以获取标签下所有的内容
            print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"])
    
            connection = pymysql.connect(host='localhost',
                                         user='root',
                                         password='root',
                                         db='wikiurl',
                                         charset='utf8mb4')
            try:
                # 获取回话指针
                with connection.cursor() as cursor:
                    # 创建sql语句
                    sql = "insert into `urls`(`urlname`,`urlhref`) values(%s,%s)"
    
                    # 执行sql语句
                    cursor.execute(sql,(url.get_text(), "https://en.wikipedia.org" + url["href"]))
                    # 提交数据
                    connection.commit()
            finally:
                connection.close()

    # 从数据库读取词条信息

    # coding=utf-8
    
    import pymysql
    
    
    def get_conn():
        connection = pymysql.connect(host='localhost',
                                     user='root',
                                     password='root',
                                     db='wikiurl',
                                     charset='utf8mb4')
        return connection
    
    
    def get_wiki_data():
        conn = get_conn()
    
        sql = "select `urlname`,`urlhref` from urls"
        cur = conn.cursor()
        # 获取总记录条数
        count = cur.execute(sql)
        print(count)
    
    
        # 获取所有数据
        # urllists = cur.fetchall()
        # 获取指定条目数据
        # urllists = cur.fetchmany(3)
        #
        # for url in urllists:
        #     print(url[0],'<--->',url[1])
    
        # 获取一条数据
        link = cur.fetchone()
        print(link)
    
        # 关闭数据库连接
        conn.close()
    
    
    def get_data():
        conn = get_conn()
    
        try:
            with conn.cursor() as cur:
                sql = "select `urlname`,`urlhref` from urls where `id` is not NULL"
                count = cur.execute(sql)
                print(count)
    
                # 查询所有数据
                # data = cur.fetchall()
                # print(data)
    
                # 查询指定条目数据
                result = cur.fetchmany(size = 5)
                print(result)
        finally:
            conn.close()
    
    
    if __name__ == '__main__':
        # get_wiki_data()
        get_data()
  • 相关阅读:
    hdu 1003 dp最大子序列和
    模拟题 (+queue队列知识)
    hdu 1016 DFS
    OSGi 系列(二)之 Hello World
    OSGi 系列(一)之什么是 OSGi :Java 语言的动态模块系统
    Mina 系列(四)之KeepAliveFilter -- 心跳检测
    Mina 系列(三)之自定义编解码器.md
    Mina 系列(二)之基础
    Mina 快速入门
    Java 8 Optional 类深度解析
  • 原文地址:https://www.cnblogs.com/reblue520/p/11200086.html
Copyright © 2011-2022 走看看