从0开始学爬虫8使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库
Python使用requests和beautifulsoup4爬取维基百科词条链接并存入数据库
参考文档:
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
# 安装 beautifulsoup4
(pytools) D:pythonpytools>pip install beautifulsoup4
安装mysql的模块
pymysql的地址:https://github.com/PyMySQL/PyMySQL
爬取维基百科词条
# coding=utf-8 from bs4 import BeautifulSoup import requests import re def spider_wike(): url = "https://en.wikipedia.org/wiki/Main_Page" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} resp = requests.get(url, headers = headers) # 将响应数据转换为utf-8编码 resp.encoding = 'utf-8' html_doc = resp.text soup = BeautifulSoup(html_doc, "html.parser") # 找到以wiki开头的a标签的href属性 list_urls = soup.find_all("a", href=re.compile("^/wiki/")) # print(list_urls) # 输出所有的词条对应的名称和URL for url in list_urls: # 过滤掉.jpg 或.JPG 结尾的URL if not re.search(r".(jpg|JPG)", url["href"]): # 词条加网址 # sting只能获取一个, get_text() 可以获取标签下所有的内容 print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"]) if __name__ == '__main__': spider_wike()
# 将维基百科词条链接存入数据库
# coding=utf-8 from bs4 import BeautifulSoup import requests import re import pymysql.cursors ''' # 环境准备 pip install pymysql create database wikiurl charset=utf8mb4; use wikiurl; create table urls (id int primary key auto_increment,urlname varchar(255),urlhref varchar(1000)); ''' url = "https://en.wikipedia.org/wiki/Main_Page" headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} resp = requests.get(url, headers = headers) # 将响应数据转换为utf-8编码 resp.encoding = 'utf-8' html_doc = resp.text soup = BeautifulSoup(html_doc, "html.parser") # 找到以wiki开头的a标签的href属性 list_urls = soup.find_all("a", href=re.compile("^/wiki/")) # print(list_urls) # 输出所有的词条对应的名称和URL for url in list_urls: # 过滤掉.jpg 或.JPG 结尾的URL if not re.search(r".(jpg|JPG)", url["href"]): # 词条加网址 # sting只能获取一个, get_text() 可以获取标签下所有的内容 print(url.get_text(), " <------>", "https://en.wikipedia.org" + url["href"]) connection = pymysql.connect(host='localhost', user='root', password='root', db='wikiurl', charset='utf8mb4') try: # 获取回话指针 with connection.cursor() as cursor: # 创建sql语句 sql = "insert into `urls`(`urlname`,`urlhref`) values(%s,%s)" # 执行sql语句 cursor.execute(sql,(url.get_text(), "https://en.wikipedia.org" + url["href"])) # 提交数据 connection.commit() finally: connection.close()
# 从数据库读取词条信息
# coding=utf-8 import pymysql def get_conn(): connection = pymysql.connect(host='localhost', user='root', password='root', db='wikiurl', charset='utf8mb4') return connection def get_wiki_data(): conn = get_conn() sql = "select `urlname`,`urlhref` from urls" cur = conn.cursor() # 获取总记录条数 count = cur.execute(sql) print(count) # 获取所有数据 # urllists = cur.fetchall() # 获取指定条目数据 # urllists = cur.fetchmany(3) # # for url in urllists: # print(url[0],'<--->',url[1]) # 获取一条数据 link = cur.fetchone() print(link) # 关闭数据库连接 conn.close() def get_data(): conn = get_conn() try: with conn.cursor() as cur: sql = "select `urlname`,`urlhref` from urls where `id` is not NULL" count = cur.execute(sql) print(count) # 查询所有数据 # data = cur.fetchall() # print(data) # 查询指定条目数据 result = cur.fetchmany(size = 5) print(result) finally: conn.close() if __name__ == '__main__': # get_wiki_data() get_data()