网页情况:
代码:
import requests from requests.exceptions import RequestException from bs4 import BeautifulSoup as bs import re import time import pymysql def get_one_page(url): #得到一页的内容 try: headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } response = requests.get(url,headers=headers) response.encoding = 'utf-8' if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): #使用BeautifuSoup解析一页的内容 soup = bs(html,'lxml') for data in soup.find_all('div',class_="photo"): name = data.a['title'] href = "https://baike.baidu.com"+data.a['href'] img = data.img['src'] #有些人物没有图片,图片链接有误需要拼接 if re.search("^/static",img): img ="https://baike.baidu.com" + img yield { 'name':name, 'href':href, 'img':img } def write_mysql(item): #写入Mysql数据库 conn = pymysql.connect( host='localhost', user='root', password='', database='baidu', charset='utf8' # 别写成utf-8 ) cursor = conn.cursor() # 建立游标 sql = "insert into baidu_baike(name,href,img) values(%s,%s,%s)" cursor.execute(sql,(item['name'],item['href'],item['img'])) # 注意excute的位置参数的问题 conn.commit() # 修改值的时候,一定需要commit cursor.close() # 关闭 conn.close() # 关闭 def main(url): #主函数 html = get_one_page(url) items = parse_one_page(html) for item in items: write_mysql(item) if __name__ == '__main__': #分析URL构成,拼接URL for i in range(1,7): url = "http://baike.baidu.com/fenlei/虚拟人物?limit=30&index=" + str(i) + "&offset=" + str( 30 * (int(i) - 1)) + "# gotoList" main(url) print('正在爬取第%s页'%i) time.sleep(1) print("全部写入成功!")
运行结果:
mysql数据库结果: