1.数据库连接池
#######db.py########## import time import pymysql import threading from DBUtils.PooledDB import PooledDB, SharedDBConnection POOL = PooledDB( creator=pymysql, # 使用链接数据库的模块 maxconnections=6, # 连接池允许的最大连接数,0和None表示不限制连接数 mincached=2, # 初始化时,链接池中至少创建的空闲的链接,0表示不创建 maxcached=5, # 链接池中最多闲置的链接,0和None不限制 maxshared=3, # 链接池中最多共享的链接数量,0和None表示全部共享。PS: 无用,因为pymysql和MySQLdb等模块的 threadsafety都为1,所有值无论设置为多少,_maxcached永远为0,所以永远是所有链接都共享。 blocking=True, # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错 maxusage=None, # 一个链接最多被重复使用的次数,None表示无限制 setsession=[], # 开始会话前执行的命令列表。如:["set datestyle to ...", "set time zone ..."] ping=0, # ping MySQL服务端,检查是否服务可用。# 如:0 = None = never, 1 = default = whenever it is requested, 2 = when a cursor is created, 4 = when a query is executed, 7 = always host='127.0.0.1', port=3306, user='root', password='1234', database='bgm', charset='utf8' )
2.核心代码
import re import requests from bs4 import BeautifulSoup import pymysql from db import POOL from threading import Thread header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} conn = POOL.connection() cursor = conn.cursor() sql = '''insert into bgm_info(CNAME,JNAME,CON,FEN,NUM,IMG_URL,TEXT_URL) VALUES (%s,%s,%s,%s,%s,%s,%s)''' def sql_data(*args): # print(args) res=cursor.execute(sql,args) conn.commit() # cursor.close() conn.close() def get_text(url): res=requests.get(url,headers=header) # print(res.url) res.encoding='utf-8' html=res.text soup_html=BeautifulSoup(html,'html.parser') try: all_li=soup_html.find('ul',class_='browserFull').find_all('li',class_='item odd clearit') for div in all_li: c_name=div.find('div',class_='inner').find('a').get_text() j_name=div.find('small').string rank=div.find('span',class_='rank').get_text() con=div.find('p',class_='info tip').string fade=div.find('small',class_='fade').string num=div.find('span',class_='tip_j').get_text() num = re.search('d+',num).group() img_url = 'http:'+div.find('img')['src'] url2 = 'http://bangumi.tv/'+div.find('div',class_='inner').find('a')['href'] # print(img_url) sql_data(c_name,j_name,con,fade,num,img_url,url2) except: print(url) if __name__ == '__main__': for i in range(500): url = 'http://bangumi.tv/anime/browser?sort=rank&page=%s'%i # get_text(url) t = Thread(target=get_text,args=(url,)) t.start()