总管写的书一直都很喜欢,从《雪中悍刀行》到《剑来》。
其实我还是最喜欢那个雪中的鼠标垫,哈哈哈
针对笔趣阁小说进行数据爬取
上源码
#filename=get_data.py
# -*-coding:utf-8 -*-
# BY WANGCC
from bs4 import BeautifulSoup
import urllib.request
import os
from send_mail import sms
from ip_to_mysql import mysql_proxies
import logger
log = logger.Logger("debug")
test_file="剑来" + ".txt"
def gain_html_content(url):
"""获取网页的html内容
url:目标url地址
content:返回的页面内容
"""
# 构建请求对象
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
}
# proxies = mysql_proxies()
# print(proxies)
#
# request = urllib.request.Request(url,headers=headers)
proxies=mysql_proxies()
proxies_str=proxies.split(":")
Agreement=proxies_str[0]
ip=str(proxies_str[1])
port=str(proxies_str[2])
proxies_new=(ip[2:]+":"+port)
# 构建代理Handler
#http://111.26.9.26:80
httpproxy_handler = urllib.request.ProxyHandler({Agreement: proxies_new})
#httpproxy_handler = urllib.request.ProxyHandler({'http': '116.114.19.211:443'})
opener = urllib.request.build_opener(httpproxy_handler)
request = urllib.request.Request(url=url,headers=headers)
#request = urllib.request.Request(url,headers=header)
response = opener.open(request)
log.info('获取代理成功,请求页面成功!')
# 发送请求
#response = urllib.request.urlopen(request)
# 读取文件
content = response.read().decode('utf-8')
return content
def get_chapter(content):
# 先构建一个soup对象
soup = BeautifulSoup(content, "lxml")
# 找到小说的内容(是在div标签里面,并且这个div标签的id为"list")
content1 = soup.find("meta", property="og:novel:latest_chapter_name")
content=content1['content']
return content
def readfile(content):
if not os.path.exists(test_file):
write2file(content)
log.info('将当前内容写入文档,生成剑来.txt文档')
with open(test_file, 'r',encoding='utf-8') as f:
str=f.read()
log.info('读取剑来.txt文档')
return str
def write2file(content):
"""将小说写入本地文件"""
with open(test_file, 'w',encoding='utf-8') as f:
f.write(content)
log.info('将小说写入本地文件,生成剑来.txt文档')
def main():
# 获取页面内容
tar_url = 'https://www.qu.la/book/31177/'
content_url = gain_html_content(tar_url)
log.info('页面下载完成')
content=get_chapter(content_url)
old_str=readfile(content)
if content == old_str:
log.info("没更新呢!")
else:
write2file(content)
sms(content)
log.info('发送邮件提醒')
#main()
if __name__ == "__main__":
main()
发送邮件部分
# -*-coding:utf-8 -*-
# BY WANGCC
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import logger
log = logger.Logger("debug")
smtpserver = 'smtp.163.com'
username = 'xxxxx@163.com'
password = 'xxxxxx'
sender = 'xxxx@163.com'
# receiver='XXX@126.com'
# 收件人为多个收件人
receiver = ['xxxxxxx@139.com','xxxxx@wo.cn']
#这里使用运行商邮箱可以配置短信提醒,非常好用,就像短信提醒一样
def sms(contect):
print("input sms...")
subject = contect
#通过Header对象编码的文本,包含utf-8编码信息和Base64编码信息。以下中文名测试ok
#subject = '中文标题'
#subject=Header(subject, 'utf-8').encode()
#构造邮件对象MIMEMultipart对象
#下面的主题,发件人,收件人,日期是显示在邮件页面上的。
msg = MIMEMultipart('mixed')
msg['Subject'] = subject
msg['From'] = 'wangcc <wangcc7777@163.com>'
#msg['To'] = 'XXX@126.com'
#收件人为多个收件人,通过join将列表转换为以;为间隔的字符串
msg['To'] = ";".join(receiver)
#msg['Date']='2019-3-16'
#构造文字内容
text = "小说更新了!"
text_plain = MIMEText(text,'plain', 'utf-8')
msg.attach(text_plain)
smtp = smtplib.SMTP_SSL(host='smtp.163.com')
smtp.connect(host='smtp.163.com',port=465)
#我们用set_debuglevel(1)就可以打印出和SMTP服务器交互的所有信息。
#smtp.set_debuglevel(1)
smtp.login(username, password)
print("进入发送")
smtp.sendmail(sender, receiver, msg.as_string())
print('success....')
s_receiver=str(receiver)
log.info('发送提醒邮件给:'+s_receiver)
smtp.quit()
if __name__ == "__main__":
sms('c测试~~')
数据库连接
# -*-coding:utf-8 -*-
# BY WANGCC
import pymysql,datetime
import logger,random
log = logger.Logger("debug")
DB_CONFIG = {
"host": "xxxxxxxx",
"port": xxxxx,
"user": "xxxx",
"passwd": "111111111",
"db": "xxxxx",
"charset": "utf8"
}
def get_random():
numbers = range(1,10)
chosen = random.choice(numbers)
return chosen
def mysql(ip_list):
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
for ip in ip_list:
check_sql="select count(*) from ip_original where ip='%s'"%(ip)
insert_sql = "insert into ip_original(ip,date)value ('%s','%s')" % (ip, date)
cursor.execute(check_sql)
number=cursor.fetchall()
new_num=number[0][0]
if number[0][0] == 0:
try:
# 执行sql语句
cursor.execute(insert_sql)
log.info(ip+'insert to ip_original success!')
# 提交到数据库执行
db.commit()
except Exception as e:
log.info('执行sql-->'+insert_sql+'fail')
# 发生错误时回滚
db.rollback()
else:
log.info(ip+': is existence !!',)
# 关闭数据库连接
db.close()
#采集用一个ip代理
def mysql_proxies():
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
check_sql="SELECT * FROM ip_original where check_date is not NULL ORDER BY RAND() LIMIT 10 "
cursor.execute(check_sql)
number=cursor.fetchmany(10)
chose=get_random()
proxies=number[chose][1]
print(proxies)
# 关闭数据库连接
db.close()
return proxies
#验证用一个ip代理
def mysql_old():
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
check_sql="SELECT * FROM ip_original ORDER BY RAND() LIMIT 10 "
cursor.execute(check_sql)
number=cursor.fetchmany(10)
chose=get_random()
proxies=number[chose][1]
print(proxies)
# 关闭数据库连接
db.close()
return proxies
#删除一条数据
def mysql_delete(proxies):
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
check_sql="delete from ip_original where ip = '%s'"%(proxies)
log.info('delete ip-->'+check_sql)
cursor.execute(check_sql)
db.commit()
# 关闭数据库连接
db.close()
return proxies
#更新来源和验证时间
def mysql_update(str_from,proxies_yuan):
# 打开数据库连接
db = pymysql.connect(
host=DB_CONFIG["host"],
port=DB_CONFIG["port"],
user=DB_CONFIG["user"],
passwd=DB_CONFIG["passwd"],
db=DB_CONFIG["db"],
charset=DB_CONFIG["charset"])
# 使用cursor()方法获取操作游标
cursor = db.cursor()
date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
update_sql = "update ip_original set from_area='%s',check_date='%s' where ip='%s'" % (str_from, date,prox
ies_yuan)
try:
print(update_sql)
# 执行sql语句
cursor.execute(update_sql)
log.info(proxies_yuan+'---->'+str_from+'--> updata success!')
# 提交到数据库执行
db.commit()
except Exception as e:
log.info(str_from+'failed')
print(e)
# 发生错误时回滚
db.rollback()
# 关闭数据库连接
db.close()
if __name__=="__main__":
ip_list = ['http://117.191.11.108:80', 'http://134.209.15.143:8080', 'http://157.230.232.130:80',
'http://111.206.6.100:80', 'http://159.138.5.222:80', 'http://178.128.12.118:8080',
'http://83.142.126.147:80', 'http://150.109.55.190:83', 'http://165.227.62.167:8080',
'http://167.114.153.18:80', 'http://39.137.69.10:8080', 'http://111.206.6.101:80',
'http://165.227.29.189:8080', 'http://175.139.252.192:80', 'http://103.42.213.176:8080',
'http://211.23.149.29:80', 'http://211.23.149.28:80', 'http://47.94.57.119:80',
'http://175.139.252.194:80', 'http://47.94.217.37:80']
#mysql(ip_list)
number=mysql_proxies()
思路
每次爬取,从数据库随机抽一个代理ip来用,如果没用就销毁。
数据爬取后,存在本地txt。留着和下次作比对,如果一致则更新,并发送邮件。