一、获取博客园url清单
1、sql表
新建cbs数据库
通过article_list.sql新建sql表:
SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for article_list -- ---------------------------- DROP TABLE IF EXISTS `article_list`; CREATE TABLE `article_list` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT, `title` varchar(255) DEFAULT NULL, `countView` varchar(10) DEFAULT NULL, `countComment` varchar(10) DEFAULT NULL, `url` varchar(100) DEFAULT NULL, `datePublished` datetime DEFAULT NULL, `dateUpdated` datetime DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2、cookie
登录博客园,此时的cookie值,存入mycookie.txt
3、获取url清单
python依赖包
requirements.txt
certifi==2020.4.5.1 chardet==3.0.4 et-xmlfile==1.0.1 idna==2.9 jdcal==1.4.1 openpyxl==3.0.3 PyMySQL==0.9.3 requests==2.23.0 urllib3==1.25.9
Python程序
import time
import json
import math
import requests
import sys
import os
from openpyxl import Workbook
import pymysql
# 启动前输出
version_str = "V1.2"
logo_str = "cnBlogs_List"
logo_pic_str = """
____ _ _ _ _
___ _ __ | __ )| | ___ __ _ ___ | | (_)___| |_
/ __| '_ | _ | |/ _ / _` / __| | | | / __| __|
| (__| | | | |_) | | (_) | (_| \__ | |___| \__ |_
\___|_| |_|____/|_|\___/ \__, |___/ _____ |_____|_|___/\__|
|___/ |_____|
"""
print("%s %s" % (logo_str, version_str), end='')
print(logo_pic_str)
print("%s %s 启动中..." % (logo_str, version_str))
time.sleep(2.5)
# 开始启动
# 配置
# 方式一:直接赋值给常量COOKIE_STR
# cookie值
COOKIE_STR = ""
# 方式二:将cookie值存到一个指定的文件
COOKIE_PATH = "./mycookie.txt"
if os.path.exists(COOKIE_PATH) and len(COOKIE_STR) == 0:
# 如果存在cookie存值文件,且没有对系统常量cookie赋值,则读取文件中的内容
with open(COOKIE_PATH, 'r', encoding="utf-8") as f:
data_str = f.read()
if len(data_str) > 0:
# 如果存在文件内容,则向系统赋值cookie常量
COOKIE_STR = data_str
else:
print("There is no cookie value in the file %s" % COOKIE_PATH)
sys.exit()
# sys.exit()
CACHE_FILE_NAME = "cache.txt" # 缓存文件名称
EXCEL_FILE_NAME = "result.xls" # 电子表格文件名称
WORKSHEET_NAME = "cnblogs_admin" # 工作簿名称
# TABLE_HEAD_TITLE_LIST = ["title", "countView", "countCommet", "url", "datePublished", "dateUpdated"] # 表头名称
TABLE_HEAD_TITLE_LIST = ["标题", "阅读量", "评论数", "链接", "首次发布时间", "最近更新时间"] # *.xls或cache.txt表头名称
SINGLE_PAGE_COUNT = 10 # 每页文章数目
# 配置请求参数
#REQUEST_URL_PART = r'https://i-beta.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0'
REQUEST_URL_PART = r'https://i.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0'
REQUEST_HEADERS = {
'authority': 'i-beta.cnblogs.com',
'method': 'GET',
'path': '/api/posts/list?p=1&cid=&t=1&cfg=0',
'scheme': 'https',
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cookie': COOKIE_STR,
'referer': 'https://i-beta.cnblogs.com/posts',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}
# 全局变量
info_result_all = [] # 所有页面的数据汇总列表
# 配置mysql数据库连接参数
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWD = 'root'
MYSQL_DB = 'cbs'
MYSQL_CHARSET = 'utf8'
# 连接数据库
connect = pymysql.Connect(
host=MYSQL_HOST,
port=MYSQL_PORT,
user=MYSQL_USER,
passwd=MYSQL_PASSWD,
db=MYSQL_DB,
charset=MYSQL_CHARSET
)
# 获取游标
cursor = connect.cursor()
# 定义函数
# 获取总页数
def get_page_count(page_n=1):
page_num = page_n
# 构造请求参数
url = REQUEST_URL_PART % page_num
headers = REQUEST_HEADERS
# 发起get请求
response = requests.get(url, headers=headers, verify=True)
html_data = response.content.decode()
data_dict = json.loads(html_data)
all_post_count = data_dict["postsCount"] # 获取总文章数目
page_count = math.ceil(all_post_count / 10) # 获取总页数
# 返回总页数
return page_count
# 请求每个列表页的数据
# 1、打印到控制台
# 2、存入mysql数据库cbs中article_list表中
def get_per_page_data(page_n=1, save_mysql=True):
page_num = page_n
url = REQUEST_URL_PART % page_num
headers = REQUEST_HEADERS
# 发起get请求
response = requests.get(url, headers=headers, verify=True)
html_data = response.content.decode()
data_dict = json.loads(html_data)
post_list = data_dict["postList"] # 获取页面内容
all_post_count = data_dict["postsCount"] # 获取总文章数目
page_count = math.ceil(all_post_count / 10) # 获取总页数
print("【status %s】第 %s 页" % (response.status_code, page_n))
info_result = [] # 写入表格的数据
# 写入首行
if page_num == 1:
info = TABLE_HEAD_TITLE_LIST # 首行内容
info_result.append(info)
# 获取当页内容
for index, item in enumerate(post_list):
title = item['title'] # 获取标题
viewCount = item['viewCount'] # 获取点击量
comment_count = item['feedBackCount'] # 获取评论数量
url = "https:%s" % item['url'] # 获取文章url
datePublished = item['datePublished'] # 获取首次发布日期
dateUpdated = item['dateUpdated'] # 获取最近修改日期
# 打印到控制台
print((index + ((page_n - 1) * SINGLE_PAGE_COUNT) + 1), end=' ')
print(title)
info = [title, viewCount, comment_count, url, datePublished, dateUpdated] # 每行的内容
# 存内存列表
info_result.append(info)
if save_mysql is True:
# 存mysql数据库
# 增加数据操作
sql_1 = "INSERT INTO article_list(title,countView,countComment,url,datePublished,dateUpdated) VALUES ('%s','%s','%s','%s','%s','%s');"
data = tuple(info)
cursor.execute(sql_1 % data) # 生成增加sql语句
# cursor.execute(sql_1) # 生成增加sql语句
connect.commit() # 确认永久执行增加
return page_count, page_num, info_result
# 变量列表页,数据存mysql、txt
def get_cbs_list_data(page_count, save_cache=True):
for n in range(1, page_count + 1):
time.sleep(1) # 休息一秒钟,降低请求频率
one_page_data_list = get_per_page_data(n, save_mysql=True) # 请求一次第n页,并存mysql数据库
page_num = one_page_data_list[1]
info_result = one_page_data_list[2]
info_result_all.extend(info_result)
if save_cache is True:
# 将本页面数据写入缓存文件cache.txt
save_file_cache = CACHE_FILE_NAME
with open(save_file_cache, 'a', encoding="utf-8") as f:
f.write("## 第 %s/%s 页
" % (page_num, page_count)) # 缓存页面号
for line in info_result:
f.write("%s %s %s %s %s %s
" % tuple(line)) # 本页内容写入缓存
# 将数据写入excel
def save_excel():
save_file = EXCEL_FILE_NAME # 存入文件名称
sheet_name = WORKSHEET_NAME # 工作簿名称
wb = Workbook() # 新建工作簿
ws1 = wb.active # 获得当前活跃的工作页,默认为第一个工作页
ws1.title = sheet_name # 修改页名称。sheet名称
for row in info_result_all:
ws1.append(row)
wb.save(save_file) # Excel文件名称,保存文件
# 主函数
# 1、遍历所有页面
# 2、存入缓存文件cache.txt、存入mysql数据库
# 3、数据存入内存
# 4、存入xls电子表格中
def main():
# 尝试请求第一个列表页,获取总页数
try:
page_count = int(get_page_count()) # 获取总页面数。
except Exception:
print("请求无效,请替换为新Cookie值(COOKIE_STR)后重试")
sys.exit()
# 遍历所有页面。存mysql,存cache.txt文件
get_cbs_list_data(page_count, save_cache=False)
# get_cbs_list_data(2, save_cache=True) # 遍历前2页
# 最后:将数据写入excel
# save_excel()
# 启动测试
if __name__ == '__main__':
# 启动主程序
main()
二、通过url清单下载html页面
1、新建如下目录结构
└─html
├─2
├─3
├─4
└─others
2、新建url清单文件,并从sql库中导入url清单
1.txt
3、下载html文件
import os
import time
import requests
import shutil
from contextlib import closing
from urllib.request import urlopen
# py2使用
# from urllib2 import urlopen
def myurlopen(url):
# url = 'https://www.baidu.com'
headers = {
'authority': 'i-beta.cnblogs.com',
'method': 'GET',
# 'path': '/api/posts/list?p=1&cid=&t=1&cfg=0',
'scheme': 'https',
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
# 'cookie': COOKIE_STR,
'referer': 'https://i-beta.cnblogs.com/posts',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}
# 发起get请求
response = requests.get(url, headers=headers, verify=True)
# 获取html文本
# html_data = response.content.decode()
return response
def save_html(url):
# url = "https://www.cnblogs.com/andy9468/p/10005406.html"
# url = "https://www.cnblogs.com/andy9468/p/8025420.html"
server_path, html_name = os.path.split(url)
print(server_path)
print(html_name)
# with closing(urlopen('ftp://www.xxxx.com/haha.txt')) as page:
resp = myurlopen(url)
html_code = resp.status_code
# print(html_code)
html_data = resp.content
if str(html_code).startswith('2'):
with open("html/2/%s" % html_name, 'wb') as f:
f.write(html_data)
elif str(html_code).startswith('3'):
with open("html/3/%s" % html_name, 'wb') as f:
f.write(html_data)
elif str(html_code).startswith('4'):
with open("html/4/%s" % html_name, 'wb') as f:
f.write(html_data)
else:
with open("html/others/%s" % html_name, 'wb') as f:
f.write(html_data)
def main():
# url = "https://www.cnblogs.com/andy9468/p/10005406.html"
# save_html(url)
urls_file = "1.txt"
with open(urls_file, 'r')as f:
urls = f.readlines()
for url in urls:
# print(123)
# print(url.strip("
"))
url = url.strip("
")
time.sleep(1)
save_html(url)
if __name__ == '__main__':
main()