用大才哥的代理池 地址:https://github.com/Germey/ProxyPool
1.get_proxy() :从redis的端口随机拿一个代理
2.使用selenium模块,配置上代理信息,用webdrive刷浏览器地址
在此程序中,缺少了一个检验代理是否能有效快速连接博客,有的代理很慢,而还是用这些很慢的代理刷了一圈,浪费了很多时间。其实可以添加一断代码,如果连接第一篇博客很慢,就跳过这个代理地址。然而我很懒 嘿嘿
import time import requests import json import re from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait import redis import time proxy_url = 'http://localhost:5000/get' url = 'https://i.cnblogs.com/categories' base_url = 'https://i.cnblogs.com/posts?' views = 0 url_list = [] headers = { # 在headers中添加上自己的cookie 'cookie': '你的cookie', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'upgrade-insecure-requests': '1' } pattern1 = re.compile('<td>发布</td>.*?d.*?(d{1,})', re.S) # 正则表达式 #这边尤其要注意到浏览量有可能不只是一个位数,还可能是二位数三位数,所以写正则的时候应该是(d{1,}) pattern2 = re.compile('<td class="post-title"><a href="(.*?)"', re.S) # 正则表达式 response = requests.get(url=url, headers=headers) html = response.text data = json.loads(html) # 通过json.loads把数据转化成字典格式 categories = (i['CategoryId'] for i in data) for category in categories: cate_url = base_url + 'categoryid=' + str(category) # build每个category的地址 headers['referer'] = cate_url response = requests.get(cate_url, headers=headers) html = response.text results1 = re.findall(pattern1, html) # 浏览量的findall结果 results2 = re.findall(pattern2, html) # 网页地址的findall结果 if results1: for result1 in results1: views = views + int(result1) # 计算浏览量 for result2 in results2: url_list.append('https://' + result2) # build地址 print('总浏览量为:', views) print('一共{}篇文章'.format(len(url_list))) print('文章平均浏览量', views / len(url_list)) print(url_list) def get_proxy(): response = requests.get(proxy_url) if response.status_code == 200: return response.text else: return None while True: proxy = get_proxy() print('the proxy is:', proxy) options = webdriver.ChromeOptions() # 通过selenium 模块中的webdriver 模拟一个chrome浏览器 # 设置中文 options.add_argument('lang=zh_CN.UTF-8') # 更换头部 options.add_argument( 'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"') options.add_argument('--proxy-server={0}'.format(proxy)) # https://www.cnblogs.com/francischeng/p/9437809.html # www.cnblogs.com/francischeng/p/9437809.com' driver = webdriver.Chrome(chrome_options=options) r = redis.StrictRedis() try: for url in url_list: driver.delete_all_cookies() print('connecting to', url) past = time.time() driver.get(url) now = time.time() print('成功打开网页,使用了{}秒'.format(now-past)) # print('睡眠1秒钟.....') # time.sleep(1) # 睡眠两秒 driver.quit() except: pass