说明:下面的脚本,涵盖利用文件传参数、logging日志模块,多进程
1 import requests 2 import re 3 import logging 4 from logging.handlers import RotatingFileHandler 5 import datetime 6 import time 7 import configparser 8 import os 9 import sys 10 import getpass 11 from multiprocessing import Pool 12 from requests.packages.urllib3.exceptions import InsecureRequestWarning 13 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 14 15 def get_parameter(): 16 confFile='multiSiteAccess.cnf' 17 if os.path.exists(confFile) is False: 18 print('配置文件:{}不存在!'.format(confFile)) 19 opt=getpass.getpass('输入回车键退出程序!') 20 sys.exit(1) 21 else: 22 cf=configparser.ConfigParser(allow_no_value=True) 23 cf.read(confFile,encoding='UTF-8') 24 try: 25 username=cf.get('Auth','username') 26 password=cf.get('Auth','password') 27 urlFile=cf.get('FilePath','urlFile') 28 proxyFile=cf.get('FilePath','proxyFile') 29 ProccessNum=int(cf.get('Proccess','ProccessNum')) 30 logFile=cf.get('logRecord','logFile') 31 logPrint=cf.get('logRecord','logPrint') 32 logRotateType=cf.get('logRecord','logRotateType') 33 return username,password,urlFile,proxyFile,ProccessNum,logFile,logPrint,logRotateType 34 except Exception as e: 35 print(e) 36 opt = getpass.getpass('输入回车键退出程序!') 37 sys.exit(2) 38 39 def log_record_conf(logFile,logPrint,logRotateType): 40 global logger 41 logger=logging.getLogger() 42 logger.setLevel(logging.DEBUG) 43 console_handler=logging.StreamHandler() 44 maxBytes=100*1024*1024 45 interval=8 46 if logRotateType.lower()=='size': 47 file_handler=logging.handlers.RotatingFileHandler(filename=logFile,encoding='UTF-8', 48 maxBytes=maxBytes,backupCount=50,delay=False) 49 elif logRotateType.lower()=='time': 50 file_handler=logging.handlers.TimedRotatingFileHandler(filename=logFile,encoding='UTF-8',when='H', 51 interval=interval,delay=False,utc=False,atTime=datetime.time) 52 else: 53 file_handler=logging.FileHandler(filename=logFile,encoding='UTF-8',mode='a',delay=False) 54 formatter=logging.Formatter('%(asctime)s - %(levelname)-7s - %(message)s') 55 console_handler.setFormatter(formatter) 56 file_handler.setFormatter(formatter) 57 logging.addHandler(file_handler) 58 if logPrint.lower()=='on': 59 logger.addHandler(console_handler) 60 return logger 61 62 def get_urls(urlFile): 63 with open(urlFile,'r',encoding='UTF-8') as rf: 64 allUrl=rf.readlines() 65 return allUrl 66 67 def get_proxy_host(proxyFile): 68 with open(proxyFile,'r',encoding='UTF-8') as rf: 69 allProxy=rf.readlines() 70 return allProxy 71 72 def url_sample_disposal(urllink): 73 urllink=urllink.strip('"') 74 urllink=urllink.strip("'") 75 pattern=r'^https?://.*' 76 match=re.findall(pattern,urllink) 77 if len(match) ==0: # URL样本不是以http/https开头的样本 78 urllink1='http://{}'.format(urllink) 79 urllink2='https://{}'.format(urllink) 80 match.append(urllink1) 81 match.append(urllink2) 82 return match 83 84 def set_proxy(urlProxy,username,password): 85 if not urlProxy: 86 proxies={} 87 else: 88 http_proxy='http://{}:{}@{}'.format(username,password,urlProxy) 89 https_proxy=http_proxy 90 proxies={ 91 'http':http_proxy, 92 'https':https_proxy 93 } 94 return proxies 95 96 def http_request(urllink,proxy='',proxyHost=''): 97 headers={ 98 'User-Agent':'curl/3.03', 99 'Connection':'close' # keep-alive 100 } 101 requests.adapters.DEFAULT_RETRIES=5 102 try: 103 r=requests.get(urllink,headers=headers,proxies=proxy,timeout=30,verify=False) 104 r.close() 105 urlresult='{} {} {}'.format(proxyHost,r.url,str(r.status_code)) 106 except Exception as e: 107 urlresult='{} {} {}'.format(proxyHost,urllink,str(e)) 108 finally: 109 return urlresult 110 111 def writer_log(urlresult): 112 logger.info(urlresult) 113 114 if __name__=='__main__': 115 start_time=datetime.datetime.now() 116 username,password,urlFile,proxyFile,ProccessNum,logFile,logPrint,logRotateType=get_parameter() 117 logger=log_record_conf(logFile,logPrint,logRotateType) 118 logger.info('{}URL测试开始{}'.format('='*15,'='*15)) 119 allUrl=get_urls(urlFile) 120 allProxy=get_proxy_host(proxyFile) 121 logger.debug('测试总URL数:{}'.format(len(allUrl))) 122 logger.debug('测试进程数:{}'.format(ProccessNum)) 123 p=Pool(ProccessNum) 124 for i in allUrl: 125 urlLink=i.split()[-1] 126 match=url_sample_disposal(urlLink) 127 for urllink in match : 128 urllink=urllink.strip() 129 for proxyHost in allProxy: 130 proxyHost=proxyHost.strip() 131 proxies=set_proxy(proxyHost,username,password) 132 result=p.apply_async(http_request,args=(urllink,proxies,proxyHost),callback=writer_log) 133 p.close() 134 count=0 135 while True: 136 try: 137 time.sleep(60) 138 if result.ready(): 139 count+=1 140 time.sleep(30) 141 logger.debug('进程池调度结束') 142 result.get(timeout=30) 143 if count>=3: 144 break 145 except Exception as e: 146 logger.warning('进程异常:{}'.format(e)) 147 p.terminate() 148 p.join() 149 end_time=datetime.datetime.now() 150 logger.info('开始时间:{}'.format(start_time)) 151 logger.info('结束时间:{}'.format(end_time)) 152 logger.info('总耗时:{}'.format(end_time-start_time)) 153 logger.debug('日志文件:{}'.format(logFile)) 154 logger.info('{}URL测试结束{}'.format('=' * 15, '=' * 15))