zoukankan html css js c++ java

改进网站资源探测工具(添加代理）

对网站资源探测工具进行调整，并且添加代理，防止多次访问ip被封的情况。

#获取代理，并写入agents列
def agent_list(url):
    global agent_lists
    agent_lists = []
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
    r = requests.get(url,headers = header)
    agent_info = BeautifulSoup(r.content,'html.parser').find(id = "ip_list").find_all('tr')[1:]
    for i in range(len(agent_info)):
        info = agent_info[i].find_all('td')
        agents = {info[5].string : 'http://' + info[1].string}
        agent_lists.append(agents)

修改后的网站资源扫描工具：

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# by 默不知然 2018-03-15

import threading
from threading import Thread
from bs4 import BeautifulSoup
import time
import sys
import requests
import getopt
import random

#创建类，并对目标网站发起请求
class scan_thread (threading.Thread):
    global real_url_list
    real_url_list = []
    def __init__(self,url):
        threading.Thread.__init__(self)
        self.url = url

    def run(self):
        try:
            proxy = random.sample(agent_lists,1)[0]
            r = requests.get(self.url, proxies = proxy)
            print(self.url,'------->>',str(r.status_code))
            if int(r.status_code) == 200:
                real_url_list.append(self.url)
            l[0] = l[0] - 1
        except Exception as e:
            print(e)

#获取字典并构造url并声明扫描线程
def url_makeup(dicts,url,threshold):
    global url_list
    global l
    url_list = []
    l =[]
    l.append(0)
    dic = str(dicts)
    with open (dic,'r') as f:
        code_list = f.readlines()
        for i in code_list:
            url_list.append(url+i.replace('
','').replace('
',''))
    while len(url_list):
        try:
            if l[0] < threshold:
                n = url_list.pop(0)
                l[0] = l[0] + 1
                thread = scan_thread(n)
                thread.start()
        except KeyboardInterrupt:
            print('用户停止了程序，完成目录扫描。')
            sys.exit()

#获取输入参数
def get_args():
    global get_url
    global get_dicts
    global get_threshold
    try:
        options,args = getopt.getopt(sys.argv[1:],"u:f:n:")
    except getopt.GetoptError:
        print("错误参数")
        sys.exit()
    for option,arg in options:
        if option == '-u':
            get_url = arg
        if option == '-f':
            get_dicts = arg
        if option == '-n':
            get_threshold = int(arg)
#获取代理，并写入agents列
def agent_list(url):
    global agent_lists
    agent_lists = []
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}
    r = requests.get(url,headers = header)
    agent_info = BeautifulSoup(r.content,'html.parser').find(id = "ip_list").find_all('tr')[1:]
    for i in range(len(agent_info)):
        info = agent_info[i].find_all('td')
        agents = {info[5].string : 'http://' + info[1].string}
        agent_lists.append(agents)

#主函数，运行扫描程序
def main():
    agent_url = 'http://www.xicidaili.com/nn/1'
    agent_list(agent_url)
    get_args()
    url = get_url
    dicts = get_dicts
    threshold = get_threshold
    url_makeup(dicts,url,threshold)
    time.sleep(0.5)
    print('目标网站存在目录： ','
', real_url_list)
    with open(r'网站目录扫描结果.txt','w') as f:
        for i in real_url_list:
            f.write(i)
            f.write('
')

if __name__ == '__main__':
    main()

对某网站扫描结果：

查看全文

相关阅读:
CF446DDZY Loves Games【高斯消元,矩阵乘法】
PHP操作MongoDB数据库
 PHP linux spl_autoload_register区分大小写
 win7 64位安装redis 及Redis Desktop Manager使用
 svn的搭建
 php 扩展 redis
CI reids 缓存
 拿起键盘写下我的第一封博客
 自我介绍
 课程目标

原文地址：https://www.cnblogs.com/kaiho/p/8578848.html

最新文章
FCM算法
 蚁群算法
 数组
 函数数组循环
 遗传算法优化
 图
 Kmeans算法
 html.css
如何居中div
html.css总结

改进 网站资源探测工具(添加代理）

改进网站资源探测工具(添加代理）