zoukankan      html  css  js  c++  java
  • 简易网页采集器的实现

    --------------------------------------------------------------------------
    -----------------------------Cryking 原创-----------------------------
    -----------------------转载请注明出处,谢谢!------------------------ 

    自己写的一个扫描网址标题的小工具.

    功能:遍历指定范围的IP,根据IP扫描网页的标题,并记录(支持二级重定向网页的扫描)

             自动记录采集日志到D盘的net_collect.log文件中.

    类型:控制台程序

    实现语言:C#

    需要的环境: .NET 3.5

    可选的环境:Oracle数据库

    相关的缺省值说明:

    缺省(直接按回车即是缺省值)Oracle数据库用户:scott

    缺省Oracle数据库密码:tigger

    缺省Oracle数据库连接标识符:orcl  (即TNSNAME名称)

    缺省的http连接超时时间:6秒

    缺省启用数据库来记录采集到的信息

    缺省不启用扫描完成后自动关机

     (篇幅原因,数据库连接类这里就不贴了)

    主函数代码如下:

    static void Main(string[] args)
            {
                try
                {
                    string user = "scott";
                    string pwd = "tigger";
                    string tns = "orcl";
    
                    Console.WriteLine("***************简易网址扫描器V1.0*****************");
                    Console.WriteLine("**************Created  By Cryking*****************");
                    Console.WriteLine("******************QQ:278676125********************");
                    Console.WriteLine("**************************************************");
                    Console.WriteLine("请设置超时时间(若网络环境较差,建议设大一点,如100秒)(单位/秒):");
                    timeOut = Int32.Parse(Console.ReadLine());
                    Console.WriteLine("扫描完成后是否自动关机(Y/N)?");
                    if (Regex.IsMatch(Console.ReadLine(), "(?i)[y]")) shutDownFlag = 1;
                    Console.WriteLine("是否启用数据库支持(不启用则只写日志文件),Y/N?:");
                    if (Regex.IsMatch(Console.ReadLine(), "(?i)[n]")) DBFlag = 0;
                    if (DBFlag == 1)
                    {
                        Console.WriteLine("请输入Oracle数据库连接用户名:");
                        user = Console.ReadLine();
                        user = user == string.Empty ? "scott" : user;
                        Console.WriteLine("请输入Oracle数据库连接密码:");
                        pwd = string.Empty;
                        ConsoleKeyInfo info;
                        do
                        {
                            info = Console.ReadKey(true);
                            if (info.Key != ConsoleKey.Enter && info.Key != ConsoleKey.Backspace && info.Key != ConsoleKey.Escape && info.Key != ConsoleKey.Tab && info.KeyChar != '\0')
                            { pwd += info.KeyChar; Console.Write('*'); }
                        } while (info.Key != ConsoleKey.Enter);
                        pwd = pwd == string.Empty ? "tigger" : pwd;
                        Console.WriteLine();
                        Console.WriteLine("请输入Oracle数据库连接标识符(TNSNAME):");
                        tns = Console.ReadLine();
                        tns = tns == string.Empty ? "orcl" : tns;
                        if (!DBAccess.DBConnect(user, pwd, tns))
                        {
                            MessageBox.Show("数据库连接失败!", "错误001", MessageBoxButtons.OK, MessageBoxIcon.Error);
                            System.Diagnostics.Process.GetCurrentProcess().Kill();
                        }
                        Console.WriteLine("数据库连接成功!");
                        if (DBAccess.selectStr("select count(*) from user_objects where object_name='NET_COLLECT' ") == "0")
                        {
                            Console.WriteLine("开始创建表(NET_COLLECT),请等待...");
                            if (0 == DBAccess.DBExecSql(@"create table NET_COLLECT(
        IP          VARCHAR2(30) not null,
        PORT        NUMBER default 80,
        TITLE       VARCHAR2(4000),
        URL         VARCHAR2(2000),
        COLLECTDATE DATE default sysdate
    )"))
                                Console.WriteLine("表(NET_COLLECT)创建成功!");
                            else
                            {
                                Console.WriteLine("表(NET_COLLECT)创建失败,请参照说明,先手工创建表(NET_COLLECT)!");
                                System.Diagnostics.Process.GetCurrentProcess().Kill();
                            }
                        }
                    }
                    string Scan = "";
                    Console.WriteLine("请输入扫描范围(如:0.0.0.0-10.10.10.10)");
                    Scan = Console.ReadLine();
                    string[] tmpIp = Scan.Trim().Split('-');
                    string[] ipScanScop = allocaIncreament(tmpIp[0], tmpIp[1]);//平均分配IP范围给8个线程
                    logFile = new StreamWriter("d:\\net_collect.log", true);
                    DateTime startTime = DateTime.Now;
                    logFile.WriteLine("开始时间:" + DateTime.Now.ToString());
                    //开8个线程跑
                    Thread t = new Thread(new ParameterizedThreadStart(ipScan));
                    t.Start(ipScanScop[0]);
                    Thread t1 = new Thread(new ParameterizedThreadStart(ipScan));
                    t1.Start(ipScanScop[1]);
                    Thread t2 = new Thread(new ParameterizedThreadStart(ipScan));
                    t2.Start(ipScanScop[2]);
                    Thread t3 = new Thread(new ParameterizedThreadStart(ipScan));
                    t3.Start(ipScanScop[3]);
                    Thread t4 = new Thread(new ParameterizedThreadStart(ipScan));
                    t4.Start(ipScanScop[4]);
                    Thread t5 = new Thread(new ParameterizedThreadStart(ipScan));
                    t5.Start(ipScanScop[5]);
                    Thread t6 = new Thread(new ParameterizedThreadStart(ipScan));
                    t6.Start(ipScanScop[6]);
                    Thread t7 = new Thread(new ParameterizedThreadStart(ipScan));
                    t7.Start(ipScanScop[7]);
    
                    while (true) { if (8 == flag) break; };
                    DBAccess.DBClose();
                    TimeSpan ts = DateTime.Now - startTime;
                    logFile.WriteLine("结束时间:" + DateTime.Now.ToString());
                    logFile.Close();
                    Console.WriteLine("总共花费时间:" + ts.ToString());
                    if(1==shutDownFlag)
                    Process.Start("Shutdown.exe", " -s -t 0"); //完成后自动关机
                    Console.ReadKey();
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
    
            }


     扫描功能函数:

    static void ipScan1(object obj)
            {
                try
                {
                    string[] scope = obj.ToString().Split('-');
                    if (string.Compare(scope[0].ToString(), scope[1].ToString()) > 0)//交换
                    {
                        string tmp = "";
                        tmp = scope[0];
                        scope[0] = scope[1];
                        scope[1] = tmp;
                    }
                    string[] ipStart = scope[0].ToString().Split('.');
                    int i = Int32.Parse(ipStart[0]);
                    int j = Int32.Parse(ipStart[1]);
                    int k = Int32.Parse(ipStart[2]);
                    int g = Int32.Parse(ipStart[3]);
    
                    string[] ipEnd = scope[1].ToString().Split('.');
                    int ei = Int32.Parse(ipEnd[0]);
                    int ej = Int32.Parse(ipEnd[1]);
                    int ek = Int32.Parse(ipEnd[2]);
                    int eg = Int32.Parse(ipEnd[3]);
                    string html;
                    string ip;
                    string logBuffer = "";
                    for (; i <= ei; i++)
                    {
                        if (10 == i || 127 == i) continue;//私有地址
                        if (g == eg && k == ek && j == ej && i == ei) break;
                        for (; j <= 255; j++)
                        {
                            if (192 == i && 168 == j) continue;//私有地址
                            for (; k <= 255; k++)
                            {
                                for (; g <= 255; g++)
                                {
                                    {
                                        
                                        ip = i.ToString() + "." + j.ToString() + "." + k.ToString() + "." + g.ToString();
    
                                        html = GetHtmlInfo(ip, timeOut*1000, Encoding.Default);//采用缺省的编码方式,可能会获得乱码
                                        string title=GetTitle(html);
                                        title = title == string.Empty ? (html.Length > 1000 ? html.Substring(0, 1000) : html) : title;
                                        if (html != string.Empty && html != "无法连接到远程服务器")
                                            if(DBFlag==1)
                                            DBAccess.DBExecSql("insert into net_collect values('" + ip + "',default,'" + title + "','',default)");
                                        Console.WriteLine(ip + " --" + title);
                                        if (logBuffer != html)
                                        {
                                            lock (logFile)
                                            {
                                                myMutex.WaitOne();
                                                logFile.WriteLine("ip:" + ip + " [MSG:]" + title);                                          logBuffer = html;
                                                myMutex.ReleaseMutex();
                                            }
                                        }
                                        logFile.Flush();
                                        countPort++;
                                    }
                                    count++;
    
                                }
                                g = 0;
                            }
                            k = 0;
                        }
                        j = 0;
                    }
                    flag++;
                }
                catch (Exception e) { Console.WriteLine(e.Message); }
                
            }


     

     网页信息获取函数

    static string GetHtmlInfo(string url, int timeout, Encoding EnCodeType)
            {
    
                if (!url.StartsWith("http://") && !url.StartsWith("https://")) { url = "http://" + url; }
                string result = "";
                StreamReader reader = null;
                string temp = "";
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                try
                {
                     request = (HttpWebRequest)HttpWebRequest.Create(url);//初始化WebRequest
                    request.Timeout = timeout;
                    request.UserAgent = "User-Agent:Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 2.0.40607; .NET CLR 1.1.4322; .NET CLR 3.5.30729;.NET CLR 1.0.3705)";
                    request.Accept = "*/*";
                    request.AllowAutoRedirect = false;
                    request.KeepAlive = true;
                    request.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                     response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
                     if (response.StatusCode == System.Net.HttpStatusCode.MovedPermanently)//获取重定向的网页
                     {
                         request = (HttpWebRequest)HttpWebRequest.Create(response.Headers["Location"]);//初始化WebRequest
                         response = (HttpWebResponse)request.GetResponse();//返回来自Internet的响应
                     }
    
                     if (response.StatusCode == System.Net.HttpStatusCode.OK)
                    {
                        StringBuilder builder = new StringBuilder();
                        Stream stream = response.GetResponseStream();
                        reader = new StreamReader(stream, EnCodeType);
                        string tmp = "";
                        while ((temp = reader.ReadLine()) != null){
                            builder.Append(temp);
                            tmp = builder.ToString();
                            if (tmp.IndexOf("</title>") > 0) { break; }//ReadLine是读取整行,所以有时在它后面的很多字符串也会读取
                            builder.Append("\r\n");
                        }
    
                        result = builder.ToString();
                        response.Close();
                        request.Abort();
                        return result;
                    }
                    response.Close();
                    request.Abort();
                    return string.Empty;
                }
    
                catch (Exception ex){
                    return ex.Message;
                }
                finally { if (reader != null) { reader.Close(); } if (response != null) { response.Close(); } if (request != null) { request.Abort(); } }
            }


     

     IP范围分配函数(分配给各个线程)

    static string[] allocaIncreament(string tmpIp0, string tmpIp1)//平均分配IP范围给各个线程
            {
                string[] ipResult=new string[8];
                if (string.Compare(tmpIp0,tmpIp1)>0)//交换
                {
                    string tmp = "";
                    tmp = tmpIp0;
                    tmpIp0 = tmpIp1;
                    tmpIp1 = tmp;
                }
                string[] startip=tmpIp0.Split('.');
                string[] endip = tmpIp1.Split('.');
                int incre = (Int32.Parse(endip[3]) - Int32.Parse(startip[3]) +
                    (Int32.Parse(endip[2]) - Int32.Parse(startip[2])) * 256 +
                    (Int32.Parse(endip[1]) - Int32.Parse(startip[1])) * 256 * 256 +
                    (Int32.Parse(endip[0]) - Int32.Parse(startip[0])) * 256 * 256 * 256) / 8;
    
                string tmpIp0End = calcIp(startip, incre);
                ipResult[0] = tmpIp0 + "-" + tmpIp0End;
    
                string[] t1 = tmpIp0End.Split('.');
                t1[3] = (Int32.Parse(t1[3]) + 1).ToString();
                string tmpIp1End = calcIp(t1, incre);
                if (string.Compare(string.Join(".", t1), tmpIp1End) >= 0)
                {
                    ipResult[1] = tmpIp0End + "-" + tmpIp1;
                    ipResult[2] = tmpIp1 + "-" + tmpIp1;
                    ipResult[3] = ipResult[2];
                    ipResult[4] = ipResult[2];
                    ipResult[5] = ipResult[2];
                    ipResult[6] = ipResult[2];
                    ipResult[7] = ipResult[2];
                }
                else
                {
                    ipResult[1] = string.Join(".", t1) + "-" + tmpIp1End;
    
                    string[] t2 = tmpIp1End.Split('.');
                    t2[3] = (Int32.Parse(t2[3]) + 1).ToString();
                    string tmpIp2End = calcIp(t2, incre);
                    if (string.Compare(string.Join(".", t2), tmpIp2End) >= 0)
                    {
                        ipResult[2] = tmpIp1End + "-" + tmpIp1;
                        ipResult[3] = tmpIp1 + "-" + tmpIp1;
                        ipResult[4] = ipResult[3];
                        ipResult[5] = ipResult[3];
                        ipResult[6] = ipResult[3];
                        ipResult[7] = ipResult[3];
                    }
                    else
                    {
                        ipResult[2] = string.Join(".", t2) + "-" + tmpIp2End;
    
                        string[] t3 = tmpIp2End.Split('.');
                        t3[3] = (Int32.Parse(t3[3]) + 1).ToString();
                        string tmpIp3End = calcIp(t3, incre);
                        if (string.Compare(string.Join(".", t3), tmpIp3End) >= 0)
                        {
                            ipResult[3] = tmpIp2End + "-" + tmpIp1; ipResult[4] = tmpIp1 + "-" + tmpIp1;
                            ipResult[5] = ipResult[4];
                            ipResult[6] = ipResult[4];
                            ipResult[7] = ipResult[4];
                        }
                        else
                        {
                            ipResult[3] = string.Join(".", t3) + "-" + tmpIp3End;
    
                            string[] t4 = tmpIp3End.Split('.');
                            t4[3] = (Int32.Parse(t4[3]) + 1).ToString();
                            string tmpIp4End = calcIp(t4, incre);
                            if (string.Compare(string.Join(".", t4), tmpIp4End) >= 0)
                            {
                                ipResult[4] = tmpIp3End + "-" + tmpIp1; ipResult[5] = tmpIp1 + "-" + tmpIp1;
                                ipResult[6] = ipResult[5];
                                ipResult[7] = ipResult[5];
                            }
                           else
                            {
                                ipResult[4] = string.Join(".", t4) + "-" + tmpIp4End;
    
                                string[] t5 = tmpIp4End.Split('.');
                                t5[3] = (Int32.Parse(t5[3]) + 1).ToString();
                                string tmpIp5End = calcIp(t5, incre);
                                if (string.Compare(string.Join(".", t5), tmpIp5End) >= 0)
                                {
                                    ipResult[5] = tmpIp4End + "-" + tmpIp1; ipResult[6] = tmpIp1 + "-" + tmpIp1;
                                    ipResult[7] = ipResult[6];
                                }
                                else
                                {
                                    ipResult[5] = string.Join(".", t5) + "-" + tmpIp5End;
    
                                    string[] t6 = tmpIp5End.Split('.');
                                    t6[3] = (Int32.Parse(t6[3]) + 1).ToString();
                                    string tmpIp6End = calcIp(t6, incre);
                                    if (string.Compare(string.Join(".", t6), tmpIp6End) >= 0)
                                    {
                                        ipResult[6] = tmpIp5End + "-" + tmpIp1; ipResult[7] = tmpIp1 + "-" + tmpIp1;
                                    }
                                    else
                                    {
                                        ipResult[6] = string.Join(".", t6) + "-" + tmpIp6End;
    
                                        string[] t7 = tmpIp6End.Split('.');
                                        t7[3] = (Int32.Parse(t7[3]) + 1).ToString();
                                        string tmpIp7End = calcIp(t7, incre);
                                        if (string.Compare(string.Join(".", t7), tmpIp7End) >= 0) ipResult[7] = tmpIp6End + "-" + tmpIp1;
                                        else
                                            ipResult[7] = string.Join(".", t7) + "-" + tmpIp1;
                                    }
                                }
                            }
                        }
                    }
                }
                return ipResult;
            }


     运行的界面如下:

    ---

    工具下载地址:http://pan.baidu.com/share/link?shareid=657915&uk=2449788611

    有任何问题及建议,请联系我QQ:278676125

  • 相关阅读:
    【TIDB】2、TIDB进阶
    【TIDB】1、TiDb简介
    【Tair】淘宝分布式NOSQL框架:Tair
    【ElasticSearch】查询优化
    【高并发解决方案】9、大流量解决方案
    【高并发解决方案】8、Nginx/LVS/HAProxy负载均衡软件的优缺点详解
    【JVM】jdk1.8-jetty-swap被占满问题排查
    【JVM】记录一次线上SWAP偏高告警的故障分析过程
    【JVM】内存和SWAP问题
    【MySQL】mysql索引结构及其原理
  • 原文地址:https://www.cnblogs.com/javawebsoa/p/3074758.html
Copyright © 2011-2022 走看看