zoukankan      html  css  js  c++  java
  • 提取网页中的超级链接

    using System;
    using System.Xml;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Collections;
    using System.Text.RegularExpressions;

    public class App
    {
    public static void Main()
    {
    string strCode;
    ArrayList alLinks;

    Console.Write("请输入一个网页地址:");
    string strURL = Console.ReadLine();
    if(strURL.Substring(0,7) != @"http://")
    {
    strURL = @"http://" + strURL;
    }

    Console.WriteLine("正在获取页面代码,请稍侯...");
    strCode = GetPageSource(strURL);

    Console.WriteLine("正在提取超链接,请稍侯...");
    alLinks = GetHyperLinks(strCode);

    Console.WriteLine("正在写入文件,请稍侯...");
    WriteToXml(strURL,alLinks);
    }

    // 获取指定网页的HTML代码
    static string GetPageSource(string URL)
    {
    Uri uri =new Uri(URL);

    HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
    HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

    hwReq.Method = "Get";

    hwReq.KeepAlive = false;

    StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));

    return reader.ReadToEnd();
    }

    // 提取HTML代码中的网址
    static ArrayList GetHyperLinks(string htmlCode)
    {
    ArrayList al = new ArrayList();

    string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    MatchCollection m = r.Matches(htmlCode);

    for(int i=0; i<=m.Count-1; i++)
    {
    bool rep = false;
    string strNew = m[i].ToString();

    // 过滤重复的URL
    foreach(string str in al)
    {
    if(strNew==str)
    {
    rep =true;
    break;
    }
    }

    if(!rep) al.Add(strNew);
    }

    al.Sort();

    return al;
    }

    // 把网址写入xml文件
    static void WriteToXml(string strURL, ArrayList alHyperLinks)
    {
    XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);

    writer.Formatting = Formatting.Indented;
    writer.WriteStartDocument(false);
    writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
    writer.WriteComment("提取自" + strURL + "的超链接");
    writer.WriteStartElement("HyperLinks");
    writer.WriteStartElement("HyperLinks", null);
    writer.WriteAttributeString("DateTime",DateTime.Now.ToString());


    foreach(string str in alHyperLinks)
    {
    string title = GetDomain(str);
    string body = str;
    writer.WriteElementString(title,null,body);
    }

    writer.WriteEndElement();
    writer.WriteEndElement();

    writer.Flush();
    writer.Close();
    }

    // 获取网址的域名后缀
    static string GetDomain(string strURL)
    {
    string retVal;

    string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    Match m = r.Match(strURL);
    retVal = m.ToString();

    strRegex = @"\.|/$";
    retVal = Regex.Replace(retVal, strRegex, "").ToString();

    if(retVal == "")
    retVal = "other";

    return retVal;
    }

  • 相关阅读:
    HTB靶场记录之Cronos
    大家好,我是菜菜子,Can’t RCE安全团队队长
    Linux进阶教程丨第11章:归档和传输文件
    Java自动化审计(上篇)
    HTB靶场记录之Arctic
    Linux进阶教程丨第14章:管理基本存储和管理逻辑卷
    博客园文章自定义的图片放大功能失效修复
    caffeine配置及注意事项
    CR和LF
    Capsulebased Object Tracking with Natural Language Specification AHU
  • 原文地址:https://www.cnblogs.com/zhangpengshou/p/1699886.html
Copyright © 2011-2022 走看看