zoukankan      html  css  js  c++  java
  • 提取网页中的超链接

    using System;
    using System.Xml;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Collections;
    using System.Text.RegularExpressions;

    public class App
    {
    public static void Main()
    {
    string strCode;
    ArrayList alLinks;

    Console.Write("请输入一个网页地址:");
    string strURL = Console.ReadLine();
    if(strURL.Substring(0,7) != @"http://")
    {
    strURL = @"http://" + strURL;
    }

    Console.WriteLine("正在获取页面代码,请稍侯...");
    strCode = GetPageSource(strURL);

    Console.WriteLine("正在提取超链接,请稍侯...");
    alLinks = GetHyperLinks(strCode);

    Console.WriteLine("正在写入文件,请稍侯...");
    WriteToXml(strURL,alLinks);
    }

    // 获取指定网页的HTML代码
    static string GetPageSource(string URL)
    {
    Uri uri =new Uri(URL);

    HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
    HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

    hwReq.Method = "Get";

    hwReq.KeepAlive = false;

    StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));

    return reader.ReadToEnd();
    }

    // 提取HTML代码中的网址
    static ArrayList GetHyperLinks(string htmlCode)
    {
    ArrayList al = new ArrayList();

    string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    MatchCollection m = r.Matches(htmlCode);

    for(int i=0; i<=m.Count-1; i++)
    {
    bool rep = false;
    string strNew = m[i].ToString();

    // 过滤重复的URL
    foreach(string str in al)
    {
    if(strNew==str)
    {
    rep =true;
    break;
    }
    }

    if(!rep) al.Add(strNew);
    }

    al.Sort();

    return al;
    }

    // 把网址写入xml文件
    static void WriteToXml(string strURL, ArrayList alHyperLinks)
    {
    XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);

    writer.Formatting = Formatting.Indented;
    writer.WriteStartDocument(false);
    writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
    writer.WriteComment("提取自" + strURL + "的超链接");
    writer.WriteStartElement("HyperLinks");
    writer.WriteStartElement("HyperLinks", null);
    writer.WriteAttributeString("DateTime",DateTime.Now.ToString());


    foreach(string str in alHyperLinks)
    {
    string title = GetDomain(str);
    string body = str;
    writer.WriteElementString(title,null,body);
    }

    writer.WriteEndElement();
    writer.WriteEndElement();

    writer.Flush();
    writer.Close();
    }

    // 获取网址的域名后缀
    static string GetDomain(string strURL)
    {
    string retVal;

    string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    Match m = r.Match(strURL);
    retVal = m.ToString();

    strRegex = @"\.|/$";
    retVal = Regex.Replace(retVal, strRegex, "").ToString();

    if(retVal == "")
    retVal = "other";

    return retVal;
    }
    }

    本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/21aspnet/archive/2007/03/24/1540012.aspx

  • 相关阅读:
    C++ Boost Thread 编程指南
    boost的Any库学习
    人生规划,关注未来,才能持续发展
    察言观色—看穿他人心理的6种方法
    MS SQL Server 2008发布与订阅
    WebService代理类中对枚举类型的序列化
    Winform注册和注销全局快捷键
    sql server中如何为数据表添加表的描述MS_Description
    如何修改SQL Server 2008数据库服务器名称
    IIS 上发布网站后编译器错误信息: CS0016: 解决办法
  • 原文地址:https://www.cnblogs.com/liufei88866/p/1534195.html
Copyright © 2011-2022 走看看