zoukankan      html  css  js  c++  java
  • 提取网页中的超级链接

    using System;
    using System.Xml;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.Collections;
    using System.Text.RegularExpressions;

    public class App
    {
    public static void Main()
    {
    string strCode;
    ArrayList alLinks;

    Console.Write("请输入一个网页地址:");
    string strURL = Console.ReadLine();
    if(strURL.Substring(0,7) != @"http://")
    {
    strURL = @"http://" + strURL;
    }

    Console.WriteLine("正在获取页面代码,请稍侯...");
    strCode = GetPageSource(strURL);

    Console.WriteLine("正在提取超链接,请稍侯...");
    alLinks = GetHyperLinks(strCode);

    Console.WriteLine("正在写入文件,请稍侯...");
    WriteToXml(strURL,alLinks);
    }

    // 获取指定网页的HTML代码
    static string GetPageSource(string URL)
    {
    Uri uri =new Uri(URL);

    HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
    HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

    hwReq.Method = "Get";

    hwReq.KeepAlive = false;

    StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));

    return reader.ReadToEnd();
    }

    // 提取HTML代码中的网址
    static ArrayList GetHyperLinks(string htmlCode)
    {
    ArrayList al = new ArrayList();

    string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    MatchCollection m = r.Matches(htmlCode);

    for(int i=0; i<=m.Count-1; i++)
    {
    bool rep = false;
    string strNew = m[i].ToString();

    // 过滤重复的URL
    foreach(string str in al)
    {
    if(strNew==str)
    {
    rep =true;
    break;
    }
    }

    if(!rep) al.Add(strNew);
    }

    al.Sort();

    return al;
    }

    // 把网址写入xml文件
    static void WriteToXml(string strURL, ArrayList alHyperLinks)
    {
    XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);

    writer.Formatting = Formatting.Indented;
    writer.WriteStartDocument(false);
    writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
    writer.WriteComment("提取自" + strURL + "的超链接");
    writer.WriteStartElement("HyperLinks");
    writer.WriteStartElement("HyperLinks", null);
    writer.WriteAttributeString("DateTime",DateTime.Now.ToString());


    foreach(string str in alHyperLinks)
    {
    string title = GetDomain(str);
    string body = str;
    writer.WriteElementString(title,null,body);
    }

    writer.WriteEndElement();
    writer.WriteEndElement();

    writer.Flush();
    writer.Close();
    }

    // 获取网址的域名后缀
    static string GetDomain(string strURL)
    {
    string retVal;

    string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

    Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
    Match m = r.Match(strURL);
    retVal = m.ToString();

    strRegex = @"\.|/$";
    retVal = Regex.Replace(retVal, strRegex, "").ToString();

    if(retVal == "")
    retVal = "other";

    return retVal;
    }

  • 相关阅读:
    UI第十七讲.图片异步加载(包括第三方), KVO, KVC
    第二十一讲.UICollectionView(集合视图)以及瀑布流效果, 通知中心(NSNotificationCenter).
    UI第十九讲:数据库
    UI第十八讲.初级数据持久化 (沙盒, 简单写入对象, 归解档, NSUserDefaults写入读取文件 )
    第十四讲(下) 可视化方式的实现通讯录.(及storyboard方式)
    第十六讲.网络编程 (HTTP协议)
    wslgit
    MyBatis时间比较
    layer开启与关闭加载层
    MyBatis中choose when正确写法
  • 原文地址:https://www.cnblogs.com/zhangpengshou/p/1699886.html
Copyright © 2011-2022 走看看