zoukankan
html css js c++ java
提取网页中的链接并生成xml
using System; using System.Xml; using System.Text; using System.Net; using System.IO; using System.Collections; using System.Text.RegularExpressions; public class App { public static void Main() { string strCode; ArrayList alLinks; Console.Write("请输入一个网页地址:"); string strURL = Console.ReadLine(); if (strURL.Substring(0, 7) != @"http://") { strURL = @"http://" + strURL; } Console.WriteLine("正在获取页面代码,请稍侯..."); strCode = GetPageSource(strURL); Console.WriteLine("正在提取超链接,请稍侯..."); alLinks = GetHyperLinks(strCode); Console.WriteLine("正在写入文件,请稍侯..."); WriteToXml(strURL, alLinks); } // 获取指定网页的HTML代码 static string GetPageSource(string URL) { Uri uri = new Uri(URL); HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri); HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse(); hwReq.Method = "Get"; hwReq.KeepAlive = false; StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312")); return reader.ReadToEnd(); } // 提取HTML代码中的网址 static ArrayList GetHyperLinks(string htmlCode) { ArrayList al = new ArrayList(); string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); MatchCollection m = r.Matches(htmlCode); for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); return al; } // 把网址写入xml文件 static void WriteToXml(string strURL, ArrayList alHyperLinks) { XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8); writer.Formatting = Formatting.Indented; writer.WriteStartDocument(false); writer.WriteDocType("HyperLinks", null, "urls.dtd", null); writer.WriteComment("提取自" + strURL + "的超链接"); writer.WriteStartElement("HyperLinks"); writer.WriteStartElement("HyperLinks", null); writer.WriteAttributeString("DateTime", DateTime.Now.ToString()); foreach (string str in alHyperLinks) { string title = GetDomain(str); string body = str; writer.WriteElementString(title, null, body); } writer.WriteEndElement(); writer.WriteEndElement(); writer.Flush(); writer.Close(); } // 获取网址的域名后缀 static string GetDomain(string strURL) { string retVal; string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); Match m = r.Match(strURL); retVal = m.ToString(); strRegex = @"\.|/{1}quot;; retVal = Regex.Replace(retVal, strRegex, "").ToString(); if (retVal == "") retVal = "other"; return retVal; } }
查看全文
相关阅读:
mysql修改数据表名
HDU 5742 It's All In The Mind (贪心)
HDU 5752 Sqrt Bo (数论)
HDU 5753 Permutation Bo (推导 or 打表找规律)
HDU 5762 Teacher Bo (暴力)
HDU 5754 Life Winner Bo (博弈)
CodeForces 455C Civilization (并查集+树的直径)
CodeForces 455B A Lot of Games (博弈论)
CodeForces 455A Boredom (DP)
HDU 4861 Couple doubi (数论 or 打表找规律)
原文地址:https://www.cnblogs.com/zhangqs008/p/2341121.html
最新文章
pcl曲面重建模块-poisson重建算法示例
js 三大家族之offset
js 原生JS实现轮播图
js 封装一个均速动画函数
js 定时器的用法和清除
js location
js BOM
js 事件的阶段
js 冒泡事件与解决冒泡事件
js 任意元素解绑任意事件的兼容代码
热门文章
JS 为任意元素添加任意事件的兼容代码
检测浏览器类型
jq图片点击居中放大原始图片兼容ie
php标签云制作——数据表的结构和查询方法
jQuery 参考手册
php 操作数组 (合并,拆分,追加,查找,删除等)
jq图片切换特效
php二维数组的取值与转换
thinkphp关联模型的用法
修改wamp默认网站目录
Copyright © 2011-2022 走看看