zoukankan
html css js c++ java
提取网页中的链接并生成xml
using System; using System.Xml; using System.Text; using System.Net; using System.IO; using System.Collections; using System.Text.RegularExpressions; public class App { public static void Main() { string strCode; ArrayList alLinks; Console.Write("请输入一个网页地址:"); string strURL = Console.ReadLine(); if (strURL.Substring(0, 7) != @"http://") { strURL = @"http://" + strURL; } Console.WriteLine("正在获取页面代码,请稍侯..."); strCode = GetPageSource(strURL); Console.WriteLine("正在提取超链接,请稍侯..."); alLinks = GetHyperLinks(strCode); Console.WriteLine("正在写入文件,请稍侯..."); WriteToXml(strURL, alLinks); } // 获取指定网页的HTML代码 static string GetPageSource(string URL) { Uri uri = new Uri(URL); HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri); HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse(); hwReq.Method = "Get"; hwReq.KeepAlive = false; StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312")); return reader.ReadToEnd(); } // 提取HTML代码中的网址 static ArrayList GetHyperLinks(string htmlCode) { ArrayList al = new ArrayList(); string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); MatchCollection m = r.Matches(htmlCode); for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); return al; } // 把网址写入xml文件 static void WriteToXml(string strURL, ArrayList alHyperLinks) { XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8); writer.Formatting = Formatting.Indented; writer.WriteStartDocument(false); writer.WriteDocType("HyperLinks", null, "urls.dtd", null); writer.WriteComment("提取自" + strURL + "的超链接"); writer.WriteStartElement("HyperLinks"); writer.WriteStartElement("HyperLinks", null); writer.WriteAttributeString("DateTime", DateTime.Now.ToString()); foreach (string str in alHyperLinks) { string title = GetDomain(str); string body = str; writer.WriteElementString(title, null, body); } writer.WriteEndElement(); writer.WriteEndElement(); writer.Flush(); writer.Close(); } // 获取网址的域名后缀 static string GetDomain(string strURL) { string retVal; string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)"; Regex r = new Regex(strRegex, RegexOptions.IgnoreCase); Match m = r.Match(strURL); retVal = m.ToString(); strRegex = @"\.|/{1}quot;; retVal = Regex.Replace(retVal, strRegex, "").ToString(); if (retVal == "") retVal = "other"; return retVal; } }
查看全文
相关阅读:
codeforces #330 div2
codeforces #332 div2
Codeforces Round #331 (Div. 2)C. Wilbur and Points
poj 01背包
zoj 1200 Mining
nginx反向代理与负载均衡
springcloud----config分布式配置中心
springcloud--zuul 网关
springcloud ----Hystrix熔断器
docker私有镜像仓库harbor搭建和配置
原文地址:https://www.cnblogs.com/zhangqs008/p/2341121.html
最新文章
第九天函数
第八天:函数
字符编码,文件处理
python入门基础
计算机基础一
在Unity3D中开发的Outline Shader
为Unity3D开发AssetBundle资源管理插件 AssetBundle Framework
开发Unity3D动画性能优化插件 GPU Animation Baker Pro
开发Unity3D动画性能优化插件 GPU Animation Baker Basic
浅谈在Unity3D中实现Finite State Machine System有限状态机框架系统
热门文章
浅谈在Unity3D中实现遗传算法插件Genetic Algorithm
浅谈在Unity3D中实现人工神经网络插件Artificial Neural Networks
开发Unity3D空战类插件 战机游戏模板Pro版本
开发Unity3D空战类插件 战机游戏模板
开发Unity3D空战类插件 战机飞行模拟模板Pro版本
数组类型和数组指针
python 基础篇(二)数据类型概述
python 基础篇(一)--linux命令篇
hiho 分冶专题
求质因数,因数,刷质数
Copyright © 2011-2022 走看看