zoukankan
html css js c++ java
CollectionHelper网页采集辅助类
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.IO; using System.Net; namespace Framework { /// <summary> /// 网页采集辅助类 /// </summary> public static class CollectionHelper { /// <summary> /// 取得字符里的Dom元素 不包含元素属性 /// </summary> /// <param name="orgStr"></param> /// <param name="domElem"></param> /// <returns></returns> public static List<string> GetDomElem(string orgStr, string domElem) { List<string> matchList = new List<string>(); string regStr = string.Format("<{0}[^>]*?>[\\s\\S]+?<\\/{0}>", domElem); try { Regex regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase); MatchCollection matches = regex.Matches(orgStr); StringBuilder sb = new StringBuilder(); foreach (Match match in matches) { matchList.Add(match.Value); } } catch (Exception ex) { matchList.Add(ex.Message); } return matchList; } /// <summary> /// 取得字符里的Dom元素 包含元素属性 如:class="aa" /// </summary> /// <param name="orgStr"></param> /// <param name="tagName"></param> /// <param name="tagValue"></param> /// <returns></returns> public static List<string> GetDomElemByAttr(string orgStr, string tagName, string tagValue) { List<string> matchList = new List<string>(); string regStr = string.Format(@"<(?<HtmlTag>[\w]+)[^>]*\s{0}=(?<Quote>[""']?){1}(?(Quote)\k<Quote>)[""']?[^>]*>((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|[\s\S]*?)*</\k<HtmlTag>>", tagName.ToLower(), tagValue); try { Regex regex = new Regex(regStr, RegexOptions.Compiled | RegexOptions.IgnoreCase); MatchCollection matches = regex.Matches(orgStr); StringBuilder sb = new StringBuilder(); foreach (Match match in matches) { matchList.Add(match.Value); } } catch (Exception ex) { matchList.Add(ex.Message); } return matchList; } /// <summary> /// 取得字符里的A元素键值对 [name,url] /// </summary> /// <param name="orgStr"></param> /// <param name="domElem"></param> /// <returns></returns> public static Dictionary<string, string> GetDomElem_A(string orgStr) { Dictionary<string, string> matchList = new Dictionary<string, string>(); string regStr1 = "<a[^>]*? href=[\"'](?<url>[^\"']*?)[\"'][^>]*?>(?<text>[\\w\\W]*?)</a>"; try { Regex regex = new Regex(regStr1, RegexOptions.Compiled | RegexOptions.IgnoreCase); MatchCollection matches = regex.Matches(orgStr); StringBuilder sb = new StringBuilder(); foreach (Match match in matches) { string key = match.Value.RemoveHtml(); if (!matchList.ContainsKey(key)) { matchList.Add(key, GetUrlArray(match.Value)[0]); } } } catch (Exception ex) { matchList.Add(ex.Message, ""); } return matchList; } /// <summary> /// 获取网页源码 /// </summary> /// <param name="url">要获取源码的网页地址</param> /// <param name="coding">编码</param> /// <returns>返回获取的网页源代码</returns> public static string GetPageSourceByUrl(string url, string coding = "gb2312") { return GetPageSourceByUrl(new Uri(url), coding); } /// <summary> /// 获取网页源码 /// </summary> /// <param name="url">要获取源码的网页地址</param> /// <param name="coding">编码</param> /// <returns>返回获取的网页源代码</returns> public static string GetPageSourceByUrl(Uri url, string coding = "gb2312") { string getSource = string.Empty; try { HttpWebRequest httpwebrequest = (HttpWebRequest)WebRequest.Create(url); HttpWebResponse httpwebresponse = (HttpWebResponse)httpwebrequest.GetResponse(); Stream stream = httpwebresponse.GetResponseStream(); StreamReader streamreader = new StreamReader(stream, Encoding.GetEncoding(coding)); getSource = streamreader.ReadToEnd(); stream.Close(); httpwebresponse.Close(); } catch (NotSupportedException exception) { getSource = exception.Message; } catch (InvalidOperationException exception) { getSource = exception.Message; } catch (IOException exception) { getSource = exception.Message; } return getSource; } /// <summary> /// 获取页面内容后,用匹配url正则表达式抓取内容中的url /// </summary> /// <param name="code">列表代码</param> /// <returns>返回截取后的URL地址</returns> public static List<string> GetUrlArray(string code) { List<string> urlList = new List<string>(); Regex regex = new Regex(@"(http://)?[\w-\.]*([\/]?[\w-])+[\w-]*\.(htm|html|shtm|shtml|aspx|asp|php|jsp)+[\w-\=\?]*", RegexOptions.Compiled | RegexOptions.IgnoreCase); MatchCollection matches = regex.Matches(code); foreach (Match match in matches) { urlList.Add(match.Value); } return urlList; } /// <summary> /// 获取内容code中所有都图片地址 /// </summary> /// <param name="code">内容</param> /// <returns>返回截取后都图片地址</returns> public static Dictionary<string, string> GetImgUrlArray(string content) { Dictionary<string, string> imgList = new Dictionary<string, string>(); Regex reg = new Regex(@"<img[\s\S]*?src=(""(?<src>[^']*?)""|'(?<src>[^']*?)'|(?<src>[^>\s]*))[^>]*?>(.*?)"); MatchCollection m = reg.Matches(content.ToLower()); foreach (Match match in m) { string matchValue = match.Groups["src"].Value; if (!imgList.ContainsKey(matchValue)) { imgList.Add(matchValue, matchValue); } } return imgList; } /// <summary> /// 将相对地址转换为绝对地址 /// </summary> /// <param name="relativeAddress">要转换的相对地址</param> /// <param name="absoluteAddress">当前网页地址</param> /// <returns>返回转换后的地址</returns> public static string ConvertToAbsluteUrl(string relativeAddress, string absoluteAddress) { if (string.IsNullOrEmpty(relativeAddress)) { return string.Empty; } if (relativeAddress.Contains("://")) { return relativeAddress; } if (string.IsNullOrEmpty(absoluteAddress)) { return string.Empty; } if (!absoluteAddress.Contains("://")) { return string.Empty; } Uri baseUrl = new Uri(absoluteAddress); Uri webrul = new Uri(baseUrl, relativeAddress); return webrul.ToString(); } /// <summary> /// 替换所有HTML标签为空 /// </summary> /// <param name="input">The string whose values should be replaced.</param> /// <returns>A string.</returns> public static string RemoveHtml(this string input) { var stripTags = new Regex("</?[a-z][^<>]*>", RegexOptions.IgnoreCase); return stripTags.Replace(input, string.Empty); } } }
查看全文
相关阅读:
C++的高效从何而来2
初体验ajax跨域
ACM在线测评系统评测程序设计与实现
高效GTD云工具 Manage Your Time
HTTP 长连接
使用avalon MVVM框架打造整一套jquery ui
GhostDoc(注释生成工具)使用方法
NUnit快速入门 笔记
ETags
nodejs + edge + ejs + c#
原文地址:https://www.cnblogs.com/zhangqs008/p/2341094.html
最新文章
多线程学习之限制同时运行的线程数量
.NET通用权限系统快速开发框架
在ASP.NET MVC Web API中使用Apworks开发第一个HTTP服务
memcached客户端操作类
2013年阿里巴巴实习生招聘笔试题目及解答
ContextAttribute类
计算机是怎么样工作的?
一致性hash和solr千万级数据分布式搜索引擎中的应用
小网站架构优化:从100并发抗到4000并发
K2使用总结K2简介
热门文章
avalonjs v6.1发布
基于任务的异步模式
Framebuffer的简单应用
当文件操作遇上进程占用时
十分钟学会什么是async和await
iptables
C#中的特性Attribute
可扩展的单据编号生成器 + 简单的解释器
对.NET系统架构改造的一点经验和教训
基于.net开发chrome核心浏览器3
Copyright © 2011-2022 走看看