zoukankan      html  css  js  c++  java
  • 数据采集[即与 WEB 相关的功能函数]

    --

    using System;
    using System.Collections;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.Text.RegularExpressions;

    namespace ToolLibrary
    {
        /// <summary>
        /// 网络爬虫[数据采集] [即与 WEB 相关的功能函数]
        /// [wzrong 2008-11-06] 
        /// QQ:120152169
        /// Email:w_zrong@163.com
        /// </summary>
        public class WebCrawler
        {

            #region 根据网站地址(URL)获取整站的 HTML

            /// <summary>
            /// 根据网站地址(URL)获取整站的 HTML
            /// </summary>
            /// <param name="urlPath">网站地址(URL)</param>
            /// <returns>整站的 HTML</returns>
            public static string GetHtmlContentsByUrl(string urlPath)
            {
                string returnStr = string.Empty;
                try
                {
                    WebClient client = new WebClient(); //向URL标识的资源发送数据和从URL标识的资源接收数据

                    returnStr = client.DownloadString(urlPath);//以字符串的形式下载资源

                    client.Dispose();
                }
                catch 
                {
                    returnStr = "";
                }

                return returnStr;
            }

            #endregion

            #region 根据正则表达式(手动配置表达式)获取指定信息 返回 ArrayList

            /// <summary>
            /// 根据正则表达式(手动配置表达式)获取指定信息 返回 ArrayList
            /// </summary>
            /// <param name="htmlSource">HTML源码</param>
            /// <param name="strRegex">正则表达式(手动配置表达式)</param>
            /// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
            /// <returns>指定信息集合</returns>
            public static ArrayList GetHtmlArrayByRegex(string htmlSource, string strRegex, bool isRightToLeft)
            {
                ArrayList array = new ArrayList();
                Regex rex;

                string html = htmlSource.Replace("\r\n", "").Replace("\r", "").Replace("\t", "");

                if (isRightToLeft)
                {
                    rex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Compiled | RegexOptions.RightToLeft);
                }
                else
                {
                    rex = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Compiled);
                }

                MatchCollection mc = rex.Matches(html); //迭代匹配 (在指定的字符串中搜索正则表达式的所有匹配)

                foreach (Match m in mc)
                {
                    string matchStr = m.Groups[1].ToString().Trim(); //获取由正则表达式匹配的组的集合
                    array.Add(matchStr);
                }
                return array;
            }

            #endregion

            #region 根据正则表达式(起始标志,结束标志)获取指定的信息 返回 ArrayList

            /// <summary>
            /// 根据正则表达式(起始标志,结束标志)获取指定的信息 返回 ArrayList
            /// </summary>
            /// <param name="htmlSource">HTML源码</param>
            /// <param name="startRex">起始标志</param>
            /// <param name="endRex">结束标志</param>
            /// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
            /// <returns>指定信息集合</returns>
            public static ArrayList GetHtmlArrayByRegex(string htmlSource, string startRex, string endRex, bool isRightToLeft)
            {
                string returnRex = startRex + "(.*?)" + endRex;
                return GetHtmlArrayByRegex(htmlSource, returnRex, false);
            }

            #endregion

            #region 根据正则表达式(起始标志,结束标志)获取指定的信息 返回字符串 String

            /// <summary>
            /// 根据正则表达式(起始标志,结束标志)获取指定的信息 返回字符串 String
            /// </summary>
            /// <param name="htmlSource">HTML源码</param>
            /// <param name="startRex">起始标志</param>
            /// <param name="endRex">结束标志</param>
            /// <param name="isRightToLeft">是否从右向左匹配?true:false</param>
            /// <returns>指定信息的字符串</returns>
            public static String GetHtmlStrByRegex(string htmlSource, string startRex, string endRex, bool isRightToLeft)
            {
                //string returnStr = string.Empty;
                StringBuilder sb = new StringBuilder();

                string regexStr = startRex + "(.*?)" + endRex;
               
                ArrayList array = new ArrayList();
                array = GetHtmlArrayByRegex(htmlSource, regexStr, false);

                for (int i = 0; i < array.Count; i++)
                {
                    //returnStr = array[i].ToString();
                    sb.Append(array[i].ToString());

                }
                //return returnStr;
                return sb.ToString();
            }

            #endregion

            #region 得到分页的连接地址

            /// <summary>
            /// 得到分页连接地址
            /// </summary>
            /// <param name="oldPageUrl">原连接地址</param>
            /// <param name="PageTags">分页标签</param>
            /// <returns>分页连接地址</returns>
            public static String GetPageUrl(string oldPageUrl, string PageTags)
            {
                string newPageUrl = string.Empty;

               
                return newPageUrl;
            }

            #endregion

            #region 得到网页图片的地址[多个则用分割符隔开累加]

            /// <summary>
            /// 得到网页图片地址 [多个则用分割符隔开累加]
            /// </summary>
            /// <param name="html">包含图片的 HTML 代码</param>
            /// <returns>图片地址</returns>
            public static String GetHtmlImgUrl(string html)
            {
                string returnStr = "";

                ArrayList array = new ArrayList();

                array = GetHtmlArrayByRegex(html, "src=\"", "\"", false);

                for (int i = 0; i < array.Count; i++)
                {
                    if (i == 0)
                    {
                        returnStr = array[i].ToString();
                    }
                    else
                    {
                        returnStr = array[i].ToString() + Common.CommonConst.GAP_CHAR1 + returnStr;
                    }
                }

                return returnStr;
            }

            #endregion

            #region 得到有效的连接地址 (对不包含域名的地址加上域名)

            /// <summary>
            /// 得到有效的连接地址 (对不包含域名的地址加上域名)
            /// </summary>
            /// <param name="oldUrl">原始地址</param>
            /// <param name="domainUrl">域名地址 如http://www.baidu.com/ </param>
            /// <returns>有效的连接地址</returns>
            public static String GetValidUrl(string oldUrl, string domainUrl)
            {
                string newUrl = oldUrl;

                string http = "http://";

                if (!oldUrl.Contains(http))
                {
                    if (oldUrl.StartsWith("/") && domainUrl.EndsWith("/"))
                    {
                        newUrl = domainUrl.Remove(domainUrl.Length - 1, 1) + oldUrl;
                    }
                    else if (!oldUrl.StartsWith("/") && domainUrl.EndsWith("/"))
                    {
                        newUrl = domainUrl + "/" + oldUrl;
                    }
                    else
                    {
                        newUrl = domainUrl + oldUrl;
                    }
                }
                return newUrl;
            }

            #endregion

            #region 获取文件后缀和文件名称

            /// <summary>
            /// 获取文件后缀和文件名称
            /// 如果文件字符串连同路径传值,则返回文件名也包含路径
            /// </summary>
            /// <param name="fileStr">文件字符串[名称和后缀(可以包含路径) 如:txtName.txt]</param>
            /// <param name="splitChr">分割符 如.</param>
            /// <param name="fileName">文件名</param>
            /// <param name="suffix">后缀</param>
            public static void GetFileNameAndSuffix(string fileStr, char splitChr, out string fileName, out string suffix)
            {
                if (fileStr.Trim() == string.Empty)
                {
                    fileName = suffix = "";
                    return;
                }
                if (!fileStr.Contains(splitChr))
                {
                    fileName = suffix = "";
                    return;
                }

                int index = fileStr.LastIndexOf(splitChr);

                fileName = fileStr.Substring(0, index);

                suffix = fileStr.Substring(index + 1);
            }

            #endregion

            #region 获取网络图片命名名称和后缀 [如:命名名称.jpg]

            /// <summary>
            /// 获取网络图片命名名称和后缀 [如:命名名称.jpg]
            /// </summary>
            /// <param name="imgUrl">网络图片连接地址</param>
            /// <param name="isOverWriteName">是否从写图片名称?true:false</param>
            /// <returns>图片命名名称和后缀</returns>
            public static string GetImgNameAndSuffix(string imgUrl, bool isOverWriteName)
            {
                //例如:/images/bg7.jpg
                string imgName = "";

                if (imgUrl.Contains("/"))
                {
                    imgName = imgUrl.Substring(imgUrl.LastIndexOf("/") + 1);
                }
                else
                {
                    imgName = imgUrl;
                }

                //重写图片名称
                if (isOverWriteName)
                {
                    string fileName, sufFix;
                    GetFileNameAndSuffix(imgUrl, '.', out fileName, out sufFix);
                    imgName = DateTime.Now.ToString("yyMMddhhmmss") + DateTime.Now.Millisecond.ToString() + "." + sufFix;
                }

                return imgName;
            }

            #endregion

            #region 从网络上下载图片到本地服务器

            /// <summary>
            /// 从网络上下载图片到本地服务器
            /// </summary>
            /// <param name="imgUrl">网络图片的连接地址 </param>
            /// <param name="imgSavePath">要接收数据的本地文件名称</param>
            /// <param name="domainUrl">域名地址 如: http://www.baidu.com </param>
            public static void DownLoadImgToLocal(string imgUrl, string imgSavePath)
            {
                try
                {
                    WebClient client = new WebClient();
                    client.DownloadFile(imgUrl, imgSavePath);
                    client.Dispose();
                }
                catch {

                }
            }

            #endregion

            #region 重写显示图片的 HTML 代码 <img />

            /// <summary>
            /// 重写图片显示的HTML代码 返回格式:[img SRC="imgPath" ALT="imgTitle" /]
            /// </summary>
            /// <param name="imgOldHtml">原始IMG显示的HTML代码</param>
            /// <param name="imgNewSavePath">图片存放新地址/路径</param>
            /// <param name="imgTitle">图片标题</param>
            /// <returns>返回格式:src="imgPath" alt="imgTitle"</returns>
            public static string OverWriteImgUrlInHtml(string imgOldHtml, string imgNewSavePath, string imgTitle)
            {
                string returnStr = "";

                string imgPath = ""; //img本地存放路径

                string imgUrl = GetHtmlImgUrl(imgOldHtml); //img网络连接地址

                string imgName = GetImgNameAndSuffix(imgUrl, false); // img名称

                if (imgNewSavePath.EndsWith("/"))
                {
                    imgPath = imgNewSavePath;
                }
                else
                {
                    imgPath = imgNewSavePath + "/";
                }

                returnStr = "<img src=\"" + imgPath + imgName + "\" alt=\"" + imgTitle + "\">" + "<br/> ";
                return returnStr;
            }

            #endregion

            #region 将 HTML 中的图片地址替换成本地地址 并将其下载到本地服务器中 返回改写图片地址后的 HTML 文本

            /// <summary>
            /// 将 HTML 中的图片地址替换成本地地址
            /// 并将其下载到本地服务器中
            /// 返回改写图片地址后的 HTML 文本
            /// </summary>
            /// <param name="htmlSource">原始 HTML 文本串</param>
            /// <param name="domainUrl">图片所在服务器域名地址</param>
            /// <returns></returns>
            public static string ReplaceImgDirInHtml(string htmlSource, string domainUrl)
            {
                string returnStr = htmlSource;
                ArrayList array = new ArrayList();

                array = GetHtmlArrayByRegex(htmlSource, "<img ", ">", false);

                returnStr = returnStr.Replace("<img", "");

                for (int i = 0; i < array.Count; i++)
                {
                    //带HTML的图片地址
                    string imgOldHtml = array[i].ToString(); //src = "http://www.11kp.com/images/20070423/1234fg32.jpg"

                    //存放图片的文件夹路径 如: /images/news/20081107/03/
                    string imgSavePath = IOFunction.CreateFolder(Common.CommonConst.IMG_SAVE_DIR);

                    //原始图片地址 如: http://www.11kp.com/images/20070423/1234fg32.jpg
                    string imgOldUrl = GetHtmlImgUrl(imgOldHtml);

                    //得到有效的连接地址
                    string imgValidUrl = GetValidUrl(imgOldUrl, domainUrl);

                    //得到图片名称
                    string imgName = GetImgNameAndSuffix(imgValidUrl, true);

                    //下载图片
                    DownLoadImgToLocal(imgValidUrl, imgSavePath + imgName);

                    string imgNewHtml = OverWriteImgUrlInHtml(imgOldHtml, imgSavePath, "十一宽频");

                    //替换图片原连接地址为新连接地址
                    returnStr = returnStr.Replace(imgOldHtml, imgNewHtml);

                }

                returnStr = returnStr.Replace(">>", ">");

                return returnStr;
            }

            #endregion

            /// <summary>
            /// 获取网页内容
            /// </summary>
            /// <param name="url">网页路径</param>
            /// <returns></returns>
            public static string getWebHtmlCotent(string url)
            {
                try
                {
                    byte[] b_text = new System.Net.WebClient().DownloadData(url);

                    return System.Text.Encoding.Default.GetString(b_text);
                }
                catch
                {
                    return "";
                }
            }

            /// <summary>
            /// 获取网页内容
            /// </summary>
            /// <param name="url">网页路径</param>
            /// <param name="encode">编码方式</param>
            /// <returns></returns>
            public static string getWebHtmlCotent(string url, System.Text.Encoding encode)
            {
                try
                {
                    byte[] b_text = new System.Net.WebClient().DownloadData(url);

                    return encode.GetString(b_text);
                }
                catch
                {
                    return "";
                }
            }

            /// <summary>
            /// 清除所有HTML标记
            /// </summary>
            /// <param name="HtmlContents"></param>
            /// <returns></returns>
            public static string getClearHtmlCode(string HtmlContents)
            {

                HtmlContents = HtmlContents.Replace(" ", "").Replace("\t", "").Replace("\r\n", "");

                //先清除js

                HtmlContents = Regex.Replace(HtmlContents, "<script*.?/script>", "", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "<.*?>", "", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "&nbsp;", "", RegexOptions.IgnoreCase);

                return HtmlContents;

            }

            /// <summary>
            /// 清除 HTML 标记中的图片
            /// </summary>
            /// <param name="HtmlContents"></param>
            /// <returns></returns>
            public static string doClearHtmlTagSaveImg(string HtmlContents)
            {
                string Contents = HtmlContents;

                Match m;

                Match m1;

                Regex r = new Regex("<img.*?>", RegexOptions.IgnoreCase | RegexOptions.Compiled);

                for (m = r.Match(Contents); m.Success; m = m.NextMatch())
                {
                    string tempstr = m.Groups[0].ToString();

                    string oldImgTag = tempstr;

                    string newImgTag = "";

                    Regex r1 = new Regex("src=\".*?\"", RegexOptions.IgnoreCase | RegexOptions.Compiled);

                    for (m1 = r1.Match(tempstr); m1.Success; )
                    {
                        newImgTag = m1.Groups[0].ToString();

                        break;
                    }
                    if (newImgTag != "")
                    {
                        newImgTag = "&ltt;img " + newImgTag + "&rtt;";

                        HtmlContents = HtmlContents.Replace(oldImgTag, newImgTag) + "<br>";
                    }
                }
                HtmlContents = Regex.Replace(HtmlContents, "\r\n", "", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "<br>", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "<br >", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "<br />", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "</p>", "&ltt;br /&rtt;", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "&nbsp;\r\n", "", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "<script*.?/script>", "", RegexOptions.IgnoreCase);

                HtmlContents = Regex.Replace(HtmlContents, "<.*?>", "", RegexOptions.IgnoreCase);

                HtmlContents = HtmlContents.Replace("&ltt;", "<").Replace("&rtt;", ">");

                HtmlContents = HtmlContents.Replace("    ", " ").Replace("   ", " ").Replace("  ", " ").Replace("  ", " ").Replace("\t", "");

                HtmlContents = HtmlContents.Replace("<br /><br /><br /><br />", "<br />").Replace("<br /><br /><br />", "<br />").Replace("<br /><br />", "<br />");

                HtmlContents = HtmlContents.Replace("<br /> <br />", "<br />").Replace("<br /> <br />", "<br />").Replace("<br /> <br />", "<br />");

                return HtmlContents;
            }

        }
    }

  • 相关阅读:
    our毕业短片
    Android Memory Management, OutOfMemoryError
    android autoswitched ImageSwitcher
    Storage size of Bitmap
    Frequentlyused URI of Intent
    小知识: 软件版本号讲解: 什么是Alpha, Beta, RC
    JSF框架中使用的设计模式介绍
    Unicode编码表/00000FFF
    Spring事务的传播行为和隔离级别
    领略Spring 3.x 时代的Spring MVC
  • 原文地址:https://www.cnblogs.com/wzrong/p/1332527.html
Copyright © 2011-2022 走看看