zoukankan      html  css  js  c++  java
  • 抓取网页

    C# 读取文本文件内容生成相应的文件,获取目录下所有文件名并保存为文本文

    最近因为经常用到2个功能:
    1):以一个文件内容为名批量生成相应的文件
    2):查找一个目录(包括子目录)下某扩展名的所有文件
    所以写了一个小程序,方便多了。
    先看效果图:
    
    
    虽然很简单但须注意:
    1. 
    扩展名 区分大小写
    if (Path.GetExtension(file).ToLower() == mask.ToLower())
    一开始没注意这,害得找出的文件总是比正常的文件少
    2.
    去掉文件名中的非法字符
    line = line.Replace("\", string.Empty);
    line = line.Replace("/", string.Empty);
    line = line.Replace(":", string.Empty);
    line = line.Replace("*", string.Empty);
    line = line.Replace("?", string.Empty);
    line = line.Replace(""", string.Empty);
    line = line.Replace("<", string.Empty);
    line = line.Replace(">", string.Empty);
    line = line.Replace("|", string.Empty);
    //line = line.Replace(" ", string.Empty);
    fs = new FileStream(fileSaveDir +"\"+ line + ext, FileMode.Create);
    3.
    注意各种细节,一些小问题不容忽视,现在这个程序的excepitoin 处理还有一些模糊! 暂时就不改了。
    4.主要代码
    c#
    //生成文件
    //
    private void btnCreate_Click(object sender, EventArgs e)//生成文件
            {
                FileStream fs;
                String line = "";
               // ext = Convert.ToString( comboBox1.SelectedItem);
                ext = comboBox1.Text;
                fileSaveDir = this.tbxSaveDir.Text;
                fileName = this.tbxFilename.Text;
                if (fileName == "")
                {
                    MessageBox.Show("请选择文件名的存放文件。");
                    return;
                }
                if (fileSaveDir == "")
                {
                    FileInfo fi = new FileInfo(fileName);
                    fileSaveDir =Convert.ToString(fi.Directory);
                }
                try
                {
                    using (StreamReader sr = new StreamReader(fileName))
                    {
                        do
                        {
                            line = sr.ReadLine();
                            if (line != null)
                            {
                                String file = fileSaveDir + "\" + line + ext;
                                if(File.Exists(file))
                                {
                                    if (DialogResult.Yes == MessageBox.Show("文件 "+"""+line+ext+"""+" 已经存在了!", "是否忽略已经存在的文件", MessageBoxButtons.YesNo,MessageBoxIcon.Warning))
                                    {
                                        continue;
                                    }
                                    else
                                    {
                                        MessageBox.Show("一共生成了" + count + " 个文件。");
                                        return;
                                    }
                                }
                                
                                line = line.Replace("\", string.Empty);
                                line = line.Replace("/", string.Empty);
                                line = line.Replace(":", string.Empty);
                                line = line.Replace("*", string.Empty);
                                line = line.Replace("?", string.Empty);
                                line = line.Replace(""", string.Empty);
                                line = line.Replace("<", string.Empty);
                                line = line.Replace(">", string.Empty);
                                line = line.Replace("|", string.Empty);
                                //line = line.Replace(" ", string.Empty);
                                
                                fs = new FileStream(fileSaveDir +"\"+ line + ext, FileMode.Create);
                                //fs = new FileStream(line + ".txt", FileMode.Create);
                                count++;
                            }
                        } while (line != null);
                    }
                }
                catch (ArgumentException arge)
                {
                    MessageBox.Show(arge.Message);
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
                MessageBox.Show("一共生成了"+count+" 个文件。");
                count = 0;
                //this.comboBox1.SelectedIndex = 0;
            
            }
    
    //获取文件名
    private void btnGetFileName_Click(object sender, EventArgs e)//获取文件名
            {
                int fileCount = 0;
                bool fullname = checkBox1.Checked;
                if (this.tbxPath.Text =="" || this.tbxExten_tabPage2.Text == "" || this.tbxSavePath.Text == "")
                {
                    MessageBox.Show("请选择目录及扩展名。");
                    return;
                }
                String directory = this.tbxPath.Text;
                String mask = this.tbxExten_tabPage2.Text;
                String savepath = this.tbxSavePath.Text;
                findFiles(directory, mask, false,fullname, ref fileCount);
                File.Delete(savepath);
                FileStream fs = new FileStream(savepath , FileMode.CreateNew);
                StreamWriter sw = new StreamWriter(fs);
                foreach (string str in al)
                    sw.WriteLine(str);
                sw.Close();
                fs.Close();
                MessageBox.Show("一共获取了" + fileCount + "个文件名。");
                
                fileCount = 0;
                al.Clear();
            }
    public void findFiles(string directory, string mask, bool ignoreHidden,bool fullname, ref int fileCount)//获取文件名
            {
                //先查找当前目录下指定后缀名的所有文件
                foreach (string file in Directory.GetFiles(directory, "*.*")) //System Volume Information unauthorizedAccessException
                {
                    if (!(ignoreHidden && (File.GetAttributes(file) & FileAttributes.Hidden) == FileAttributes.Hidden))
                    {
                        if (mask != "")
                        {
                            if (Path.GetExtension(file).ToLower() == mask.ToLower())
                            {
                                FileInfo fi = new FileInfo(file);
                                String name="";
                                if (fullname)
                                {
                                    name = fi.FullName;
                                }
                                else
                                {
                                    name = fi.Name;//.Replace(mask,"");
                                }
                                al.Add(name);
                                fileCount++;
                            }
                        }
                    }
                }          
                string[] childDirectories = Directory.GetDirectories(directory);
                foreach (string dir in childDirectories)
                {
                    if (!(ignoreHidden && (File.GetAttributes(dir) & FileAttributes.Hidden) == FileAttributes.Hidden))
                    
                        findFiles(dir, mask, false,fullname, ref fileCount);
                    
                }                 
            }
    //java code(查找一个目录(包括子目录)下的所有文件):
    import java.io.*;
    
    public class ListFiles {
    private static String listFileStr = "";
    private static String dir;
    private static String savefile;
    private static int count = 0;
    
    private static FileWriter fw;
    private static File saveFile;
    public static void main(String[] args) {
       try
       {
        System.out.println("请输入查找文件的目录:(eg:d\:music)");
        try{
           //接收键盘输入作为输入流,把输入流放到缓冲流里面
           BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 
           //从缓冲流读取一行数据
           dir = in.readLine();
           //saveFile=new File(savefile);
        }
        catch(IOException e)
        {
         //System.out.println(e.toString());
         System.out.println("请输入合法的路径名!");
        }
        System.out.println("请输入保存文件的位置:(eg:d\:savename.txt)");
        try{
           BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
           savefile = in.readLine();
           fw=new FileWriter(savefile);
        }
        catch(IOException ex)
        {
         //System.out.println(ex.toString());
         System.out.println("请输入合法的路径名!");
        } 
       
        
        ListFiles lf=new ListFiles();
        lf.listFile(dir);
        fw.write(listFileStr);
        fw.close();
        System.out.println("
    一共找到"+count+"个文件!");
       }
       catch (ArrayIndexOutOfBoundsException ea)
       {
        //参数提示
        System.out.println("Usage: ListFiles <source dir> <target file>");
      
       }
       catch (IOException e)
       {
        System.out.println("IO error!
    "+e.toString());
       }
       }
       public void listFile(String rp)
       {
      
        File file=new File(rp);
        File list[]=file.listFiles();
        for(int i=0;i<list.length;i++)
        {
         try
         {
          if (list[i].isDirectory())
          {
           new ListFiles().listFile(list[i].toString());
          }
          else 
          {
           listFileStr+=list[i].getAbsolutePath()+"
    ";
           System.out.println(list[i].getAbsolutePath());
    //       listFileStr+=list[i].getName()+"
    ";
    //       System.out.println(list[i].getName());
           count++;
          }
         }
         catch (Exception ex)
         {
          listFileStr+="Access deny:"+list[i].getAbsolutePath()+"
    ";
          System.out.println("Access deny:"+list[i].getAbsolutePath());
        }
       }
    }
    }
     
    View Code

    C# 抓取网页Html

    C# 抓取网页的Html 及分析:
    源码如下:
    private void Search(string url)
    {
        string rl;
        WebRequest Request = WebRequest.Create(url.Trim());
     
        WebResponse Response = Request.GetResponse();
     
        Stream resStream = Response.GetResponseStream();
     
        StreamReader sr = new StreamReader(resStream, Encoding.Default);
        StringBuilder sb = new StringBuilder();
        while ((rl = sr.ReadLine()) != null)
        {
            sb.Append(rl);
        }
     
     
        string str = sb.ToString().ToLower();
     
        string str_get = mid(str, "<ul class="post_list">", "</ul>");
     
     
        int start = 0;
        while (true)
        {
            if (str_get == null)
                break;
            string strResult = mid(str_get, "href="", """, out start);
            if (strResult == null)
                break;
            else
            {
                lab[url] += strResult;
                str_get = str_get.Substring(start);
            }
        }
    }
     
     
     
     
    private string mid(string istr, string startString, string endString)
    {
        int iBodyStart = istr.IndexOf(startString, 0);               //开始位置
        if (iBodyStart == -1)
            return null;
        iBodyStart += startString.Length;                           //第一次字符位置起的长度
        int iBodyEnd = istr.IndexOf(endString, iBodyStart);         //第二次字符在第一次字符位置起的首次位置
        if (iBodyEnd == -1)
            return null;
        iBodyEnd += endString.Length;                              //第二次字符位置起的长度
        string strResult = istr.Substring(iBodyStart, iBodyEnd - iBodyStart - 1);
        return strResult;
    }
     
     
    private string mid(string istr, string startString, string endString, out int iBodyEnd)
    {
        //初始化out参数,否则不能return
        iBodyEnd = 0;
     
        int iBodyStart = istr.IndexOf(startString, 0);               //开始位置
        if (iBodyStart == -1)
            return null;
        iBodyStart += startString.Length;                           //第一次字符位置起的长度
        iBodyEnd = istr.IndexOf(endString, iBodyStart);         //第二次字符在第一次字符位置起的首次位置
        if (iBodyEnd == -1)
            return null;
        iBodyEnd += endString.Length;                              //第二次字符位置起的长度
        string strResult = istr.Substring(iBodyStart, iBodyEnd - iBodyStart - 1);
        return strResult;
    }
     
    View Code

    C# 抓取网页里面的所有链接

    这几天偶尔看见了,C#抓取网页的链接。的代码。感觉当时做的很简单。呵呵。也没多考虑什么过程。先把简单的给大家拿出来看看。如果大家有什么意见或者有好的方法可以共同交流。谢谢!一下仅供参考:
     
    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    
    using System.Xml;
    using System.Net;
    using System.IO;
    using System.Collections;
    using System.Text.RegularExpressions;
    
    namespace text
    {
        public partial class Form1 : Form
        {
            string strCode;
            ArrayList alLinks;
            public Form1()
            {
                InitializeComponent();
            }
    
            private void button1_Click(object sender, EventArgs e)
            {
                if (textBox1.Text == "")
                {
                    MessageBox.Show("请输入网址");
                    return;
                }
                string strURL = textBox1.Text.ToString().Trim();
                if (strURL.Substring(0, 7) != @"http://")
                {
                    strURL = @"http://" + strURL;
                }
                MessageBox.Show("正在获取页面代码,请稍后...");
                strCode = GetPageSource(strURL);
                MessageBox.Show("正在提取超链接,请稍侯...");
                alLinks = GetHyperLinks(strCode);
                MessageBox.Show("正在写入文件,请稍侯...");
                WriteToXml(strURL, alLinks);
            }
            // 获取指定网页的HTML代码 
            public static string GetPageSource(string URL)
            {
                Uri uri = new Uri(URL);
                HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
                HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();
                hwReq.Method = "Get";
                hwReq.KeepAlive = false;
                StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));
                return reader.ReadToEnd();
            }
            // 提取HTML代码中的网址 
            public static ArrayList GetHyperLinks(string htmlCode)
            {
                ArrayList al = new ArrayList();
                string strRegex = @"http://([w-]+.)+[w-]+(/[w- ./?%&=]*)?";
                Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
                MatchCollection m = r.Matches(htmlCode);
                for (int i = 0; i <= m.Count - 1; i++)
                {
                    bool rep = false;
                    string strNew = m[i].ToString();
                    // 过滤重复的URL 
                    foreach (string str in al)
                    {
                        if (strNew == str)
                        {
                            rep = true;
                            break;
                        }
                    }
                    if (!rep) al.Add(strNew);
                }
                al.Sort();
                return al;
            }
            // 把网址写入xml文件 
            static void WriteToXml(string strURL, ArrayList alHyperLinks)
            {
                XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8);
                writer.Formatting = Formatting.Indented;
                writer.WriteStartDocument(false);
                writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
                writer.WriteComment("提取自" + strURL + "的超链接");
                writer.WriteStartElement("HyperLinks");
                writer.WriteStartElement("HyperLinks", null);
                writer.WriteAttributeString("DateTime", DateTime.Now.ToString());
    
                foreach (string str in alHyperLinks)
                {
                    string title = GetDomain(str);
                    string body = str;
                    writer.WriteElementString(title, null, body);
                }
                writer.WriteEndElement();
                writer.WriteEndElement();
                writer.Flush();
                writer.Close();
            }
            // 获取网址的域名后缀 
            static string GetDomain(string strURL)
            {
                string retVal;
                string strRegex = @"(.com/|.net/|.cn/|.org/|.gov/)";
                Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
                Match m = r.Match(strURL);
                retVal = m.ToString();
                strRegex = @".|/$";
                retVal = Regex.Replace(retVal, strRegex, "").ToString();
                if (retVal == "")
                    retVal = "other";
                return retVal;
            }
        }
    }
     
    View Code

    C# 抓取网页内容(转)

    摘要: 1、抓取一般内容需要三个类:WebRequest、WebResponse、StreamReader所需命名空间:System.Net、System.IO核心代码:view plaincopy to clipboardprint?WebRequestrequest=WebRequest.Create("http://www.cftea. ...
    1、抓取一般内容
    需要三个类:WebRequest、WebResponse、StreamReader
    所需命名空间:System.Net、System.IO
    核心代码:
    view plaincopy to clipboardprint?
    WebRequest request = WebRequest.Create("http://www.cftea.com/");  
    WebResponse response = request.GetResponse();  
    StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));  
          WebRequest 类的 Create 为静态方法,参数为要抓取的网页的网址;
          Encoding 指定编码,Encoding 中有属性 ASCII、UTF32、UTF8 等全球通用的编码,但没有 gb2312 这个编码属性,所以我们使用 GetEncoding 获得 gb2312 编码。
    示例:
    view plaincopy to clipboardprint?
    <%@ Page Language="C#" %>  
    <%@ Import Namespace="System.Net" %>  
    <%@ Import Namespace="System.IO" %>  
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
    <mce:script runat="server"><!--  
        void Page_Load(object sender, EventArgs e)  
        {  
            try  
            {  
                WebRequest request = WebRequest.Create("http://www.cftea.com/");  
                WebResponse response = request.GetResponse();  
                StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));  
                  
                tb.Text = reader.ReadToEnd();  
                  
                reader.Close();  
                reader.Dispose();  
                response.Close();  
            }  
            catch (Exception ex)  
            {  
                tb.Text = ex.Message;  
            }  
        }  
    // --></mce:script>   
    <html xmlns="http://www.w3.org/1999/xhtml" >  
    <head runat="server">  
        <title>抓取网页内容 - 千一网络</title>  
    </head>  
    <body>  
        <form id="form1" runat="server">  
        <div>  
        <asp:TextBox ID="tb" runat="server" Width="500" Height="300" TextMode="multiLine"></asp:TextBox>  
        </div>  
        </form>  
    </body>  
    </html>  
     
     2 抓取网页内容-图片
        需要四个类:WebRequest、WebResponse、Stream、FileStream。
       示例:
    view plaincopy to clipboardprint?
    <%@ Page Language="C#" %>  
    <%@ Import Namespace="System.Net" %>  
    <%@ Import Namespace="System.IO" %>  
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">  
    <mce:script runat="server"><!--  
        void Page_Load(object sender, EventArgs e)  
        {  
            try  
            {  
                WebRequest request = WebRequest.Create("http://www.cftea.com/images/logo.gif");  
                WebResponse response = request.GetResponse();  
                Stream reader = response.GetResponseStream();  
                  
                FileStream writer = new FileStream("D://logo.gif", FileMode.OpenOrCreate, FileAccess.Write);  
                byte[] buff = new byte[512];  
                int c = 0; //实际读取的字节数   
                while ((c=reader.Read(buff, 0, buff.Length)) > 0)  
                {  
                    writer.Write(buff, 0, c);  
                }  
                writer.Close();  
                writer.Dispose();  
                  
                reader.Close();  
                reader.Dispose();  
                response.Close();  
                  
                tb.Text = "保存成功!";  
            }  
            catch (Exception ex)  
            {  
                tb.Text = ex.Message;  
            }  
        }  
    // --></mce:script>   
    <html xmlns="http://www.w3.org/1999/xhtml" >  
    <head runat="server">  
        <title>抓取网页图片并保存 - 千一网络</title>  
    </head>  
    <body>  
        <form id="form1" runat="server">  
        <div>  
        <asp:TextBox ID="tb" runat="server" Width="500" Height="300" TextMode="multiLine"></asp:TextBox>  
        </div>  
        </form>  
    </body>  
    </html>  
     
    3 抓取网页内容-Post 数据
       在抓取网页时,有时候,需要将某些数据通过 Post 的方式发送到服务器,将以下代码添加在网页抓取的程序中,以实现将用户名和密码 Post 到服务器
    view plaincopy to clipboardprint?
    string data = "userName=admin&passwd=admin888";  
    byte[] requestBuffer = System.Text.Encoding.GetEncoding("gb2312").GetBytes(data);  
      
    request.Method = "POST";  
    request.ContentType = "application/x-www-form-urlencoded";  
    request.ContentLength = requestBuffer.Length;  
    using (Stream requestStream = request.GetRequestStream())  
    {  
        requestStream.Write(requestBuffer, 0, requestBuffer.Length);  
        requestStream.Close();  
    }  
      
    using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312")))  
    {  
        string str = reader.ReadToEnd();  
        reader.Close();  
    }  
     
    4  抓取网页内容-防止重定向
    在抓取网页时,成功登录服务器应用系统后,应用系统可能会通过 Response.Redirect 将网页进行重定向,如果不需要响应这个重定向,那么,我们就不要把 reader.ReadToEnd() 给 Response.Write 出来,就可以了。
    5 抓取网页内容-保持登录状态
      
    利用 Post 数据成功登录服务器应用系统后,就可以抓取需要登录的页面了,那么我们就可能需要在多个 Request 间保持登录状态。
    首先,我们要使用 HttpWebRequest,而不是 WebRequest。
    与 WebRequest 相比,变化的代码是:
    view plaincopy to clipboardprint?
    HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);  
     
    注意:HttpWebRequest.Create 返回的类型仍是 WebRequest,所以要转化一下。
    其次,使用 CookieContainer。
    view plaincopy to clipboardprint?
    System.Net.CookieContainer cc = new System.Net.CookieContainer();  
    request.CookieContainer = cc;  
    request2.CookieContainer = cc;   
     
    这样 request 和 request2 之间就使用了相同的 Session,如果 request 登录了,那么 request2 也是登录状态。
    最后,如何在不同的页面间使用同一个 CookieContainer。
    要在不同的页面间使用同一个 CookieContainer,只有把 CookieContainer 加入 Session。
     
    view plaincopy to clipboardprint?
    Session.Add("ccc", cc); //
      
    CookieContainer cc = (CookieContainer)Session["ccc"]; //
     
    5 抓取网页内容-把当前会话带到 WebRequest 中
     
    比如说浏览器 B1 去访问服务器端 S1,这会产生一个会话,而服务器端 S2 再用 WebRequest 去访问服务器端 S1,这又会产生一个会话。现在的需求是让 WebRequest 使用浏览器 B1 与 S1 之间的会话,也就是说要让 S1 认为是 B1 在访问 S1,而不是 S2 在访问 S1。
    这就要利用 Cookie 了,先在 S1 中取得与 B1 的 SessionID 的 Cookie,再将这个 Cookie 告诉 S2,S2 再将 Cookie 写在 WebRequest 中。
    view plaincopy to clipboardprint?
    WebRequest request = WebRequest.Create("url");  
    <SPAN class=key>request.Headers.Add(HttpRequestHeader.Cookie, "ASPSESSIONIDSCATBTAD=KNNDKCNBONBOOBIHHHHAOKDM;");</SPAN>  
    WebResponse response = request.GetResponse();  
    StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));  
    Response.Write(reader.ReadToEnd());  
    reader.Close();  
    reader.Dispose();  
    response.Close();  
     
    要说明的是:
    本文并不是 Cookie 欺骗,因为 SessionID 是 S1 告诉 S2 的,并不是 S2 窃取的,虽然有些古怪,但这可能在一些特定的应用系统中会有用。
    S1 必须要向 B1 写 Session,这样 SessionID 才会保存到 Cookie 中,并且 SessionID 才会保持不变。
    在 ASP.NET 中取 Cookie 用 Request.Cookies,本文假设 Cookie 已经取出来。
    不同的服务器端语言,SessionID 在 Cookie 中上名称并不一样,本文是 ASP 的 SessionID。
    S1 可能不仅仅依靠 SessionID 来判断当前登录,它可能还会辅助于 Referer、User-Agent 等,这取决于 S1 端程序的设计。
    其实本文算是本连载中“保持登录状态”的另一种方法。
    6 抓取网页内容-如何更改来源 Referer 和 UserAgent
    view plaincopy to clipboardprint?
    <SPAN class=caution>HttpWebRequest</SPAN> request = <SPAN class=caution>(HttpWebRequest)HttpWebRequest</SPAN>.Create("http://127.0.0.1/index.htm");  
    //request.Headers.Add(HttpRequestHeader.Referer, "http://www.cftea.com/"); // 错误   
    //request.Headers[HttpRequestHeader.Referer] = "http://www.cftea.com/"; // 错误   
    <SPAN class=caution>request.Referer</SPAN> = "http://www.cftea.com/"; // 正确  
     
    注释掉的两句是不对的,会发生错误:
    view plaincopy to clipboardprint?
    此标头必须使用适当的属性进行修改。  
    参数名: name   
     
    UserAgent 类似。
    View Code

    C#抓取和分析网页的类

    抓取和分析网页的类。
    
    主要功能有:
    
    1、提取网页的纯文本,去所有html标签和javascript代码
    
    2、提取网页的链接,包括href和frame及iframe
    
    3、提取网页的title等(其它的标签可依此类推,正则是一样的)
    
    4、可以实现简单的表单提交及cookie保存
    
     /*
    
    *  Author:Sunjoy at CCNU
    
    *  如果您改进了这个类请发一份代码给我(ccnusjy 在gmail.com)
    
    */
    
    
    
    using System;
    
    using System.Data;
    
    using System.Configuration;
    
    using System.Net;
    
    using System.IO;
    
    using System.Text;
    
    using System.Collections.Generic;
    
    using System.Text.RegularExpressions;
    
    using System.Threading;
    
    using System.Web;
    
    /// <summary>
    
    /// 网页类
    
    /// </summary>
    
    public class WebPage
    
    {
    
    
    
        #region 私有成员
    
        private Uri m_uri;   //网址
    
        private List<Link> m_links;    //此网页上的链接
    
        private string m_title;        //此网页的标题
    
        private string m_html;         //此网页的HTML代码
    
        private string m_outstr;       //此网页可输出的纯文本
    
        private bool m_good;           //此网页是否可用
    
        private int m_pagesize;       //此网页的大小
    
        private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie
    
        private string m_post;  //此网页的登陆页需要的POST数据
    
        private string m_loginurl;  //此网页的登陆页
    
        #endregion
    
    
    
    
    
        #region 私有方法
    
        /// <summary>
    
        /// 这私有方法从网页的HTML代码中分析出链接信息
    
        /// </summary>
    
        /// <returns>List<Link></returns>
    
        private List<Link> getLinks()
    
        {
    
            if (m_links.Count == 0)
    
            {
    
                Regex[] regex = new Regex[2];
    
                regex[0] = new Regex("(?m)<a[^><]+href=("|')?(?<url>([^>"'\s)])+)("|')?[^>]*>(?<text>(\w|\W)*?)</", RegexOptions.Multiline | RegexOptions.IgnoreCase);
    
                regex[1] = new Regex("<[i]*frame[^><]+src=("|')?(?<url>([^>"'\s)])+)("|')?[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
    
                for (int i = 0; i < 2; i++)
    
                {
    
                    Match match = regex[i].Match(m_html);
    
                    while (match.Success)
    
                    {
    
                        try
    
                        {
    
                            string url = new Uri(m_uri, match.Groups["url"].Value).AbsoluteUri;
    
                            string text = "";
    
                            if (i == 0) text = new Regex("(<[^>]+>)|(\s)|(&nbsp;)|&|"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");
    
                            Link link = new Link(url, text);
    
                            m_links.Add(link);
    
                        }
    
                        catch(Exception ex){Console.WriteLine(ex.Message); };
    
                        match = match.NextMatch();
    
                    }
    
                }
    
            }
    
            return m_links;
    
        }
    
       
    
        /// <summary>
    
        /// 此私有方法从一段HTML文本中提取出一定字数的纯文本
    
        /// </summary>
    
        /// <param name="instr">HTML代码</param>
    
        /// <param name="firstN">提取从头数多少个字</param>
    
        /// <param name="withLink">是否要链接里面的字</param>
    
        /// <returns>纯文本</returns>
    
        private string getFirstNchar(string instr, int firstN, bool withLink)
    
        {
    
            if (m_outstr == "")
    
            {
    
                m_outstr = instr.Clone() as string;
    
                m_outstr = new Regex(@"(?m)<script[^>]*>(w|W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");
    
                m_outstr = new Regex(@"(?m)<style[^>]*>(w|W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");
    
                m_outstr = new Regex(@"(?m)<select[^>]*>(w|W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, "");
    
                if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(w|W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
    
                Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);
    
                m_outstr = objReg.Replace(m_outstr, "");
    
                Regex objReg2 = new System.Text.RegularExpressions.Regex("(\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
    
                m_outstr = objReg2.Replace(m_outstr, " ");
    
            }
    
            return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;
    
        }
    
    
    
        /// <summary>
    
        /// 此私有方法返回一个IP地址对应的无符号整数
    
        /// </summary>
    
        /// <param name="x">IP地址</param>
    
        /// <returns></returns>
    
        private uint getuintFromIP(IPAddress x)
    
        {
    
            Byte[] bt = x.GetAddressBytes();
    
            uint i = (uint)(bt[0] * 256 * 256 * 256);
    
            i += (uint)(bt[1] * 256 * 256);
    
            i += (uint)(bt[2] * 256);
    
            i += (uint)(bt[3]);
    
            return i;
    
        }
    
    
    
        #endregion
    
    
    
    
    
        #region 公有文法
    
        /// <summary>
    
        /// 此公有方法提取网页中一定字数的纯文本,包括链接文字
    
        /// </summary>
    
        /// <param name="firstN">字数</param>
    
        /// <returns></returns>
    
        public string getContext(int firstN)
    
        {
    
            return getFirstNchar(m_html, firstN, true);
    
        }
    
    
    
        /// <summary>
    
        /// 此公有方法提取网页中一定字数的纯文本,不包括链接文字
    
        /// </summary>
    
        /// <param name="firstN"></param>
    
        /// <returns></returns>
    
        public string getContextWithOutLink(int firstN)
    
        {
    
            return getFirstNchar(m_html, firstN, false);
    
        }
    
    
    
        /// <summary>
    
        /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式
    
        /// </summary>
    
        /// <param name="pattern">正则式</param>
    
        /// <param name="count">返回的链接的个数</param>
    
        /// <returns>List<Link></returns>
    
        public List<Link> getSpecialLinksByUrl(string pattern,int count)
    
        {
    
            if(m_links.Count==0)getLinks();
    
            List<Link> SpecialLinks = new List<Link>();
    
            List<Link>.Enumerator i;
    
            i = m_links.GetEnumerator();
    
            int cnt = 0;
    
            while (i.MoveNext() && cnt<count)
    
            {
    
                if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.url).Success)
    
                {
    
                    SpecialLinks.Add(i.Current);
    
                    cnt++;
    
                }
    
            } 
    
            return SpecialLinks;
    
        }
    
    
    
    
    
    
    
        /// <summary>
    
        /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
    
        /// </summary>
    
        /// <param name="pattern">正则式</param>
    
        /// <param name="count">返回的链接的个数</param>
    
        /// <returns>List<Link></returns>
    
        public List<Link> getSpecialLinksByText(string pattern,int count)
    
        {
    
            if (m_links.Count == 0) getLinks();
    
            List<Link> SpecialLinks = new List<Link>();
    
            List<Link>.Enumerator i;
    
            i = m_links.GetEnumerator();
    
            int cnt = 0;
    
            while (i.MoveNext() && cnt < count)
    
            {
    
                if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.text).Success)
    
                {
    
                    SpecialLinks.Add(i.Current);
    
                    cnt++;
    
                }
    
            }
    
            return SpecialLinks;
    
        }
    
        /// <summary>
    
        /// 此公有方法获得所有链接中在一定IP范围的链接
    
        /// </summary>
    
        /// <param name="_ip_start">起始IP</param>
    
        /// <param name="_ip_end">终止IP</param>
    
        /// <returns></returns>
    
        public List<Link> getSpecialLinksByIP(string _ip_start, string _ip_end)
    
        {
    
            IPAddress ip_start = IPAddress.Parse(_ip_start);
    
            IPAddress ip_end = IPAddress.Parse(_ip_end);
    
            if (m_links.Count == 0) getLinks();
    
            List<Link> SpecialLinks = new List<Link>();
    
            List<Link>.Enumerator i;
    
            i = m_links.GetEnumerator();
    
            while (i.MoveNext())
    
            {
    
                IPAddress ip;
    
                try
    
                {
    
                    ip = Dns.GetHostEntry(new Uri(i.Current.url).Host).AddressList[0];
    
                }
    
                catch { continue; }
    
                if(getuintFromIP(ip)>=getuintFromIP(ip_start) && getuintFromIP(ip)<=getuintFromIP(ip_end))
    
                {
    
                    SpecialLinks.Add(i.Current);
    
                }
    
            }
    
            return SpecialLinks;
    
        }
    
    
    
        /// <summary>
    
        /// 这公有方法提取本网页的纯文本中满足某正则式的文字
    
        /// </summary>
    
        /// <param name="pattern">正则式</param>
    
        /// <returns>返回文字</returns>
    
        public string getSpecialWords(string pattern)
    
        {
    
            if (m_outstr == "") getContext(Int16.MaxValue);
    
            Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase );
    
            Match mc=regex.Match(m_outstr);
    
            if (mc.Success)
    
                return mc.Groups[1].Value;
    
            return string.Empty;
    
        }
    
        #endregion
    
    
    
    
    
        #region 构造函数
    
        
    
        private void Init(string _url)
    
        {
    
       
    
            try
    
            {
    
                m_uri = new Uri(_url);
    
                m_links = new List<Link>();
    
                m_html = "";
    
                m_outstr = "";
    
                m_title = "";
    
                m_good = true;
    
                if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))
    
                {
    
                    m_good = false;
    
                    return;
    
                }
    
                HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);
    
                rqst.AllowAutoRedirect = true;
    
                rqst.MaximumAutomaticRedirections = 3;
    
                rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
    
                rqst.KeepAlive = true;
    
                rqst.Timeout = 30000;
    
                lock (WebPage.webcookies)
    
                {
    
                    if (WebPage.webcookies.ContainsKey(m_uri.Host))
    
                        rqst.CookieContainer = WebPage.webcookies[m_uri.Host];
    
                    else
    
                    {
    
                        CookieContainer cc = new CookieContainer();
    
                        WebPage.webcookies[m_uri.Host] = cc;
    
                        rqst.CookieContainer = cc;
    
                    }
    
                }
    
    
    
                HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
    
    
    
                Stream sm = rsps.GetResponseStream();
    
                if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)
    
                {
    
                    rsps.Close();
    
                    m_good = false;
    
                    return;
    
                }
    
                Encoding cding = System.Text.Encoding.Default;
    
                string contenttype=rsps.ContentType.ToLower();
    
                int ix = contenttype.IndexOf("charset=");
    
                if (ix != -1)
    
                {
    
    
    
                    try
    
                    {
    
                        cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));
    
                    }
    
                    catch
    
                    {
    
                        cding = Encoding.Default;
    
                    }
    
                    m_html = new StreamReader(sm, cding).ReadToEnd();
    
                }
    
                else
    
                {
    
                    m_html = new StreamReader(sm, cding).ReadToEnd();
    
                    Regex regex = new Regex("charset=(?<cding>[^=]+)?"",RegexOptions.IgnoreCase);
    
                    string strcding = regex.Match(m_html).Groups["cding"].Value;
    
                    try
    
                    {
    
                        cding = Encoding.GetEncoding(strcding);
    
                    }
    
                    catch{
    
                        cding = Encoding.Default;
    
                    }
    
                    byte[] bytes=Encoding.Default.GetBytes(m_html.ToCharArray());
    
                    m_html = cding.GetString(bytes);
    
                    if (m_html.Split('?').Length > 100)
    
                    {
    
                        m_html=Encoding.Default.GetString(bytes);
    
                    }
    
                }
    
    
    
                
    
                m_pagesize = m_html.Length;
    
                m_uri = rsps.ResponseUri;
    
                rsps.Close();
    
            }
    
            catch (Exception ex)
    
            {
    
                Console.WriteLine(ex.Message+m_uri.ToString());
    
                m_good = false;
    
                
    
            }
    
        }
    
    
    
        public WebPage(string _url)
    
        {
    
            string uurl = "";
    
            try
    
            {
    
                uurl = Uri.UnescapeDataString(_url);
    
                _url = uurl;
    
            }
    
            catch { };
    
            Regex re = new Regex("(?<h>[^x00-xff]+)");
    
            Match mc = re.Match(_url);
    
            if (mc.Success)
    
            {
    
                string han = mc.Groups["h"].Value;
    
                _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));
    
            }
    
    
    
            Init(_url);
    
        }
    
    
    
        public WebPage(string _url, string _loginurl, string _post)
    
        {
    
            string uurl = "";
    
            try
    
            {
    
                uurl = Uri.UnescapeDataString(_url);
    
                _url = uurl;
    
            }
    
            catch { };
    
            Regex re = new Regex("(?<h>[^x00-xff]+)");
    
            Match mc = re.Match(_url);
    
            if (mc.Success)
    
            {
    
                string han = mc.Groups["h"].Value;
    
                _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding("GB2312")));
    
            }
    
            if (_loginurl.Trim() == "" || _post.Trim() == "" || WebPage.webcookies.ContainsKey(new Uri(_url).Host))
    
            {
    
                Init(_url);
    
            }
    
            else
    
            {
    
                #region 登陆
    
                string indata = _post;
    
                m_post = _post;
    
                m_loginurl = _loginurl;
    
                byte[] bytes = Encoding.Default.GetBytes(_post);
    
                CookieContainer myCookieContainer = new CookieContainer();
    
                try
    
                {
    
    
    
                    //新建一个CookieContainer来存放Cookie集合 
    
    
    
                    HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(_loginurl);
    
                    //新建一个HttpWebRequest 
    
                    myHttpWebRequest.ContentType = "application/x-www-form-urlencoded";
    
                    myHttpWebRequest.AllowAutoRedirect = false;
    
                    myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
    
                    myHttpWebRequest.Timeout = 60000;
    
                    myHttpWebRequest.KeepAlive = true;
    
                    myHttpWebRequest.ContentLength = bytes.Length;
    
                    myHttpWebRequest.Method = "POST";
    
                    myHttpWebRequest.CookieContainer = myCookieContainer;
    
                    //设置HttpWebRequest的CookieContainer为刚才建立的那个myCookieContainer 
    
                    Stream myRequestStream = myHttpWebRequest.GetRequestStream();
    
                    myRequestStream.Write(bytes, 0, bytes.Length);
    
                    myRequestStream.Close();
    
                    HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
    
    
    
                    foreach (Cookie ck in myHttpWebResponse.Cookies)
    
                    {
    
                        myCookieContainer.Add(ck);
    
                    }
    
                    myHttpWebResponse.Close();
    
                }
    
                catch
    
                {
    
                    Init(_url);
    
                    return;
    
                }
    
    
    
                #endregion
    
    
    
                #region 登陆后再访问页面
    
                try
    
                {
    
                    m_uri = new Uri(_url);
    
                    m_links = new List<Link>();
    
                    m_html = "";
    
                    m_outstr = "";
    
                    m_title = "";
    
                    m_good = true;
    
                    if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))
    
                    {
    
                        m_good = false;
    
                        return;
    
                    }
    
                    HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);
    
                    rqst.AllowAutoRedirect = true;
    
                    rqst.MaximumAutomaticRedirections = 3;
    
                    rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
    
                    rqst.KeepAlive = true;
    
                    rqst.Timeout = 30000;
    
                    rqst.CookieContainer = myCookieContainer;
    
                    lock (WebPage.webcookies)
    
                    {
    
                        WebPage.webcookies[m_uri.Host] = myCookieContainer;
    
                    }
    
                    HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();
    
    
    
                    Stream sm = rsps.GetResponseStream();
    
                    if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)
    
                    {
    
                        rsps.Close();
    
                        m_good = false;
    
                        return;
    
                    }
    
                    Encoding cding = System.Text.Encoding.Default;
    
                    int ix = rsps.ContentType.ToLower().IndexOf("charset=");
    
                    if (ix != -1)
    
                    {
    
                        try
    
                        {
    
                            cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));
    
                        }
    
                        catch
    
                        {
    
                            cding = Encoding.Default;
    
                        }
    
                    }
    
    
    
    
    
                    m_html = new StreamReader(sm, cding).ReadToEnd();
    
    
    
    
    
                    m_pagesize = m_html.Length;
    
                    m_uri = rsps.ResponseUri;
    
                    rsps.Close();
    
                }
    
                catch (Exception ex)
    
                {
    
                    Console.WriteLine(ex.Message+m_uri.ToString());
    
                    m_good = false;
    
                
    
                }
    
                #endregion
    
            }
    
    
    
        }
    
    
    
        #endregion
    
    
    
    
    
        #region 属性
    
    
    
        /// <summary>
    
        /// 通过此属性可获得本网页的网址,只读
    
        /// </summary>
    
        public string URL
    
        {
    
            get
    
            {
    
                return m_uri.AbsoluteUri;
    
            }
    
        }
    
    
    
        /// <summary>
    
        /// 通过此属性可获得本网页的标题,只读
    
        /// </summary>
    
        public string Title
    
        {
    
            get
    
            {
    
                if (m_title == "")
    
                {
    
                    Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:w|W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase );
    
                    Match mc = reg.Match(m_html);
    
                    if (mc.Success)
    
                        m_title= mc.Groups["title"].Value.Trim();
    
                }
    
                return m_title;
    
            }
    
        }
    
      
    
    
    
        /// <summary>
    
        /// 此属性获得本网页的所有链接信息,只读
    
        /// </summary>
    
        public List<Link> Links
    
        {
    
            get
    
            {
    
                if (m_links.Count == 0) getLinks();
    
                return m_links;
    
            }
    
        }
    
    
    
    
    
        /// <summary>
    
        /// 此属性返回本网页的全部纯文本信息,只读
    
        /// </summary>
    
        public string Context
    
        {
    
           get
    
           {
    
               if (m_outstr == "") getContext(Int16.MaxValue);
    
               return m_outstr;
    
           }
    
        }
    
    
    
        /// <summary>
    
        /// 此属性获得本网页的大小
    
        /// </summary>
    
        public int PageSize
    
        {
    
            get
    
            {
    
                return m_pagesize;
    
            }
    
        }
    
        /// <summary>
    
        /// 此属性获得本网页的所有站内链接
    
        /// </summary>
    
        public List<Link> InsiteLinks
    
        {
    
            get
    
            {
    
                return getSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue);
    
            }
    
        }
    
    
    
        /// <summary>
    
        /// 此属性表示本网页是否可用
    
        /// </summary>
    
        public bool IsGood
    
        {
    
            get
    
            {
    
                return m_good;
    
            }
    
        }
    
        /// <summary>
    
        /// 此属性表示网页的所在的网站
    
        /// </summary>
    
        public string Host
    
        {
    
            get
    
            {
    
                return m_uri.Host;
    
            }
    
        }
    
        
    
    
    
        /// <summary>
    
        /// 此网页的登陆页所需的POST数据
    
        /// </summary>
    
        public string PostStr
    
        {
    
            get
    
            {
    
                return m_post;
    
            }
    
        }
    
        /// <summary>
    
        /// 此网页的登陆页
    
        /// </summary>
    
        public string LoginURL
    
        {
    
            get
    
            {
    
                return m_loginurl;
    
            }
    
        }
    
        #endregion
    
    }
    
    
    
    /// <summary>
    
    /// 链接类
    
    /// </summary>
    
    public class Link
    
    {
    
        public string url;   //链接网址
    
        public string text;  //链接文字
    
        public Link(string _url, string _text)
    
        {
    
            url = _url;
    
            text = _text;
    
        }
    
    } 
    View Code

    C#抓取网页信息

    景
      随着Internet的普及,网络信息正以极高的速度增长,在这么多数据中找到自己需要的信息是一件很繁琐的事情,找到需要的信息后如何获取也是件麻烦的事。这 就需要Internet信息抓取程序来代替人工的操作。
      所谓Internet信息抓取程序,就是程序会按照用户的关键词或关键网站来收集相应的信息,并提供给用户想要的信息格式 。
      信息量的增加会带来信息网站发布人员工作量的剧增,为实现信息发布系统实现信息自
      动发布、减少工作人员工作量、即时跟踪最新信息,就需要自动信息提供 程序,因此Internet信息抓取程序应运而生。
      目标
      实现自定义网站信息分类抓取,存入本地数据库、生成静态页面或其它用户定义的信息结构,并下载与信息相关 的多媒体文件。
      开发
      目标站点结构分析
      本步骤是准确抓取信息个关键。
      首先要选择更新频 率高的页面做为抓取地址,然后分析要抓取内容页面url特点。
      然后分析要抓取信息页面的元素特性,比如标题位置,内容位置 等,得到定位标记点。
      将以上信息 写成自己的配置文件或存到数据库中。
      每个网站都需要分析,写出单独的配置文件,供抓取程序使用。
      信息提取
      根据配置文件取得要抓取页面url,使用HttpWebRequest类获取内容:
    双击代码全选
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    //获取http页面函数
            public string Get_Http(string a_strUrl,int timeout)
            {
                string strResult ;        
                try
                {
    HttpWebRequest myReq = (HttpWebRequest) HttpWebRequest.Create(a_strUrl) ;
                    myReq.Timeout = timeout;
                    HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
               
                    Stream myStream = HttpWResp.GetResponseStream () ;
                    StreamReader sr = new StreamReader (myStream , Encoding.Default);
                    StringBuilder strBuilder = new StringBuilder();
                    while (-1 != sr.Peek())
                    {
                        strBuilder.Append (sr.ReadLine()+"
    ");
                    }
                    strResult = strBuilder.ToString ();
                }
                catch(Exception exp)
                {
                    strResult = "错误:" + exp.Message ;
                }
                return strResult ;
            }
      获取页面内容后,分析页面中连接地址取到要抓取的url:
    双击代码全选
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    //处理页面标题和链接
            public string SniffWebUrl( string urlStr,string blockB,string blockE )
            {      
                string urlch1 = "";
                string urlch2 = "";                   
                int end_n1 = 0;
                int end_nums = 0;
                int end_nums1 = 0;
                int end_nums2 = 0;
                int end_nums3     = 0;           
                string reUTStr = "";
                string reTitle = "";
                string ret = "";          
                try
                {
                    int pos01 = urlStr.IndexOf( "." );
                    int pos02 = urlStr.LastIndexOf( "/" );
                    if( pos01 < 0 )
                    {
                        return "";
                    }
                    if( pos02 < 0 )
                    {
                        return "";
                    }
                    int pos03 = urlStr.IndexOf( "/",pos01 );
                    if ( pos03 < 0 )
                    {
                        urlch1 = urlStr;
                        urlch2 = urlStr;
                    }
                    else
                    {
                        urlch1 = urlStr.Substring( 0,pos03 );
                        urlch2 = urlStr.Substring( 0,pos02 );
                    }
                    string tmpAllStr = new PublicFun().Get_Http( urlStr ,time1);
                    int pos1 = tmpAllStr.IndexOf( blockB );
                    int pos2 = tmpAllStr.IndexOf( blockE,pos1 + blockB.Length );
                    if ( pos1>0 && pos2>0 && pos2>pos1 )
                    {
                        ret = tmpAllStr.Substring( pos1 + blockB.Length,pos2 - pos1 - blockB.Length );
                        ret = ret.Substring( ret.IndexOf( "<" ));
                        while( ret.IndexOf( "<A" ) >= 0 )
                        {
                          &nbs p; ret = ret.Substring( 0,ret.IndexOf( "<A" ) ) + "<a" + ret.Substring( ret.IndexOf( "<A" ) + 2 );
                        }
                        while( ret.IndexOf( "</A" ) >=0 )
                        {
                          &nbs p; ret = ret.Substring( 0,ret.IndexOf( "</A" ) ) + "</a" + ret.Substring( ret.IndexOf( "</A" ) + 3 );
                        }
                        while( ret.IndexOf( "Href=" ) >=0 )
                        {
                          &nbs p; ret = ret.Substring( 0,ret.IndexOf( "Href=" )) + "href=" + ret.Substring( ret.IndexOf( "Href=" ) + 5 );
                        }
                        while( ret.IndexOf( "HREF=" ) >=0 )
                        {
                          &nbs p; ret = ret.Substring( 0,ret.IndexOf( "HREF=" )) + "href=" + ret.Substring( ret.IndexOf( "HREF=" ) + 5 );
                        }
                        while( ret.IndexOf( "href='" ) >=0 )
                        {
                          &nbs p; ret = ret.Substring( 0,ret.IndexOf( "href='" )) + "href="" + ret.Substring( ret.IndexOf( "href='" ) + 6 );
                        }
                    }      
                    tmpAllStr = ret;     
                    int begin_nums = tmpAllStr.IndexOf( "href=" );
                    while ( begin_nums >= 0 )
                    {              
                        string tmpStrA = "";
                        string tmpStrB = tmpAllStr.Substring( begin_nums + 5,1 );
                        if ( tmpStrB == """ )
                        {
                          &nbs p; end_n1 = begin_nums + 6;
                          &nb sp; if ( ( end_n1 + 1 ) > tmpAllStr.Length )
                            {
                          &nbs p;     return "";
                        &nbs p;   }
                            tmpStrA = tmpAllStr.Substring( begin_nums+6,1 );
                        }
                        else
                        {
                          &nbs p; end_n1 = begin_nums + 5;
                          &nb sp; tmpStrA = tmpStrB;
                        }
                        if ( tmpStrA == "#" )
                        {
                          &nbs p; tmpAllStr = tmpAllStr.Substring( end_n1 );
                          &nb sp; begin_nums = tmpAllStr.IndexOf( "href=" );
                        }
                        else
                        {                  
                            end_nums1 = tmpAllStr.IndexOf( " ",end_n1 );
                          &nb sp; end_nums2 = tmpAllStr.IndexOf( ">",end_n1 );
                          &nb sp; end_nums3 = tmpAllStr.IndexOf( "</a",end_nums2 );
                          &nb sp; if ( ( end_nums3 >= 0 ) && ( end_nums2 >= 0 ) )
                            {
                          &nbs p;     reTitle = tmpAllStr.Substring( end_nums2 + 1,end_nums3 - end_nums2 - 1 );
                          &nb sp;     if ( end_nums1 > end_nums2 )
                                {
                          &nbs p;         end_nums = end_nums2;
                         & nbsp;      }
                                else
                          & nbsp;     {
                          &nbs p;         if ( end_nums1 < 0 )
                                    {
                          &nbs p;             end_nums = end_nums2;
                         & nbsp;          }
                                    else
                          & nbsp;         {
                          &nbs p;             end_nums = end_nums1;
                         & nbsp;          }
                                }
                                string str4 = tmpAllStr.Substring( end_nums-1, end_nums - end_nums + 1 );
                          &nb sp;     if ( str4 =="""  || str4 == "'" )
                                {
                          &nbs p;         end_nums = end_nums - 1;
                          &nb sp;     }
                                string sTotalOne = tmpAllStr.Substring( end_n1,end_nums - end_n1 );
                          &nb sp;     if ( sTotalOne.IndexOf( "http://" ) <0 )
                                {
                          &nbs p;         if ( sTotalOne.IndexOf( "/" ) == 0 )
                                    {
                          &nbs p;             sTotalOne = urlch1 + sTotalOne;
                         & nbsp;          }
                                    else
                          & nbsp;         {                               
                                        int linshiIntNum = 0;
                          &nb sp;             int flags = 0;
                          &nb sp;             string urlChange = urlStr;;
                         &nb sp;              while( sTotalOne.IndexOf( "../" ) >= 0 )
                                        {
                          &nbs p;                 sTotalOne = sTotalOne.Substring( sTotalOne.IndexOf( "../" ) + 3 );
                          &nb sp;                 linshiIntNum = linshiIntNum + 1;
                          &nb sp;                 flags = flags +1;
                          &n bsp;             }
                                        while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
                                        {
                          &nbs p;                 urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
                          &nb sp;                 linshiIntNum = linshiIntNum - 1;
                          &nb sp;             }
                                        if ( flags == 0 )
                                        {
                          &nbs p;                 sTotalOne = urlch2 + "/" + sTotalOne;
                         & nbsp;              }
                                        else
                          & nbsp;             {
                          &nbs p;                 sTotalOne = urlChange + "/" + sTotalOne;
                         & nbsp;              }
                                    }
                                }
                                reUTStr = reUTStr + new PublicFun().RemoveHtmlCode( reTitle ) + sTotalOne;
                         & nbsp;      tmpAllStr = tmpAllStr.Substring( end_nums3 + 4 );
                          &nb sp;     begin_nums = tmpAllStr.IndexOf( "href=" );
                          &nb sp; }
                            else
                          & nbsp; {
                          &nbs p;     begin_nums = - 1;
                          &nb sp; }                   
                        }
                    }
                    return reUTStr;
                }
                catch( Exception e)
                {
                    return "";
                }
            }
    
    
    得到要抓取内容的url后,处理该页面:
    双击代码全选
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    272
    273
    274
    275
    276
    277
    278
    279
    280
    281
    282
    283
    284
    285
    286
    287
    288
    289
    290
    291
    292
    293
    294
    295
    296
    297
    298
    299
    300
    301
    302
    303
    //获取链接内容并分类处理
            public string GetWebContent( string gatherUrl,string subUrl,string subTitle,string b_Content,string e_Content,string b_Filter,string e_Filter,string root )
            {
                string tmpAllStr = "";           
                string dfStrB = "";
                string dfStrE = "";               
                string rePicStr = "";//图片返回路径   
                string reContentStr = "";
                string picHtml = "images"; //本地图片路径
               
                string urlch1 ="";
                string urlch2 ="";
                int pos1 = gatherUrl.IndexOf( "." );
                int pos2 = gatherUrl.LastIndexOf( "/" );
                if( pos1 < 0 )
                {
                    return "";
                }
                if( pos2 < 0 )
                {               
                    return "";
                }
                int pos3 = gatherUrl.IndexOf( "/",pos1 );
                if ( pos3 < 0 )
                {
                    urlch1 = gatherUrl;
                    urlch2 = gatherUrl;
                }
                else
                {
                    urlch1 = gatherUrl.Substring( 0,pos3 );
                    urlch2 = gatherUrl.Substring( 0,pos2 );
                }   
               
                tmpAllStr = new PublicFun().Get_Http( subUrl,time1 );
                //取稿源
                string docFromStr = "";
                if ( dfStrB != "" && dfStrE != "" )
                {
                    if ( tmpAllStr != "" )
                    {
                        int b_docF = tmpAllStr.IndexOf( dfStrB );
                        if ( b_docF > 0 )
                        {
                          &nbs p; int e_docF = tmpAllStr.IndexOf( dfStrE,b_docF + dfStrB.Length );
                          &nb sp; if ( e_docF > 0 && e_docF > b_docF && e_docF - b_docF < 20 )
                            {
                          &nbs p;     docFromStr = tmpAllStr.Substring( b_docF + dfStrB.Length, e_docF - b_docF - dfStrB.Length );
                          &nb sp; }
                        }
                    }
                }
                //取内容
                if ( tmpAllStr != "" )
                {               
                    int begin_strnum = tmpAllStr.IndexOf( b_Content );
                    if ( begin_strnum < 0 )
                    {                  
                        return "";
                    }
                    int end_strnum = tmpAllStr.IndexOf( e_Content,begin_strnum + b_Content.Length );
                    if ( end_strnum < 0 )
                    {                  
                        return "";
                    }
                    string sTotalSubM = "";
                    if ( end_strnum > begin_strnum )
                    {
                        sTotalSubM = tmpAllStr.Substring ( begin_strnum,end_strnum - begin_strnum );
                    }
                   
                    if ( sTotalSubM == "" )
                    {                  
                        return "";
                    }               
                    //过滤无用信息
                    int bfnum = sTotalSubM.IndexOf( b_Filter );
                    if ( bfnum > -1 )
                    {
                        int efnum = sTotalSubM.IndexOf( e_Filter,bfnum );
                        if ( efnum > -1 )
                        {
                          &nbs p; if ( efnum > bfnum )
                            {
                          &nbs p;     sTotalSubM = sTotalSubM.Substring( 0,bfnum ) + sTotalSubM.Substring( efnum + e_Filter.Length );
                          &nb sp; }
                        }
                    }
                    //格式化图片标记
                   
                    while( sTotalSubM.IndexOf( "Src=" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "Src=" ) ) + "src=" + sTotalSubM.Substring( sTotalSubM.IndexOf( "Src=" ) + 4 );
                    }
                    while( sTotalSubM.IndexOf( "SRC=" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "SRC=" ) ) + "src=" + sTotalSubM.Substring( sTotalSubM.IndexOf( "SRC=" ) + 4 );
                    }
                    while( sTotalSubM.IndexOf( "src='" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "src='" ) ) + "src="" + sTotalSubM.Substring( sTotalSubM.IndexOf( "src='" ) + 5 );
                    }
                    //取图片地址
                    int end_n12 = 0;
                    int end_nums2 = 0;
                    int begin_nums2 = sTotalSubM.IndexOf( "src=" );
                    while( begin_nums2 >= 0 )
                    {
                        String tmpStr = sTotalSubM.Substring( begin_nums2 + 4,1 );
                        if ( tmpStr == """ )
                        {
                          &nbs p; end_n12 = begin_nums2 + 5;
                        }
                        else
                        {
                          &nbs p; end_n12 = begin_nums2 + 4;
                        }
                        int end_nums2a = sTotalSubM.IndexOf( " ",end_n12 );
                        int end_nums2b = sTotalSubM.IndexOf( ">",end_n12 );
                        if ( end_nums2b < 0 )
                        {
                          &nbs p; break;
                        }
                        if ( end_nums2a > end_nums2b )
                        {
                          &nbs p; end_nums2 = end_nums2b;
                        }
                        else
                        {
                          &nbs p; if (end_nums2a<0)
                            {
                          &nbs p;     end_nums2 = end_nums2b;
                             }
                            else
                          & nbsp; {
                          &nbs p;     end_nums2 = end_nums2a;
                             }
                        }
                        tmpStr = sTotalSubM.Substring( end_nums2-1,1 );
                        if ( tmpStr == """ || tmpStr == "'" )
                        {
                          &nbs p; end_nums2 = end_nums2 - 1;
                        }
                        string tmpPicStr = sTotalSubM.Substring( end_n12,end_nums2 - end_n12 );
                        if ( tmpPicStr.IndexOf( "http://" ) < 0 )
                        {
                          &nbs p; if ( tmpPicStr.IndexOf( "/" ) == 0 )
                            {
                          &nbs p;     tmpPicStr = urlch1 + tmpPicStr;
                         & nbsp;  }
                            else
                          & nbsp; {                            
                                int linshiIntNum = 0;
                          &nb sp;     int flags = 0;
                          &nb sp;     string urlChange = subUrl;
                         &nbs p;      while( tmpPicStr.IndexOf( "../" ) >= 0 )
                                {
                          &nbs p;         tmpPicStr = tmpPicStr.Substring( tmpPicStr.IndexOf("../") + 3 );
                          &nb sp;         linshiIntNum = linshiIntNum + 1;
                          &nb sp;         flags = flags + 1;
                          &nb sp;     }
                                while( ( urlChange.LastIndexOf( "/" ) >= 0 ) && ( linshiIntNum >= 0 ) )
                                {
                          &nbs p;         urlChange = urlChange.Substring( 0,urlChange.LastIndexOf( "/" ) );
                          &nb sp;         linshiIntNum = linshiIntNum - 1;
                          &nb sp;     }
                                if ( flags == 0 )
                                {
                          &nbs p;         tmpPicStr = urlch2 + "/" + tmpPicStr;
                         & nbsp;      }
                                else
                          & nbsp;     {
                          &nbs p;         tmpPicStr = urlChange + "/" + tmpPicStr;
                         & nbsp;      }
                            }
                        }
                        //tmpPicStr = tmpPicStr.ToLower();
                        string tmpPicStrTmp = tmpPicStr.ToLower ();
                        //if ( tmpPicStr.IndexOf( ".jpg" ) > 0 || tmpPicStr.IndexOf( ".gif" ) > 0 || tmpPicStr.IndexOf( ".bmp" ) > 0 )
                        if ( tmpPicStrTmp.IndexOf( ".jpg" ) > 0 || tmpPicStrTmp.IndexOf( ".gif" ) > 0 || tmpPicStrTmp.IndexOf( ".bmp" ) > 0 )
                        {
                          &nbs p; rePicStr = rePicStr + "||" + tmpPicStr ;
                          &nbs p; int flagN2 = tmpPicStr.LastIndexOf( "/" );
                          &nb sp; string fileN2 = picHtml + tmpPicStr.Substring( flagN2 );
                          &nb sp; sTotalSubM = sTotalSubM.Substring( 0,end_nums2 ) + ">******" + fileN2 + "******<" + sTotalSubM.Substring( end_nums2 );
                          &nb sp; begin_nums2 = sTotalSubM.IndexOf( "src=", end_nums2 + fileN2.Length + 22 );
                        }
                        else
                        {
                          &nbs p; begin_nums2 = sTotalSubM.IndexOf( "src=", end_nums2 + 4 );                       
                        }                   
                    }
                    if ( rePicStr.Length > 2 ) 
                        rePicStr =  rePicStr.Substring(2);              
                    //内容处理 格式化关键标记
                    while( sTotalSubM.IndexOf( "<P" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<P" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<P" ) + 2 );
                    }
                    while( sTotalSubM.IndexOf( "<p" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<p" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<p" ) + 2 );
                    }
                    while( sTotalSubM.IndexOf( "</P" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "</P" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "</P" ) + 3 );
                    }
                    while( sTotalSubM.IndexOf( "</p" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "</p" ) ) + "|****|<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "</p" ) + 3 );
                    }
                    while( sTotalSubM.IndexOf( "<br" ) >=0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<br" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<br" ) + 3 );
                    }
                    while( sTotalSubM.IndexOf( "<BR" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<BR" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<BR" ) + 3 );
                    }
                    while( sTotalSubM.IndexOf( "<Br" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<Br" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<Br" ) + 3 );
                    }
                    while( sTotalSubM.IndexOf( "<bR" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "<bR" ) ) + "+****+<" + sTotalSubM.Substring( sTotalSubM.IndexOf( "<bR" ) + 3 );
                    }
                    //去除html标记
                    int linshiInt1 = sTotalSubM.IndexOf( "<" );
                    int linshiInt2 = sTotalSubM.IndexOf( ">" );           
                    if ( linshiInt2 < linshiInt1 )
                    {
                        sTotalSubM = sTotalSubM.Substring( linshiInt2 + 1 );
                    }
                    int linshiInt11 = sTotalSubM.LastIndexOf( "<" );
                    int linshiInt12 = sTotalSubM.LastIndexOf( ">" );
                    if ( linshiInt12 < linshiInt11 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,linshiInt12 + 1 );
                    }
                    linshiInt1 = sTotalSubM.IndexOf( "<" );
                    while ( linshiInt1 >= 0 )
                    {
                        linshiInt2 = sTotalSubM.IndexOf( ">",linshiInt1 );
                        if ( linshiInt2 >= 0 )
                        {              
                            sTotalSubM = sTotalSubM.Substring( 0,linshiInt1 ) + sTotalSubM.Substring( linshiInt2 + 1 );
                        }
                        else
                        {
                          &nbs p; sTotalSubM = sTotalSubM.Substring( 0,linshiInt1 );
                        }
                        linshiInt1 = sTotalSubM.IndexOf("<");
                    }
                    //还原关键标记
                    int linshiInt3 = 0;
                    int linshiInt4 = 0;
                    while( sTotalSubM.IndexOf( "+****+" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "+****+" ) ) + "<br>
    " + sTotalSubM.Substring( sTotalSubM.IndexOf( "+****+" ) + 9 );
                    }
                    while( sTotalSubM.IndexOf( "|****|" ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( "|****|" ) ) + "<br>
    " + sTotalSubM.Substring( sTotalSubM.IndexOf( "|****|" ) + 9 );
                    }
                    while( sTotalSubM.IndexOf( "******" ) >= 0 )
                    {
                        linshiInt3 = sTotalSubM.IndexOf( "******" ) + 9;
                        linshiInt4 = sTotalSubM.IndexOf( "******",linshiInt3 );
                        if ( linshiInt4 >= 0 )
                        {
                          &nbs p; int tmpPos = sTotalSubM.IndexOf( "******" );
                          &nb sp; string tmpStr1 = sTotalSubM.Substring( 0,tmpPos );
                            string tmpStr2 = sTotalSubM.Substring( linshiInt3,linshiInt4 - linshiInt3 );
                          &nb sp; string tmpStr3 = sTotalSubM.Substring( linshiInt4 + 9 );
                          &nb sp; sTotalSubM = tmpStr1 + "<img src=" + tmpStr2 + ">" + tmpStr3;
                        }
                        else
                        {
                          &nbs p; break;
                        }
                    }
                    //去除内容中的标题
                    if ( sTotalSubM.IndexOf( subTitle ) >= 0 )
                    {
                        sTotalSubM = sTotalSubM.Substring( 0,sTotalSubM.IndexOf( subTitle ) ) + sTotalSubM.Substring( sTotalSubM.IndexOf( subTitle ) + subTitle.Length );
                    }
                    reContentStr = sTotalSubM;
                    //调用下载图片功能
                    //下载图片到指定目录
                    string[] img_Url = new PublicFun().split( rePicStr,"||" );
                    for ( int i=0;i<img_Url.Length;i++ )
                    {
                        if ( img_Url[i] != "" )
                        {
                          &nbs p; new PublicFun().Get_Img( img_Url[i],10000,root + "images" + img_Url[i].Substring( img_Url[i].LastIndexOf("/")+1 ) );
                        }
                    }
                }
                return reContentStr;
            }
    
    以上方法返回要取得的信息,包括标题内容,图片地址等。
      下载页面中图片:
    双击代码全选
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    //下载图片
            public void Get_Img(string a_strUrl,int timeout,string filepath)
            {
                try
                {
    HttpWebRequest myReq = (HttpWebRequest) HttpWebRequest.Create(a_strUrl) ;
                    myReq.Timeout = timeout;
                    HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();        
                    Stream myStream = HttpWResp.GetResponseStream () ;         
                    Bitmap map = new Bitmap( myStream );
                    PictureBox picB = new PictureBox ();
                    picB.Image = (Image) map;
                    string path = filepath.Substring( 0,filepath.LastIndexOf( "" ) );
                    if (!Directory.Exists(path))
                    {
                        CreateDir( path );
                    }               
                    picB.Image.Save (filepath);               
                }
                catch(Exception exp)
                {
                    string ss = exp.Message;
            WriteLog( filepath.Substring(0,filepath.LastIndexOf("")) + "error.log",a_strUrl + "--" + ss + "
    ");       
                }
            }
      保存文件或入库
      上面取得的信息可以按自己的要求保存。
      ****设计的时候没有使用url按层次循 环抓取,这样定义抓取url效率更高,速度更快。
      注:此版本只提供静态文件存储功能,不提供数据库接口,不提供自定义网站功能。
      本程序运行需要先安 装.net 框架1.1
    View Code

    c# 抓取网页类(获取网页中所有信息)

    c# 抓取网页类(获取网页中所有信息)
    分类: c#程序设计2011-08-05 09:14 2362人阅读 评论(4) 收藏 举报
     
    [csharp] view plaincopyprint?
    1. using System;  
    2. using System.Data;  
    3. using System.Configuration;  
    4. using System.Net;  
    5. using System.IO;  
    6. using System.Text;  
    7. using System.Collections.Generic;  
    8. using System.Text.RegularExpressions;  
    9. using System.Threading;  
    10. using System.Web;  
    11. using System.Web.UI.MobileControls;  
    12.     /// <summary>  
    13.     /// 网页类  
    14.     /// </summary>  
    15.     public class WebPage  
    16.     {  
    17.         #region 私有成员  
    18.         private Uri m_uri;   //url  
    19.         private List<Link> m_links;    //此网页上的链接  
    20.         private string m_title;        //标题  
    21.         private string m_html;         //HTML代码  
    22.         private string m_outstr;       //网页可输出的纯文本  
    23.         private bool m_good;           //网页是否可用  
    24.         private int m_pagesize;       //网页的大小  
    25.         private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie  
    26.         
    27.         #endregion  
    28.  
    29.         #region 属性  
    30.   
    31.         /// <summary>  
    32.         /// 通过此属性可获得本网页的网址,只读  
    33.         /// </summary>  
    34.         public string URL  
    35.         {  
    36.             get  
    37.             {  
    38.                 return m_uri.AbsoluteUri;  
    39.             }  
    40.         }  
    41.   
    42.         /// <summary>  
    43.         /// 通过此属性可获得本网页的标题,只读  
    44.         /// </summary>  
    45.         public string Title  
    46.         {  
    47.             get  
    48.             {  
    49.                 if (m_title == "")  
    50.                 {  
    51.                     Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:w|W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
    52.                     Match mc = reg.Match(m_html);  
    53.                     if (mc.Success)  
    54.                         m_title = mc.Groups["title"].Value.Trim();  
    55.                 }  
    56.                 return m_title;  
    57.             }  
    58.         }  
    59.         public string M_html  
    60.         {  
    61.             get  
    62.             {  
    63.                 if (m_html == null)  
    64.                 {  
    65.                     m_html = "";  
    66.                 }  
    67.                 return m_html;  
    68.             }  
    69.         }  
    70.         /// <summary>  
    71.         /// 此属性获得本网页的所有链接信息,只读  
    72.         /// </summary>  
    73.         public List<Link> Links  
    74.         {  
    75.             get  
    76.             {  
    77.                 if (m_links.Count == 0) getLinks();  
    78.                 return m_links;  
    79.             }  
    80.         }  
    81.   
    82.   
    83.         /// <summary>  
    84.         /// 此属性返回本网页的全部纯文本信息,只读  
    85.         /// </summary>  
    86.         public string Context  
    87.         {  
    88.             get  
    89.             {  
    90.                 if (m_outstr == "") getContext(Int16.MaxValue);  
    91.                 return m_outstr;  
    92.             }  
    93.         }  
    94.   
    95.         /// <summary>  
    96.         /// 此属性获得本网页的大小  
    97.         /// </summary>  
    98.         public int PageSize  
    99.         {  
    100.             get  
    101.             {  
    102.                 return m_pagesize;  
    103.             }  
    104.         }  
    105.         /// <summary>  
    106.         /// 此属性获得本网页的所有站内链接  
    107.         /// </summary>  
    108.         public List<Link> InsiteLinks  
    109.         {  
    110.             get  
    111.             {  
    112.                 return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue);  
    113.             }  
    114.         }  
    115.   
    116.         /// <summary>  
    117.         /// 此属性表示本网页是否可用  
    118.         /// </summary>  
    119.         public bool IsGood  
    120.         {  
    121.             get  
    122.             {  
    123.                 return m_good;  
    124.             }  
    125.         }  
    126.         /// <summary>  
    127.         /// 此属性表示网页的所在的网站  
    128.         /// </summary>  
    129.         public string Host  
    130.         {  
    131.             get  
    132.             {  
    133.                 return m_uri.Host;  
    134.             }  
    135.         }  
    136.         #endregion  
    137.   
    138.   
    139.         /// <summary>  
    140.         /// 从HTML代码中分析出链接信息  
    141.         /// </summary>  
    142.         /// <returns>List<Link></returns>  
    143.         private List<Link> getLinks()  
    144.         {  
    145.             if (m_links.Count == 0)  
    146.             {  
    147.                 Regex[] regex = new Regex[2];  
    148.                 regex[0] = new Regex(@"<ashrefs*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline);  
    149.                 regex[1] = new Regex("<[i]*frame[^><]+src=("|')?(?<url>([^>"'\s)])+)("|')?[^>]*>", RegexOptions.IgnoreCase);  
    150.   
    151.                 for (int i = 0; i < 2; i++)  
    152.                 {  
    153.                     Match match = regex[i].Match(m_html);  
    154.                     while (match.Success)  
    155.                     {  
    156.                         try  
    157.                         {  
    158.                             string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri);  
    159.   
    160.                             string text = "";  
    161.                             if (i == 0) text = new Regex("(<[^>]+>)|(\s)|( )|&|"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");  
    162.   
    163.                             Link link = new Link();  
    164.                             link.Text = text;  
    165.                             link.NavigateUrl = url;  
    166.   
    167.                             m_links.Add(link);  
    168.                         }  
    169.                         catch (Exception ex) { Console.WriteLine(ex.Message); };  
    170.                         match = match.NextMatch();  
    171.                     }  
    172.                 }  
    173.             }  
    174.             return m_links;  
    175.         }  
    176.         /// <summary>  
    177.         /// 此私有方法从一段HTML文本中提取出一定字数的纯文本  
    178.         /// </summary>  
    179.         /// <param name="instr">HTML代码</param>  
    180.         /// <param name="firstN">提取从头数多少个字</param>  
    181.         /// <param name="withLink">是否要链接里面的字</param>  
    182.         /// <returns>纯文本</returns>  
    183.         private string getFirstNchar(string instr, int firstN, bool withLink)  
    184.         {  
    185.             if (m_outstr == "")  
    186.             {  
    187.                 m_outstr = instr.Clone() as string;  
    188.                 m_outstr = new Regex(@"(?m)<script[^>]*>(w|W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    189.                 m_outstr = new Regex(@"(?m)<style[^>]*>(w|W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    190.                 m_outstr = new Regex(@"(?m)<select[^>]*>(w|W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    191.                 if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(w|W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");  
    192.                 Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
    193.                 m_outstr = objReg.Replace(m_outstr, "");  
    194.                 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);  
    195.                 m_outstr = objReg2.Replace(m_outstr, " ");  
    196.   
    197.             }  
    198.             return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;  
    199.         }  
    200.  
    201.  
    202.         #region 公有文法  
    203.         /// <summary>  
    204.         /// 此公有方法提取网页中一定字数的纯文本,包括链接文字  
    205.         /// </summary>  
    206.         /// <param name="firstN">字数</param>  
    207.         /// <returns></returns>  
    208.         public string getContext(int firstN)  
    209.         {  
    210.             return getFirstNchar(m_html, firstN, true);  
    211.         }  
    212.   
    213.         /// <summary>  
    214.         /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式  
    215.         /// </summary>  
    216.         /// <param name="pattern">正则式</param>  
    217.         /// <param name="count">返回的链接的个数</param>  
    218.         /// <returns>List<Link></returns>  
    219.         public List<Link> getSpecialLinksByUrl(string pattern, int count)  
    220.         {  
    221.             if (m_links.Count == 0) getLinks();  
    222.             List<Link> SpecialLinks = new List<Link>();  
    223.             List<Link>.Enumerator i;  
    224.             i = m_links.GetEnumerator();  
    225.             int cnt = 0;  
    226.             while (i.MoveNext() && cnt < count)  
    227.             {  
    228.                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success)  
    229.                 {  
    230.                     SpecialLinks.Add(i.Current);  
    231.                     cnt++;  
    232.                 }  
    233.             }  
    234.             return SpecialLinks;  
    235.         }  
    236.   
    237.         /// <summary>  
    238.         /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式  
    239.         /// </summary>  
    240.         /// <param name="pattern">正则式</param>  
    241.         /// <param name="count">返回的链接的个数</param>  
    242.         /// <returns>List<Link></returns>  
    243.         public List<Link> getSpecialLinksByText(string pattern, int count)  
    244.         {  
    245.             if (m_links.Count == 0) getLinks();  
    246.             List<Link> SpecialLinks = new List<Link>();  
    247.             List<Link>.Enumerator i;  
    248.             i = m_links.GetEnumerator();  
    249.             int cnt = 0;  
    250.             while (i.MoveNext() && cnt < count)  
    251.             {  
    252.                 if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success)  
    253.                 {  
    254.                     SpecialLinks.Add(i.Current);  
    255.                     cnt++;  
    256.                 }  
    257.             }  
    258.             return SpecialLinks;  
    259.         }  
    260.   
    261.         /// <summary>  
    262.         /// 这公有方法提取本网页的纯文本中满足某正则式的文字  
    263.         /// </summary>  
    264.         /// <param name="pattern">正则式</param>  
    265.         /// <returns>返回文字</returns>  
    266.         public string getSpecialWords(string pattern)  
    267.         {  
    268.             if (m_outstr == "") getContext(Int16.MaxValue);  
    269.             Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);  
    270.             Match mc = regex.Match(m_outstr);  
    271.             if (mc.Success)  
    272.                 return mc.Groups[1].Value;  
    273.             return string.Empty;  
    274.         }  
    275.         #endregion  
    276.  
    277.         #region 构造函数  
    278.   
    279.         private void Init(string _url)  
    280.         {  
    281.             try  
    282.             {  
    283.                 m_uri = new Uri(_url);  
    284.                 m_links = new List<Link>();  
    285.                 m_html = "";  
    286.                 m_outstr = "";  
    287.                 m_title = "";  
    288.                 m_good = true;  
    289.                 if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi"))  
    290.                 {  
    291.                     m_good = false;  
    292.                     return;  
    293.                 }  
    294.                 HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri);  
    295.                 rqst.AllowAutoRedirect = true;  
    296.                 rqst.MaximumAutomaticRedirections = 3;  
    297.                 rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";  
    298.                 rqst.KeepAlive = true;  
    299.                 rqst.Timeout = 10000;  
    300.                 lock (WebPage.webcookies)  
    301.                 {  
    302.                     if (WebPage.webcookies.ContainsKey(m_uri.Host))  
    303.                         rqst.CookieContainer = WebPage.webcookies[m_uri.Host];  
    304.                     else  
    305.                     {  
    306.                         CookieContainer cc = new CookieContainer();  
    307.                         WebPage.webcookies[m_uri.Host] = cc;  
    308.                         rqst.CookieContainer = cc;  
    309.                     }  
    310.                 }  
    311.                 HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse();  
    312.                 Stream sm = rsps.GetResponseStream();  
    313.                 if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22)  
    314.                 {  
    315.                     rsps.Close();  
    316.                     m_good = false;  
    317.                     return;  
    318.                 }  
    319.                 Encoding cding = System.Text.Encoding.Default;  
    320.                 string contenttype = rsps.ContentType.ToLower();  
    321.                 int ix = contenttype.IndexOf("charset=");  
    322.                 if (ix != -1)  
    323.                 {  
    324.                     try  
    325.                     {  
    326.                         cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1));  
    327.                     }  
    328.                     catch  
    329.                     {  
    330.                         cding = Encoding.Default;  
    331.                     }  
    332.                      
    333.                     //该处视情况而定 有的需要解码  
    334.                     //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
    335.                     m_html = new StreamReader(sm, cding).ReadToEnd();  
    336.                       
    337.                 }  
    338.                 else  
    339.                 {  
    340.                   //该处视情况而定 有的需要解码  
    341.                    //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd());  
    342.                       
    343.                     m_html = new StreamReader(sm, cding).ReadToEnd();  
    344.                     Regex regex = new Regex("charset=(?<cding>[^=]+)?"", RegexOptions.IgnoreCase);  
    345.                     string strcding = regex.Match(m_html).Groups["cding"].Value;  
    346.                     try  
    347.                     {  
    348.                         cding = Encoding.GetEncoding(strcding);  
    349.                     }  
    350.                     catch  
    351.                     {  
    352.                         cding = Encoding.Default;  
    353.                     }  
    354.                     byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray());  
    355.                     m_html = cding.GetString(bytes);  
    356.                     if (m_html.Split('?').Length > 100)  
    357.                     {  
    358.                         m_html = Encoding.Default.GetString(bytes);  
    359.                     }  
    360.                 }  
    361.                 m_pagesize = m_html.Length;  
    362.                 m_uri = rsps.ResponseUri;  
    363.                 rsps.Close();  
    364.             }  
    365.             catch (Exception ex)  
    366.             {  
    367.                  
    368.             }  
    369.         }  
    370.         public WebPage(string _url)  
    371.         {  
    372.             string uurl = "";  
    373.             try  
    374.             {  
    375.                 uurl = Uri.UnescapeDataString(_url);  
    376.                 _url = uurl;  
    377.             }  
    378.             catch { };  
    379.             Init(_url);  
    380.         }  
    381.         #endregion  
    382.     }  
     
    View Code

    得到一个完整的文件列表中使用ftprequest缓慢

    我想得到的文件名,文件的大小和最后修改时间每个文件服务器上,然后在一个完整的它。
    
    它真的很好,直到我切换主机,现在真的很缓慢,尽管新的主机是一样快,在客户端。
    
    没有任何明显的理由为何?
    
    此外,它是非常必要把登录凭据,每一次?
    
    我使用的第一种方法得到一个字符串数组,然后遍历并使用另一个在每个项目得到文件的大小:
    public static string[] GetFileList()
        {
            string[] downloadFiles;
            StringBuilder result = new StringBuilder();
            FtpWebRequest request;
            try
            {
                request = (FtpWebRequest)FtpWebRequest.Create(new Uri("ftp://mysite.se/"));
                request.UseBinary = true;
                request.Credentials = new NetworkCredential(settings.Username, settings.Password);
                request.Method = WebRequestMethods.Ftp.ListDirectory;
                request.UseBinary = true;
    
                WebResponse response = request.GetResponse();
                StreamReader reader = new StreamReader(response.GetResponseStream());
    
                string line = reader.ReadLine();
                while (line != null)
                {
                    result.Append(line);
                    result.Append("
    ");
                    line = reader.ReadLine();
                }
                // to remove the trailing '
    '
                result.Remove(result.ToString().LastIndexOf('
    '), 1);
                reader.Close();
                response.Close();
                return result.ToString().Split('
    ');
            }
            catch (Exception ex)
            {
                System.Windows.Forms.MessageBox.Show(ex.Message);
                downloadFiles = null;
                return downloadFiles;
            }
        }
    
        public static int GetFileSize(string file)
        {
            //MessageBox.Show("getting filesize...");
    
            StringBuilder result = new StringBuilder();
            FtpWebRequest request;
            try
            {
                request = (FtpWebRequest)FtpWebRequest.Create(new Uri("ftp://mysite.se/" + file));
                request.UseBinary = true;
                request.Credentials = new NetworkCredential(settings.Username, settings.Password);
                request.Method = WebRequestMethods.Ftp.GetFileSize;
    
                int dataLength = (int)request.GetResponse().ContentLength;
    
                return dataLength;
            }
            catch (Exception ex)
            {
                //System.Windows.Forms.MessageBox.Show(ex.Message);
                return 1337;
            }
        }
    View Code
  • 相关阅读:
    围棋术语中英文对照
    修改grub及console的分别率 Linux-Ubuntu
    内核crash (Linux)
    pthread_create build
    内联函数定义的关键字inline及由此产生的编译问题简析
    debian家族重量级成员Ubuntu 20.04下载链接开启了。。。
    stm32 GPIO 输出配置参照
    Linux安装应用程序后,点击图标没法应,怎么解决呢?
    c语言中的引用使用
    QA Issue: PN: startUp is upper case, this can result in unexpected behavior. [uppercase-pn]
  • 原文地址:https://www.cnblogs.com/blogpro/p/11458363.html
Copyright © 2011-2022 走看看