zoukankan      html  css  js  c++  java
  • 正则抓取页面信息

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Xml.Linq;

    namespace CollectingInformation
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }

            private string XMLPath = Application.StartupPath.ToString() + "/58.xml";
            private string HTMLPath = Application.StartupPath.ToString() + "/58.html";

            private void btnOK_Click(object sender, EventArgs e)
            {
                string pagePath = textBox1.Text.Trim();
                try
                {
                    if (!File.Exists(XMLPath))
                    {
                        XElement xeCreateFile = new XElement("InfoBy58");
                        xeCreateFile.Save(XMLPath);
                    }

                    //开始抓取数据
                    //获得指定页面的内容  
                    WebRequest hwr = WebRequest.Create(pagePath);
                    HttpWebResponse hwp = hwr.GetResponse() as HttpWebResponse;
                    StreamReader sr;
                    string code = hwp.ContentType;
                    //得到编码了
                    //如果取不到则默认为gb2312
                    try
                    {
                        code = code.Split('=')[1];
                    }
                    catch
                    {
                        code = "gb2312";
                    }
                    Stream rep = hwp.GetResponseStream();
                    sr = new StreamReader(rep, Encoding.GetEncoding(code));
                    string strSource = sr.ReadToEnd();

                    Regex rx = new Regex("<h1>" + @"([Ss]*?)" + "<h2>"
                            , RegexOptions.Compiled | RegexOptions.IgnoreCase);

                    MatchCollection matchs = rx.Matches(strSource);
                    if (matchs.Count > 0)
                    {
                        strSource = matchs[0].Value;//@all</td><td>(.*)@all</td>
                        string pattern = "<h1>(.*)</h1>@allusername:'(.*)'@all<img src='(.*)'@all";
                        pattern = pattern.Replace("@all", @"[Ss]*?");
                        rx = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);

                        matchs = rx.Matches(strSource);
                        if (matchs.Count == 1)
                        {
                            XDocument root = XDocument.Load(XMLPath);
                            XElement xele = root.Element("InfoBy58");
                            xele.Add(new XElement("UserInfo", new XElement("Title", matchs[0].Groups[1].Value), new XElement("Name", matchs[0].Groups[2].Value), new XElement("Tel", matchs[0].Groups[3].Value)));
                            root.Save(XMLPath);
                        }
                    }
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }

                // pictureBox1.ImageLocation = "http://image.58.com/showphone.aspx?t=v55&v=3041A034B4AF246DD511D9E44B08582D7";

            }

            private void btnExport_Click(object sender, EventArgs e)
            {
                try
                {
                    XDocument root = XDocument.Load(XMLPath);
                    XElement xele = root.Element("InfoBy58");

                    StringBuilder strBuilder = new StringBuilder();
                    strBuilder.Append("<html>");
                    strBuilder.Append("<body>");
                    strBuilder.Append("<table border="1">");
                    strBuilder.Append("<th>");
                    strBuilder.Append("<td>标题</td>");
                    strBuilder.Append("<td>联系人</td>");
                    strBuilder.Append("<td>电话</td>");
                    strBuilder.Append("</th>");

                    foreach (var item in root.Elements("UserInfo"))
                    {
                        strBuilder.Append("<tr>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td><img src='" + item.Element("CategoryName").Value + "'/></td>");
                        strBuilder.Append("</tr>");
                    }
                    strBuilder.Append("</body></html>");

                    if (!File.Exists(HTMLPath))
                    {
                        File.Create(HTMLPath);
                    }
                    FileStream fs = new FileStream(HTMLPath, FileMode.Open, FileAccess.ReadWrite);
                    StreamWriter sw = new StreamWriter(fs);
                    fs.SetLength(0);//首先把文件清空了。
                    sw.Write(strBuilder.ToString());//写你的字符串。
                    sw.Close();

                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
            }
        }
    }

  • 相关阅读:
    使用shutdown命令实现局域网内远程关机、重启整蛊他人
    在foxmail和outlook中设置QQ邮箱、gmail邮箱、新浪邮箱、微软邮箱、网易邮箱等的方法
    万能驱动助理篡改主页为2345的解决办法
    巧用UserAgent来解决浏览器的各种问题
    各大浏览器保存密码的文件
    使用代理软件之后其他软件不能联网的解决方法
    windows xp/7/8/8.1/10安全模式详解和系统修复讲解
    VirtualBox更改默认路径
    Virtualbox中不能为虚拟机打开一个新任务的原因及解决方法
    xampp打开显示缺少运行库的解决方法
  • 原文地址:https://www.cnblogs.com/contain/p/3285699.html
Copyright © 2011-2022 走看看