zoukankan      html  css  js  c++  java
  • 正则抓取页面信息

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Xml.Linq;

    namespace CollectingInformation
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }

            private string XMLPath = Application.StartupPath.ToString() + "/58.xml";
            private string HTMLPath = Application.StartupPath.ToString() + "/58.html";

            private void btnOK_Click(object sender, EventArgs e)
            {
                string pagePath = textBox1.Text.Trim();
                try
                {
                    if (!File.Exists(XMLPath))
                    {
                        XElement xeCreateFile = new XElement("InfoBy58");
                        xeCreateFile.Save(XMLPath);
                    }

                    //开始抓取数据
                    //获得指定页面的内容  
                    WebRequest hwr = WebRequest.Create(pagePath);
                    HttpWebResponse hwp = hwr.GetResponse() as HttpWebResponse;
                    StreamReader sr;
                    string code = hwp.ContentType;
                    //得到编码了
                    //如果取不到则默认为gb2312
                    try
                    {
                        code = code.Split('=')[1];
                    }
                    catch
                    {
                        code = "gb2312";
                    }
                    Stream rep = hwp.GetResponseStream();
                    sr = new StreamReader(rep, Encoding.GetEncoding(code));
                    string strSource = sr.ReadToEnd();

                    Regex rx = new Regex("<h1>" + @"([Ss]*?)" + "<h2>"
                            , RegexOptions.Compiled | RegexOptions.IgnoreCase);

                    MatchCollection matchs = rx.Matches(strSource);
                    if (matchs.Count > 0)
                    {
                        strSource = matchs[0].Value;//@all</td><td>(.*)@all</td>
                        string pattern = "<h1>(.*)</h1>@allusername:'(.*)'@all<img src='(.*)'@all";
                        pattern = pattern.Replace("@all", @"[Ss]*?");
                        rx = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);

                        matchs = rx.Matches(strSource);
                        if (matchs.Count == 1)
                        {
                            XDocument root = XDocument.Load(XMLPath);
                            XElement xele = root.Element("InfoBy58");
                            xele.Add(new XElement("UserInfo", new XElement("Title", matchs[0].Groups[1].Value), new XElement("Name", matchs[0].Groups[2].Value), new XElement("Tel", matchs[0].Groups[3].Value)));
                            root.Save(XMLPath);
                        }
                    }
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }

                // pictureBox1.ImageLocation = "http://image.58.com/showphone.aspx?t=v55&v=3041A034B4AF246DD511D9E44B08582D7";

            }

            private void btnExport_Click(object sender, EventArgs e)
            {
                try
                {
                    XDocument root = XDocument.Load(XMLPath);
                    XElement xele = root.Element("InfoBy58");

                    StringBuilder strBuilder = new StringBuilder();
                    strBuilder.Append("<html>");
                    strBuilder.Append("<body>");
                    strBuilder.Append("<table border="1">");
                    strBuilder.Append("<th>");
                    strBuilder.Append("<td>标题</td>");
                    strBuilder.Append("<td>联系人</td>");
                    strBuilder.Append("<td>电话</td>");
                    strBuilder.Append("</th>");

                    foreach (var item in root.Elements("UserInfo"))
                    {
                        strBuilder.Append("<tr>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td><img src='" + item.Element("CategoryName").Value + "'/></td>");
                        strBuilder.Append("</tr>");
                    }
                    strBuilder.Append("</body></html>");

                    if (!File.Exists(HTMLPath))
                    {
                        File.Create(HTMLPath);
                    }
                    FileStream fs = new FileStream(HTMLPath, FileMode.Open, FileAccess.ReadWrite);
                    StreamWriter sw = new StreamWriter(fs);
                    fs.SetLength(0);//首先把文件清空了。
                    sw.Write(strBuilder.ToString());//写你的字符串。
                    sw.Close();

                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
            }
        }
    }

  • 相关阅读:
    Sum Root to Leaf Numbers
    Sum Root to Leaf Numbers
    Sort Colors
    Partition List
    Binary Tree Inorder Traversal
    Binary Tree Postorder Traversal
    Remove Duplicates from Sorted List II
    Remove Duplicates from Sorted List
    Search a 2D Matrix
    leetcode221
  • 原文地址:https://www.cnblogs.com/contain/p/3285699.html
Copyright © 2011-2022 走看看