zoukankan      html  css  js  c++  java
  • 正则抓取页面信息

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;
    using System.IO;
    using System.Text.RegularExpressions;
    using System.Xml.Linq;

    namespace CollectingInformation
    {
        public partial class Form1 : Form
        {
            public Form1()
            {
                InitializeComponent();
            }

            private string XMLPath = Application.StartupPath.ToString() + "/58.xml";
            private string HTMLPath = Application.StartupPath.ToString() + "/58.html";

            private void btnOK_Click(object sender, EventArgs e)
            {
                string pagePath = textBox1.Text.Trim();
                try
                {
                    if (!File.Exists(XMLPath))
                    {
                        XElement xeCreateFile = new XElement("InfoBy58");
                        xeCreateFile.Save(XMLPath);
                    }

                    //开始抓取数据
                    //获得指定页面的内容  
                    WebRequest hwr = WebRequest.Create(pagePath);
                    HttpWebResponse hwp = hwr.GetResponse() as HttpWebResponse;
                    StreamReader sr;
                    string code = hwp.ContentType;
                    //得到编码了
                    //如果取不到则默认为gb2312
                    try
                    {
                        code = code.Split('=')[1];
                    }
                    catch
                    {
                        code = "gb2312";
                    }
                    Stream rep = hwp.GetResponseStream();
                    sr = new StreamReader(rep, Encoding.GetEncoding(code));
                    string strSource = sr.ReadToEnd();

                    Regex rx = new Regex("<h1>" + @"([Ss]*?)" + "<h2>"
                            , RegexOptions.Compiled | RegexOptions.IgnoreCase);

                    MatchCollection matchs = rx.Matches(strSource);
                    if (matchs.Count > 0)
                    {
                        strSource = matchs[0].Value;//@all</td><td>(.*)@all</td>
                        string pattern = "<h1>(.*)</h1>@allusername:'(.*)'@all<img src='(.*)'@all";
                        pattern = pattern.Replace("@all", @"[Ss]*?");
                        rx = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);

                        matchs = rx.Matches(strSource);
                        if (matchs.Count == 1)
                        {
                            XDocument root = XDocument.Load(XMLPath);
                            XElement xele = root.Element("InfoBy58");
                            xele.Add(new XElement("UserInfo", new XElement("Title", matchs[0].Groups[1].Value), new XElement("Name", matchs[0].Groups[2].Value), new XElement("Tel", matchs[0].Groups[3].Value)));
                            root.Save(XMLPath);
                        }
                    }
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }

                // pictureBox1.ImageLocation = "http://image.58.com/showphone.aspx?t=v55&v=3041A034B4AF246DD511D9E44B08582D7";

            }

            private void btnExport_Click(object sender, EventArgs e)
            {
                try
                {
                    XDocument root = XDocument.Load(XMLPath);
                    XElement xele = root.Element("InfoBy58");

                    StringBuilder strBuilder = new StringBuilder();
                    strBuilder.Append("<html>");
                    strBuilder.Append("<body>");
                    strBuilder.Append("<table border="1">");
                    strBuilder.Append("<th>");
                    strBuilder.Append("<td>标题</td>");
                    strBuilder.Append("<td>联系人</td>");
                    strBuilder.Append("<td>电话</td>");
                    strBuilder.Append("</th>");

                    foreach (var item in root.Elements("UserInfo"))
                    {
                        strBuilder.Append("<tr>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td>" + item.Element("CategoryName").Value + "</td>");
                        strBuilder.Append("<td><img src='" + item.Element("CategoryName").Value + "'/></td>");
                        strBuilder.Append("</tr>");
                    }
                    strBuilder.Append("</body></html>");

                    if (!File.Exists(HTMLPath))
                    {
                        File.Create(HTMLPath);
                    }
                    FileStream fs = new FileStream(HTMLPath, FileMode.Open, FileAccess.ReadWrite);
                    StreamWriter sw = new StreamWriter(fs);
                    fs.SetLength(0);//首先把文件清空了。
                    sw.Write(strBuilder.ToString());//写你的字符串。
                    sw.Close();

                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
            }
        }
    }

  • 相关阅读:
    (转)TextView 设置背景和文本颜色的问题
    (转)Android强制设置横屏或竖屏
    android栈和队列
    关于android开发环境中sdk和adt更新到22.6之后多了appcompat_v7
    (转)android平台下使用点九PNG技术
    (转)多重背包
    (转)完全背包
    (转)01背包
    Longest Palindromic Substring
    Median of Two Sorted Arrays
  • 原文地址:https://www.cnblogs.com/contain/p/3285699.html
Copyright © 2011-2022 走看看