zoukankan      html  css  js  c++  java
  • 图片抓取器web + winform

    原文发布时间为:2009-11-21 —— 来源于本人的百度文章 [由搬家工具导入]

    请先学习:http://hi.baidu.com/handboy/blog/item/bfef61000a67ea16738b6565.html

    string x = "Live for nothing,die for something";
    Regex r = new Regex(@"^Live for no(?<g1>[a-z]{5}),die for some1$");
    if (r.IsMatch(x))
    {
    Console.WriteLine("group1 value:" + r.Match(x).Groups["g1"].Value);//输出:thing
    }
    //可根据组名进行索引。使用以下格式为标识一个组的名称(?<groupname>…)。

    string x = "Live for nothing nothing";
    Regex r = new Regex(@"([a-z]+) 1");
    if (r.IsMatch(x))
    {
    x = r.Replace(x, "$1");
    Console.WriteLine("var x:" + x);//输出:Live for nothing
    }
    //删除原字符串中重复出现的“nothing”。在表达式之外,使用“$1”来引用第一个组,下面则是通过

    组名来引用:
    string x = "Live for nothing nothing";
    Regex r = new Regex(@"(?<g1>[a-z]+) 1");
    if (r.IsMatch(x))
    {
    x = r.Replace(x, "${g1}");
    Console.WriteLine("var x:" + x);//输出:Live for nothing
    }

    string x = "Live for nothing";
    Regex r = new Regex(@"^Live for no(?:[a-z]{5})$");
    if (r.IsMatch(x))
    {
    Console.WriteLine("group1 value:" + r.Match(x).Groups[1].Value);//输出:(空)
    }
    //在组前加上“?:”表示这是个“非捕获组”,即引擎将不保存该组的内容。

    ========

    最近闲来无事,重温了一下正则表达式,然后做了这个 图片抓取器。
    原则就是 根据分析 新浪博文的共同特征,把图片抓取到本地下来,自动下载下来。 这个原理就是用 正则表达式去匹配,如果有一天新浪博文网页格式变化了,可能这个就用不了了,但是可以进行修改去满足。这只是一个范例,O(∩_∩)O哈!
    winform下载预览:http://www.xmaspx.com/Services/FileAttachment.ashx?AttachmentID=51
    首先
    在根目录下,建一个名为 DownLoadImages 的文件夹

    前台:
    <%@ Page Language="C#" AutoEventWireup="true" CodeFile="SinaImage.aspx.cs" Inherits="SinaImage" %>

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

    <html xmlns="http://www.w3.org/1999/xhtml" >
    <head runat="server">
    <title>无标题页</title>
    </head>
    <body>
    <form id="form1" runat="server">
    <div>
    <asp:TextBox ID="TextBox1" runat="server" Width="495px">http://blog.sina.com.cn/s/articlelist_1270540911_0_1.html</asp:TextBox>
    <asp:Button ID="Button1" runat="server" OnClick="Button1_Click" Text="Button" OnClientClick="javascript:alert('开始下载,可能要等几分钟,请勿关闭')" /><br />
    <asp:TextBox ID="TextBox2" runat="server" Height="296px" TextMode="MultiLine" Width="498px"></asp:TextBox></div>
    </form>
    </body>
    </html>

    后台

    using System;
    using System.Web;
    using System.Web.UI.WebControls;
    using System.Net;
    using System.IO;
    using System.Text;
    using System.Collections;
    using System.Text.RegularExpressions;

    public partial class SinaImage : System.Web.UI.Page
    {
    protected void Page_Load(object sender, EventArgs e)
    {

    }
    protected void Button1_Click(object sender, EventArgs e)
    {
    int num = 0;
    TextBox2.Text = "";
    string p = @"http://blog.sina.com.cn/s/blog_([w])*.html";
    string p2 = @"http://([w-]+.)+[w-]+(/[w- ./?%&=]*)?";

    ArrayList arrUrl = GetUrl(this.TextBox1.Text, p);

    for (int i = 0; i < arrUrl.Count; i++)
    {

    string imgPage = arrUrl[i].ToString();
    ArrayList arrImgUrl = GetUrl(imgPage, p2);

    for (int j = 0; j < arrImgUrl.Count; j++)
    {
    string imgUrl = arrImgUrl[j].ToString();
    if (!imgUrl.Contains("simg") && !imgUrl.Contains("sinaimg") && !imgUrl.Contains(".js"))
    {
    if (imgUrl.Contains("photo") || imgUrl.Contains("image") || imgUrl.Contains("img"))
    {
    TextBox2.Text += imgUrl + " ";
    try
    {
    DownLoadImage(imgUrl, j.ToString());
    num++;
    }
    catch
    {
    }
    }
    }
    }

    }
    ClientScript.RegisterStartupScript(this.GetType(), "alert", "alert('下载了" + num.ToString() + "张,请打开文件夹DownLoadImages,以缩略图形式进行筛选')", true);
    }

    protected void DownLoadImage(string fromUrl, string fileName)
    {
    string savePath = Server.MapPath("DownLoadImages/") + DateTime.Now.ToString("yyyyMMddhhmmss") + fileName + ".jpg";
    WebClient myWebClient = new WebClient();
    myWebClient.DownloadFile(fromUrl, savePath);
    }

    protected ArrayList GetUrl(string web_url, string p)
    {
    string all_code = string.Empty;
    ArrayList arrUrl = new ArrayList();
    HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(web_url);
    WebResponse all_codeResponse = all_codeRequest.GetResponse();
    StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), Encoding.GetEncoding("GB2312"));
    all_code = the_Reader.ReadToEnd();
    the_Reader.Close();
    ArrayList my_list = new ArrayList();
    Regex re = new Regex(p, RegexOptions.IgnoreCase);
    MatchCollection mc = re.Matches(all_code);

    for (int i = 0; i <= mc.Count - 1; i++)
    {
    bool _foo = false;
    string name = mc[i].ToString();
    foreach (string list in my_list)
    {
    if (name == list)
    {
    _foo = true;
    break;
    }

    }//过滤

    if (!_foo)
    {
    arrUrl.Add(name);
    }
    }
    return arrUrl;
    }
    }

  • 相关阅读:
    学指令 emacs篇
    c 排序 总结
    C# WinForm源码下载
    PetShop下载
    Visual Studio常用技巧
    人事信息管理系统(PMS)
    css实现细表格原理
    GridView导出Excel/Word
    dotnet程序员装机必备工具
    容器练习一
  • 原文地址:https://www.cnblogs.com/handboy/p/7158314.html
Copyright © 2011-2022 走看看