zoukankan      html  css  js  c++  java
  • .net版 类似火车头的网页采集

    最近因工作需要,需写一个类似火车头的web采集器

    各位有什么建议啊?

    由于正则不会,只能简单的写一个测试代码,代码如下

    代码
    protected void Button1_Click(object sender, EventArgs e)
    {
    string content,sql;
    WebClient client
    = new WebClient();
    string content2 = null;
    MatchCollection matches;
    int iStart, iEnd;
    Regex regex
    = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>"); //分组捕获url链接以及对应的标题,一个列表页中有多个网页链接
    try
    {
    for (int i = Convert.ToInt32(exp1.Text); i <= Convert.ToInt32(exp2.Text); i++)
    {

    bool bl = test("http://souky.eol.cn/HomePage/index_" + i + ".html");

    if (bl == false)
    {
    continue;
    }
    else
    {
    content
    = client.DownloadString("http://souky.eol.cn/HomePage/index_" + i + ".html");

    matches
    = regex.Matches(content);
    foreach (Match m in matches)
    {
    if (m.Groups["url"].Value.StartsWith("/HomePage/takeinfo/" + i))
    {
    tb.Text
    += m.Groups["url"].Value + "\n";
    content2
    = client.DownloadString("http://souky.eol.cn" + m.Groups["url"].Value);
    //int iStart = content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">");

    if (content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">") == -1)
    {
    iStart
    = content2.IndexOf("<div class=\"line_24 pad_c\">");
    iEnd
    = content2.IndexOf("</div>");
    }
    else
    {
    iStart
    = content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">");
    iEnd
    = content2.IndexOf("<td height=50>");
    }

    //tbcontent.Text += content2.Substring(iStart, iEnd - iStart);
    sql = "insert into temp (subContent) values('" + NoHTML(content2.Substring(iStart, iEnd - iStart)) + "')";
    try
    {
    ULCode.XSql.MsSql.Execute(sql);
    }
    catch (Exception EX)
    {
    continue;
    }
    finally
    {
    tb.Text
    = "输出";
    }
    //if (exe(sql)!=1)
    //{
    // ULCode.Debug.Alert(Page,"123");
    // Response.Write("http://souky.eol.cn/HomePage/index_" + i + ".html");
    // continue;
    //}
    //else
    //{
    // ULCode.XSql.MsSql.Execute(sql);
    //}
    }
    }
    }
    }
    }
    catch (Exception ex)
    {
    tb.Text
    = ex.Message;
    }
    finally
    {
    client.Dispose();
    }

    }
    private int exe(string sql) {

    int IR = ULCode.XSql.MsSql.Execute(sql);
    return IR;
    }
    private bool test(string url) {
    HttpWebRequest request
    = (HttpWebRequest)WebRequest.Create(url);
    HttpWebResponse response;
    //request.KeepAlive = false;
    try
    {
    response
    = (HttpWebResponse)request.GetResponse();
    if (response.StatusCode == HttpStatusCode.NotFound)
    {
    response.Close();

    return false;

    }
    response.Close();
    return true;
    }
    catch (Exception ex)
    {
    //response.Close();
    return false;
    }

    }
    //清除HTML函数
    public static string NoHTML(string Htmlstring)
    {

    //删除脚本

    Htmlstring
    = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

    ////删除HTML
    Htmlstring = Regex.Replace(Htmlstring, @"<(/?p|br[^>]*)>;", "[--$1--]", RegexOptions.IgnoreCase);
    Htmlstring
    = Regex.Replace(Htmlstring,"\"", ""); //去掉引号
    //Htmlstring = Regex.Replace(Htmlstring, "\"", ""); //去掉引号
    Htmlstring = Regex.Replace(Htmlstring, "", ""); //去掉引号
    Htmlstring = Regex.Replace(Htmlstring, "", ""); //去掉引号
    Htmlstring = Regex.Replace(Htmlstring, "'", ""); //去掉引号
    //Htmlstring = Regex.Replace(Htmlstring, @"\+", ""); //去掉引号
    Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);

    Htmlstring
    = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
    Htmlstring
    = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

    Htmlstring.Replace(
    "<", "");

    Htmlstring.Replace(
    ">", "");

    //Htmlstring.Replace("<BR>", "\r\n");
    //Htmlstring = Regex.Replace(Htmlstring, "<[^>]*?>", "");
    Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

    return Htmlstring;

    }
  • 相关阅读:
    科学计算和可视化
    利用Python制作GIF图片
    模拟体育竞技分析
    词云(傲慢与偏见)
    词频统计+词云(傲慢与偏见)
    汉诺塔问题
    Python 的turtle笔记
    有进度条的圆周率计算
    Python 第二周练习
    warning: deprecated conversion from string constant to ‘char*’
  • 原文地址:https://www.cnblogs.com/OK_Blog/p/1822426.html
Copyright © 2011-2022 走看看