zoukankan html css js c++ java
.net版类似火车头的网页采集

最近因工作需要，需写一个类似火车头的web采集器
各位有什么建议啊？
由于正则不会，只能简单的写一个测试代码，代码如下
代码
 protected void Button1_Click(object sender, EventArgs e)
    {
        string content,sql;
        WebClient client = new WebClient();
        string content2 = null;
        MatchCollection matches;
        int iStart, iEnd;
        Regex regex = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");  //分组捕获url链接以及对应的标题，一个列表页中有多个网页链接
        try
        {
            for (int i = Convert.ToInt32(exp1.Text); i <= Convert.ToInt32(exp2.Text); i++)
            {
                
                bool bl = test("http://souky.eol.cn/HomePage/index_" + i + ".html");
                
                if (bl == false)
                {
                    continue;
                }
                else
                {
                    content = client.DownloadString("http://souky.eol.cn/HomePage/index_" + i + ".html");
                    
                    matches = regex.Matches(content);
                    foreach (Match m in matches)
                    {
                        if (m.Groups["url"].Value.StartsWith("/HomePage/takeinfo/" + i))
                        {
                            tb.Text += m.Groups["url"].Value + "\n";
                            content2 = client.DownloadString("http://souky.eol.cn" + m.Groups["url"].Value);
                            //int iStart = content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">");
                            
                            if (content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">") == -1)
                            {
                                iStart = content2.IndexOf("<div class=\"line_24 pad_c\">");
                                iEnd = content2.IndexOf("</div>");
                            }
                            else
                            {
                                iStart = content2.IndexOf("<td class=\"font14\" style=\"word-wrap:break-word;\">");
                                iEnd = content2.IndexOf("<td height=50>");
                            }

                            //tbcontent.Text += content2.Substring(iStart, iEnd - iStart);
                            sql = "insert into temp (subContent) values('" + NoHTML(content2.Substring(iStart, iEnd - iStart)) + "')";
                            try
                            {
                                ULCode.XSql.MsSql.Execute(sql);
                            }
                            catch (Exception EX)
                            {
                                continue;
                            }
                            finally 
                            {
                                tb.Text = "输出";
                            }
                            //if (exe(sql)!=1)
                            //{
                            //    ULCode.Debug.Alert(Page,"123");
                            //    Response.Write("http://souky.eol.cn/HomePage/index_" + i + ".html");
                            //    continue;
                            //}
                            //else
                            //{
                            //    ULCode.XSql.MsSql.Execute(sql);
                            //}
                        }
                    }
                }
            }
        }
        catch (Exception ex)
        {
            tb.Text = ex.Message;
        }
        finally 
        {
            client.Dispose();
        }

    }
    private int exe(string sql) {
       
        int IR = ULCode.XSql.MsSql.Execute(sql);
        return IR;
    }
    private bool test(string url) {
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
        HttpWebResponse response;
        //request.KeepAlive = false;
        try
        {
            response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.NotFound)
            {
                response.Close();
                
                return false;
               
            }
            response.Close();
            return true;
        }
        catch (Exception ex)
        {
            //response.Close();
            return false;
        }
        
    }
    //清除HTML函数 
    public static string NoHTML(string Htmlstring)
    {

        //删除脚本

        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

        ////删除HTML
        Htmlstring = Regex.Replace(Htmlstring, @"<(/?p|br[^>]*)>;", "[--$1--]", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring,"\"", ""); //去掉引号 
        //Htmlstring = Regex.Replace(Htmlstring, "\"", ""); //去掉引号 
        Htmlstring = Regex.Replace(Htmlstring, "“", ""); //去掉引号 
        Htmlstring = Regex.Replace(Htmlstring, "”", ""); //去掉引号 
        Htmlstring = Regex.Replace(Htmlstring, "'", ""); //去掉引号 
        //Htmlstring = Regex.Replace(Htmlstring, @"\+", ""); //去掉引号
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);

        Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

        Htmlstring.Replace("<", "");

        Htmlstring.Replace(">", "");

        //Htmlstring.Replace("<BR>", "\r\n");
        //Htmlstring = Regex.Replace(Htmlstring, "<[^>]*?>", "");
        Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

        return Htmlstring;
        
    }
查看全文
相关阅读:
day3 集合
 进度条
 day3 文件操作 seek tell 修改
 day3 函数
 同学满分代码，购物车。
day2杂---三元运算 is
模块sys os
day2--列表/元组/字符串/字典
 一、Git配置
 四、TestNG 批量执行脚本Runner.xml
原文地址：https://www.cnblogs.com/OK_Blog/p/1822426.html
.net版 类似火车头的网页采集

.net版类似火车头的网页采集