zoukankan      html  css  js  c++  java
  • 正则抓取网页所有href和src

    根据抓取的页面,用正则来匹配页面href和src

    string UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:29.0) Gecko/20100101 Firefox/29.0";
        string ContentType = "";
    
        Uri strReqUrl = new Uri("http://m.lhrb.ufstone.net/");
        protected void Application_BeginRequest(object sender, EventArgs e)
        {
    
            Uri u = new Uri(strReqUrl, Request.RawUrl);
            byte[] b = getVerificationCode(u);
    
            //MemoryStream ms = new MemoryStream(b);
            //Response.ClearContent();
            //Response.ContentType = ContentType;
            //Response.BinaryWrite(b);
    
            StringBuilder strHtml = new StringBuilder(Encoding.GetEncoding("gb2312").GetString(b));
            StringBuilder sb = new StringBuilder();
            GetHtmlUrl(ref strHtml);
            Response.Write(strHtml.ToString());
            Response.End();
        }
        public byte[] getVerificationCode(Uri url)
        {
            WebClient MyWebClient = new WebClient();
            MyWebClient.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            MyWebClient.Headers.Add("Accept-Language", "    zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
            MyWebClient.Headers.Add("User-Agent", this.UserAgent);
            MyWebClient.Credentials = CredentialCache.DefaultCredentials;
            try
            {
                Byte[] pageData = MyWebClient.DownloadData(url.AbsoluteUri);
                ContentType = MyWebClient.ResponseHeaders["Content-Type"];
                return (pageData);
            }
            catch
            {
                return null;
            }
        }
    View Code
        void GetHtmlUrl(ref StringBuilder strHtml)
        {
            //string headstr = "(src|href)=", endstr = "(")";
            //string reg = @"(?<=" + headstr + ")(.*?)(?=" + endstr + ")";
    
            string reg = "(src|href)\s*=\s*(?:"(?<1>[^"]*)"|(?<1>\S+))";
            Regex r = new Regex(reg, RegexOptions.None);
            Match match = r.Match(strHtml.ToString());
            StringBuilder sb = new StringBuilder();
            while (match.Success)
            {
                //sb.Append(match.Groups["url"].Value + "
    ");//得到href值                
                //sb.Append(match.Groups["text"].Value + "
    ");//得到<a><a/>中间的内容     
    
                sb.Append(match + "
    ");//得到href值     
                match = match.NextMatch();
                //try
                //{
                //    Uri u = new Uri(strReqUrl, match.Value.Replace(""", "").Replace("'", ""));
                //    strHtml.Replace(match.Value, @"/" + u.ToString().Replace(strReqUrl.ToString(), ""));
                //}
                //catch
                //{
                //}
            }
        }
  • 相关阅读:
    剑指offer-整数中1出现的次数
    剑指offer-连续子数组的最大和
    剑指offer-最小的k个数
    剑指offer-数组中超过一半的数字
    剑指offer-二叉搜索树与双向链表
    剑指offer-复杂链表的复制
    剑指offer-二叉树中和为某一值的路径
    剑指offer-二叉搜索树的后序遍历
    Alpha 冲刺 (7/10)
    Alpha 冲刺 (6/10)
  • 原文地址:https://www.cnblogs.com/xuxiaoshuan/p/3817662.html
Copyright © 2011-2022 走看看