ArrayList linkLocal = new ArrayList();
ArrayList linkExternal = new ArrayList();
// Dodgy Regex will find *some* links
foreach (Match match in Regex.Matches(htmlData
, @"(?<=<(a|area)\s+href="").*?(?=""\s*/?>)"
, RegexOptions.IgnoreCase|RegexOptions.ExplicitCapture)) {
// Regex matches from opening "quote
link = match.Value;
// find first space (ie no spaces in Url)
int spacePos = link.IndexOf(' ');
// or first closing quote (NO single quotes)
int quotePos = link.IndexOf('"');
int chopPos = (quotePos<spacePos?quotePos:spacePos);
if (chopPos > 0) {
// chopPos if quote or space first the at URL end
link = link.Substring(0,chopPos);
}
if ( (link.Length > 8) &&
(link.Substring(0, 7).ToLower() == "http://") ) {
// Assumes all links beginning with http:// are _external_
linkExternal.Add(link) ;
} else {
// otherwise they're "relative"/internal links
// so we concatenate the base URL
link = startingUrl + link;
linkLocal.Add(link);
}
}
ArrayList linkExternal = new ArrayList();
// Dodgy Regex will find *some* links
foreach (Match match in Regex.Matches(htmlData
, @"(?<=<(a|area)\s+href="").*?(?=""\s*/?>)"
, RegexOptions.IgnoreCase|RegexOptions.ExplicitCapture)) {
// Regex matches from opening "quote
link = match.Value;
// find first space (ie no spaces in Url)
int spacePos = link.IndexOf(' ');
// or first closing quote (NO single quotes)
int quotePos = link.IndexOf('"');
int chopPos = (quotePos<spacePos?quotePos:spacePos);
if (chopPos > 0) {
// chopPos if quote or space first the at URL end
link = link.Substring(0,chopPos);
}
if ( (link.Length > 8) &&
(link.Substring(0, 7).ToLower() == "http://") ) {
// Assumes all links beginning with http:// are _external_
linkExternal.Add(link) ;
} else {
// otherwise they're "relative"/internal links
// so we concatenate the base URL
link = startingUrl + link;
linkLocal.Add(link);
}
}
.*? 非贪婪或最小匹配.
?<= 正向引用不包含在匹配值里
?= 同上.. (不过上面的哪个没有了<号就会造成不同的结果了)
"" 因为前面加了个@ 所以这边的""变成了"的意思.
(a|area) 其它的任意一个
RegexOptions.ExplicitCapture 指得没有命名的不能捕获..其它?<=和?=以经代替了它的作用了
另一种方法的捕狱.括号加命名 (?<banyi>.*?)到时候就可以match.Groups["banyi"].Value这种形式来获得了 Replace的时候也可以指定的
?的另一个作用就是 匹配0次或一次了 +号是一次或多次*号是0次或多次