一:前言:网页源码中有很多嵌套的标签
例如div标签嵌套如:bUTP<DIV>finally<div>aurora</div>@126.com</div><div class=\"Cited1\">ggff</div>
我们的网页解析工作中有时候需要解嵌套。通俗的讲就是把嵌套的标签以线性表的形式表示出来。还拿上面的例子来说明。即解嵌套为
<div>aurora</div>
<DIV>finally<div>aurora</div>@126.com</div>
div class=\"Cited1\">ggff</div>
核心代码如下:
Code
class ThemeIRAssist
{
public static void GetNodesByTags( ref string rawtext,string tags,ref List<string>result )
{ //储存开始标签的位置
List<Position > beginTagPos = new List<Position >();
//储存结束标签的位置
List<Position> endTagPos = new List<Position>();
//匹配开始标签的正则表达式
string sBeginTagPattern = "<" + tags;
Regex regexBeginTag = new Regex(sBeginTagPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
//匹配结束标签的正则表达式。
string sEndTagPattern = "</" + tags + ">";
Regex regexEndTag = new Regex(sEndTagPattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
//获得开始标签的集合
MatchCollection beginTagCollection = regexBeginTag.Matches(rawtext);
//获得结束标签的集合
MatchCollection endTagCollection = regexEndTag.Matches(rawtext);
foreach (Match mymatch in beginTagCollection)
{
Position pos=new Position();
pos.nPos=mymatch.Index;
pos.VistStatus=false;
beginTagPos.Add(pos);
}
foreach (Match mymatch in endTagCollection)
{
Position pos = new Position();
pos.nPos = mymatch.Index;
pos.VistStatus = false;
endTagPos.Add(pos);
}
for (int i = 0; i < endTagPos.Count; i++)
{
for (int j = beginTagPos.Count - 1; j >= 0; j--)
{
if(endTagPos[i].nPos<beginTagPos[j].nPos)
continue;
else
{
if (beginTagPos[j].VistStatus)
continue;
else
{
result.Add(rawtext.Substring(beginTagPos[j].nPos,endTagPos[i].nPos-beginTagPos[j].nPos+6));
beginTagPos[j].VistStatus=true;
break;
}
}
}
}
}
}
class Position
{
private int pos;
private bool visited;
public int nPos
{
get { return pos; }
set { pos = value; }
}
public bool VistStatus
{
get { return visited; }
set { visited = value; }
}
}
class ThemeIRAssist
{
public static void GetNodesByTags( ref string rawtext,string tags,ref List<string>result )
{ //储存开始标签的位置
List<Position > beginTagPos = new List<Position >();
//储存结束标签的位置
List<Position> endTagPos = new List<Position>();
//匹配开始标签的正则表达式
string sBeginTagPattern = "<" + tags;
Regex regexBeginTag = new Regex(sBeginTagPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
//匹配结束标签的正则表达式。
string sEndTagPattern = "</" + tags + ">";
Regex regexEndTag = new Regex(sEndTagPattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
//获得开始标签的集合
MatchCollection beginTagCollection = regexBeginTag.Matches(rawtext);
//获得结束标签的集合
MatchCollection endTagCollection = regexEndTag.Matches(rawtext);
foreach (Match mymatch in beginTagCollection)
{
Position pos=new Position();
pos.nPos=mymatch.Index;
pos.VistStatus=false;
beginTagPos.Add(pos);
}
foreach (Match mymatch in endTagCollection)
{
Position pos = new Position();
pos.nPos = mymatch.Index;
pos.VistStatus = false;
endTagPos.Add(pos);
}
for (int i = 0; i < endTagPos.Count; i++)
{
for (int j = beginTagPos.Count - 1; j >= 0; j--)
{
if(endTagPos[i].nPos<beginTagPos[j].nPos)
continue;
else
{
if (beginTagPos[j].VistStatus)
continue;
else
{
result.Add(rawtext.Substring(beginTagPos[j].nPos,endTagPos[i].nPos-beginTagPos[j].nPos+6));
beginTagPos[j].VistStatus=true;
break;
}
}
}
}
}
}
class Position
{
private int pos;
private bool visited;
public int nPos
{
get { return pos; }
set { pos = value; }
}
public bool VistStatus
{
get { return visited; }
set { visited = value; }
}
}
Main函数测试如下:
Code
static void Main(string[] args)
{
StreamReader sr = new StreamReader("D:\\finally.txt", Encoding.GetEncoding("utf-8"));
string rawtext = sr.ReadToEnd();
// string rawtext = "bUTP<DIV>finally<div>aurora</div>@126.com</div><div class=\"Cited1\">ggff</div>";
List<string> result = new List<string>();
ThemeIRAssist.GetNodesByTags(ref rawtext, "div", ref result);
Console.WriteLine(result.Count);
Regex regexStadard = new Regex(@"^<div\s+class=""Cited1""", RegexOptions.IgnoreCase | RegexOptions.Singleline);
for (int i = result.Count - 1; i >= 0; i--)
{
if(!regexStadard.IsMatch(result[i]))
result.RemoveAt(i);
}
Console.WriteLine(result.Count);
foreach(string s in result)
{
Console.WriteLine("***************************************************");
Console.Write(s);
Console.WriteLine("***************************************************");
}
Console.Read();
}
static void Main(string[] args)
{
StreamReader sr = new StreamReader("D:\\finally.txt", Encoding.GetEncoding("utf-8"));
string rawtext = sr.ReadToEnd();
// string rawtext = "bUTP<DIV>finally<div>aurora</div>@126.com</div><div class=\"Cited1\">ggff</div>";
List<string> result = new List<string>();
ThemeIRAssist.GetNodesByTags(ref rawtext, "div", ref result);
Console.WriteLine(result.Count);
Regex regexStadard = new Regex(@"^<div\s+class=""Cited1""", RegexOptions.IgnoreCase | RegexOptions.Singleline);
for (int i = result.Count - 1; i >= 0; i--)
{
if(!regexStadard.IsMatch(result[i]))
result.RemoveAt(i);
}
Console.WriteLine(result.Count);
foreach(string s in result)
{
Console.WriteLine("***************************************************");
Console.Write(s);
Console.WriteLine("***************************************************");
}
Console.Read();
}