本例使用类与TList相结合,用简洁的方法,实现了一个 HTML 解析与格式化功能。
所用到的知识点如下:
1.类的提前申明
2.TList用法
3.String的指针操作
4.单例设计
5.递归用法
编程是综合实力的较量,把单个技术小点,结合起来,实现一个具体的功能才能创造价值。
为了让代码漂亮,需要反复修改,善用重构工具。
写完本例后的思考:
此类解析文本的工作,不适合用Class来实现,应该用接口。
原因是,如果要取Class中的Item并使用,此时Item到底由谁来负责释放的问题变得复杂了。
如:SuperObject.pas 解析JSON就是用的接口。系统自带单元,解析HTML Document 也是用的接口。
本例源码下载(XE8)
unit uHtmlItem; interface uses uSimpleList; type THtmlItem = class; // 类型提前申明 THtmlItemList = class(TSimpleList<THtmlItem>) private function FindIndexByTagName(ATagName: string): integer; protected procedure FreeItem(Item: THtmlItem); override; end; THtmlItem = class private FTagName: string; Taghead: string; TagTail: string; TagHeadBegin: integer; TagHeadEnd: integer; TagTailBegin: integer; TagTailEnd: integer; FLevel: integer; // 层级数 private FChildren: THtmlItemList; // 为递归做准备 FParent: THtmlItem; FHtml: string; // FHtml 单例 function GetHtml: string; procedure SetHtml(const Value: string); function AddChild: THtmlItem; overload; function SpaceTimes(ATimes: integer): string; function InnerGetHtmlText: string; public constructor Create; destructor Destroy; override; protected property Html: string read GetHtml write SetHtml; public function GetHtmlText: string; function GetFormatedHtmlText: string; public class function ParseHtml(AHtml: string): THtmlItem; end; implementation { THtmlItemList } uses System.SysUtils; // 跳过所有的空白 char ,直至找到一个非空白的char function SkipBlankChar(const S: string; AStartPos: integer): integer; const BlankChars: array [0 .. 3] of char = (#$20, #$09, #$0A, #$0D); var D: PChar; C: char; i: integer; begin Result := AStartPos; D := @S[AStartPos]; for i := AStartPos to length(S) do begin for C in BlankChars do if D^ <> C then // 指针的使用 begin Result := i; exit; end; inc(D); end; end; // 搜索 Char function SearchChar(const S: string; AStartPos: integer; C: char): integer; var i: integer; D: PChar; begin Result := 0; D := @S[AStartPos]; for i := AStartPos to length(S) do begin if D^ = C then begin Result := i; exit; end; inc(D); end; end; // 搜 <html > function SearchTagHead(const S: string; AStartPos: integer; var ABeginPos, AEndPos: integer): boolean; var nPos, nStrLen: integer; begin Result := false; nStrLen := length(S); ABeginPos := SearchChar(S, AStartPos, '<'); nPos := ABeginPos + 1; if (ABeginPos > 0) and (nPos < nStrLen) then begin AEndPos := SearchChar(S, nPos, '>'); Result := AEndPos > 0; end; end; function InnerGetTagName(const S: string; AStartPos: integer = 2): string; const TailChar: array [0 .. 4] of char = (#$20, #$09, #$0A, #$0D, '>'); var i, nPos, nStrLen: integer; D: PChar; C: char; nBegin: integer; begin Result := ''; nStrLen := length(S); nPos := AStartPos; nBegin := SkipBlankChar(S, nPos); nPos := nBegin + 1; if (nBegin > 0) and (nPos < nStrLen) then begin D := @S[nPos]; for i := nPos to nStrLen do begin for C in TailChar do if D^ = C then begin Result := copy(S, nBegin, i - nBegin); exit; end; inc(D); end; end; end; // ATagHead -- <html xx=123> ,输出:html function GetTagNameByHead(const ATagHead: string): string; inline; begin Result := InnerGetTagName(ATagHead, 2); end; // ATagTail </html> ,输出 html function GetTagNameByTail(const ATagTail: string): string; inline; begin Result := InnerGetTagName(ATagTail, 3); end; function THtmlItemList.FindIndexByTagName(ATagName: string): integer; var i: integer; begin Result := -1; for i := Self.Count - 1 downto 0 do begin if (Self[i].TagTail = '') and (Self[i].FTagName = ATagName) then begin Result := i; exit; end; end; end; procedure THtmlItemList.FreeItem(Item: THtmlItem); begin inherited; Item.Free; end; { THtmlItem } function THtmlItem.AddChild: THtmlItem; // 函数的类型为本类型,这是类型提前申明的用法。 begin Result := THtmlItem.Create; Result.FParent := Self; // 为找到顶级父类提供线索 FChildren.Add(Result); end; constructor THtmlItem.Create; begin inherited; FChildren := THtmlItemList.Create; FLevel := -1; end; destructor THtmlItem.Destroy; begin FChildren.Free; inherited; end; function THtmlItem.GetFormatedHtmlText: string; var Q: THtmlItem; sTemp: string; sHtmlText: string; begin Result := ''; if FChildren.Count = 0 then begin if length(TagTail) = 0 then // 没有 TagTail 的 HtmlItem Result := SpaceTimes(FLevel) + Taghead else Result := SpaceTimes(FLevel) + Taghead + InnerGetHtmlText + TagTail; end else begin sHtmlText := ''; for Q in FChildren do begin Q.FLevel := FLevel + 1; sTemp := Q.GetFormatedHtmlText; // 递归 if length(sTemp) > 0 then begin if length(sHtmlText) > 0 then sHtmlText := sHtmlText + #13#10; sHtmlText := sHtmlText + sTemp; end; end; Result := Result + SpaceTimes(FLevel) + Taghead + #13#10 + sHtmlText + #13#10 + SpaceTimes(FLevel) + TagTail; end; end; function THtmlItem.GetHtml: string; begin // 根 Item 才有 Html ,其它都是引用此 html if not Assigned(FParent) then Result := FHtml else Result := FParent.Html; // 实现 Html 内容为单例 end; function THtmlItem.GetHtmlText: string; var Q: THtmlItem; sHtmlText: string; begin Result := ''; if (length(TagTail) > 0) and (FChildren.Count = 0) then Result := InnerGetHtmlText; for Q in FChildren do begin sHtmlText := Q.GetHtmlText; // 递归 if length(sHtmlText) > 0 then begin if (length(Result) > 0) then Result := Result + #13#10; Result := Result + sHtmlText; end; end; end; function THtmlItem.InnerGetHtmlText: string; var nLeft, nRight: integer; begin Result := ''; if Assigned(FParent) then begin nLeft := TagHeadEnd + 1; nRight := TagTailBegin - 1; Result := Result + copy(Html, nLeft, nRight - nLeft + 1); end; end; class function THtmlItem.ParseHtml(AHtml: string): THtmlItem; var i, nPos, HtmlItemIndex: integer; LeftAngleBracketPos: integer; // >位置 RightAngleBracketPos: integer; // <位置 nStrLen: integer; sTag, sTagName: string; Q, M: THtmlItem; L: THtmlItemList; begin Result := THtmlItem.Create; nStrLen := length(AHtml); nPos := 1; Result.Html := AHtml; L := Result.FChildren; while nPos < nStrLen do begin // 找 <html > if SearchTagHead(AHtml, nPos, LeftAngleBracketPos, RightAngleBracketPos) then begin // 得到 <html > 或 </html > sTag := copy(AHtml, LeftAngleBracketPos, RightAngleBracketPos - LeftAngleBracketPos + 1); nPos := RightAngleBracketPos + 1; if sTag[2] = '/' then // 如果是</html>,往回找 <html> begin sTagName := UpperCase(GetTagNameByTail(sTag)); HtmlItemIndex := L.FindIndexByTagName(sTagName); // 找与之配对的 <html 位置 if HtmlItemIndex > -1 then // 回找时,路过的 HtmlItem 都是 Child begin Q := L[HtmlItemIndex]; Q.TagTail := sTag; Q.TagTailBegin := LeftAngleBracketPos; Q.TagTailEnd := RightAngleBracketPos; for i := L.Count - 1 downto HtmlItemIndex + 1 do begin M := L.PopLast; M.FParent := Q; // 指定 Q 的 Parent Q.FChildren.Insert(0, M); // 把顺序放对 // 从 List 取出并放进 Q 的 Children 中。 end; end; end else begin // <html> Q := Result.AddChild; Q.FTagName := UpperCase(GetTagNameByHead(sTag)); Q.Taghead := sTag; Q.TagHeadBegin := LeftAngleBracketPos; Q.TagHeadEnd := RightAngleBracketPos; end; end else break; end; end; procedure THtmlItem.SetHtml(const Value: string); begin if not Assigned(FParent) then FHtml := Value end; function THtmlItem.SpaceTimes(ATimes: integer): string; var i: integer; D: PChar; begin Result := ''; if ATimes > 0 then begin SetLength(Result, ATimes * 4); D := PChar(Result); for i := 0 to ATimes * 4 - 1 do D[i] := ' '; end; end; end.