写了一个类,主要用于解析html文本的对称的标签结构。
通过输入tag名称,解析对应HTML文本,查找对应tag的层级数,并可以通过层级数得出对应的tag内容。写的比较粗糙,后续如果用到在慢慢改进。
代码如下:
#!/usr/bin/python3 #encoding = UTF-8 import re #################################### #通过解析HTML文本,获取指定tag的层数 ################################### class htmltaganalysis(object): def __init__(self,html,tag): self.html = html self.tag = tag #正则匹配,还需要调优 def tagdec(self,html,tag): pa = re.compile(tag,re.I|re.S|re.M) return re.finditer(pa,html) #返回数组[{'content':'xx','layer',x}....] def GetTagContent(self): divfinditers = self.tagdec(self.html,'<'+ self.tag) divfinditere = self.tagdec(self.html,'</'+ self.tag + '>') startlist = [] endlist = [] arr = [] for n in divfinditere: endlist.append(n.end()) for m in divfinditers: startlist.append(m.start()) for j in range(len(endlist)): for i in range(len(startlist)-1): if startlist[i] < endlist[j] and startlist[i + 1] > endlist[j] : arr.append([startlist[i] ,endlist[j]]) startlist.remove(startlist[i]) continue for k in range(len(startlist)): #print(startlist[k],endlist[len(endlist)-k-1]) arr.append([startlist[k],endlist[len(endlist)-k-1]]) #按第一列进行排序 arr = sorted(arr, key=lambda x:x[0]) arrcontent = [] for i in range(len(arr)): #print(arr[i],self.Getlayer(arr,i,1)) dic = dict() dic['content'] = self.html[arr[i][0]:arr[i][1]] dic['layer'] = self.Getlayer(arr,i,1) arrcontent.append(dic) return arrcontent #计算数组在二维数组中的层级 def Getlayer(self,arr,i,layer): #print(arr[i]) zz = False if i > 0 and i <len(arr): for j in range(i - 1 ,-1,-1): if arr[i][0] > arr[j][0] and arr[i][1] < arr[j][1]: zz = True layer = layer + 1 #print(arr[j]) #break return self.Getlayer(arr,j,layer) if zz: return layer else: return layer #获取对应层级的标签文本 def GetContentForLayer(self,layer = 1): arr = [] for dic in self.GetTagContent(): if dic['layer'] == layer: arr.append(dic['content']) return arr #获取最高层级 def GetTopLayer(self): tplayer = 0 for dic in self.GetTagContent(): if tplayer < dic['layer']: tplayer = dic['layer'] return tplayer
使用示例:
html = '<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>' htmltaganalysis = htmltaganalysis(html,'div') print(htmltaganalysis.GetTopLayer()) print(htmltaganalysis.GetContentForLayer(1)) print(htmltaganalysis.GetContentForLayer(2))
结果:
2 ['<div id="cnblogs_post_body"><div class="x-wiki-content x-content"></div></div>'] ['<div class="x-wiki-content x-content"></div>']
备注:欢迎任何形式的转载,但请务必注明出处。
限于本人水平,如果文章和代码有表述不当之处,还请不吝赐教。