使用BS4解析XML文件用法
1. html.parser
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
两个参数:第一个参数是要解析的html文本,第二个参数是使用那种解析器,对于HTML来讲就是html.parser,这个是bs4自带的解析器
2.
soup = BeautifulSoup(html, "lxml")
查找所有符合条件的标签
a)使用tag查找
soup.find_all('b')
b)正则表达式查找
soup.find_all(re.compile("^b"))
c)按列表中提供的tab查找
soup.find_all(["a", "b"])
d)实例
解析html将dt中class和ul中text找到放人字典中
b.html内容:
<div class="MuneDown"> <dl> <dt class="menuListBox"> <ul class="a"> xxx</ul> </dt> <dd class="adMuneBox validateArea" _bamboo_rep_transid="150166001-1" _bamboo_rep_menuid="" modulecode="module_nav_myunicom" _bamboo_rep_productlink="" validatetype="module_logo_area_2" rowrecordid="b6303d42ec84468badfd05cb88ef20d2"></dd> </dl> </div> </div>
代码如下:
#!/usr/bin/env python #coding:utf-8 import os import sys import lxml from bs4 import BeautifulSoup xml_file = sys.argv[1] with open(xml_file, 'r') as f: xml = f.read() soup = BeautifulSoup(xml, 'lxml') link_content = soup.select("div[class='MuneDown']") result = {} for item in link_content: dt = item.dt["class"][0] ul = item.select("ul[class='a']")[0].get_text() print("dt:{}".format(dt)) print("ul: {}".format(ul)) result[dt] = ul print(result)
运行结果:
dt:menuListBox
ul: xxx
{'menuListBox': ' xxx'}
参考链接:
https://www.cnblogs.com/gl1573/p/9480022.html
https://www.jianshu.com/p/9254bdc467b2
第二部分 制作lxml文件
代码示例:
#!/usr/bin/env python #coding:utf-8 from xml.dom.minidom import Document import xml.dom.minidom doc = Document() #创建DOM文档对象 Object = doc.createElement('Objects') doc.appendChild(Object) label_name='person' label=15 label_attribute = ("name", "lala") xmin=207 ymin=106 xmax=489 ymax=381 score=0.99566 object_name = doc.createElement(label_name) #object_name.setAttribute('物体类别',label_name)#设置命名空间 Object.appendChild(object_name) object_label =doc.createElement('label') object_label.setAttribute(label_attribute[0],label_attribute[1]) object_label_text = doc.createTextNode(str(label)) #元素内容写入 object_label.appendChild(object_label_text) object_name.appendChild(object_label) object_xmin =doc.createElement('xmin') object_xmin_text = doc.createTextNode(str(xmin)) #元素内容写入 object_xmin.appendChild(object_xmin_text) object_name.appendChild(object_xmin) object_ymin =doc.createElement('ymin') object_ymin_text = doc.createTextNode(str(ymin)) #元素内容写入 object_ymin.appendChild(object_ymin_text) object_name.appendChild(object_ymin) object_xmax =doc.createElement('xmax') object_xmax_text = doc.createTextNode(str(xmax)) #元素内容写入 object_xmax.appendChild(object_xmax_text) object_name.appendChild(object_xmax) object_ymax =doc.createElement('ymax') object_ymax_text = doc.createTextNode(str(ymax)) #元素内容写入 object_ymax.appendChild(object_ymax_text) object_name.appendChild(object_ymax) object_score= doc.createElement('score') object_score_text = doc.createTextNode(str(score)) #元素内容写入 object_score.appendChild(object_score_text) object_name.appendChild(object_score) f = open('parameter.xml','w') doc.writexml(f,indent = ' ',newl = ' ', addindent = ' ',encoding='utf-8') f.close()
生成xml文件如下
<?xml version="1.0" encoding="utf-8"?> <Objects> <person> <label name="lala">15</label> <xmin>207</xmin> <ymin>106</ymin> <xmax>489</xmax> <ymax>381</ymax> <score>0.99566</score> </person> </Objects>
参考链接:
https://blog.csdn.net/jqw11/article/details/70670003
https://www.cnblogs.com/zhugaopeng/p/9665365.html