import json
from lxml import etree
class HtmlParser(object):
"""这是HtmlParser"""
# 提取urls
def _get_new_urls(self):
pass
# 提取内容
def _get_new_data(self):
pass
def parser(self, page_url, html_cont_str):
if page_url is None or html_cont_str is None:
return
# dict_data=json.loads(html_cont)
html_etree = etree.HTML(html_cont_str) # 获取element 类型的html
# node_list = html_etree.xpath("//div[@id='u1']/a") # 获得节点
node_list = html_etree.xpath("//a[starts-with(@href,'http')]|//a[starts-with(@href,'//')]") # 获得节点
print(len(node_list))
# 遍历节点
i = 1
for node in node_list:
a_href = node.xpath("./@href")[0]
# a_href=node.xpath("./text()")
print('No.%3s: %s' % (i, a_href))
i += 1
new_urls = self._get_new_urls()
new_data = self._get_new_data()
return new_urls, new_data
pass