import lxml.etree as le with open('edu.html','r',encoding='utf-8') as f: html = f.read() html_x = le.HTML(html) a_x_s = html_x.xpath("//div[contains(@class,'threadlist_title pull_left j_th_tit')]/a/text()") for a in a_x_s: print(a)
如果HTML被注释,Xpath会不起作用,此时应该使用正则表达式
import re with open('edu.html','r',encoding='utf-8') as f: html = re.sub(' ','',f.read()) a_pattern = '<div class="threadlist_title pull_left j_th_tit ">.*?<a.*?>(.*?)</a>' a_s = re.findall(a_pattern,html) for a in a_s: print(a)