zoukankan      html  css  js  c++  java
  • html抽取文本信息-java版(适合lucene建立索引)

    import org.htmlparser.NodeFilter;
    import org.htmlparser.Parser;
    import org.htmlparser.beans.StringBean;
    import org.htmlparser.filters.CssSelectorNodeFilter;
    import org.htmlparser.util.NodeList;
    
    public class HtmlUtil {
    	public static String getText(String html, String id) {
    		try {
    			Parser parser = new Parser(html);
    			NodeFilter filter = new CssSelectorNodeFilter("#" + id);
    			NodeList nList = parser.extractAllNodesThatMatch(filter);
    			return nList == null || nList.size() == 0 ? null : nList.elementAt(
    					0).toPlainTextString();
    		} catch (Exception e) {
    			e.printStackTrace();
    			return null;
    		}
    	}
    
    	public static String getTextByClass(String html, String css_class) {
    		try {
    			Parser parser = new Parser(html);
    			NodeFilter filter = new CssSelectorNodeFilter("." + css_class);
    			NodeList nList = parser.extractAllNodesThatMatch(filter);
    			return nList == null || nList.size() == 0 ? null : nList.elementAt(
    					0).toPlainTextString();
    		} catch (Exception e) {
    			e.printStackTrace();
    			return null;
    		}
    	}
    
    	public static String filterText(String text) {
    		if (text == null)
    			return null;
    		text = text.replace(">", ">");
    		text = text.replace("<", "<");
    		text = text.replace(""", """);
    		text = text.replace(" ", " ");
    		text = text.replace("&", "&");
    		text = text.replace("&copy;", "©");
    		text = text.replace(" ", "");
    		return text;
    	}
    
    	/**
    	 * 获取网页中纯文本信息
    	 * 
    	 * @param html
    	 * @param id
    	 * @return
    	 * @throws Exception
    	 * @throws Exception
    	 */
    	public static String getText(String html) throws Exception {
    		StringBean bean = new StringBean();
    		bean.setLinks(false);
    		bean.setReplaceNonBreakingSpaces(true);
    		bean.setCollapse(true);
    
    		// 返回解析后的网页纯文本信息
    		Parser parser = Parser.createParser(html, "utf-8");
    		parser.visitAllNodesWith(bean);
    		parser.reset();
    		return bean.getStrings();
    	}
    }
    

    须要用htmlparse.jar库,调用方式例如以下:

    HtmlUtil.getText(htmlStr)。

  • 相关阅读:
    司法相关学习网站视频资料
    小型网站如何防范DDoS攻击
    教你9招 破解多种系统登陆密码方法 (1)
    加快Win7整体运行速度的12个小技巧
    手机指令大全,需要可以看看
    Java、fileless恶意软件威胁桌面安全
    浅谈Android手机木马手工查杀
    Win7路由器设置过程
    如何加强移动应用开发安全?
    FTP常用故障代码注解
  • 原文地址:https://www.cnblogs.com/wzjhoutai/p/6811951.html
Copyright © 2011-2022 走看看