zoukankan      html  css  js  c++  java
  • jsoup针对html工具类

    //		<div class="menu-list"> 
    //		   <div class="margin-auto min-width1200"> 
    //		    <div class="menu-item active">
    //		     <a href="/">首页</a>
    //		    </div> 
    //		    <div style=" 145px;" class="menu-item "> 
    //		     <a href="//zjjnts.ghlearning.com:80/node/5040.jspx">技能提升培训课程 </a> 
    //		    </div> 
    //		    <div style=" 145px;" class="menu-item "> 
    //		     <a href="//zjjnts.ghlearning.com:80/node/5479.jspx">特色视频展示区 </a> 
    //		    </div> 
    //		    <div style=" 145px;" class="menu-item "> 
    //		     <a href="//zjjnts.ghlearning.com:80/node/5473.jspx">技能人才招聘专区 </a> 
    //		    </div> 
    //		    <div style=" 145px;" class="menu-item "> 
    //		     <a href="//zjjnts.ghlearning.com:80/node/5478.jspx">院校及培训机构招生区 </a> 
    //		    </div> 
    //		    <div style="" class="menu-item "> 
    //		     <a href="//zjjnts.ghlearning.com:80/node/5472.jspx">政策通知动态专区 </a> 
    //		    </div> 
    //		    <div style="" class="menu-item "> 
    //		     <a href="//zjjnts.ghlearning.com:80/node/5374.jspx">在线模考 </a> 
    //		    </div> 
    //		    <div style="" class="menu-item "> 
    //		     <a href="http://v.qq.com/vplus/df399a8d1cf80ae06f356522325b0902?page=video" target="_blank">技能体验 </a> 
    //		    </div> 
    //		    <div style="" class="menu-item "> 
    //		     <a href="//zjjnts.ghlearning.com:80/node/4516.jspx">帮助中心 </a> 
    //		    </div> 
    //		   </div> 
    //		  </div> 
    		//目标地址
    		String url = "http://42.51.69.234:8001/";
    		try {
    			Document document = Jsoup.connect(url).header("user-agent",
    					"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36")
    					.get();
    			Elements nav_com = document.getElementsByClass("menu-list");
    			ListIterator<Element> listIterator = nav_com.listIterator();
    			while(listIterator.hasNext()) {
    				Element elementmenulist = listIterator.next();
    				//System.out.println(next.getElementsByClass("menu-item "));
    				Elements select = elementmenulist.select(".menu-item a");
    				ListIterator<Element> aList = select.listIterator();
    				while (aList.hasNext()) {
    					Element a = (Element) aList.next();
    					System.out.println(a.text()+"
    "+a.attr("href"));
    				}
    			}
    		} catch (IOException e) {
    			System.out.println("出现错误:" + e.getMessage());
    		}
    
    	}
    

      

    public class HtmlUtil {

    // 只有纯文本可以通过
    public static String getText(String html) {
    if (html == null)
    return null;
    return Jsoup.clean(html, Whitelist.none());
    }

    // 以下标签可以通过
    // b, em, i, strong, u. 纯文本
    public static String getSimpleHtml(String html) {
    if (html == null)
    return null;
    return Jsoup.clean(html, Whitelist.simpleText());
    }

    // 以下标签可以通过
    //a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, ol, p, pre, q, small, strike, strong, sub, sup, u, ul
    public static String getBasicHtml(String html) {
    if (html == null)
    return null;
    return Jsoup.clean(html, Whitelist.basic());
    }

    //在basic基础上 增加图片通过
    public static String getBasicHtmlandimage(String html) {
    if (html == null)
    return null;
    return Jsoup.clean(html, Whitelist.basicWithImages());
    }
    // 以下标签可以通过
    //a, b, blockquote, br, caption, cite, code, col, colgroup, dd, dl, dt, em, h1, h2, h3, h4, h5, h6, i, img, li, ol, p, pre, q, small, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, u, ul
    public static String getFullHtml(String html) {
    if (html == null)
    return null;
    return Jsoup.clean(html, Whitelist.relaxed());
    }

    //只允许指定的html标签
    public static String clearTags(String html, String ...tags) {
    Whitelist wl = new Whitelist();
    return Jsoup.clean(html, wl.addTags(tags));
    }

    // // 对关键字加上颜色
    // public static String markKeywods (String keywords, String target) {
    // if (StringKit.notBlank(keywords)) {
    // String[] arr = keywords.split(" ");
    // for (String s : arr) {
    // if (StringKit.notBlank(s)) {
    // String temp = "<span class="highlight">" + s + "</span>";
    // if(temp!=null)
    // target = target.replaceAll(s, temp);
    // }
    // }
    // }
    // return target;
    // }

    // 获取文章中的img url
    public static String getImgSrc(String html) {
    if (html == null)
    return null;
    Document doc = Jsoup.parseBodyFragment(html);
    Element image = doc.select("img").first();
    return image == null ? null : image.attr("src");
    }


  • 相关阅读:
    CODEVS4650 破损的键盘
    洛谷P1656 炸铁路
    洛谷 P3225 [HNOI2012]矿场搭建
    1265 四点共面
    1406: [AHOI2007]密码箱
    1193: [HNOI2006]马步距离
    1800: [Ahoi2009]fly 飞行棋
    1923: [Sdoi2010]外星千足虫
    I
    2017CCPC秦皇岛G ZOJ 3987Numbers(大数+贪心)
  • 原文地址:https://www.cnblogs.com/java-llp/p/11378166.html
Copyright © 2011-2022 走看看