zoukankan      html  css  js  c++  java
  • 用htmlParser把HTML页面信息解析到文本中

    1   html源码

    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
    <!-- saved from url=(0045)http://10.42.158.2/info/news.asp -->
    <HTML><HEAD><TITLE>散文</TITLE>
    <SCRIPT language=JavaScript>
    self.moveTo(0,0); 
    self.resizeTo(screen.width,screen.height); 
    </SCRIPT>
    
    <STYLE type=text/css>A:link {
    	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
    }
    A:active {
    	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
    }
    A:visited {
    	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
    }
    A:hover {
    	FONT-FAMILY: "仿宋_GB2312"; COLOR: #ff6600; FONT-SIZE: 16px; TEXT-DECORATION: underline
    }
    </STYLE>
    
    <META name=GENERATOR content="MSHTML 8.00.6001.18702"></HEAD>
    <BODY >
    <SCRIPT language=JavaScript>
    </SCRIPT>
    <TABLE border=0 cellSpacing=0 cellPadding=0 width=778 align=center>
      <TBODY>
      <TR>
        <TD class=text>  </TD></TR>
      <TR>
        <TD height=30>
          <TABLE class=text border=0 cellSpacing=0 cellPadding=0 width="89%" 
          align=center>
            <TBODY>
            <TR>
              <TD>当前位置:首页 >> 散文</TD>
              <TD ></TD></TR>
      <TR>
        <TD >
          <TABLE border=0 cellSpacing=0 cellPadding=0 width="90%" align=center>
            <TBODY>
            <TR>
              <TD>
                <TABLE border=0 cellSpacing=0 cellPadding=0 width="83%" 
    align=center>
                  <TBODY>
                  <TR>
                    <TD height=160 colSpan=2></TD></TR>
                  <TR>
                    <TD class=briefingred colSpan=2 align=middle>大海</TD></TR>
                  
                  <TR>
                    <TD class=news height=20 colSpan=2 align=middle>
                      <HR color=#ff0000 SIZE=5>
                    </TD></TR>
                  <TR>
                    <TD height=10 colSpan=2></TD></TR>
                  <TR>
                    <TD class=briefingtext colSpan=2 align=justify>
                      <DIV> </DIV>
                
                      <DIV 
                      align=left>   <BR>   一滴水怎样才能不干涸”相传,古代有一位学者这样问他的弟子。
    孤零零的一滴水,论容量只能以毫升计,体积也微乎其微,风能吹干它,阳光也能晒干它,其寿命能有几何……弟子答不上来。 :  <BR>    学者说:“把它放到大海里去。; <BR>    是的,一滴水的寿命是短暂的。但当它汇入大海,与浩瀚的大海融为一体时,就获得了新的生命。大海永远不会干涸,一滴水就永存于大海之中,<BR>    雷锋同志说:“一滴水只有放进大海里才能永远不干,一个人只有当他把自己和集体事业融合在一起的时候才能有力量。” <BR>    可见,团结就有力量。; <BR>    大海,总是冥冥之中给予人一种澎湃的感觉。但见,大海,一浪未平一浪又起,如同人的命运,时起时落,不可能有唾手可得的成功。人的一生尚不可能永远风平浪静,更何况一个国家,一个民族呢欲速则不达,如果我们的民族不是经历了风风雨雨,大挫大折的锻炼,又怎么能如此的经久不衰呢 </DIV>
                     
                                
                     </TD></TR>
                  <TR>
                    <TD height=120 
          colSpan=2></TD></TR></TBODY></TABLE></TD></TR></TBODY></TABLE></TD></TR></TBODY></TABLE>
    <TABLE cellSpacing=0 cellPadding=0 width=676 align=center>
     </TABLE></BODY></HTML>
    


    2  config.properties文件配置

    # excute interval default three minute
    #时间(单位:分钟)
    interval=2
    #bayonet share dir
    #文件输出的父路径
    path=F:\ftp_root
    #url=http://10.42.158.2/info/list3.asp?id=8&page=1&num=30&date=1
    #html的路径
    url=http://10.42.158.2/info/list3.asp?id=8&page=1&num=30&date=1
    #输出文件名
    fileName=sea.txt

    3 java代码

    package com.odin.cn.warning;
    
    import java.io.BufferedOutputStream;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileWriter;
    import java.io.InputStream;
    import java.io.OutputStream;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    import java.util.regex.Pattern;
    
    import jcifs.smb.SmbFile;
    import jcifs.smb.SmbFileOutputStream;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpResponse;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.impl.client.DefaultHttpClient;
    import org.apache.http.util.EntityUtils;
    import org.apache.log4j.Logger;
    import org.apache.log4j.PropertyConfigurator;
    import org.htmlparser.Node;
    import org.htmlparser.NodeFilter;
    import org.htmlparser.Parser;
    import org.htmlparser.nodes.TextNode;
    import org.htmlparser.tags.TableColumn;
    import org.htmlparser.util.NodeList;
    
    import com.odin.cn.util.ConfigUtils;
    import com.odin.cn.util.PropertiesUtils;
    import com.odin.cn.util.Sea;
    
    public class SetArticle {
    	private static SimpleDateFormat longFormate = new SimpleDateFormat(
    			"yyyy-MM-dd HH:mm:ss");
    	private static final Logger logger = Logger
    			.getLogger(SetArticle.class);
    	public ConfigUtils config = new ConfigUtils();
    
    	public SetArticle(ConfigUtils config) {
    		this.config = config;
    	}
    
    	
    
    	// http 网络解析
    	public List<Sea> parseHtmlByHttp(String url) {
    		String body = "{}";
    		List<Sea> sea = new ArrayList<Sea>();
    		DefaultHttpClient httpclient = new DefaultHttpClient();
    		try {
    			HttpPost httpget = new HttpPost(url);
    			HttpResponse response = httpclient.execute(httpget);
    			HttpEntity entity = response.getEntity();
    			body = EntityUtils.toString(entity, "GBK");
    			Parser parser = new Parser(body);
    			// 2.1、自定义一个Filter,用于过滤<Frame >标签,然后取得标签中的src属性值
    			NodeFilter frameNodeFilter = new NodeFilter() {
    				@Override
    				public boolean accept(Node node) {
    					if (node.getText().startsWith("td")) {
    						return true;
    					} else {
    						return false;
    					}
    				}
    			};
    			// 3、使用parser根据filter来取得所有符合条件的节点
    			NodeList nodeList = parser
    					.extractAllNodesThatMatch(frameNodeFilter);
    			for (int i = 0; i < nodeList.size(); i++) {
    				Node node = nodeList.elementAt(i);
    				if (node instanceof TableColumn) {
    					String attr = ((TableColumn) node).getAttribute("class");
    					String align = ((TableColumn) node).getAttribute("align");
    					String colspan = ((TableColumn) node)
    							.getAttribute("colspan");
    					if ("2".equals(colspan) && "justify".equals(align)
    							&& "briefingtext".equals(attr)) {
    						NodeList n = node.getChildren();
    						for (int j = 0; j < n.size(); j++) {
    							Node bNode = n.elementAt(j);
    							sea = foreachNode(sea, bNode);
    						}
    						List<Sea> list = new ArrayList<Sea>();
    						for(int j=1;j<sea.size();j++){
    							Sea pw=sea.get(j);
    							String message=pw.getMessage();
    							if(message!=null&&!"".equals(message)){
    								list.add(pw);
    							}
    						}
    						sea=list;
    					}
    				}
    			}
    			
    		} catch (Exception e) {
    			e.printStackTrace();
    		} finally {
    			httpclient.getConnectionManager().shutdown();
    		}
    		return sea;
    	}
    
    	//递归获取到节点下的文本信息
    	private List<Sea> foreachNode(List<Sea> list, Node node) {
    		if (node == null) {
    			return list;
    		}
    		if (node.getChildren() != null) {
    			NodeList nodeList = node.getChildren();
    			for (int i = 0; i < nodeList.size(); i++) {
    				Node bNode = nodeList.elementAt(i);
    				foreachNode(list, bNode);
    			}
    		} else if (node instanceof TextNode) {
    			String message = node.getText();
    			message = node.getText();
    			message = message.replaceAll(",", ",").replaceAll(" ", "")
    					.replaceAll("
    ", "");
    			int index = message.indexOf(",");
    			if (index > 0&&list.size()!=1) {
    				message = message.substring(0, index);
    			}
    			if (!"".equals(message)) {
    				Sea pw = new Sea();
    				pw.setMessage(message);
    				list.add(pw);
    			}
    		}
    		return list;
    
    	}
    
    	
    	// 保存文件
    	public void saveFile() {
    		BufferedWriter bw = null;
    		try {
    			List<Sea> sea = parseHtmlByHttp(config.getUrl());
    			if(sea==null||sea.size()==0){
    				logger.info("保存到文件夹失败:未获取到网页信息");
    				return;
    			}
    			// List<Leader> leaders=parseHtmlByFile();
    			StringBuffer sb = new StringBuffer();
    			SimpleDateFormat dFormat = new SimpleDateFormat("yyyy-MM-dd");
    			String date = dFormat.format(new Date());
    			sb.append("{date:"" + date + "",data:[");
    			if (!sea.isEmpty()) {
    				for (int i = 0; i < sea.size(); i++) {
    					Sea pw = sea.get(i);
    					if (i == sea.size() - 1) {
    						sb.append("{message:"" + pw.getMessage() + ""}");
    					} else {
    						sb.append("{message:"" + pw.getMessage() + ""},");
    					}
    				}
    			}
    			sb.append("]}");
    			String filename = config.path + "\" + config.fileName;
    			bw = new BufferedWriter(new FileWriter(filename));
    			bw.write(sb.toString(), 0, sb.length());
    			bw.flush(); // 刷新缓冲的输出流
    			logger.info("成功保存文件");
    		} catch (Exception e) {
    			logger.info("保存文件失败" + e.getMessage());
    		} finally {
    			try {
    				if (bw != null) {
    					
    					bw.close();
    				}
    			} catch (Exception e2) {
    				logger.info("关闭连接发送异常" + e2.getMessage());
    			}
    		}
    
    	}
    
    	public static void main(String[] args) {
    		PropertyConfigurator.configure("conf/log4j.properties");
    		final ConfigUtils config = PropertiesUtils.getConfig();
    		final SetArticle leader = new SetArticle(config);
    		logger.info("启动成功");
    		/*
    		 * try { leader.shareFile(); } catch (Exception e) {
    		 * e.printStackTrace(); }
    		 */
    		try {
    			// 启用线程调用
    			new Thread(new Runnable() {
    				boolean initRun = true;
    
    				@Override
    				public void run() {
    					while (initRun) {
    						try {
    							// 5秒调用一次
    							leader.saveFile();
    							Thread.currentThread();
    							// 5分钟调用一次
    							Thread.sleep(1000 * 60 * config.interval);
    						} catch (Exception e) {
    							logger.info("线程执行出现异常:"
    									+ longFormate.format(new java.util.Date())
    									+ e.getMessage());
    						}
    					}
    				}
    			}).start();
    		} catch (Exception e) {
    			logger.info("线程执行出现异常:" + longFormate.format(new java.util.Date())
    					+ e.getMessage());
    		} finally {
    			logger.info("线程终止时间2:" + longFormate.format(new java.util.Date()));
    		}
    	}
    }
    

    4 属性文件加载类

    package com.odin.cn.util;
    
    import java.io.BufferedInputStream;
    import java.io.FileInputStream;
    import java.io.InputStream;
    import java.util.Properties;
    
    import org.apache.log4j.Logger;
    
    
    public class PropertiesUtils {
    private static Logger logger = Logger.getLogger(PropertiesUtils.class);
    	
    	public static ConfigUtils getConfig(){
    		ConfigUtils db=new ConfigUtils();
    		String relativelyPath = System.getProperty("user.dir").replace("\", "/")+"/conf/"+"config.properties"; 
    		InputStream is = null;
    		Properties dbProps = new Properties();
    		try {
    			is = new BufferedInputStream(new FileInputStream(relativelyPath));
    			dbProps.load(is);
        		logger.info("读取数据库config.properties配置文件成功!");
    		} catch (Exception e) {
    			logger.error(e.getMessage());
    		}
    		db.setPath(dbProps.getProperty("path"));
    		db.setInterval(Integer.parseInt(dbProps.getProperty("interval")));
    		
    		db.setFileName(dbProps.getProperty("fileName"));
    		db.setUrl(dbProps.getProperty("url"));
    		return db;
    	}
    }
    

  • 相关阅读:
    剑指Offer对答如流系列
    剑指Offer对答如流系列
    KMP算法
    殊途同归
    从m个数中取top n
    用红黑树实现500万数据的动态排序
    返璞归真
    second blog编程之美------控制cpu曲线
    first blog编程之美-----计算1的个数
    mathematica入门学习记录:
  • 原文地址:https://www.cnblogs.com/t0404/p/10290976.html
Copyright © 2011-2022 走看看