zoukankan      html  css  js  c++  java
  • Programming a Spider in Java 源码帖

    Programming a Spider in Java 源码帖
    
    Listing 1: Finding the bad links (CheckLinks.java)
    import java.awt.*;
    import javax.swing.*;
    import java.net.*;
    import java.io.*;
    /**
    * This example uses a Java spider to scan a Web site
    * and check for broken links. Written by Jeff Heaton.
    * Jeff Heaton is the author of "Programming Spiders,
    * Bots, and Aggregators" by Sybex. Jeff can be contacted
    * through his Web site at http://www.jeffheaton.com.
    * 
    * @author Jeff Heaton(http://www.jeffheaton.com)
    * @version 1.0
    */
    public class CheckLinks extends javax.swing.JFrame implements Runnable,ISpiderReportable {
       /**
        * The constructor. Perform setup here.
        */
       public CheckLinks() {
         //{{INIT_CONTROLS
         setTitle("Find Broken Links");
         getContentPane().setLayout(null);
         setSize(405,288);
         setVisible(false);
         label1.setText("Enter a URL:");
         getContentPane().add(label1);
         label1.setBounds(12,12,84,12);
         begin.setText("Begin");
         begin.setActionCommand("Begin");
         getContentPane().add(begin);
         begin.setBounds(12,36,84,24);
         getContentPane().add(url);
         url.setBounds(108,36,288,24);
         errorScroll.setAutoscrolls(true);
         errorScroll.setHorizontalScrollBarPolicy(javax.swing.ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);
         errorScroll.setVerticalScrollBarPolicy(javax.swing.ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
         errorScroll.setOpaque(true);
         getContentPane().add(errorScroll);
         errorScroll.setBounds(12,120,384,156);
         errors.setEditable(false);
         errorScroll.getViewport().add(errors);
         errors.setBounds(0,0,366,138);
         current.setText("Currently Processing: ");
         getContentPane().add(current);
         current.setBounds(12,72,384,12);
         goodLinksLabel.setText("Good Links: 0");
         getContentPane().add(goodLinksLabel);
         goodLinksLabel.setBounds(12,96,192,12);
         badLinksLabel.setText("Bad Links: 0");
         getContentPane().add(badLinksLabel);
         badLinksLabel.setBounds(216,96,96,12);
         //}}
         //{{INIT_MENUS
         //}}
         //{{REGISTER_LISTENERS
         SymAction lSymAction = new SymAction();
         begin.addActionListener(lSymAction);
         //}}
       }
       /**
        * Main method for the application
        * 
        * @param args Not used
        */
       static public void main(String args[]){
         (new CheckLinks()).setVisible(true);
       }
       /**
        * Add notifications.
        */
       public void addNotify(){
         // Record the size of the window prior to calling parent's
         // addNotify.
         Dimension size = getSize();
         super.addNotify();
         if ( frameSizeAdjusted )
           return;
         frameSizeAdjusted = true;
         // Adjust size of frame according to the insets and menu bar
         Insets insets = getInsets();
         javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();
         int menuBarHeight = 0;
         if ( menuBar != null )
           menuBarHeight = menuBar.getPreferredSize().height;
         setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight);
       }
       // Used by addNotify
       boolean frameSizeAdjusted = false;
       //{{DECLARE_CONTROLS
       javax.swing.JLabel label1 = new javax.swing.JLabel();
       /**
        * The begin or cancel button
        */
       javax.swing.JButton begin = new javax.swing.JButton();
       /**
        * The URL being processed
        */
       javax.swing.JTextField url = new javax.swing.JTextField();
       /**
        * Scroll the errors.
        */
       javax.swing.JScrollPane errorScroll = new javax.swing.JScrollPane();
       /**
        * A place to store the errors created
        */
       javax.swing.JTextArea errors = new javax.swing.JTextArea();
       javax.swing.JLabel current = new javax.swing.JLabel();
       javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();
       javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();
       //}}
       //{{DECLARE_MENUS
       //}}
       /**
        * The background spider thread
        */
       protected Thread backgroundThread;
       /**
        * The spider object being used
        */
       protected Spider spider;
       /**
        * The URL that the spider began with
        */
       protected URL base;
       /**
        * How many bad links have been found
        */
       protected int badLinksCount = 0;
       /**
        * How many good links have been found
        */
       protected int goodLinksCount = 0;
    
       /**
        * Internal class used to dispatch events
        * 
        * @author Jeff Heaton
        * @version 1.0
        */
       class SymAction implements java.awt.event.ActionListener {
         public void actionPerformed(java.awt.event.ActionEvent event){
           Object object = event.getSource();
           if ( object == begin )
             begin_actionPerformed(event);
         }
       }
       /**
        * Called when the begin or cancel buttons are clicked
        * 
        * @param event The event associated with the button.
        */
       void begin_actionPerformed(java.awt.event.ActionEvent event){
         if ( backgroundThread==null ) {
           begin.setLabel("Cancel");
           backgroundThread = new Thread(this);
           backgroundThread.start();
           goodLinksCount=0;
           badLinksCount=0;
         } else {
           spider.cancel();
         }
       }
       /**
        * Perform the background thread operation. This method
        * actually starts the background thread.
        */
       public void run(){
         try {
           errors.setText("");
           spider = new Spider(this);
           spider.clear();
           base = new URL(url.getText());
           spider.addURL(base);
           spider.begin();
           Runnable doLater = new Runnable(){
             public void run(){
               begin.setText("Begin");
             }
           };
           SwingUtilities.invokeLater(doLater);
           backgroundThread=null;
         } catch ( MalformedURLException e ) {
           UpdateErrors err = new UpdateErrors();
           err.msg = "Bad address.";
           SwingUtilities.invokeLater(err);
         }
       }
       /**
        * Called by the spider when a URL is found. It is here
        * that links are validated.
        * 
        * @param base The page that the link was found on.
        * @param url The actual link address.
        */
       public boolean spiderFoundURL(URL base,URL url){
         UpdateCurrentStats cs = new UpdateCurrentStats();
         cs.msg = url.toString();
         SwingUtilities.invokeLater(cs);
         if ( !checkLink(url) ) {
           UpdateErrors err = new UpdateErrors();
           err.msg = url+"(on page " + base + ")
    ";
           SwingUtilities.invokeLater(err);
           badLinksCount++;
           return false;
         }
         goodLinksCount++;
         if ( !url.getHost().equalsIgnoreCase(base.getHost()) )
           return false;
         else
           return true;
       }
       /**
        * Called when a URL error is found
        * 
        * @param url The URL that resulted in an error.
        */
       public void spiderURLError(URL url){
       }
       /**
        * Called internally to check whether a link is good
        * 
        * @param url The link that is being checked.
        * @return True if the link was good, false otherwise.
        */
       protected boolean checkLink(URL url){
         try {
           URLConnection connection = url.openConnection();
           connection.connect();
           return true;
         } catch ( IOException e ) {
           return false;
         }
       }
       /**
        * Called when the spider finds an e-mail address
        * 
        * @param email The email address the spider found.
        */
       public void spiderFoundEMail(String email){
       }
       /**
        * Internal class used to update the error information
        * in a Thread-Safe way
        * 
        * @author Jeff Heaton
        * @version 1.0
        */
       class UpdateErrors implements Runnable {
         public String msg;
         public void run(){
           errors.append(msg);
         }
       }
       /**
        * Used to update the current status information
        * in a "Thread-Safe" way
        * 
        * @author Jeff Heaton
        * @version 1.0
        */
       class UpdateCurrentStats implements Runnable {
         public String msg;
         public void run(){
           current.setText("Currently Processing: " + msg );
           goodLinksLabel.setText("Good Links: " + goodLinksCount);
           badLinksLabel.setText("Bad Links: " + badLinksCount);
         }
       }
    }
    
    Listing 2: Reporting spider events(ISpiderReportable.java)
    import java.net.*;
    interface ISpiderReportable {
       public boolean spiderFoundURL(URL base,URL url);
       public void spiderURLError(URL url);
       public void spiderFoundEMail(String email);
    }
    
    Listing 3: A reusable spider (Spider.java)
    import java.util.*;
    import java.net.*;
    import java.io.*;
    import javax.swing.text.*;
    import javax.swing.text.html.*;
    /**
    * That class implements a reusable spider
    * 
    * @author Jeff Heaton(http://www.jeffheaton.com)
    * @version 1.0
    */
    public class Spider {
       /**
        * A collection of URLs that resulted in an error
        */
       protected Collection workloadError = new ArrayList(3);
       /**
        * A collection of URLs that are waiting to be processed
        */
       protected Collection workloadWaiting = new ArrayList(3);
       /**
        * A collection of URLs that were processed
        */
       protected Collection workloadProcessed = new ArrayList(3);
       /**
        * The class that the spider should report its URLs to
        */
       protected ISpiderReportable report;
       /**
        * A flag that indicates whether this process
        * should be canceled
        */
       protected boolean cancel = false;
       /**
        * The constructor
        * 
        * @param report A class that implements the ISpiderReportable
        * interface, that will receive information that the
        * spider finds.
        */
       public Spider(ISpiderReportable report){
         this.report = report;
       }
       /**
        * Get the URLs that resulted in an error.
        * 
        * @return A collection of URL's.
        */
       public Collection getWorkloadError(){
         return workloadError;
       }
       /**
        * Get the URLs that were waiting to be processed.
        * You should add one URL to this collection to
        * begin the spider.
        * 
        * @return A collection of URLs.
        */
       public Collection getWorkloadWaiting(){
         return workloadWaiting;
       }
       /**
        * Get the URLs that were processed by this spider.
        * 
        * @return A collection of URLs.
        */
       public Collection getWorkloadProcessed(){
         return workloadProcessed;
       }    
       /**
        * Clear all of the workloads.
        */
       public void clear(){
         getWorkloadError().clear();
         getWorkloadWaiting().clear();
         getWorkloadProcessed().clear();
       }
       /**
        * Set a flag that will cause the begin
        * method to return before it is done.
        */
       public void cancel(){
         cancel = true;
       }
       /**
        * Add a URL for processing.
        * 
        * @param url
        */
       public void addURL(URL url){
         if ( getWorkloadWaiting().contains(url) )
           return;
         if ( getWorkloadError().contains(url) )
           return;
         if ( getWorkloadProcessed().contains(url) )
           return;
         log("Adding to workload: " + url );
         getWorkloadWaiting().add(url);
       }
       /**
        * Called internally to process a URL
        * 
        * @param url The URL to be processed.
        */
       public void processURL(URL url){
         try {
           log("Processing: " + url );
           // get the URL's contents
           URLConnection connection = url.openConnection();
           if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) {
             getWorkloadWaiting().remove(url);
             getWorkloadProcessed().add(url);
             log("Not processing because content type is: " + connection.getContentType() );
             return;
           }
          
           // read the URL
           InputStream is = connection.getInputStream();
           Reader r = new InputStreamReader(is);
           // parse the URL
           HTMLEditorKit.Parser parse = new HTMLParse().getParser();
           parse.parse(r,new Parser(url),true);
         } catch ( IOException e ) {
           getWorkloadWaiting().remove(url);
           getWorkloadError().add(url);
           log("Error: " + url );
           report.spiderURLError(url);
           return;
         }
         // mark URL as complete
         getWorkloadWaiting().remove(url);
         getWorkloadProcessed().add(url);
         log("Complete: " + url );
       }
       /**
        * Called to start the spider
        */
       public void begin(){
         cancel = false;
         while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
           Object list[] = getWorkloadWaiting().toArray();
           for ( int i=0;(i<list.length)&&!cancel;i++ )
             processURL((URL)list[i]);
         }
       }
    /**
    * A HTML parser callback used by this class to detect links
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
       protected class Parser
       extends HTMLEditorKit.ParserCallback {
         protected URL base;
         public Parser(URL base){
           this.base = base;
         }
         public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos){
           String href = (String)a.getAttribute(HTML.Attribute.HREF);
          
           if( (href==null) && (t==HTML.Tag.FRAME) )
             href = (String)a.getAttribute(HTML.Attribute.SRC);
            
           if ( href==null )
             return;
           int i = href.indexOf('#');
           if ( i!=-1 )
             href = href.substring(0,i);
           if ( href.toLowerCase().startsWith("mailt") ) {
             report.spiderFoundEMail(href);
             return;
           }
           handleLink(base,href);
         }
         public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos){
           handleSimpleTag(t,a,pos);     // handle the same way
         }
         protected void handleLink(URL base,String str){
           try {
             URL url = new URL(base,str);
             if ( report.spiderFoundURL(base,url) )
               addURL(url);
           } catch ( MalformedURLException e ) {
             log("Found malformed URL: " + str );
           }
         }
       }
       /**
        * Called internally to log information
        * This basic method just writes the log
        * out to the stdout.
        * 
        * @param entry The information to be written to the log.
        */
       public void log(String entry){
         System.out.println( (new Date()) + ":" + entry );
       }
    }
    
    Listing 4: Parsing HTML (HTMLParse.java)
    import javax.swing.text.html.*;
    public class HTMLParse extends HTMLEditorKit {
       public HTMLEditorKit.Parser getParser(){
         return super.getParser();
       }
    }
  • 相关阅读:
    安装rqalpha的日志
    从github上下载一个csv文件
    PyQt4 里的表格部件的使用方法: QTableWidget
    markdown里的多层次列表项
    打包python脚本为exe的坎坷经历, by pyinstaller方法
    Spyder docstrings文档字符串的标准
    Plot Candlestick Charts in Research of quantopian
    另类之将ipython notebook嵌入blog方法
    Jupyter Notebook Tutorial: Introduction, Setup, and Walkthrough
    爬虫视频讲座
  • 原文地址:https://www.cnblogs.com/cRaZy-TyKeIo/p/3543294.html
Copyright © 2011-2022 走看看