zoukankan html css js c++ java

Programming a Spider in Java 源码帖

Programming a Spider in Java 源码帖

Listing 1: Finding the bad links (CheckLinks.java)
import java.awt.*;
import javax.swing.*;
import java.net.*;
import java.io.*;
/**
* This example uses a Java spider to scan a Web site
* and check for broken links. Written by Jeff Heaton.
* Jeff Heaton is the author of "Programming Spiders,
* Bots, and Aggregators" by Sybex. Jeff can be contacted
* through his Web site at http://www.jeffheaton.com.
* 
* @author Jeff Heaton(http://www.jeffheaton.com)
* @version 1.0
*/
public class CheckLinks extends javax.swing.JFrame implements Runnable,ISpiderReportable {
   /**
    * The constructor. Perform setup here.
    */
   public CheckLinks() {
     //{{INIT_CONTROLS
     setTitle("Find Broken Links");
     getContentPane().setLayout(null);
     setSize(405,288);
     setVisible(false);
     label1.setText("Enter a URL:");
     getContentPane().add(label1);
     label1.setBounds(12,12,84,12);
     begin.setText("Begin");
     begin.setActionCommand("Begin");
     getContentPane().add(begin);
     begin.setBounds(12,36,84,24);
     getContentPane().add(url);
     url.setBounds(108,36,288,24);
     errorScroll.setAutoscrolls(true);
     errorScroll.setHorizontalScrollBarPolicy(javax.swing.ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);
     errorScroll.setVerticalScrollBarPolicy(javax.swing.ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
     errorScroll.setOpaque(true);
     getContentPane().add(errorScroll);
     errorScroll.setBounds(12,120,384,156);
     errors.setEditable(false);
     errorScroll.getViewport().add(errors);
     errors.setBounds(0,0,366,138);
     current.setText("Currently Processing: ");
     getContentPane().add(current);
     current.setBounds(12,72,384,12);
     goodLinksLabel.setText("Good Links: 0");
     getContentPane().add(goodLinksLabel);
     goodLinksLabel.setBounds(12,96,192,12);
     badLinksLabel.setText("Bad Links: 0");
     getContentPane().add(badLinksLabel);
     badLinksLabel.setBounds(216,96,96,12);
     //}}
     //{{INIT_MENUS
     //}}
     //{{REGISTER_LISTENERS
     SymAction lSymAction = new SymAction();
     begin.addActionListener(lSymAction);
     //}}
   }
   /**
    * Main method for the application
    * 
    * @param args Not used
    */
   static public void main(String args[]){
     (new CheckLinks()).setVisible(true);
   }
   /**
    * Add notifications.
    */
   public void addNotify(){
     // Record the size of the window prior to calling parent's
     // addNotify.
     Dimension size = getSize();
     super.addNotify();
     if ( frameSizeAdjusted )
       return;
     frameSizeAdjusted = true;
     // Adjust size of frame according to the insets and menu bar
     Insets insets = getInsets();
     javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();
     int menuBarHeight = 0;
     if ( menuBar != null )
       menuBarHeight = menuBar.getPreferredSize().height;
     setSize(insets.left + insets.right + size.width, insets.top + insets.bottom + size.height + menuBarHeight);
   }
   // Used by addNotify
   boolean frameSizeAdjusted = false;
   //{{DECLARE_CONTROLS
   javax.swing.JLabel label1 = new javax.swing.JLabel();
   /**
    * The begin or cancel button
    */
   javax.swing.JButton begin = new javax.swing.JButton();
   /**
    * The URL being processed
    */
   javax.swing.JTextField url = new javax.swing.JTextField();
   /**
    * Scroll the errors.
    */
   javax.swing.JScrollPane errorScroll = new javax.swing.JScrollPane();
   /**
    * A place to store the errors created
    */
   javax.swing.JTextArea errors = new javax.swing.JTextArea();
   javax.swing.JLabel current = new javax.swing.JLabel();
   javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();
   javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();
   //}}
   //{{DECLARE_MENUS
   //}}
   /**
    * The background spider thread
    */
   protected Thread backgroundThread;
   /**
    * The spider object being used
    */
   protected Spider spider;
   /**
    * The URL that the spider began with
    */
   protected URL base;
   /**
    * How many bad links have been found
    */
   protected int badLinksCount = 0;
   /**
    * How many good links have been found
    */
   protected int goodLinksCount = 0;

   /**
    * Internal class used to dispatch events
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class SymAction implements java.awt.event.ActionListener {
     public void actionPerformed(java.awt.event.ActionEvent event){
       Object object = event.getSource();
       if ( object == begin )
         begin_actionPerformed(event);
     }
   }
   /**
    * Called when the begin or cancel buttons are clicked
    * 
    * @param event The event associated with the button.
    */
   void begin_actionPerformed(java.awt.event.ActionEvent event){
     if ( backgroundThread==null ) {
       begin.setLabel("Cancel");
       backgroundThread = new Thread(this);
       backgroundThread.start();
       goodLinksCount=0;
       badLinksCount=0;
     } else {
       spider.cancel();
     }
   }
   /**
    * Perform the background thread operation. This method
    * actually starts the background thread.
    */
   public void run(){
     try {
       errors.setText("");
       spider = new Spider(this);
       spider.clear();
       base = new URL(url.getText());
       spider.addURL(base);
       spider.begin();
       Runnable doLater = new Runnable(){
         public void run(){
           begin.setText("Begin");
         }
       };
       SwingUtilities.invokeLater(doLater);
       backgroundThread=null;
     } catch ( MalformedURLException e ) {
       UpdateErrors err = new UpdateErrors();
       err.msg = "Bad address.";
       SwingUtilities.invokeLater(err);
     }
   }
   /**
    * Called by the spider when a URL is found. It is here
    * that links are validated.
    * 
    * @param base The page that the link was found on.
    * @param url The actual link address.
    */
   public boolean spiderFoundURL(URL base,URL url){
     UpdateCurrentStats cs = new UpdateCurrentStats();
     cs.msg = url.toString();
     SwingUtilities.invokeLater(cs);
     if ( !checkLink(url) ) {
       UpdateErrors err = new UpdateErrors();
       err.msg = url+"(on page " + base + ")
";
       SwingUtilities.invokeLater(err);
       badLinksCount++;
       return false;
     }
     goodLinksCount++;
     if ( !url.getHost().equalsIgnoreCase(base.getHost()) )
       return false;
     else
       return true;
   }
   /**
    * Called when a URL error is found
    * 
    * @param url The URL that resulted in an error.
    */
   public void spiderURLError(URL url){
   }
   /**
    * Called internally to check whether a link is good
    * 
    * @param url The link that is being checked.
    * @return True if the link was good, false otherwise.
    */
   protected boolean checkLink(URL url){
     try {
       URLConnection connection = url.openConnection();
       connection.connect();
       return true;
     } catch ( IOException e ) {
       return false;
     }
   }
   /**
    * Called when the spider finds an e-mail address
    * 
    * @param email The email address the spider found.
    */
   public void spiderFoundEMail(String email){
   }
   /**
    * Internal class used to update the error information
    * in a Thread-Safe way
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class UpdateErrors implements Runnable {
     public String msg;
     public void run(){
       errors.append(msg);
     }
   }
   /**
    * Used to update the current status information
    * in a "Thread-Safe" way
    * 
    * @author Jeff Heaton
    * @version 1.0
    */
   class UpdateCurrentStats implements Runnable {
     public String msg;
     public void run(){
       current.setText("Currently Processing: " + msg );
       goodLinksLabel.setText("Good Links: " + goodLinksCount);
       badLinksLabel.setText("Bad Links: " + badLinksCount);
     }
   }
}

Listing 2: Reporting spider events(ISpiderReportable.java)
import java.net.*;
interface ISpiderReportable {
   public boolean spiderFoundURL(URL base,URL url);
   public void spiderURLError(URL url);
   public void spiderFoundEMail(String email);
}

Listing 3: A reusable spider (Spider.java)
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
/**
* That class implements a reusable spider
* 
* @author Jeff Heaton(http://www.jeffheaton.com)
* @version 1.0
*/
public class Spider {
   /**
    * A collection of URLs that resulted in an error
    */
   protected Collection workloadError = new ArrayList(3);
   /**
    * A collection of URLs that are waiting to be processed
    */
   protected Collection workloadWaiting = new ArrayList(3);
   /**
    * A collection of URLs that were processed
    */
   protected Collection workloadProcessed = new ArrayList(3);
   /**
    * The class that the spider should report its URLs to
    */
   protected ISpiderReportable report;
   /**
    * A flag that indicates whether this process
    * should be canceled
    */
   protected boolean cancel = false;
   /**
    * The constructor
    * 
    * @param report A class that implements the ISpiderReportable
    * interface, that will receive information that the
    * spider finds.
    */
   public Spider(ISpiderReportable report){
     this.report = report;
   }
   /**
    * Get the URLs that resulted in an error.
    * 
    * @return A collection of URL's.
    */
   public Collection getWorkloadError(){
     return workloadError;
   }
   /**
    * Get the URLs that were waiting to be processed.
    * You should add one URL to this collection to
    * begin the spider.
    * 
    * @return A collection of URLs.
    */
   public Collection getWorkloadWaiting(){
     return workloadWaiting;
   }
   /**
    * Get the URLs that were processed by this spider.
    * 
    * @return A collection of URLs.
    */
   public Collection getWorkloadProcessed(){
     return workloadProcessed;
   }    
   /**
    * Clear all of the workloads.
    */
   public void clear(){
     getWorkloadError().clear();
     getWorkloadWaiting().clear();
     getWorkloadProcessed().clear();
   }
   /**
    * Set a flag that will cause the begin
    * method to return before it is done.
    */
   public void cancel(){
     cancel = true;
   }
   /**
    * Add a URL for processing.
    * 
    * @param url
    */
   public void addURL(URL url){
     if ( getWorkloadWaiting().contains(url) )
       return;
     if ( getWorkloadError().contains(url) )
       return;
     if ( getWorkloadProcessed().contains(url) )
       return;
     log("Adding to workload: " + url );
     getWorkloadWaiting().add(url);
   }
   /**
    * Called internally to process a URL
    * 
    * @param url The URL to be processed.
    */
   public void processURL(URL url){
     try {
       log("Processing: " + url );
       // get the URL's contents
       URLConnection connection = url.openConnection();
       if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().startsWith("text/") ) {
         getWorkloadWaiting().remove(url);
         getWorkloadProcessed().add(url);
         log("Not processing because content type is: " + connection.getContentType() );
         return;
       }
      
       // read the URL
       InputStream is = connection.getInputStream();
       Reader r = new InputStreamReader(is);
       // parse the URL
       HTMLEditorKit.Parser parse = new HTMLParse().getParser();
       parse.parse(r,new Parser(url),true);
     } catch ( IOException e ) {
       getWorkloadWaiting().remove(url);
       getWorkloadError().add(url);
       log("Error: " + url );
       report.spiderURLError(url);
       return;
     }
     // mark URL as complete
     getWorkloadWaiting().remove(url);
     getWorkloadProcessed().add(url);
     log("Complete: " + url );
   }
   /**
    * Called to start the spider
    */
   public void begin(){
     cancel = false;
     while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
       Object list[] = getWorkloadWaiting().toArray();
       for ( int i=0;(i<list.length)&&!cancel;i++ )
         processURL((URL)list[i]);
     }
   }
/**
* A HTML parser callback used by this class to detect links
* 
* @author Jeff Heaton
* @version 1.0
*/
   protected class Parser
   extends HTMLEditorKit.ParserCallback {
     protected URL base;
     public Parser(URL base){
       this.base = base;
     }
     public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos){
       String href = (String)a.getAttribute(HTML.Attribute.HREF);
      
       if( (href==null) && (t==HTML.Tag.FRAME) )
         href = (String)a.getAttribute(HTML.Attribute.SRC);
        
       if ( href==null )
         return;
       int i = href.indexOf('#');
       if ( i!=-1 )
         href = href.substring(0,i);
       if ( href.toLowerCase().startsWith("mailt") ) {
         report.spiderFoundEMail(href);
         return;
       }
       handleLink(base,href);
     }
     public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos){
       handleSimpleTag(t,a,pos);     // handle the same way
     }
     protected void handleLink(URL base,String str){
       try {
         URL url = new URL(base,str);
         if ( report.spiderFoundURL(base,url) )
           addURL(url);
       } catch ( MalformedURLException e ) {
         log("Found malformed URL: " + str );
       }
     }
   }
   /**
    * Called internally to log information
    * This basic method just writes the log
    * out to the stdout.
    * 
    * @param entry The information to be written to the log.
    */
   public void log(String entry){
     System.out.println( (new Date()) + ":" + entry );
   }
}

Listing 4: Parsing HTML (HTMLParse.java)
import javax.swing.text.html.*;
public class HTMLParse extends HTMLEditorKit {
   public HTMLEditorKit.Parser getParser(){
     return super.getParser();
   }
}

查看全文

相关阅读:
只是为了好玩——Linux之父林纳斯自传
 Unity Sprite Atlas Compression
Bitmap动画
 UnityShader：HSV(色相，饱和度，亮度)转换
 Using Flash Builder with Flash Professional
Flash Decompiler
One Night Ultimate Werewolf Daybreak
Visual Studio CLR Profiler
Photoshop 融合属性 Unity Shader
.NET GC

原文地址：https://www.cnblogs.com/cRaZy-TyKeIo/p/3543294.html